Commit e954e954 authored by vogelheim's avatar vogelheim Committed by Commit bot

Remove unused Utf8ToUtf16CharacterStream.

- The static method CopyChars was actually used and has been extracted.
- It was used in tests, where it's been replaced w/ ExternalOneByteString...
- Only one test actually relied on Utf8 handling (as opposed to ASCII only),
  and that was the test testing Utf8ToUtf16CharacterStream itself.

+66 -277 LOC :)

BUG=v8:4947

Review-Url: https://codereview.chromium.org/2256273002
Cr-Commit-Position: refs/heads/master@{#38824}
parent 3a0a24dc
......@@ -16,6 +16,33 @@ namespace internal {
namespace {
size_t CopyUtf8CharsToUtf16Chars(uint16_t* dest, size_t length, const byte* src,
size_t* src_pos, size_t src_length) {
static const unibrow::uchar kMaxUtf16Character =
unibrow::Utf16::kMaxNonSurrogateCharCode;
size_t i = 0;
// Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
// one character early (in the normal case), because we need to have at least
// two free spaces in the buffer to be sure that the next character will fit.
while (i < length - 1) {
if (*src_pos == src_length) break;
unibrow::uchar c = src[*src_pos];
if (c <= unibrow::Utf8::kMaxOneByteChar) {
*src_pos = *src_pos + 1;
} else {
c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
src_pos);
}
if (c > kMaxUtf16Character) {
dest[i++] = unibrow::Utf16::LeadSurrogate(c);
dest[i++] = unibrow::Utf16::TrailSurrogate(c);
} else {
dest[i++] = static_cast<uc16>(c);
}
}
return i;
}
size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src,
size_t* src_pos, size_t src_length,
ScriptCompiler::StreamedSource::Encoding encoding) {
......@@ -24,8 +51,7 @@ size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src,
if (length == 0) return 0;
if (encoding == ScriptCompiler::StreamedSource::UTF8) {
return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
dest, length, src, src_pos, src_length);
return CopyUtf8CharsToUtf16Chars(dest, length, src, src_pos, src_length);
}
size_t to_fill = length;
......@@ -175,163 +201,7 @@ size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
// ----------------------------------------------------------------------------
// Utf8ToUtf16CharacterStream
Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
size_t length)
: BufferedUtf16CharacterStream(),
raw_data_(data),
raw_data_length_(length),
raw_data_pos_(0),
raw_character_position_(0) {
ReadBlock();
}
Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length,
const byte* src, size_t* src_pos,
size_t src_length) {
static const unibrow::uchar kMaxUtf16Character =
unibrow::Utf16::kMaxNonSurrogateCharCode;
size_t i = 0;
// Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
// one character early (in the normal case), because we need to have at least
// two free spaces in the buffer to be sure that the next character will fit.
while (i < length - 1) {
if (*src_pos == src_length) break;
unibrow::uchar c = src[*src_pos];
if (c <= unibrow::Utf8::kMaxOneByteChar) {
*src_pos = *src_pos + 1;
} else {
c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
src_pos);
}
if (c > kMaxUtf16Character) {
dest[i++] = unibrow::Utf16::LeadSurrogate(c);
dest[i++] = unibrow::Utf16::TrailSurrogate(c);
} else {
dest[i++] = static_cast<uc16>(c);
}
}
return i;
}
size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) {
size_t old_pos = pos_;
size_t target_pos = pos_ + delta;
SetRawPosition(target_pos);
pos_ = raw_character_position_;
ReadBlock();
return pos_ - old_pos;
}
size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) {
SetRawPosition(char_position);
if (raw_character_position_ != char_position) {
// char_position was not a valid position in the stream (hit the end
// while spooling to it).
return 0u;
}
size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
raw_data_length_);
raw_character_position_ = char_position + i;
return i;
}
static const byte kUtf8MultiByteMask = 0xC0;
static const byte kUtf8MultiByteCharFollower = 0x80;
#ifdef DEBUG
static const byte kUtf8MultiByteCharStart = 0xC0;
static bool IsUtf8MultiCharacterStart(byte first_byte) {
return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
}
#endif
static bool IsUtf8MultiCharacterFollower(byte later_byte) {
return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
}
// Move the cursor back to point at the preceding UTF-8 character start
// in the buffer.
static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) {
byte character = buffer[--*cursor];
if (character > unibrow::Utf8::kMaxOneByteChar) {
DCHECK(IsUtf8MultiCharacterFollower(character));
// Last byte of a multi-byte character encoding. Step backwards until
// pointing to the first byte of the encoding, recognized by having the
// top two bits set.
while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
}
}
// Move the cursor forward to point at the next following UTF-8 character start
// in the buffer.
static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) {
byte character = buffer[(*cursor)++];
if (character > unibrow::Utf8::kMaxOneByteChar) {
// First character of a multi-byte character encoding.
// The number of most-significant one-bits determines the length of the
// encoding:
// 110..... - (0xCx, 0xDx) one additional byte (minimum).
// 1110.... - (0xEx) two additional bytes.
// 11110... - (0xFx) three additional bytes (maximum).
DCHECK(IsUtf8MultiCharacterStart(character));
// Additional bytes is:
// 1 if value in range 0xC0 .. 0xDF.
// 2 if value in range 0xE0 .. 0xEF.
// 3 if value in range 0xF0 .. 0xF7.
// Encode that in a single value.
size_t additional_bytes =
((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
*cursor += additional_bytes;
DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
}
}
// This can't set a raw position between two surrogate pairs, since there
// is no position in the UTF8 stream that corresponds to that. This assumes
// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
// it is illegally coded as two 3 byte sequences then there is no problem here.
void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) {
if (raw_character_position_ > target_position) {
// Spool backwards in utf8 buffer.
do {
size_t old_pos = raw_data_pos_;
Utf8CharacterBack(raw_data_, &raw_data_pos_);
raw_character_position_--;
DCHECK(old_pos - raw_data_pos_ <= 4);
// Step back over both code units for surrogate pairs.
if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
} while (raw_character_position_ > target_position);
// No surrogate pair splitting.
DCHECK(raw_character_position_ == target_position);
return;
}
// Spool forwards in the utf8 buffer.
while (raw_character_position_ < target_position) {
if (raw_data_pos_ == raw_data_length_) return;
size_t old_pos = raw_data_pos_;
Utf8CharacterForward(raw_data_, &raw_data_pos_);
raw_character_position_++;
DCHECK(raw_data_pos_ - old_pos <= 4);
if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
}
// No surrogate pair splitting.
DCHECK(raw_character_position_ == target_position);
}
// ExternalStreamingStream
size_t ExternalStreamingStream::FillBuffer(size_t position) {
// Ignore "position" which is the position in the decoded data. Instead,
......@@ -599,6 +469,16 @@ ExternalOneByteStringUtf16CharacterStream::
pos_ = start_position;
}
ExternalOneByteStringUtf16CharacterStream::
ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length)
: raw_data_(reinterpret_cast<const uint8_t*>(data)),
length_(length),
bookmark_(kNoBookmark) {}
ExternalOneByteStringUtf16CharacterStream::
ExternalOneByteStringUtf16CharacterStream(const char* data)
: ExternalOneByteStringUtf16CharacterStream(data, strlen(data)) {}
bool ExternalOneByteStringUtf16CharacterStream::SetBookmark() {
bookmark_ = pos_;
return true;
......
......@@ -64,29 +64,6 @@ class GenericStringUtf16CharacterStream: public BufferedUtf16CharacterStream {
};
// Utf16 stream based on a literal UTF-8 string.
class Utf8ToUtf16CharacterStream: public BufferedUtf16CharacterStream {
public:
Utf8ToUtf16CharacterStream(const byte* data, size_t length);
~Utf8ToUtf16CharacterStream() override;
static size_t CopyChars(uint16_t* dest, size_t length, const byte* src,
size_t* src_pos, size_t src_length);
protected:
size_t BufferSeekForward(size_t delta) override;
size_t FillBuffer(size_t char_position) override;
void SetRawPosition(size_t char_position);
const byte* raw_data_;
size_t raw_data_length_; // Measured in bytes, not characters.
size_t raw_data_pos_;
// The character position of the character at raw_data[raw_data_pos_].
// Not necessarily the same as pos_.
size_t raw_character_position_;
};
// ExternalStreamingStream is a wrapper around an ExternalSourceStream (see
// include/v8.h) subclass implemented by the embedder.
class ExternalStreamingStream : public BufferedUtf16CharacterStream {
......@@ -193,6 +170,10 @@ class ExternalOneByteStringUtf16CharacterStream
int end_position);
~ExternalOneByteStringUtf16CharacterStream() override;
// For testing:
explicit ExternalOneByteStringUtf16CharacterStream(const char* data);
ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length);
bool SetBookmark() override;
void ResetToBookmark() override;
......
......@@ -65,14 +65,13 @@ TEST(ScanKeywords) {
KeywordToken key_token;
i::UnicodeCache unicode_cache;
i::byte buffer[32];
char buffer[32];
for (int i = 0; (key_token = keywords[i]).keyword != NULL; i++) {
const i::byte* keyword =
reinterpret_cast<const i::byte*>(key_token.keyword);
int length = i::StrLength(key_token.keyword);
const char* keyword = key_token.keyword;
size_t length = strlen(key_token.keyword);
CHECK(static_cast<int>(sizeof(buffer)) >= length);
{
i::Utf8ToUtf16CharacterStream stream(keyword, length);
i::ExternalOneByteStringUtf16CharacterStream stream(keyword, length);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(key_token.token, scanner.Next());
......@@ -80,7 +79,7 @@ TEST(ScanKeywords) {
}
// Removing characters will make keyword matching fail.
{
i::Utf8ToUtf16CharacterStream stream(keyword, length - 1);
i::ExternalOneByteStringUtf16CharacterStream stream(keyword, length - 1);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
......@@ -91,7 +90,7 @@ TEST(ScanKeywords) {
for (int j = 0; j < static_cast<int>(arraysize(chars_to_append)); ++j) {
i::MemMove(buffer, keyword, length);
buffer[length] = chars_to_append[j];
i::Utf8ToUtf16CharacterStream stream(buffer, length + 1);
i::ExternalOneByteStringUtf16CharacterStream stream(buffer, length + 1);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
......@@ -101,7 +100,7 @@ TEST(ScanKeywords) {
{
i::MemMove(buffer, keyword, length);
buffer[length - 1] = '_';
i::Utf8ToUtf16CharacterStream stream(buffer, length);
i::ExternalOneByteStringUtf16CharacterStream stream(buffer, length);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
......@@ -166,9 +165,8 @@ TEST(ScanHTMLEndComments) {
i::GetCurrentStackPosition() - 128 * 1024);
uintptr_t stack_limit = CcTest::i_isolate()->stack_guard()->real_climit();
for (int i = 0; tests[i]; i++) {
const i::byte* source =
reinterpret_cast<const i::byte*>(tests[i]);
i::Utf8ToUtf16CharacterStream stream(source, i::StrLength(tests[i]));
const char* source = tests[i];
i::ExternalOneByteStringUtf16CharacterStream stream(source);
i::CompleteParserRecorder log;
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......@@ -184,9 +182,8 @@ TEST(ScanHTMLEndComments) {
}
for (int i = 0; fail_tests[i]; i++) {
const i::byte* source =
reinterpret_cast<const i::byte*>(fail_tests[i]);
i::Utf8ToUtf16CharacterStream stream(source, i::StrLength(fail_tests[i]));
const char* source = fail_tests[i];
i::ExternalOneByteStringUtf16CharacterStream stream(source);
i::CompleteParserRecorder log;
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......@@ -344,9 +341,7 @@ TEST(StandAlonePreParser) {
uintptr_t stack_limit = CcTest::i_isolate()->stack_guard()->real_climit();
for (int i = 0; programs[i]; i++) {
const char* program = programs[i];
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::ExternalOneByteStringUtf16CharacterStream stream(program);
i::CompleteParserRecorder log;
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......@@ -380,9 +375,7 @@ TEST(StandAlonePreParserNoNatives) {
uintptr_t stack_limit = CcTest::i_isolate()->stack_guard()->real_climit();
for (int i = 0; programs[i]; i++) {
const char* program = programs[i];
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::ExternalOneByteStringUtf16CharacterStream stream(program);
i::CompleteParserRecorder log;
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......@@ -451,9 +444,7 @@ TEST(RegressChromium62639) {
// and then used the invalid currently scanned literal. This always
// failed in debug mode, and sometimes crashed in release mode.
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::ExternalOneByteStringUtf16CharacterStream stream(program);
i::CompleteParserRecorder log;
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......@@ -537,9 +528,8 @@ TEST(PreParseOverflow) {
uintptr_t stack_limit = CcTest::i_isolate()->stack_guard()->real_climit();
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program.get()),
static_cast<unsigned>(kProgramSize));
i::ExternalOneByteStringUtf16CharacterStream stream(program.get(),
kProgramSize);
i::CompleteParserRecorder log;
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......@@ -607,8 +597,8 @@ void TestCharacterStream(const char* one_byte_source, unsigned length,
end);
i::GenericStringUtf16CharacterStream string_stream(one_byte_string, start,
end);
i::Utf8ToUtf16CharacterStream utf8_stream(
reinterpret_cast<const i::byte*>(one_byte_source), end);
i::ExternalOneByteStringUtf16CharacterStream utf8_stream(one_byte_source,
end);
utf8_stream.SeekForward(start);
unsigned i = start;
......@@ -715,6 +705,7 @@ void TestCharacterStream(const char* one_byte_source, unsigned length,
CHECK_LT(c4, 0);
}
#undef CHECK_EQU
TEST(CharacterStreams) {
v8::Isolate* isolate = CcTest::isolate();
......@@ -737,63 +728,6 @@ TEST(CharacterStreams) {
}
TEST(Utf8CharacterStream) {
static const unsigned kMaxUC16CharU = unibrow::Utf8::kMaxThreeByteChar;
static const int kMaxUC16Char = static_cast<int>(kMaxUC16CharU);
static const int kAllUtf8CharsSize =
(unibrow::Utf8::kMaxOneByteChar + 1) +
(unibrow::Utf8::kMaxTwoByteChar - unibrow::Utf8::kMaxOneByteChar) * 2 +
(unibrow::Utf8::kMaxThreeByteChar - unibrow::Utf8::kMaxTwoByteChar) * 3;
static const unsigned kAllUtf8CharsSizeU =
static_cast<unsigned>(kAllUtf8CharsSize);
char buffer[kAllUtf8CharsSizeU];
unsigned cursor = 0;
for (int i = 0; i <= kMaxUC16Char; i++) {
cursor += unibrow::Utf8::Encode(buffer + cursor, i,
unibrow::Utf16::kNoPreviousCharacter, true);
}
CHECK(cursor == kAllUtf8CharsSizeU);
i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
kAllUtf8CharsSizeU);
int32_t bad = unibrow::Utf8::kBadChar;
for (int i = 0; i <= kMaxUC16Char; i++) {
CHECK_EQU(i, stream.pos());
int32_t c = stream.Advance();
if (i >= 0xd800 && i <= 0xdfff) {
CHECK_EQ(bad, c);
} else {
CHECK_EQ(i, c);
}
CHECK_EQU(i + 1, stream.pos());
}
for (int i = kMaxUC16Char; i >= 0; i--) {
CHECK_EQU(i + 1, stream.pos());
stream.PushBack(i);
CHECK_EQU(i, stream.pos());
}
int i = 0;
while (stream.pos() < kMaxUC16CharU) {
CHECK_EQU(i, stream.pos());
int progress = static_cast<int>(stream.SeekForward(12));
i += progress;
int32_t c = stream.Advance();
if (i >= 0xd800 && i <= 0xdfff) {
CHECK_EQ(bad, c);
} else if (i <= kMaxUC16Char) {
CHECK_EQ(i, c);
} else {
CHECK_EQ(-1, c);
}
i += 1;
CHECK_EQU(i, stream.pos());
}
}
#undef CHECK_EQU
void TestStreamScanner(i::Utf16CharacterStream* stream,
i::Token::Value* expected_tokens,
int skip_pos = 0, // Zero means not skipping.
......@@ -818,8 +752,7 @@ TEST(StreamScanner) {
v8::V8::Initialize();
const char* str1 = "{ foo get for : */ <- \n\n /*foo*/ bib";
i::Utf8ToUtf16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
static_cast<unsigned>(strlen(str1)));
i::ExternalOneByteStringUtf16CharacterStream stream1(str1);
i::Token::Value expectations1[] = {
i::Token::LBRACE,
i::Token::IDENTIFIER,
......@@ -837,8 +770,7 @@ TEST(StreamScanner) {
TestStreamScanner(&stream1, expectations1, 0, 0);
const char* str2 = "case default const {THIS\nPART\nSKIPPED} do";
i::Utf8ToUtf16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
static_cast<unsigned>(strlen(str2)));
i::ExternalOneByteStringUtf16CharacterStream stream2(str2);
i::Token::Value expectations2[] = {
i::Token::CASE,
i::Token::DEFAULT,
......@@ -868,18 +800,14 @@ TEST(StreamScanner) {
for (int i = 0; i <= 4; i++) {
expectations3[6 - i] = i::Token::ILLEGAL;
expectations3[5 - i] = i::Token::EOS;
i::Utf8ToUtf16CharacterStream stream3(
reinterpret_cast<const i::byte*>(str3),
static_cast<unsigned>(strlen(str3)));
i::ExternalOneByteStringUtf16CharacterStream stream3(str3);
TestStreamScanner(&stream3, expectations3, 1, 1 + i);
}
}
void TestScanRegExp(const char* re_source, const char* expected) {
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(re_source),
static_cast<unsigned>(strlen(re_source)));
i::ExternalOneByteStringUtf16CharacterStream stream(re_source);
i::HandleScope scope(CcTest::i_isolate());
i::Scanner scanner(CcTest::i_isolate()->unicode_cache());
scanner.Initialize(&stream);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment