Commit f93f8ded authored by dcarney@chromium.org's avatar dcarney@chromium.org

String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

This patch makes String::WriteUtf8 replace invalid code points (i.e. unmatched
surrogates) with the unicode replacement character when REPLACE_INVALID_UTF8 is
set.  This is done to avoid creating invalid UTF-8 output which can lead to
compatibility issues with software requiring valid UTF-8 inputs (e.g. the
WebSocket protocol requires valid UTF-8 and terminates connections when invalid
UTF-8 is encountered).

R=dcarney@chromium.org

BUG=

Review URL: https://codereview.chromium.org/121173009

Patch from Felix Geisendörfer <haimuiba@gmail.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18683 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 04b1baa4
...@@ -1628,7 +1628,11 @@ class V8_EXPORT String : public Primitive { ...@@ -1628,7 +1628,11 @@ class V8_EXPORT String : public Primitive {
NO_OPTIONS = 0, NO_OPTIONS = 0,
HINT_MANY_WRITES_EXPECTED = 1, HINT_MANY_WRITES_EXPECTED = 1,
NO_NULL_TERMINATION = 2, NO_NULL_TERMINATION = 2,
PRESERVE_ASCII_NULL = 4 PRESERVE_ASCII_NULL = 4,
// Used by WriteUtf8 to replace orphan surrogate code units with the
// unicode replacement character. Needs to be set to guarantee valid UTF-8
// output.
REPLACE_INVALID_UTF8 = 8
}; };
// 16-bit character codes. // 16-bit character codes.
......
...@@ -4454,28 +4454,35 @@ int String::Utf8Length() const { ...@@ -4454,28 +4454,35 @@ int String::Utf8Length() const {
class Utf8WriterVisitor { class Utf8WriterVisitor {
public: public:
Utf8WriterVisitor( Utf8WriterVisitor(
char* buffer, int capacity, bool skip_capacity_check) char* buffer,
: early_termination_(false), int capacity,
last_character_(unibrow::Utf16::kNoPreviousCharacter), bool skip_capacity_check,
buffer_(buffer), bool replace_invalid_utf8)
start_(buffer), : early_termination_(false),
capacity_(capacity), last_character_(unibrow::Utf16::kNoPreviousCharacter),
skip_capacity_check_(capacity == -1 || skip_capacity_check), buffer_(buffer),
utf16_chars_read_(0) { start_(buffer),
capacity_(capacity),
skip_capacity_check_(capacity == -1 || skip_capacity_check),
replace_invalid_utf8_(replace_invalid_utf8),
utf16_chars_read_(0) {
} }
static int WriteEndCharacter(uint16_t character, static int WriteEndCharacter(uint16_t character,
int last_character, int last_character,
int remaining, int remaining,
char* const buffer) { char* const buffer,
bool replace_invalid_utf8) {
using namespace unibrow; using namespace unibrow;
ASSERT(remaining > 0); ASSERT(remaining > 0);
// We can't use a local buffer here because Encode needs to modify // We can't use a local buffer here because Encode needs to modify
// previous characters in the stream. We know, however, that // previous characters in the stream. We know, however, that
// exactly one character will be advanced. // exactly one character will be advanced.
if (Utf16::IsTrailSurrogate(character) && if (Utf16::IsSurrogatePair(last_character, character)) {
Utf16::IsLeadSurrogate(last_character)) { int written = Utf8::Encode(buffer,
int written = Utf8::Encode(buffer, character, last_character); character,
last_character,
replace_invalid_utf8);
ASSERT(written == 1); ASSERT(written == 1);
return written; return written;
} }
...@@ -4484,7 +4491,8 @@ class Utf8WriterVisitor { ...@@ -4484,7 +4491,8 @@ class Utf8WriterVisitor {
// Can't encode using last_character as gcc has array bounds issues. // Can't encode using last_character as gcc has array bounds issues.
int written = Utf8::Encode(temp_buffer, int written = Utf8::Encode(temp_buffer,
character, character,
Utf16::kNoPreviousCharacter); Utf16::kNoPreviousCharacter,
replace_invalid_utf8);
// Won't fit. // Won't fit.
if (written > remaining) return 0; if (written > remaining) return 0;
// Copy over the character from temp_buffer. // Copy over the character from temp_buffer.
...@@ -4494,6 +4502,16 @@ class Utf8WriterVisitor { ...@@ -4494,6 +4502,16 @@ class Utf8WriterVisitor {
return written; return written;
} }
// Visit writes out a group of code units (chars) of a v8::String to the
// internal buffer_. This is done in two phases. The first phase calculates a
// pesimistic estimate (writable_length) on how many code units can be safely
// written without exceeding the buffer capacity and without writing the last
// code unit (it could be a lead surrogate). The estimated number of code
// units is then written out in one go, and the reported byte usage is used
// to correct the estimate. This is repeated until the estimate becomes <= 0
// or all code units have been written out. The second phase writes out code
// units until the buffer capacity is reached, would be exceeded by the next
// unit, or all units have been written out.
template<typename Char> template<typename Char>
void Visit(const Char* chars, const int length) { void Visit(const Char* chars, const int length) {
using namespace unibrow; using namespace unibrow;
...@@ -4531,7 +4549,10 @@ class Utf8WriterVisitor { ...@@ -4531,7 +4549,10 @@ class Utf8WriterVisitor {
} else { } else {
for (; i < fast_length; i++) { for (; i < fast_length; i++) {
uint16_t character = *chars++; uint16_t character = *chars++;
buffer += Utf8::Encode(buffer, character, last_character); buffer += Utf8::Encode(buffer,
character,
last_character,
replace_invalid_utf8_);
last_character = character; last_character = character;
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
} }
...@@ -4551,10 +4572,17 @@ class Utf8WriterVisitor { ...@@ -4551,10 +4572,17 @@ class Utf8WriterVisitor {
ASSERT(remaining_capacity >= 0); ASSERT(remaining_capacity >= 0);
for (; i < length && remaining_capacity > 0; i++) { for (; i < length && remaining_capacity > 0; i++) {
uint16_t character = *chars++; uint16_t character = *chars++;
// remaining_capacity is <= 3 bytes at this point, so we do not write out
// an umatched lead surrogate.
if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
early_termination_ = true;
break;
}
int written = WriteEndCharacter(character, int written = WriteEndCharacter(character,
last_character, last_character,
remaining_capacity, remaining_capacity,
buffer); buffer,
replace_invalid_utf8_);
if (written == 0) { if (written == 0) {
early_termination_ = true; early_termination_ = true;
break; break;
...@@ -4602,6 +4630,7 @@ class Utf8WriterVisitor { ...@@ -4602,6 +4630,7 @@ class Utf8WriterVisitor {
char* const start_; char* const start_;
int capacity_; int capacity_;
bool const skip_capacity_check_; bool const skip_capacity_check_;
bool const replace_invalid_utf8_;
int utf16_chars_read_; int utf16_chars_read_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
}; };
...@@ -4640,9 +4669,11 @@ int String::WriteUtf8(char* buffer, ...@@ -4640,9 +4669,11 @@ int String::WriteUtf8(char* buffer,
} }
const int string_length = str->length(); const int string_length = str->length();
bool write_null = !(options & NO_NULL_TERMINATION); bool write_null = !(options & NO_NULL_TERMINATION);
bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
// First check if we can just write the string without checking capacity. // First check if we can just write the string without checking capacity.
if (capacity == -1 || capacity / 3 >= string_length) { if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
Utf8WriterVisitor writer(buffer, capacity, true); Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
const int kMaxRecursion = 100; const int kMaxRecursion = 100;
bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
if (success) return writer.CompleteWrite(write_null, nchars_ref); if (success) return writer.CompleteWrite(write_null, nchars_ref);
...@@ -4670,7 +4701,7 @@ int String::WriteUtf8(char* buffer, ...@@ -4670,7 +4701,7 @@ int String::WriteUtf8(char* buffer,
} }
// Recursive slow path can potentially be unreasonable slow. Flatten. // Recursive slow path can potentially be unreasonable slow. Flatten.
str = FlattenGetString(str); str = FlattenGetString(str);
Utf8WriterVisitor writer(buffer, capacity, false); Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
i::String::VisitFlat(&writer, *str); i::String::VisitFlat(&writer, *str);
return writer.CompleteWrite(write_null, nchars_ref); return writer.CompleteWrite(write_null, nchars_ref);
} }
......
...@@ -107,8 +107,14 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { ...@@ -107,8 +107,14 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
return 2; return 2;
} }
// Encode encodes the UTF-16 code units c and previous into the given str
unsigned Utf8::Encode(char* str, uchar c, int previous) { // buffer, and combines surrogate code units into single code points. If
// replace_invalid is set to true, orphan surrogate code units will be replaced
// with kBadChar.
unsigned Utf8::Encode(char* str,
uchar c,
int previous,
bool replace_invalid) {
static const int kMask = ~(1 << 6); static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) { if (c <= kMaxOneByteChar) {
str[0] = c; str[0] = c;
...@@ -118,12 +124,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) { ...@@ -118,12 +124,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) {
str[1] = 0x80 | (c & kMask); str[1] = 0x80 | (c & kMask);
return 2; return 2;
} else if (c <= kMaxThreeByteChar) { } else if (c <= kMaxThreeByteChar) {
if (Utf16::IsTrailSurrogate(c) && if (Utf16::IsSurrogatePair(previous, c)) {
Utf16::IsLeadSurrogate(previous)) {
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
return Encode(str - kUnmatchedSize, return Encode(str - kUnmatchedSize,
Utf16::CombineSurrogatePair(previous, c), Utf16::CombineSurrogatePair(previous, c),
Utf16::kNoPreviousCharacter) - kUnmatchedSize; Utf16::kNoPreviousCharacter,
replace_invalid) - kUnmatchedSize;
} else if (replace_invalid &&
(Utf16::IsLeadSurrogate(c) ||
Utf16::IsTrailSurrogate(c))) {
c = kBadChar;
} }
str[0] = 0xE0 | (c >> 12); str[0] = 0xE0 | (c >> 12);
str[1] = 0x80 | ((c >> 6) & kMask); str[1] = 0x80 | ((c >> 6) & kMask);
......
...@@ -102,6 +102,9 @@ class UnicodeData { ...@@ -102,6 +102,9 @@ class UnicodeData {
class Utf16 { class Utf16 {
public: public:
static inline bool IsSurrogatePair(int lead, int trail) {
return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
}
static inline bool IsLeadSurrogate(int code) { static inline bool IsLeadSurrogate(int code) {
if (code == kNoPreviousCharacter) return false; if (code == kNoPreviousCharacter) return false;
return (code & 0xfc00) == 0xd800; return (code & 0xfc00) == 0xd800;
...@@ -146,11 +149,16 @@ class Utf8 { ...@@ -146,11 +149,16 @@ class Utf8 {
public: public:
static inline uchar Length(uchar chr, int previous); static inline uchar Length(uchar chr, int previous);
static inline unsigned EncodeOneByte(char* out, uint8_t c); static inline unsigned EncodeOneByte(char* out, uint8_t c);
static inline unsigned Encode( static inline unsigned Encode(char* out,
char* out, uchar c, int previous); uchar c,
int previous,
bool replace_invalid = false);
static uchar CalculateValue(const byte* str, static uchar CalculateValue(const byte* str,
unsigned length, unsigned length,
unsigned* cursor); unsigned* cursor);
// The unicode replacement character, used to signal invalid unicode
// sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
static const uchar kBadChar = 0xFFFD; static const uchar kBadChar = 0xFFFD;
static const unsigned kMaxEncodedSize = 4; static const unsigned kMaxEncodedSize = 4;
static const unsigned kMaxOneByteChar = 0x7f; static const unsigned kMaxOneByteChar = 0x7f;
...@@ -162,6 +170,9 @@ class Utf8 { ...@@ -162,6 +170,9 @@ class Utf8 {
// that match are coded as a 4 byte UTF-8 sequence. // that match are coded as a 4 byte UTF-8 sequence.
static const unsigned kBytesSavedByCombiningSurrogates = 2; static const unsigned kBytesSavedByCombiningSurrogates = 2;
static const unsigned kSizeOfUnmatchedSurrogate = 3; static const unsigned kSizeOfUnmatchedSurrogate = 3;
// The maximum size a single UTF-16 code unit may take up when encoded as
// UTF-8.
static const unsigned kMax16BitCodeUnitSize = 3;
static inline uchar ValueOf(const byte* str, static inline uchar ValueOf(const byte* str,
unsigned length, unsigned length,
unsigned* cursor); unsigned* cursor);
......
...@@ -7614,6 +7614,22 @@ THREADED_TEST(StringWrite) { ...@@ -7614,6 +7614,22 @@ THREADED_TEST(StringWrite) {
v8::Handle<String> str2 = v8_str("abc\303\260\342\230\203"); v8::Handle<String> str2 = v8_str("abc\303\260\342\230\203");
v8::Handle<String> str3 = v8::String::NewFromUtf8( v8::Handle<String> str3 = v8::String::NewFromUtf8(
context->GetIsolate(), "abc\0def", v8::String::kNormalString, 7); context->GetIsolate(), "abc\0def", v8::String::kNormalString, 7);
// "ab" + lead surrogate + "cd" + trail surrogate + "ef"
uint16_t orphans[8] = { 0x61, 0x62, 0xd800, 0x63, 0x64, 0xdc00, 0x65, 0x66 };
v8::Handle<String> orphans_str = v8::String::NewFromTwoByte(
context->GetIsolate(), orphans, v8::String::kNormalString, 8);
// single lead surrogate
uint16_t lead[1] = { 0xd800 };
v8::Handle<String> lead_str = v8::String::NewFromTwoByte(
context->GetIsolate(), lead, v8::String::kNormalString, 1);
// single trail surrogate
uint16_t trail[1] = { 0xdc00 };
v8::Handle<String> trail_str = v8::String::NewFromTwoByte(
context->GetIsolate(), trail, v8::String::kNormalString, 1);
// surrogate pair
uint16_t pair[2] = { 0xd800, 0xdc00 };
v8::Handle<String> pair_str = v8::String::NewFromTwoByte(
context->GetIsolate(), pair, v8::String::kNormalString, 2);
const int kStride = 4; // Must match stride in for loops in JS below. const int kStride = 4; // Must match stride in for loops in JS below.
CompileRun( CompileRun(
"var left = '';" "var left = '';"
...@@ -7687,6 +7703,53 @@ THREADED_TEST(StringWrite) { ...@@ -7687,6 +7703,53 @@ THREADED_TEST(StringWrite) {
CHECK_EQ(2, charlen); CHECK_EQ(2, charlen);
CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3)); CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
// allow orphan surrogates by default
memset(utf8buf, 0x1, 1000);
len = orphans_str->WriteUtf8(utf8buf, sizeof(utf8buf), &charlen);
CHECK_EQ(13, len);
CHECK_EQ(8, charlen);
CHECK_EQ(0, strcmp(utf8buf, "ab\355\240\200cd\355\260\200ef"));
// replace orphan surrogates with unicode replacement character
memset(utf8buf, 0x1, 1000);
len = orphans_str->WriteUtf8(utf8buf,
sizeof(utf8buf),
&charlen,
String::REPLACE_INVALID_UTF8);
CHECK_EQ(13, len);
CHECK_EQ(8, charlen);
CHECK_EQ(0, strcmp(utf8buf, "ab\357\277\275cd\357\277\275ef"));
// replace single lead surrogate with unicode replacement character
memset(utf8buf, 0x1, 1000);
len = lead_str->WriteUtf8(utf8buf,
sizeof(utf8buf),
&charlen,
String::REPLACE_INVALID_UTF8);
CHECK_EQ(4, len);
CHECK_EQ(1, charlen);
CHECK_EQ(0, strcmp(utf8buf, "\357\277\275"));
// replace single trail surrogate with unicode replacement character
memset(utf8buf, 0x1, 1000);
len = trail_str->WriteUtf8(utf8buf,
sizeof(utf8buf),
&charlen,
String::REPLACE_INVALID_UTF8);
CHECK_EQ(4, len);
CHECK_EQ(1, charlen);
CHECK_EQ(0, strcmp(utf8buf, "\357\277\275"));
// do not replace / write anything if surrogate pair does not fit the buffer
// space
memset(utf8buf, 0x1, 1000);
len = pair_str->WriteUtf8(utf8buf,
3,
&charlen,
String::REPLACE_INVALID_UTF8);
CHECK_EQ(0, len);
CHECK_EQ(0, charlen);
memset(utf8buf, 0x1, sizeof(utf8buf)); memset(utf8buf, 0x1, sizeof(utf8buf));
len = GetUtf8Length(left_tree); len = GetUtf8Length(left_tree);
int utf8_expected = int utf8_expected =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment