// Copyright 2016 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/uri.h" #include <vector> #include "src/char-predicates-inl.h" #include "src/isolate-inl.h" #include "src/string-search.h" #include "src/unicode-inl.h" namespace v8 { namespace internal { namespace { // anonymous namespace for DecodeURI helper functions bool IsReservedPredicate(uc16 c) { switch (c) { case '#': case '$': case '&': case '+': case ',': case '/': case ':': case ';': case '=': case '?': case '@': return true; default: return false; } } bool IsReplacementCharacter(const uint8_t* octets, int length) { // The replacement character is at codepoint U+FFFD in the Unicode Specials // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD. if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF || octets[2] != 0xBD) { return false; } return true; } bool DecodeOctets(const uint8_t* octets, int length, std::vector<uc16>* buffer) { size_t cursor = 0; uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor); if (value == unibrow::Utf8::kBadChar && !IsReplacementCharacter(octets, length)) { return false; } if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { buffer->push_back(value); } else { buffer->push_back(unibrow::Utf16::LeadSurrogate(value)); buffer->push_back(unibrow::Utf16::TrailSurrogate(value)); } return true; } int TwoDigitHex(uc16 character1, uc16 character2) { if (character1 > 'f') return -1; int high = HexValue(character1); if (high == -1) return -1; if (character2 > 'f') return -1; int low = HexValue(character2); if (low == -1) return -1; return (high << 4) + low; } template <typename T> void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index, bool is_uri, std::vector<T>* buffer) { if (is_uri && IsReservedPredicate(decoded)) { buffer->push_back('%'); uc16 first = uri_content->Get(index + 1); uc16 second = uri_content->Get(index + 2); DCHECK_GT(std::numeric_limits<T>::max(), first); DCHECK_GT(std::numeric_limits<T>::max(), second); buffer->push_back(first); buffer->push_back(second); } else { buffer->push_back(decoded); } } bool IntoTwoByte(int index, bool is_uri, int uri_length, String::FlatContent* uri_content, std::vector<uc16>* buffer) { for (int k = index; k < uri_length; k++) { uc16 code = uri_content->Get(k); if (code == '%') { int two_digits; if (k + 2 >= uri_length || (two_digits = TwoDigitHex(uri_content->Get(k + 1), uri_content->Get(k + 2))) < 0) { return false; } k += 2; uc16 decoded = static_cast<uc16>(two_digits); if (decoded > unibrow::Utf8::kMaxOneByteChar) { uint8_t octets[unibrow::Utf8::kMaxEncodedSize]; octets[0] = decoded; int number_of_continuation_bytes = 0; while ((decoded << ++number_of_continuation_bytes) & 0x80) { if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) { return false; } if (uri_content->Get(++k) != '%' || (two_digits = TwoDigitHex(uri_content->Get(k + 1), uri_content->Get(k + 2))) < 0) { return false; } k += 2; uc16 continuation_byte = static_cast<uc16>(two_digits); octets[number_of_continuation_bytes] = continuation_byte; } if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) { return false; } } else { AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer); } } else { buffer->push_back(code); } } return true; } bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri, std::vector<uint8_t>* one_byte_buffer, std::vector<uc16>* two_byte_buffer) { DisallowHeapAllocation no_gc; String::FlatContent uri_content = uri->GetFlatContent(); int uri_length = uri->length(); for (int k = 0; k < uri_length; k++) { uc16 code = uri_content.Get(k); if (code == '%') { int two_digits; if (k + 2 >= uri_length || (two_digits = TwoDigitHex(uri_content.Get(k + 1), uri_content.Get(k + 2))) < 0) { return false; } uc16 decoded = static_cast<uc16>(two_digits); if (decoded > unibrow::Utf8::kMaxOneByteChar) { return IntoTwoByte(k, is_uri, uri_length, &uri_content, two_byte_buffer); } AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer); k += 2; } else { if (code > unibrow::Utf8::kMaxOneByteChar) { return IntoTwoByte(k, is_uri, uri_length, &uri_content, two_byte_buffer); } one_byte_buffer->push_back(code); } } return true; } } // anonymous namespace MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri, bool is_uri) { uri = String::Flatten(isolate, uri); std::vector<uint8_t> one_byte_buffer; std::vector<uc16> two_byte_buffer; if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) { THROW_NEW_ERROR(isolate, NewURIError(), String); } if (two_byte_buffer.empty()) { return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>( one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size()))); } Handle<SeqTwoByteString> result; int result_length = static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size()); ASSIGN_RETURN_ON_EXCEPTION( isolate, result, isolate->factory()->NewRawTwoByteString(result_length), String); CopyChars(result->GetChars(), one_byte_buffer.data(), one_byte_buffer.size()); CopyChars(result->GetChars() + one_byte_buffer.size(), two_byte_buffer.data(), two_byte_buffer.size()); return result; } namespace { // anonymous namespace for EncodeURI helper functions bool IsUnescapePredicateInUriComponent(uc16 c) { if (IsAlphaNumeric(c)) { return true; } switch (c) { case '!': case '\'': case '(': case ')': case '*': case '-': case '.': case '_': case '~': return true; default: return false; } } bool IsUriSeparator(uc16 c) { switch (c) { case '#': case ':': case ';': case '/': case '?': case '$': case '&': case '+': case ',': case '@': case '=': return true; default: return false; } } void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) { buffer->push_back('%'); buffer->push_back(HexCharOfValue(octet >> 4)); buffer->push_back(HexCharOfValue(octet & 0x0F)); } void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) { char s[4] = {}; int number_of_bytes; number_of_bytes = unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false); for (int k = 0; k < number_of_bytes; k++) { AddEncodedOctetToBuffer(s[k], buffer); } } void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) { char s[4] = {}; int number_of_bytes = unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2), unibrow::Utf16::kNoPreviousCharacter, false); for (int k = 0; k < number_of_bytes; k++) { AddEncodedOctetToBuffer(s[k], buffer); } } } // anonymous namespace MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri, bool is_uri) { uri = String::Flatten(isolate, uri); int uri_length = uri->length(); std::vector<uint8_t> buffer; buffer.reserve(uri_length); { DisallowHeapAllocation no_gc; String::FlatContent uri_content = uri->GetFlatContent(); for (int k = 0; k < uri_length; k++) { uc16 cc1 = uri_content.Get(k); if (unibrow::Utf16::IsLeadSurrogate(cc1)) { k++; if (k < uri_length) { uc16 cc2 = uri->Get(k); if (unibrow::Utf16::IsTrailSurrogate(cc2)) { EncodePair(cc1, cc2, &buffer); continue; } } } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) { if (IsUnescapePredicateInUriComponent(cc1) || (is_uri && IsUriSeparator(cc1))) { buffer.push_back(cc1); } else { EncodeSingle(cc1, &buffer); } continue; } AllowHeapAllocation allocate_error_and_return; THROW_NEW_ERROR(isolate, NewURIError(), String); } } return isolate->factory()->NewStringFromOneByte( Vector<const uint8_t>(buffer.data(), static_cast<int>(buffer.size()))); } namespace { // Anonymous namespace for Escape and Unescape template <typename Char> int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) { uint16_t character = vector[i]; int32_t hi = 0; int32_t lo = 0; if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' && (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 && (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) { *step = 6; return (hi << 8) + lo; } else if (character == '%' && i <= length - 3 && (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) { *step = 3; return lo; } else { *step = 1; return character; } } template <typename Char> MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string, int start_index) { bool one_byte = true; int length = string->length(); int unescaped_length = 0; { DisallowHeapAllocation no_allocation; Vector<const Char> vector = string->GetCharVector<Char>(); for (int i = start_index; i < length; unescaped_length++) { int step; if (UnescapeChar(vector, i, length, &step) > String::kMaxOneByteCharCode) { one_byte = false; } i += step; } } DCHECK(start_index < length); Handle<String> first_part = isolate->factory()->NewProperSubString(string, 0, start_index); int dest_position = 0; Handle<String> second_part; DCHECK_LE(unescaped_length, String::kMaxLength); if (one_byte) { Handle<SeqOneByteString> dest = isolate->factory() ->NewRawOneByteString(unescaped_length) .ToHandleChecked(); DisallowHeapAllocation no_allocation; Vector<const Char> vector = string->GetCharVector<Char>(); for (int i = start_index; i < length; dest_position++) { int step; dest->SeqOneByteStringSet(dest_position, UnescapeChar(vector, i, length, &step)); i += step; } second_part = dest; } else { Handle<SeqTwoByteString> dest = isolate->factory() ->NewRawTwoByteString(unescaped_length) .ToHandleChecked(); DisallowHeapAllocation no_allocation; Vector<const Char> vector = string->GetCharVector<Char>(); for (int i = start_index; i < length; dest_position++) { int step; dest->SeqTwoByteStringSet(dest_position, UnescapeChar(vector, i, length, &step)); i += step; } second_part = dest; } return isolate->factory()->NewConsString(first_part, second_part); } bool IsNotEscaped(uint16_t c) { if (IsAlphaNumeric(c)) { return true; } // @*_+-./ switch (c) { case '@': case '*': case '_': case '+': case '-': case '.': case '/': return true; default: return false; } } template <typename Char> static MaybeHandle<String> UnescapePrivate(Isolate* isolate, Handle<String> source) { int index; { DisallowHeapAllocation no_allocation; StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%")); index = search.Search(source->GetCharVector<Char>(), 0); if (index < 0) return source; } return UnescapeSlow<Char>(isolate, source, index); } template <typename Char> static MaybeHandle<String> EscapePrivate(Isolate* isolate, Handle<String> string) { DCHECK(string->IsFlat()); int escaped_length = 0; int length = string->length(); { DisallowHeapAllocation no_allocation; Vector<const Char> vector = string->GetCharVector<Char>(); for (int i = 0; i < length; i++) { uint16_t c = vector[i]; if (c >= 256) { escaped_length += 6; } else if (IsNotEscaped(c)) { escaped_length++; } else { escaped_length += 3; } // We don't allow strings that are longer than a maximal length. DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow. if (escaped_length > String::kMaxLength) break; // Provoke exception. } } // No length change implies no change. Return original string if no change. if (escaped_length == length) return string; Handle<SeqOneByteString> dest; ASSIGN_RETURN_ON_EXCEPTION( isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length), String); int dest_position = 0; { DisallowHeapAllocation no_allocation; Vector<const Char> vector = string->GetCharVector<Char>(); for (int i = 0; i < length; i++) { uint16_t c = vector[i]; if (c >= 256) { dest->SeqOneByteStringSet(dest_position, '%'); dest->SeqOneByteStringSet(dest_position + 1, 'u'); dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12)); dest->SeqOneByteStringSet(dest_position + 3, HexCharOfValue((c >> 8) & 0xF)); dest->SeqOneByteStringSet(dest_position + 4, HexCharOfValue((c >> 4) & 0xF)); dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF)); dest_position += 6; } else if (IsNotEscaped(c)) { dest->SeqOneByteStringSet(dest_position, c); dest_position++; } else { dest->SeqOneByteStringSet(dest_position, '%'); dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4)); dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF)); dest_position += 3; } } } return dest; } } // Anonymous namespace MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) { Handle<String> result; string = String::Flatten(isolate, string); return string->IsOneByteRepresentationUnderneath() ? EscapePrivate<uint8_t>(isolate, string) : EscapePrivate<uc16>(isolate, string); } MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) { Handle<String> result; string = String::Flatten(isolate, string); return string->IsOneByteRepresentationUnderneath() ? UnescapePrivate<uint8_t>(isolate, string) : UnescapePrivate<uc16>(isolate, string); } } // namespace internal } // namespace v8