builtins-string.cc 17.8 KB
Newer Older
1 2 3 4
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

5
#include "src/builtins/builtins-utils-inl.h"
6
#include "src/builtins/builtins.h"
7
#include "src/heap/heap-inl.h"  // For ToBoolean. TODO(jkummerow): Drop.
8
#include "src/logging/counters.h"
9
#include "src/numbers/conversions.h"
10
#include "src/objects/objects-inl.h"
11 12 13
#ifdef V8_INTL_SUPPORT
#include "src/objects/intl-objects.h"
#endif
14
#include "src/base/strings.h"
15
#include "src/regexp/regexp-utils.h"
16 17 18 19
#include "src/strings/string-builder-inl.h"
#include "src/strings/string-case.h"
#include "src/strings/unicode-inl.h"
#include "src/strings/unicode.h"
20 21 22 23 24 25 26

namespace v8 {
namespace internal {

namespace {  // for String.fromCodePoint

bool IsValidCodePoint(Isolate* isolate, Handle<Object> value) {
27 28
  if (!value->IsNumber() &&
      !Object::ToNumber(isolate, value).ToHandle(&value)) {
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
    return false;
  }

  if (Object::ToInteger(isolate, value).ToHandleChecked()->Number() !=
      value->Number()) {
    return false;
  }

  if (value->Number() < 0 || value->Number() > 0x10FFFF) {
    return false;
  }

  return true;
}

44
static constexpr base::uc32 kInvalidCodePoint = static_cast<base::uc32>(-1);
45

46
base::uc32 NextCodePoint(Isolate* isolate, BuiltinArguments args, int index) {
47
  Handle<Object> value = args.at(1 + index);
48 49
  ASSIGN_RETURN_ON_EXCEPTION_VALUE(
      isolate, value, Object::ToNumber(isolate, value), kInvalidCodePoint);
50 51 52
  if (!IsValidCodePoint(isolate, value)) {
    isolate->Throw(*isolate->factory()->NewRangeError(
        MessageTemplate::kInvalidCodePoint, value));
53
    return kInvalidCodePoint;
54 55 56 57 58 59 60 61 62 63
  }
  return DoubleToUint32(value->Number());
}

}  // namespace

// ES6 section 21.1.2.2 String.fromCodePoint ( ...codePoints )
BUILTIN(StringFromCodePoint) {
  HandleScope scope(isolate);
  int const length = args.length() - 1;
64
  if (length == 0) return ReadOnlyRoots(isolate).empty_string();
65 66 67 68
  DCHECK_LT(0, length);

  // Optimistically assume that the resulting String contains only one byte
  // characters.
69 70
  std::vector<uint8_t> one_byte_buffer;
  one_byte_buffer.reserve(length);
71
  base::uc32 code = 0;
72 73 74
  int index;
  for (index = 0; index < length; index++) {
    code = NextCodePoint(isolate, args, index);
75
    if (code == kInvalidCodePoint) {
76
      return ReadOnlyRoots(isolate).exception();
77 78 79 80
    }
    if (code > String::kMaxOneByteCharCode) {
      break;
    }
81
    one_byte_buffer.push_back(code);
82 83 84
  }

  if (index == length) {
85
    RETURN_RESULT_OR_FAILURE(
86
        isolate, isolate->factory()->NewStringFromOneByte(base::Vector<uint8_t>(
87
                     one_byte_buffer.data(), one_byte_buffer.size())));
88 89
  }

90
  std::vector<base::uc16> two_byte_buffer;
91
  two_byte_buffer.reserve(length - index);
92 93

  while (true) {
94 95
    if (code <=
        static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
96
      two_byte_buffer.push_back(code);
97
    } else {
98 99
      two_byte_buffer.push_back(unibrow::Utf16::LeadSurrogate(code));
      two_byte_buffer.push_back(unibrow::Utf16::TrailSurrogate(code));
100 101 102 103 104 105
    }

    if (++index == length) {
      break;
    }
    code = NextCodePoint(isolate, args, index);
106
    if (code == kInvalidCodePoint) {
107
      return ReadOnlyRoots(isolate).exception();
108 109 110 111 112 113
    }
  }

  Handle<SeqTwoByteString> result;
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
      isolate, result,
114 115
      isolate->factory()->NewRawTwoByteString(
          static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size())));
116

117
  DisallowGarbageCollection no_gc;
118 119 120 121
  CopyChars(result->GetChars(no_gc), one_byte_buffer.data(),
            one_byte_buffer.size());
  CopyChars(result->GetChars(no_gc) + one_byte_buffer.size(),
            two_byte_buffer.data(), two_byte_buffer.size());
122 123 124 125

  return *result;
}

126 127 128 129 130 131 132 133 134
// ES6 section 21.1.3.9
// String.prototype.lastIndexOf ( searchString [ , position ] )
BUILTIN(StringPrototypeLastIndexOf) {
  HandleScope handle_scope(isolate);
  return String::LastIndexOf(isolate, args.receiver(),
                             args.atOrUndefined(isolate, 1),
                             args.atOrUndefined(isolate, 2));
}

135 136 137 138 139 140
// ES6 section 21.1.3.10 String.prototype.localeCompare ( that )
//
// This function is implementation specific.  For now, we do not
// do anything locale specific.
BUILTIN(StringPrototypeLocaleCompare) {
  HandleScope handle_scope(isolate);
141 142

  isolate->CountUsage(v8::Isolate::UseCounterFeature::kStringLocaleCompare);
143
  const char* method = "String.prototype.localeCompare";
144

145
#ifdef V8_INTL_SUPPORT
146
  TO_THIS_STRING(str1, method);
147 148 149 150
  Handle<String> str2;
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
      isolate, str2, Object::ToString(isolate, args.atOrUndefined(isolate, 1)));
  RETURN_RESULT_OR_FAILURE(
151 152 153
      isolate, Intl::StringLocaleCompare(
                   isolate, str1, str2, args.atOrUndefined(isolate, 2),
                   args.atOrUndefined(isolate, 3), method));
154
#else
155
  DCHECK_LE(2, args.length());
156

157
  TO_THIS_STRING(str1, method);
158
  Handle<String> str2;
159 160
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, str2,
                                     Object::ToString(isolate, args.at(1)));
161

162
  if (str1.is_identical_to(str2)) return Smi::zero();  // Equal.
163 164 165 166 167
  int str1_length = str1->length();
  int str2_length = str2->length();

  // Decide trivial cases without flattening.
  if (str1_length == 0) {
168
    if (str2_length == 0) return Smi::zero();  // Equal.
169 170 171 172 173 174 175 176 177 178 179 180 181
    return Smi::FromInt(-str2_length);
  } else {
    if (str2_length == 0) return Smi::FromInt(str1_length);
  }

  int end = str1_length < str2_length ? str1_length : str2_length;

  // No need to flatten if we are going to find the answer on the first
  // character. At this point we know there is at least one character
  // in each string, due to the trivial case handling above.
  int d = str1->Get(0) - str2->Get(0);
  if (d != 0) return Smi::FromInt(d);

182 183
  str1 = String::Flatten(isolate, str1);
  str2 = String::Flatten(isolate, str2);
184

185
  DisallowGarbageCollection no_gc;
186 187
  String::FlatContent flat1 = str1->GetFlatContent(no_gc);
  String::FlatContent flat2 = str2->GetFlatContent(no_gc);
188 189 190 191 192 193 194 195

  for (int i = 0; i < end; i++) {
    if (flat1.Get(i) != flat2.Get(i)) {
      return Smi::FromInt(flat1.Get(i) - flat2.Get(i));
    }
  }

  return Smi::FromInt(str1_length - str2_length);
196
#endif  // !V8_INTL_SUPPORT
197 198
}

199
#ifndef V8_INTL_SUPPORT
200 201 202
// ES6 section 21.1.3.12 String.prototype.normalize ( [form] )
//
// Simply checks the argument is valid and returns the string itself.
203
// If internationalization is enabled, then intl.js will override this function
204 205 206 207 208 209 210 211 212 213 214 215
// and provide the proper functionality, so this is just a fallback.
BUILTIN(StringPrototypeNormalize) {
  HandleScope handle_scope(isolate);
  TO_THIS_STRING(string, "String.prototype.normalize");

  Handle<Object> form_input = args.atOrUndefined(isolate, 1);
  if (form_input->IsUndefined(isolate)) return *string;

  Handle<String> form;
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, form,
                                     Object::ToString(isolate, form_input));

216 217 218 219
  if (!(String::Equals(isolate, form, isolate->factory()->NFC_string()) ||
        String::Equals(isolate, form, isolate->factory()->NFD_string()) ||
        String::Equals(isolate, form, isolate->factory()->NFKC_string()) ||
        String::Equals(isolate, form, isolate->factory()->NFKD_string()))) {
220 221 222 223 224 225 226 227 228
    Handle<String> valid_forms =
        isolate->factory()->NewStringFromStaticChars("NFC, NFD, NFKC, NFKD");
    THROW_NEW_ERROR_RETURN_FAILURE(
        isolate,
        NewRangeError(MessageTemplate::kNormalizationForm, valid_forms));
  }

  return *string;
}
229
#endif  // !V8_INTL_SUPPORT
230

231

232
#ifndef V8_INTL_SUPPORT
233 234
namespace {

235
inline bool ToUpperOverflows(base::uc32 character) {
236 237
  // y with umlauts and the micro sign are the only characters that stop
  // fitting into one-byte when converting to uppercase.
238 239
  static const base::uc32 yuml_code = 0xFF;
  static const base::uc32 micro_code = 0xB5;
240 241 242 243
  return (character == yuml_code || character == micro_code);
}

template <class Converter>
244
V8_WARN_UNUSED_RESULT static Object ConvertCaseHelper(
245
    Isolate* isolate, String string, SeqString result, int result_length,
246
    unibrow::Mapping<Converter, 128>* mapping) {
247
  DisallowGarbageCollection no_gc;
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
  // We try this twice, once with the assumption that the result is no longer
  // than the input and, if that assumption breaks, again with the exact
  // length.  This may not be pretty, but it is nicer than what was here before
  // and I hereby claim my vaffel-is.
  //
  // NOTE: This assumes that the upper/lower case of an ASCII
  // character is also ASCII.  This is currently the case, but it
  // might break in the future if we implement more context and locale
  // dependent upper/lower conversions.
  bool has_changed_character = false;

  // Convert all characters to upper case, assuming that they will fit
  // in the buffer
  StringCharacterStream stream(string);
  unibrow::uchar chars[Converter::kMaxWidth];
  // We can assume that the string is not empty
264
  base::uc32 current = stream.GetNext();
265
  bool ignore_overflow = Converter::kIsToLower || result.IsSeqTwoByteString();
266 267
  for (int i = 0; i < result_length;) {
    bool has_next = stream.HasMore();
268
    base::uc32 next = has_next ? stream.GetNext() : 0;
269 270 271
    int char_length = mapping->get(current, next, chars);
    if (char_length == 0) {
      // The case conversion of this character is the character itself.
272
      result.Set(i, current);
273 274 275 276
      i++;
    } else if (char_length == 1 &&
               (ignore_overflow || !ToUpperOverflows(current))) {
      // Common case: converting the letter resulted in one character.
277
      DCHECK(static_cast<base::uc32>(chars[0]) != current);
278
      result.Set(i, chars[0]);
279 280
      has_changed_character = true;
      i++;
281
    } else if (result_length == string.length()) {
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
      bool overflows = ToUpperOverflows(current);
      // We've assumed that the result would be as long as the
      // input but here is a character that converts to several
      // characters.  No matter, we calculate the exact length
      // of the result and try the whole thing again.
      //
      // Note that this leaves room for optimization.  We could just
      // memcpy what we already have to the result string.  Also,
      // the result string is the last object allocated we could
      // "realloc" it and probably, in the vast majority of cases,
      // extend the existing string to be able to hold the full
      // result.
      int next_length = 0;
      if (has_next) {
        next_length = mapping->get(next, 0, chars);
        if (next_length == 0) next_length = 1;
      }
      int current_length = i + char_length + next_length;
      while (stream.HasMore()) {
        current = stream.GetNext();
        overflows |= ToUpperOverflows(current);
        // NOTE: we use 0 as the next character here because, while
        // the next character may affect what a character converts to,
        // it does not in any case affect the length of what it convert
        // to.
        int char_length = mapping->get(current, 0, chars);
        if (char_length == 0) char_length = 1;
        current_length += char_length;
        if (current_length > String::kMaxLength) {
311
          AllowGarbageCollection allocate_error_and_return;
312 313 314 315 316 317 318 319 320 321
          THROW_NEW_ERROR_RETURN_FAILURE(isolate,
                                         NewInvalidStringLengthError());
        }
      }
      // Try again with the real length.  Return signed if we need
      // to allocate a two-byte string for to uppercase.
      return (overflows && !ignore_overflow) ? Smi::FromInt(-current_length)
                                             : Smi::FromInt(current_length);
    } else {
      for (int j = 0; j < char_length; j++) {
322
        result.Set(i, chars[j]);
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
        i++;
      }
      has_changed_character = true;
    }
    current = next;
  }
  if (has_changed_character) {
    return result;
  } else {
    // If we didn't actually change anything in doing the conversion
    // we simple return the result and let the converted string
    // become garbage; there is no reason to keep two identical strings
    // alive.
    return string;
  }
}

template <class Converter>
341
V8_WARN_UNUSED_RESULT static Object ConvertCase(
342 343
    Handle<String> s, Isolate* isolate,
    unibrow::Mapping<Converter, 128>* mapping) {
344
  s = String::Flatten(isolate, s);
345 346 347 348 349 350 351 352 353 354
  int length = s->length();
  // Assume that the string is not empty; we need this assumption later
  if (length == 0) return *s;

  // Simpler handling of ASCII strings.
  //
  // NOTE: This assumes that the upper/lower case of an ASCII
  // character is also ASCII.  This is currently the case, but it
  // might break in the future if we implement more context and locale
  // dependent upper/lower conversions.
355
  if (String::IsOneByteRepresentationUnderneath(*s)) {
356 357 358
    // Same length as input.
    Handle<SeqOneByteString> result =
        isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
359
    DisallowGarbageCollection no_gc;
360
    String::FlatContent flat_content = s->GetFlatContent(no_gc);
361 362 363
    DCHECK(flat_content.IsFlat());
    bool has_changed_character = false;
    int index_to_first_unprocessed = FastAsciiConvert<Converter::kIsToLower>(
364
        reinterpret_cast<char*>(result->GetChars(no_gc)),
365
        reinterpret_cast<const char*>(flat_content.ToOneByteVector().begin()),
366 367 368 369 370 371 372 373 374 375 376 377 378
        length, &has_changed_character);
    // If not ASCII, we discard the result and take the 2 byte path.
    if (index_to_first_unprocessed == length)
      return has_changed_character ? *result : *s;
  }

  Handle<SeqString> result;  // Same length as input.
  if (s->IsOneByteRepresentation()) {
    result = isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
  } else {
    result = isolate->factory()->NewRawTwoByteString(length).ToHandleChecked();
  }

379
  Object answer = ConvertCaseHelper(isolate, *s, *result, length, mapping);
380
  if (answer.IsException(isolate) || answer.IsString()) return answer;
381

382
  DCHECK(answer.IsSmi());
jgruber's avatar
jgruber committed
383
  length = Smi::ToInt(answer);
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
  if (s->IsOneByteRepresentation() && length > 0) {
    ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
        isolate, result, isolate->factory()->NewRawOneByteString(length));
  } else {
    if (length < 0) length = -length;
    ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
        isolate, result, isolate->factory()->NewRawTwoByteString(length));
  }
  return ConvertCaseHelper(isolate, *s, *result, length, mapping);
}

}  // namespace

BUILTIN(StringPrototypeToLocaleLowerCase) {
  HandleScope scope(isolate);
  TO_THIS_STRING(string, "String.prototype.toLocaleLowerCase");
  return ConvertCase(string, isolate,
                     isolate->runtime_state()->to_lower_mapping());
}

BUILTIN(StringPrototypeToLocaleUpperCase) {
  HandleScope scope(isolate);
  TO_THIS_STRING(string, "String.prototype.toLocaleUpperCase");
  return ConvertCase(string, isolate,
                     isolate->runtime_state()->to_upper_mapping());
}

BUILTIN(StringPrototypeToLowerCase) {
  HandleScope scope(isolate);
  TO_THIS_STRING(string, "String.prototype.toLowerCase");
  return ConvertCase(string, isolate,
                     isolate->runtime_state()->to_lower_mapping());
}

BUILTIN(StringPrototypeToUpperCase) {
  HandleScope scope(isolate);
  TO_THIS_STRING(string, "String.prototype.toUpperCase");
  return ConvertCase(string, isolate,
                     isolate->runtime_state()->to_upper_mapping());
}
424
#endif  // !V8_INTL_SUPPORT
425

426 427 428 429 430 431 432 433 434 435 436 437 438
// ES6 #sec-string.prototype.raw
BUILTIN(StringRaw) {
  HandleScope scope(isolate);
  Handle<Object> templ = args.atOrUndefined(isolate, 1);
  const uint32_t argc = args.length();
  Handle<String> raw_string =
      isolate->factory()->NewStringFromAsciiChecked("raw");

  Handle<Object> cooked;
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, cooked,
                                     Object::ToObject(isolate, templ));

  Handle<Object> raw;
439 440
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
      isolate, raw, Object::GetProperty(isolate, cooked, raw_string));
441 442 443 444 445
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, raw,
                                     Object::ToObject(isolate, raw));
  Handle<Object> raw_len;
  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
      isolate, raw_len,
446
      Object::GetProperty(isolate, raw, isolate->factory()->length_string()));
447 448 449 450 451

  ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, raw_len,
                                     Object::ToLength(isolate, raw_len));

  IncrementalStringBuilder result_builder(isolate);
452 453 454 455 456 457
  // Intentional spec violation: we ignore {length} values >= 2^32, because
  // assuming non-empty chunks they would generate too-long strings anyway.
  const double raw_len_number = raw_len->Number();
  const uint32_t length = raw_len_number > std::numeric_limits<uint32_t>::max()
                              ? std::numeric_limits<uint32_t>::max()
                              : static_cast<uint32_t>(raw_len_number);
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
  if (length > 0) {
    Handle<Object> first_element;
    ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, first_element,
                                       Object::GetElement(isolate, raw, 0));

    Handle<String> first_string;
    ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
        isolate, first_string, Object::ToString(isolate, first_element));
    result_builder.AppendString(first_string);

    for (uint32_t i = 1, arg_i = 2; i < length; i++, arg_i++) {
      if (arg_i < argc) {
        Handle<String> argument_string;
        ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
            isolate, argument_string,
            Object::ToString(isolate, args.at(arg_i)));
        result_builder.AppendString(argument_string);
      }

      Handle<Object> element;
      ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, element,
                                         Object::GetElement(isolate, raw, i));

      Handle<String> element_string;
      ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, element_string,
                                         Object::ToString(isolate, element));
      result_builder.AppendString(element_string);
    }
  }

  RETURN_RESULT_OR_FAILURE(isolate, result_builder.Finish());
}

491 492
}  // namespace internal
}  // namespace v8