string.h 38.6 KB
Newer Older
1 2 3 4 5 6 7
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_OBJECTS_STRING_H_
#define V8_OBJECTS_STRING_H_

8 9
#include <memory>

10
#include "src/base/bits.h"
11
#include "src/base/export-template.h"
12
#include "src/objects/instance-type.h"
13
#include "src/objects/name.h"
14
#include "src/objects/smi.h"
15
#include "src/strings/unicode-decoder.h"
16
#include "torque-generated/field-offsets.h"
17 18 19 20 21 22 23

// Has to be the last include (doesn't have include guards):
#include "src/objects/object-macros.h"

namespace v8 {
namespace internal {

24 25
class SharedStringAccessGuardIfNeeded;

26
enum InstanceType : uint16_t;
27

28 29 30 31 32 33 34 35 36 37 38 39 40 41
enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS };
enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL };

// The characteristics of a string are stored in its map.  Retrieving these
// few bits of information is moderately expensive, involving two memory
// loads where the second is dependent on the first.  To improve efficiency
// the shape of the string is given its own class so that it can be retrieved
// once and used for several string operations.  A StringShape is small enough
// to be passed by value and is immutable, but be aware that flattening a
// string can potentially alter its shape.  Also be aware that a GC caused by
// something else can alter the shape of a string due to ConsString
// shortcutting.  Keeping these restrictions in mind has proven to be error-
// prone and so we no longer put StringShapes in variables unless there is a
// concrete performance benefit at that particular point in the code.
42
class StringShape {
43
 public:
44
  inline explicit StringShape(const String s);
45
  inline explicit StringShape(Map s);
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  inline explicit StringShape(InstanceType t);
  inline bool IsSequential();
  inline bool IsExternal();
  inline bool IsCons();
  inline bool IsSliced();
  inline bool IsThin();
  inline bool IsIndirect();
  inline bool IsExternalOneByte();
  inline bool IsExternalTwoByte();
  inline bool IsSequentialOneByte();
  inline bool IsSequentialTwoByte();
  inline bool IsInternalized();
  inline StringRepresentationTag representation_tag();
  inline uint32_t encoding_tag();
  inline uint32_t full_representation_tag();
#ifdef DEBUG
  inline uint32_t type() { return type_; }
  inline void invalidate() { valid_ = false; }
  inline bool valid() { return valid_; }
#else
  inline void invalidate() {}
#endif

69 70 71 72 73 74 75
  // Run different behavior for each concrete string class type, as defined by
  // the dispatcher.
  template <typename TDispatcher, typename TResult, typename... TArgs>
  inline TResult DispatchToSpecificTypeWithoutCast(TArgs&&... args);
  template <typename TDispatcher, typename TResult, typename... TArgs>
  inline TResult DispatchToSpecificType(String str, TArgs&&... args);

76 77 78 79 80 81 82 83 84 85
 private:
  uint32_t type_;
#ifdef DEBUG
  inline void set_valid() { valid_ = true; }
  bool valid_;
#else
  inline void set_valid() {}
#endif
};

86 87
#include "torque-generated/src/objects/string-tq.inc"

88 89 90 91 92 93 94 95
// The String abstract class captures JavaScript string values:
//
// Ecma-262:
//  4.3.16 String Value
//    A string value is a member of the type String and is a finite
//    ordered sequence of zero or more 16-bit unsigned integer values.
//
// All string values have a length field.
96
class String : public TorqueGeneratedString<String, Name> {
97 98 99 100 101 102 103 104
 public:
  enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING };

  // Representation of the flat content of a String.
  // A non-flat string doesn't have flat content.
  // A flat string has content that's encoded as a sequence of either
  // one-byte chars or two-byte UC16.
  // Returned by String::GetFlatContent().
105 106 107 108
  // Not safe to use from concurrent background threads.
  // TODO(solanes): Move FlatContent into FlatStringReader, and make it private.
  // This would de-duplicate code, as well as taking advantage of the fact that
  // FlatStringReader is relocatable.
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
  class FlatContent {
   public:
    // Returns true if the string is flat and this structure contains content.
    bool IsFlat() const { return state_ != NON_FLAT; }
    // Returns true if the structure contains one-byte content.
    bool IsOneByte() const { return state_ == ONE_BYTE; }
    // Returns true if the structure contains two-byte content.
    bool IsTwoByte() const { return state_ == TWO_BYTE; }

    // Return the one byte content of the string. Only use if IsOneByte()
    // returns true.
    Vector<const uint8_t> ToOneByteVector() const {
      DCHECK_EQ(ONE_BYTE, state_);
      return Vector<const uint8_t>(onebyte_start, length_);
    }
    // Return the two-byte content of the string. Only use if IsTwoByte()
    // returns true.
    Vector<const uc16> ToUC16Vector() const {
      DCHECK_EQ(TWO_BYTE, state_);
      return Vector<const uc16>(twobyte_start, length_);
    }

    uc16 Get(int i) const {
      DCHECK(i < length_);
      DCHECK(state_ != NON_FLAT);
      if (state_ == ONE_BYTE) return onebyte_start[i];
      return twobyte_start[i];
    }

    bool UsesSameString(const FlatContent& other) const {
      return onebyte_start == other.onebyte_start;
    }

   private:
    enum State { NON_FLAT, ONE_BYTE, TWO_BYTE };

    // Constructors only used by String::GetFlatContent().
146
    FlatContent(const uint8_t* start, int length,
147
                const DisallowGarbageCollection& no_gc)
148 149 150 151 152
        : onebyte_start(start),
          length_(length),
          state_(ONE_BYTE),
          no_gc_(no_gc) {}
    FlatContent(const uc16* start, int length,
153
                const DisallowGarbageCollection& no_gc)
154 155 156 157
        : twobyte_start(start),
          length_(length),
          state_(TWO_BYTE),
          no_gc_(no_gc) {}
158
    explicit FlatContent(const DisallowGarbageCollection& no_gc)
159
        : onebyte_start(nullptr), length_(0), state_(NON_FLAT), no_gc_(no_gc) {}
160 161 162 163 164 165 166

    union {
      const uint8_t* onebyte_start;
      const uc16* twobyte_start;
    };
    int length_;
    State state_;
167
    const DisallowGarbageCollection& no_gc_;
168 169 170 171 172

    friend class String;
    friend class IterableSubString;
  };

173 174
  void MakeThin(Isolate* isolate, String canonical);

175
  template <typename Char>
176
  V8_INLINE Vector<const Char> GetCharVector(
177
      const DisallowGarbageCollection& no_gc);
178

179 180 181
  // Get chars from sequential or external strings. May only be called when a
  // SharedStringAccessGuard is not needed (i.e. on the main thread or on
  // read-only strings).
182
  template <typename Char>
183
  inline const Char* GetChars(const DisallowGarbageCollection& no_gc);
184

185 186 187
  // Get chars from sequential or external strings.
  template <typename Char>
  inline const Char* GetChars(
188
      const DisallowGarbageCollection& no_gc,
189 190
      const SharedStringAccessGuardIfNeeded& access_guard);

191 192 193
  // Returns the address of the character at an offset into this string.
  // Requires: this->IsFlat()
  const byte* AddressOfCharacterAt(int start_index,
194
                                   const DisallowGarbageCollection& no_gc);
195

196 197
  // Get and set the length of the string using acquire loads and release
  // stores.
198
  DECL_SYNCHRONIZED_INT_ACCESSORS(length)
199 200 201 202 203

  // Returns whether this string has only one-byte chars, i.e. all of them can
  // be one-byte encoded.  This might be the case even if the string is
  // two-byte.  Such strings may appear when the embedder prefers
  // two-byte external representations even for one-byte data.
204 205
  DECL_GETTER(IsOneByteRepresentation, bool)
  DECL_GETTER(IsTwoByteRepresentation, bool)
206 207 208

  // Cons and slices have an encoding flag that may not represent the actual
  // encoding of the underlying string.  This is taken into account here.
209 210 211
  // This function is static because that helps it get inlined.
  // Requires: string.IsFlat()
  static inline bool IsOneByteRepresentationUnderneath(String string);
212 213 214 215 216

  // Get and set individual two byte chars in the string.
  inline void Set(int index, uint16_t value);
  // Get individual two byte char in the string.  Repeated calls
  // to this method are not efficient unless the string is flat.
217 218 219 220
  // If it is called from a background thread, the LocalIsolate version should
  // be used.
  V8_INLINE uint16_t Get(int index, Isolate* isolate = nullptr);
  V8_INLINE uint16_t Get(int index, LocalIsolate* local_isolate);
221 222

  // ES6 section 7.1.3.1 ToNumber Applied to the String Type
223
  static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject);
224 225 226 227 228 229 230 231 232 233 234 235 236 237

  // Flattens the string.  Checks first inline to see if it is
  // necessary.  Does nothing if the string is not a cons string.
  // Flattening allocates a sequential string with the same data as
  // the given string and mutates the cons string to a degenerate
  // form, where the first component is the new sequential string and
  // the second component is the empty string.  If allocation fails,
  // this function returns a failure.  If flattening succeeds, this
  // function returns the sequential string that is now the first
  // component of the cons string.
  //
  // Degenerate cons strings are handled specially by the garbage
  // collector (see IsShortcutCandidate).

238 239 240
  static inline Handle<String> Flatten(
      Isolate* isolate, Handle<String> string,
      AllocationType allocation = AllocationType::kYoung);
241
  static inline Handle<String> Flatten(
242
      LocalIsolate* isolate, Handle<String> string,
243
      AllocationType allocation = AllocationType::kYoung);
244 245 246 247 248 249

  // Tries to return the content of a flat string as a structure holding either
  // a flat vector of char or of uc16.
  // If the string isn't flat, and therefore doesn't have flat content, the
  // returned structure will report so, and can't provide a vector of either
  // kind.
250
  V8_EXPORT_PRIVATE FlatContent
251
  GetFlatContent(const DisallowGarbageCollection& no_gc);
252 253 254

  // Returns the parent of a sliced string or first part of a flat cons string.
  // Requires: StringShape(this).IsIndirect() && this->IsFlat()
255
  inline String GetUnderlying();
256 257 258 259 260 261 262 263 264 265 266 267

  // String relational comparison, implemented according to ES6 section 7.2.11
  // Abstract Relational Comparison (step 5): The comparison of Strings uses a
  // simple lexicographic ordering on sequences of code unit values. There is no
  // attempt to use the more complex, semantically oriented definitions of
  // character or string equality and collating order defined in the Unicode
  // specification. Therefore String values that are canonically equal according
  // to the Unicode standard could test as unequal. In effect this algorithm
  // assumes that both Strings are already in normalized form. Also, note that
  // for strings containing supplementary characters, lexicographic ordering on
  // sequences of UTF-16 code unit values differs from that on sequences of code
  // point values.
268 269
  V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate,
                                                        Handle<String> x,
270
                                                        Handle<String> y);
271 272

  // Perform ES6 21.1.3.8, including checking arguments.
273 274
  static Object IndexOf(Isolate* isolate, Handle<Object> receiver,
                        Handle<Object> search, Handle<Object> position);
275 276 277 278 279 280
  // Perform string match of pattern on subject, starting at start index.
  // Caller must ensure that 0 <= start_index <= sub->length(), as this does not
  // check any arguments.
  static int IndexOf(Isolate* isolate, Handle<String> receiver,
                     Handle<String> search, int start_index);

281 282
  static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver,
                            Handle<Object> search, Handle<Object> position);
283 284 285 286 287 288 289 290 291

  // Encapsulates logic related to a match and its capture groups as required
  // by GetSubstitution.
  class Match {
   public:
    virtual Handle<String> GetMatch() = 0;
    virtual Handle<String> GetPrefix() = 0;
    virtual Handle<String> GetSuffix() = 0;

292 293 294
    // A named capture can be unmatched (either not specified in the pattern,
    // or specified but unmatched in the current string), or matched.
    enum CaptureState { UNMATCHED, MATCHED };
295 296 297 298 299 300 301

    virtual int CaptureCount() = 0;
    virtual bool HasNamedCaptures() = 0;
    virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0;
    virtual MaybeHandle<String> GetNamedCapture(Handle<String> name,
                                                CaptureState* state) = 0;

302
    virtual ~Match() = default;
303 304 305 306 307 308 309 310
  };

  // ES#sec-getsubstitution
  // GetSubstitution(matched, str, position, captures, replacement)
  // Expand the $-expressions in the string and return a new string with
  // the result.
  // A {start_index} can be passed to specify where to start scanning the
  // replacement string.
311
  V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution(
312 313 314 315
      Isolate* isolate, Match* match, Handle<String> replacement,
      int start_index = 0);

  // String equality operations.
316
  inline bool Equals(String other);
317 318
  inline static bool Equals(Isolate* isolate, Handle<String> one,
                            Handle<String> two);
319

320
  enum class EqualityType { kWholeString, kPrefix, kNoLengthCheck };
321

322 323 324 325
  // Check if this string matches the given vector of characters, either as a
  // whole string or just a prefix.
  //
  // The Isolate is passed as "evidence" that this call is on the main thread,
326 327 328 329
  // and to distiguish from the LocalIsolate overload.
  template <EqualityType kEqType = EqualityType::kWholeString, typename Char>
  inline bool IsEqualTo(Vector<const Char> str,
                        Isolate* isolate = nullptr) const;
330 331 332 333 334 335

  // Check if this string matches the given vector of characters, either as a
  // whole string or just a prefix.
  //
  // The LocalIsolate is passed to provide access to the string access lock,
  // which is taken when reading the string's contents on a background thread.
336 337
  template <EqualityType kEqType = EqualityType::kWholeString, typename Char>
  inline bool IsEqualTo(Vector<const Char> str, LocalIsolate* isolate) const;
338

339
  V8_EXPORT_PRIVATE bool HasOneBytePrefix(Vector<const char> str);
340
  V8_EXPORT_PRIVATE inline bool IsOneByteEqualTo(Vector<const char> str);
341 342 343 344 345 346 347 348 349 350 351

  // Return a UTF8 representation of the string.  The string is null
  // terminated but may optionally contain nulls.  Length is returned
  // in length_output if length_output is not a null pointer  The string
  // should be nearly flat, otherwise the performance of this method may
  // be very slow (quadratic in the length).  Setting robustness_flag to
  // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust  This means it
  // handles unexpected data without causing assert failures and it does not
  // do any heap allocations.  This is useful when printing stack traces.
  std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls,
                                    RobustnessFlag robustness_flag, int offset,
352
                                    int length, int* length_output = nullptr);
353
  V8_EXPORT_PRIVATE std::unique_ptr<char[]> ToCString(
354 355
      AllowNullsFlag allow_nulls = DISALLOW_NULLS,
      RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
356
      int* length_output = nullptr);
357 358

  // Externalization.
359 360 361 362
  V8_EXPORT_PRIVATE bool MakeExternal(
      v8::String::ExternalStringResource* resource);
  V8_EXPORT_PRIVATE bool MakeExternal(
      v8::String::ExternalOneByteStringResource* resource);
363
  V8_EXPORT_PRIVATE bool SupportsExternalization();
364 365

  // Conversion.
366
  // "array index": an index allowed by the ES spec for JSArrays.
367
  inline bool AsArrayIndex(uint32_t* index);
368 369 370 371 372 373 374 375 376 377 378 379 380

  // This is used for calculating array indices but differs from an
  // Array Index in the regard that this does not support the full
  // array index range. This only supports positive numbers less than
  // or equal to INT_MAX.
  //
  // String::AsArrayIndex might be a better fit if you're looking to
  // calculate the array index.
  //
  // if val < 0 or val > INT_MAX, returns -1
  // if 0 <= val <= INT_MAX, returns val
  static int32_t ToArrayIndex(Address addr);

381
  uint32_t inline ToValidIndex(Object number);
382 383 384
  // "integer index": the string is the decimal representation of an
  // integer in the range of a size_t. Useful for TypedArray accesses.
  inline bool AsIntegerIndex(size_t* index);
385 386

  // Trimming.
387
  enum TrimMode { kTrim, kTrimStart, kTrimEnd };
388

389
  V8_EXPORT_PRIVATE void PrintOn(FILE* out);
390 391 392 393

  // For use during stack traces.  Performs rudimentary sanity check.
  bool LooksValid();

394 395 396 397 398 399 400
  // Printing utility functions.
  // - PrintUC16 prints the raw string contents to the given stream.
  //   Non-printable characters are formatted as hex, but otherwise the string
  //   is printed as-is.
  // - StringShortPrint and StringPrint have extra formatting: they add a
  //   prefix and suffix depending on the string kind, may add other information
  //   such as the string heap object address, may truncate long strings, etc.
401 402 403
  const char* PrefixForDebugPrint() const;
  const char* SuffixForDebugPrint() const;
  void StringShortPrint(StringStream* accumulator);
404
  void PrintUC16(std::ostream& os, int start = 0, int end = -1);  // NOLINT
405
  void PrintUC16(StringStream* accumulator, int start, int end);
406 407

  // Dispatched behavior.
408 409 410
#if defined(DEBUG) || defined(OBJECT_PRINT)
  char* ToAsciiArray();
#endif
411 412
  DECL_PRINTER(String)
  DECL_VERIFIER(String)
413 414 415 416 417 418 419 420 421 422 423

  inline bool IsFlat();

  // Max char codes.
  static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
  static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
  static const int kMaxUtf16CodeUnit = 0xffff;
  static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
  static const uc32 kMaxCodePoint = 0x10ffff;

  // Maximal string length.
424 425 426
  // The max length is different on 32 and 64 bit platforms. Max length for
  // 32-bit platforms is ~268.4M chars. On 64-bit platforms, max length is
  // ~536.8M chars.
427 428
  // See include/v8.h for the definition.
  static const int kMaxLength = v8::String::kMaxLength;
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
  // There are several defining limits imposed by our current implementation:
  // - any string's length must fit into a Smi.
  static_assert(kMaxLength <= kSmiMaxValue,
                "String length must fit into a Smi");
  // - adding two string lengths must still fit into a 32-bit int without
  //   overflow
  static_assert(kMaxLength * 2 <= kMaxInt,
                "String::kMaxLength * 2 must fit into an int32");
  // - any heap object's size in bytes must be able to fit into a Smi, because
  //   its space on the heap might be filled with a Filler; for strings this
  //   means SeqTwoByteString::kMaxSize must be able to fit into a Smi.
  static_assert(kMaxLength * 2 + kHeaderSize <= kSmiMaxValue,
                "String object size in bytes must fit into a Smi");
  // - any heap object's size in bytes must be able to fit into an int, because
  //   that's what our object handling code uses almost everywhere.
  static_assert(kMaxLength * 2 + kHeaderSize <= kMaxInt,
                "String object size in bytes must fit into an int");
446 447 448 449 450 451 452 453 454 455

  // Max length for computing hash. For strings longer than this limit the
  // string length is used as the hash value.
  static const int kMaxHashCalcLength = 16383;

  // Limit for truncation in short printing.
  static const int kMaxShortPrintLength = 1024;

  // Helper function for flattening strings.
  template <typename sinkchar>
456 457
  EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
  static void WriteToFlat(String source, sinkchar* sink, int from, int to);
458 459 460
  template <typename sinkchar>
  static void WriteToFlat(String source, sinkchar* sink, int from, int to,
                          const SharedStringAccessGuardIfNeeded&);
461 462

  static inline bool IsAscii(const char* chars, int length) {
463
    return IsAscii(reinterpret_cast<const uint8_t*>(chars), length);
464 465 466
  }

  static inline bool IsAscii(const uint8_t* chars, int length) {
467
    return NonAsciiStart(chars, length) >= length;
468 469 470
  }

  static inline int NonOneByteStart(const uc16* chars, int length) {
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
    DCHECK(IsAligned(reinterpret_cast<Address>(chars), sizeof(uc16)));
    const uint16_t* start = chars;
    const uint16_t* limit = chars + length;

    if (static_cast<size_t>(length) >= kUIntptrSize) {
      // Check unaligned chars.
      while (!IsAligned(reinterpret_cast<Address>(chars), kUIntptrSize)) {
        if (*chars > unibrow::Latin1::kMaxChar) {
          return static_cast<int>(chars - start);
        }
        ++chars;
      }

      // Check aligned words.
      STATIC_ASSERT(unibrow::Latin1::kMaxChar == 0xFF);
#ifdef V8_TARGET_LITTLE_ENDIAN
      const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFFFF * 0xFF00;
#else
      const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFFFF * 0x00FF;
#endif
      while (chars + sizeof(uintptr_t) <= limit) {
        if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
          break;
        }
        chars += (sizeof(uintptr_t) / sizeof(uc16));
      }
    }

    // Check remaining unaligned chars, or find non-one-byte char in word.
500
    while (chars < limit) {
501 502 503
      if (*chars > unibrow::Latin1::kMaxChar) {
        return static_cast<int>(chars - start);
      }
504 505
      ++chars;
    }
506

507 508 509 510 511 512 513 514
    return static_cast<int>(chars - start);
  }

  static inline bool IsOneByte(const uc16* chars, int length) {
    return NonOneByteStart(chars, length) >= length;
  }

  template <class Visitor>
515 516
  static inline ConsString VisitFlat(Visitor* visitor, String string,
                                     int offset = 0);
517

518 519 520 521
  template <typename LocalIsolate>
  static Handle<FixedArray> CalculateLineEnds(LocalIsolate* isolate,
                                              Handle<String> string,
                                              bool include_ending_line);
522 523 524 525

 private:
  friend class Name;
  friend class StringTableInsertionKey;
526
  friend class InternalizedStringKey;
527

528 529 530
  // Implementation of the Get() public methods. Do not use directly.
  V8_INLINE uint16_t GetImpl(int index);

531
  // Implementation of the IsEqualTo() public methods. Do not use directly.
532
  template <EqualityType kEqType, typename Char>
533
  V8_INLINE bool IsEqualToImpl(
534
      Vector<const Char> str,
535 536
      const SharedStringAccessGuardIfNeeded& access_guard) const;

537 538
  V8_EXPORT_PRIVATE static Handle<String> SlowFlatten(
      Isolate* isolate, Handle<ConsString> cons, AllocationType allocation);
539 540 541

  // Slow case of String::Equals.  This implementation works on any strings
  // but it is most efficient on strings that are almost flat.
542
  V8_EXPORT_PRIVATE bool SlowEquals(String other);
543

544 545
  V8_EXPORT_PRIVATE static bool SlowEquals(Isolate* isolate, Handle<String> one,
                                           Handle<String> two);
546 547 548

  // Slow case of AsArrayIndex.
  V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index);
549
  V8_EXPORT_PRIVATE bool SlowAsIntegerIndex(size_t* index);
550 551

  // Compute and set the hash code.
552
  V8_EXPORT_PRIVATE uint32_t ComputeAndSetHash();
553

554
  TQ_OBJECT_CONSTRUCTORS(String)
555 556
};

557 558
// clang-format off
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
559 560
void String::WriteToFlat(String source, uint8_t* sink, int from, int to);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
561
void String::WriteToFlat(String source, uint16_t* sink, int from, int to);
562 563 564 565 566 567
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void String::WriteToFlat(String source, uint8_t* sink, int from, int to ,
                        const SharedStringAccessGuardIfNeeded&);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void String::WriteToFlat(String source, uint16_t* sink, int from, int to,
                        const SharedStringAccessGuardIfNeeded&);
568 569
// clang-format on

570 571
class SubStringRange {
 public:
572
  inline SubStringRange(String string, const DisallowGarbageCollection& no_gc,
573
                        int first = 0, int length = -1);
574 575 576 577 578 579 580 581
  class iterator;
  inline iterator begin();
  inline iterator end();

 private:
  String string_;
  int first_;
  int length_;
582
  const DisallowGarbageCollection& no_gc_;
583 584 585
};

// The SeqString abstract class captures sequential string values.
586
class SeqString : public TorqueGeneratedSeqString<SeqString, String> {
587 588 589 590
 public:
  // Truncate the string in-place if possible and return the result.
  // In case of new_length == 0, the empty string is returned without
  // truncating the original string.
591 592
  V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string,
                                                       int new_length);
593

594
  TQ_OBJECT_CONSTRUCTORS(SeqString)
595 596
};

597 598
class InternalizedString
    : public TorqueGeneratedInternalizedString<InternalizedString, String> {
599 600 601
 public:
  // TODO(neis): Possibly move some stuff from String here.

602
  TQ_OBJECT_CONSTRUCTORS(InternalizedString)
603 604
};

605 606
// The OneByteString class captures sequential one-byte string objects.
// Each character in the OneByteString is an one-byte character.
607 608
class SeqOneByteString
    : public TorqueGeneratedSeqOneByteString<SeqOneByteString, SeqString> {
609 610
 public:
  static const bool kHasOneByteEncoding = true;
611
  using Char = uint8_t;
612 613

  // Dispatched behavior.
614
  inline uint8_t Get(int index);
615 616 617 618 619
  inline void SeqOneByteStringSet(int index, uint16_t value);

  // Get the address of the characters in this string.
  inline Address GetCharsAddress();

620 621 622
  // Get a pointer to the characters of the string. May only be called when a
  // SharedStringAccessGuard is not needed (i.e. on the main thread or on
  // read-only strings).
623
  inline uint8_t* GetChars(const DisallowGarbageCollection& no_gc);
624

625
  // Get a pointer to the characters of the string.
626
  inline uint8_t* GetChars(const DisallowGarbageCollection& no_gc,
627 628
                           const SharedStringAccessGuardIfNeeded& access_guard);

629 630 631 632
  // Clear uninitialized padding space. This ensures that the snapshot content
  // is deterministic.
  void clear_padding();

633 634 635 636 637 638
  // Garbage collection support.  This method is called by the
  // garbage collector to compute the actual size of an OneByteString
  // instance.
  inline int SeqOneByteStringSize(InstanceType instance_type);

  // Maximal memory usage for a single sequential one-byte string.
639 640
  static const int kMaxCharsSize = kMaxLength;
  static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
641 642
  STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength);

643 644
  int AllocatedSize();

645 646
  class BodyDescriptor;

647
  TQ_OBJECT_CONSTRUCTORS(SeqOneByteString)
648 649 650 651
};

// The TwoByteString class captures sequential unicode string objects.
// Each character in the TwoByteString is a two-byte uint16_t.
652 653
class SeqTwoByteString
    : public TorqueGeneratedSeqTwoByteString<SeqTwoByteString, SeqString> {
654 655
 public:
  static const bool kHasOneByteEncoding = false;
656
  using Char = uint16_t;
657 658

  // Dispatched behavior.
659
  inline uint16_t Get(int index);
660 661 662 663 664
  inline void SeqTwoByteStringSet(int index, uint16_t value);

  // Get the address of the characters in this string.
  inline Address GetCharsAddress();

665 666 667
  // Get a pointer to the characters of the string. May only be called when a
  // SharedStringAccessGuard is not needed (i.e. on the main thread or on
  // read-only strings).
668
  inline uc16* GetChars(const DisallowGarbageCollection& no_gc);
669

670
  // Get a pointer to the characters of the string.
671
  inline uc16* GetChars(const DisallowGarbageCollection& no_gc,
672 673
                        const SharedStringAccessGuardIfNeeded& access_guard);

674 675 676 677
  // Clear uninitialized padding space. This ensures that the snapshot content
  // is deterministic.
  void clear_padding();

678 679 680 681 682 683
  // Garbage collection support.  This method is called by the
  // garbage collector to compute the actual size of a TwoByteString
  // instance.
  inline int SeqTwoByteStringSize(InstanceType instance_type);

  // Maximal memory usage for a single sequential two-byte string.
684 685
  static const int kMaxCharsSize = kMaxLength * 2;
  static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
686 687 688
  STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >=
                String::kMaxLength);

689 690
  int AllocatedSize();

691 692
  class BodyDescriptor;

693
  TQ_OBJECT_CONSTRUCTORS(SeqTwoByteString)
694 695 696 697 698 699 700 701 702 703
};

// The ConsString class describes string values built by using the
// addition operator on strings.  A ConsString is a pair where the
// first and second components are pointers to other string values.
// One or both components of a ConsString can be pointers to other
// ConsStrings, creating a binary tree of ConsStrings where the leaves
// are non-ConsString string values.  The string value represented by
// a ConsString can be obtained by concatenating the leaf string
// values in a left-to-right depth-first traversal of the tree.
704
class ConsString : public TorqueGeneratedConsString<ConsString, String> {
705 706 707
 public:
  // Doesn't check that the result is a string, even in debug mode.  This is
  // useful during GC where the mark bits confuse the checks.
708
  inline Object unchecked_first();
709 710 711

  // Doesn't check that the result is a string, even in debug mode.  This is
  // useful during GC where the mark bits confuse the checks.
712
  inline Object unchecked_second();
713 714

  // Dispatched behavior.
715
  V8_EXPORT_PRIVATE uint16_t Get(int index);
716 717 718 719

  // Minimum length for a cons string.
  static const int kMinLength = 13;

720
  class BodyDescriptor;
721

722
  DECL_VERIFIER(ConsString)
723

724
  TQ_OBJECT_CONSTRUCTORS(ConsString)
725 726 727 728 729 730 731 732 733
};

// The ThinString class describes string objects that are just references
// to another string object. They are used for in-place internalization when
// the original string cannot actually be internalized in-place: in these
// cases, the original string is converted to a ThinString pointing at its
// internalized version (which is allocated as a new object).
// In terms of memory layout and most algorithms operating on strings,
// ThinStrings can be thought of as "one-part cons strings".
734
class ThinString : public TorqueGeneratedThinString<ThinString, String> {
735
 public:
736
  DECL_GETTER(unchecked_actual, HeapObject)
737

738
  V8_EXPORT_PRIVATE uint16_t Get(int index);
739

740
  DECL_VERIFIER(ThinString)
741

742
  class BodyDescriptor;
743

744
  TQ_OBJECT_CONSTRUCTORS(ThinString)
745 746 747 748 749 750 751 752 753 754 755 756 757 758
};

// The Sliced String class describes strings that are substrings of another
// sequential string.  The motivation is to save time and memory when creating
// a substring.  A Sliced String is described as a pointer to the parent,
// the offset from the start of the parent string and the length.  Using
// a Sliced String therefore requires unpacking of the parent string and
// adding the offset to the start address.  A substring of a Sliced String
// are not nested since the double indirection is simplified when creating
// such a substring.
// Currently missing features are:
//  - handling externalized parent strings
//  - external strings as parent
//  - truncating sliced string to enable otherwise unneeded parent to be GC'ed.
759
class SlicedString : public TorqueGeneratedSlicedString<SlicedString, String> {
760
 public:
761
  inline void set_parent(String parent,
762 763
                         WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
  // Dispatched behavior.
764
  V8_EXPORT_PRIVATE uint16_t Get(int index);
765 766 767 768

  // Minimum length for a sliced string.
  static const int kMinLength = 13;

769
  class BodyDescriptor;
770

771
  DECL_VERIFIER(SlicedString)
772

773
  TQ_OBJECT_CONSTRUCTORS(SlicedString)
774 775 776 777 778 779 780 781 782 783 784 785 786
};

// The ExternalString class describes string values that are backed by
// a string resource that lies outside the V8 heap.  ExternalStrings
// consist of the length field common to all strings, a pointer to the
// external resource.  It is important to ensure (externally) that the
// resource is not deallocated while the ExternalString is live in the
// V8 heap.
//
// The API expects that all ExternalStrings are created through the
// API.  Therefore, ExternalStrings should not be used internally.
class ExternalString : public String {
 public:
787
  DECL_CAST(ExternalString)
788
  DECL_VERIFIER(ExternalString)
789

790 791 792 793 794 795
  DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
                                TORQUE_GENERATED_EXTERNAL_STRING_FIELDS)

  // Size of uncached external strings.
  static const int kUncachedSize =
      kResourceOffset + FIELD_SIZE(kResourceOffset);
796

797 798
  inline void AllocateExternalPointerEntries(Isolate* isolate);

799 800
  // Return whether the external string data pointer is not cached.
  inline bool is_uncached() const;
801 802
  // Size in bytes of the external payload.
  int ExternalPayloadSize() const;
803

804
  // Used in the serializer/deserializer.
805 806
  DECL_GETTER(resource_as_address, Address)
  inline void set_address_as_resource(Isolate* isolate, Address address);
807 808
  inline uint32_t GetResourceRefForDeserialization();
  inline void SetResourceRefForSerialization(uint32_t ref);
809

810
  // Disposes string's resource object if it has not already been disposed.
811
  inline void DisposeResource(Isolate* isolate);
812

813
  STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset);
814
  static const int kSizeOfAllExternalStrings = kHeaderSize;
815

816
  OBJECT_CONSTRUCTORS(ExternalString, String);
817 818 819 820 821 822 823 824
};

// The ExternalOneByteString class is an external string backed by an
// one-byte string.
class ExternalOneByteString : public ExternalString {
 public:
  static const bool kHasOneByteEncoding = true;

825
  using Resource = v8::String::ExternalOneByteStringResource;
826 827

  // The underlying resource.
828
  DECL_GETTER(resource, const Resource*)
829 830 831 832

  // It is assumed that the previous resource is null. If it is not null, then
  // it is the responsability of the caller the handle the previous resource.
  inline void SetResource(Isolate* isolate, const Resource* buffer);
833

834
  // Used only during serialization.
835
  inline void set_resource(Isolate* isolate, const Resource* buffer);
836 837 838 839 840

  // Update the pointer cache to the external character array.
  // The cached pointer is always valid, as the external character array does =
  // not move during lifetime.  Deserialization is the only exception, after
  // which the pointer cache has to be refreshed.
841
  inline void update_data_cache(Isolate* isolate);
842 843 844 845

  inline const uint8_t* GetChars();

  // Dispatched behavior.
846
  inline uint8_t Get(int index);
847

848
  DECL_CAST(ExternalOneByteString)
849 850 851

  class BodyDescriptor;

852 853 854 855 856 857
  DEFINE_FIELD_OFFSET_CONSTANTS(
      ExternalString::kHeaderSize,
      TORQUE_GENERATED_EXTERNAL_ONE_BYTE_STRING_FIELDS)

  STATIC_ASSERT(kSize == kSizeOfAllExternalStrings);

858
  OBJECT_CONSTRUCTORS(ExternalOneByteString, ExternalString);
859 860 861 862 863 864 865 866
};

// The ExternalTwoByteString class is an external string backed by a UTF-16
// encoded string.
class ExternalTwoByteString : public ExternalString {
 public:
  static const bool kHasOneByteEncoding = false;

867
  using Resource = v8::String::ExternalStringResource;
868 869

  // The underlying string resource.
870
  DECL_GETTER(resource, const Resource*)
871 872 873 874

  // It is assumed that the previous resource is null. If it is not null, then
  // it is the responsability of the caller the handle the previous resource.
  inline void SetResource(Isolate* isolate, const Resource* buffer);
875

876
  // Used only during serialization.
877
  inline void set_resource(Isolate* isolate, const Resource* buffer);
878 879 880 881 882

  // Update the pointer cache to the external character array.
  // The cached pointer is always valid, as the external character array does =
  // not move during lifetime.  Deserialization is the only exception, after
  // which the pointer cache has to be refreshed.
883
  inline void update_data_cache(Isolate* isolate);
884 885 886 887

  inline const uint16_t* GetChars();

  // Dispatched behavior.
888
  inline uint16_t Get(int index);
889 890 891 892

  // For regexp code.
  inline const uint16_t* ExternalTwoByteStringGetData(unsigned start);

893
  DECL_CAST(ExternalTwoByteString)
894 895 896

  class BodyDescriptor;

897 898 899 900 901 902
  DEFINE_FIELD_OFFSET_CONSTANTS(
      ExternalString::kHeaderSize,
      TORQUE_GENERATED_EXTERNAL_TWO_BYTE_STRING_FIELDS)

  STATIC_ASSERT(kSize == kSizeOfAllExternalStrings);

903
  OBJECT_CONSTRUCTORS(ExternalTwoByteString, ExternalString);
904 905 906
};

// A flat string reader provides random access to the contents of a
907
// string independent of the character width of the string. The handle
908
// must be valid as long as the reader is being used.
909
// Not safe to use from concurrent background threads.
910
class V8_EXPORT_PRIVATE FlatStringReader : public Relocatable {
911 912
 public:
  FlatStringReader(Isolate* isolate, Handle<String> str);
913
  void PostGarbageCollection() override;
914 915 916 917 918 919
  inline uc32 Get(int index);
  template <typename Char>
  inline Char Get(int index);
  int length() { return length_; }

 private:
920
  Handle<String> str_;
921 922 923 924 925 926 927 928 929 930
  bool is_one_byte_;
  int length_;
  const void* start_;
};

// This maintains an off-stack representation of the stack frames required
// to traverse a ConsString, allowing an entirely iterative and restartable
// traversal of the entire string
class ConsStringIterator {
 public:
931
  inline ConsStringIterator() = default;
932
  inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) {
933 934
    Reset(cons_string, offset);
  }
935 936
  ConsStringIterator(const ConsStringIterator&) = delete;
  ConsStringIterator& operator=(const ConsStringIterator&) = delete;
937
  inline void Reset(ConsString cons_string, int offset = 0) {
938
    depth_ = 0;
939
    // Next will always return nullptr.
940
    if (cons_string.is_null()) return;
941 942
    Initialize(cons_string, offset);
  }
943
  // Returns nullptr when complete.
944
  inline String Next(int* offset_out) {
945
    *offset_out = 0;
946
    if (depth_ == 0) return String();
947 948 949 950 951 952 953
    return Continue(offset_out);
  }

 private:
  static const int kStackSize = 32;
  // Use a mask instead of doing modulo operations for stack wrapping.
  static const int kDepthMask = kStackSize - 1;
954 955
  static_assert(base::bits::IsPowerOfTwo(kStackSize),
                "kStackSize must be power of two");
956 957
  static inline int OffsetForDepth(int depth);

958 959
  inline void PushLeft(ConsString string);
  inline void PushRight(ConsString string);
960 961 962
  inline void AdjustMaximumDepth();
  inline void Pop();
  inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; }
963 964
  V8_EXPORT_PRIVATE void Initialize(ConsString cons_string, int offset);
  V8_EXPORT_PRIVATE String Continue(int* offset_out);
965 966
  String NextLeaf(bool* blew_stack);
  String Search(int* offset_out);
967 968 969

  // Stack must always contain only frames for which right traversal
  // has not yet been performed.
970 971
  ConsString frames_[kStackSize];
  ConsString root_;
972 973 974 975 976 977 978
  int depth_;
  int maximum_depth_;
  int consumed_;
};

class StringCharacterStream {
 public:
979
  inline explicit StringCharacterStream(String string, int offset = 0);
980 981
  StringCharacterStream(const StringCharacterStream&) = delete;
  StringCharacterStream& operator=(const StringCharacterStream&) = delete;
982 983
  inline uint16_t GetNext();
  inline bool HasMore();
984
  inline void Reset(String string, int offset = 0);
985 986 987 988 989 990 991 992 993 994 995 996 997
  inline void VisitOneByteString(const uint8_t* chars, int length);
  inline void VisitTwoByteString(const uint16_t* chars, int length);

 private:
  ConsStringIterator iter_;
  bool is_one_byte_;
  union {
    const uint8_t* buffer8_;
    const uint16_t* buffer16_;
  };
  const uint8_t* end_;
};

998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
template <typename Char>
struct CharTraits;

template <>
struct CharTraits<uint8_t> {
  using String = SeqOneByteString;
  using ExternalString = ExternalOneByteString;
};

template <>
struct CharTraits<uint16_t> {
  using String = SeqTwoByteString;
  using ExternalString = ExternalTwoByteString;
};

1013 1014 1015 1016 1017 1018
}  // namespace internal
}  // namespace v8

#include "src/objects/object-macros-undef.h"

#endif  // V8_OBJECTS_STRING_H_