regexp-macro-assembler.cc 15 KB
Newer Older
1
// Copyright 2012 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
#include "src/regexp/regexp-macro-assembler.h"
6

7
#include "src/codegen/assembler.h"
8 9
#include "src/execution/isolate-inl.h"
#include "src/execution/simulator.h"
10
#include "src/regexp/regexp-stack.h"
11
#include "src/strings/unicode-inl.h"
12

13
#ifdef V8_INTL_SUPPORT
14
#include "unicode/uchar.h"
15
#include "unicode/unistr.h"
16
#endif  // V8_INTL_SUPPORT
17

18 19
namespace v8 {
namespace internal {
20

21 22 23 24 25
RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
    : slow_safe_compiler_(false),
      global_mode_(NOT_GLOBAL),
      isolate_(isolate),
      zone_(zone) {}
26

27
RegExpMacroAssembler::~RegExpMacroAssembler() = default;
28

29 30 31 32 33 34 35
int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
                                                     Address byte_offset2,
                                                     size_t byte_length,
                                                     Isolate* isolate) {
  // This function is not allowed to cause a garbage collection.
  // A GC might move the calling generated code and invalidate the
  // return address on the stack.
36
  DCHECK_EQ(0, byte_length % 2);
37 38 39 40 41 42 43 44

#ifdef V8_INTL_SUPPORT
  int32_t length = (int32_t)(byte_length >> 1);
  icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
                               length);
  return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
                               length, U_FOLD_CASE_DEFAULT) == 0;
#else
45 46 47 48
  uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
  uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
  size_t length = byte_length >> 1;
  DCHECK_NOT_NULL(isolate);
49 50
  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
      isolate->regexp_macro_assembler_canonicalize();
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
  for (size_t i = 0; i < length; i++) {
    unibrow::uchar c1 = substring1[i];
    unibrow::uchar c2 = substring2[i];
    if (c1 != c2) {
      unibrow::uchar s1[1] = {c1};
      canonicalize->get(c1, '\0', s1);
      if (s1[0] != c2) {
        unibrow::uchar s2[1] = {c2};
        canonicalize->get(c2, '\0', s2);
        if (s1[0] != s2[0]) {
          return 0;
        }
      }
    }
  }
  return 1;
67
#endif  // V8_INTL_SUPPORT
68 69 70
}


71 72 73 74 75 76 77 78 79 80 81 82
void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
                                                   Label* on_failure) {
  Label ok;
  // Check that current character is not a trail surrogate.
  LoadCurrentCharacter(cp_offset, &ok);
  CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
  // Check that previous character is not a lead surrogate.
  LoadCurrentCharacter(cp_offset - 1, &ok);
  CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
  Bind(&ok);
}

83 84 85 86 87
void RegExpMacroAssembler::CheckPosition(int cp_offset,
                                         Label* on_outside_input) {
  LoadCurrentCharacter(cp_offset, on_outside_input, true);
}

88 89 90 91 92 93 94 95 96 97 98 99 100 101
void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
                                                Label* on_end_of_input,
                                                bool check_bounds,
                                                int characters,
                                                int eats_at_least) {
  // By default, eats_at_least = characters.
  if (eats_at_least == kUseCharactersValue) {
    eats_at_least = characters;
  }

  LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
                           eats_at_least);
}

102 103 104 105
bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
                                                      Label* on_no_match) {
  return false;
}
106

107 108 109
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
                                                       Zone* zone)
    : RegExpMacroAssembler(isolate, zone) {}
110

111
NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
lrn@chromium.org's avatar
lrn@chromium.org committed
112 113

bool NativeRegExpMacroAssembler::CanReadUnaligned() {
114
  return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
lrn@chromium.org's avatar
lrn@chromium.org committed
115 116
}

117
const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
118
    String subject, int start_index, const DisallowHeapAllocation& no_gc) {
119 120 121 122 123
  if (subject.IsConsString()) {
    subject = ConsString::cast(subject).first();
  } else if (subject.IsSlicedString()) {
    start_index += SlicedString::cast(subject).offset();
    subject = SlicedString::cast(subject).parent();
124
  }
125 126
  if (subject.IsThinString()) {
    subject = ThinString::cast(subject).actual();
127
  }
128
  DCHECK_LE(0, start_index);
129 130
  DCHECK_LE(start_index, subject.length());
  if (subject.IsSeqOneByteString()) {
131
    return reinterpret_cast<const byte*>(
132 133
        SeqOneByteString::cast(subject).GetChars(no_gc) + start_index);
  } else if (subject.IsSeqTwoByteString()) {
134
    return reinterpret_cast<const byte*>(
135 136
        SeqTwoByteString::cast(subject).GetChars(no_gc) + start_index);
  } else if (subject.IsExternalOneByteString()) {
137
    return reinterpret_cast<const byte*>(
138
        ExternalOneByteString::cast(subject).GetChars() + start_index);
139
  } else {
140
    DCHECK(subject.IsExternalTwoByteString());
141
    return reinterpret_cast<const byte*>(
142
        ExternalTwoByteString::cast(subject).GetChars() + start_index);
143
  }
144 145
}

146
// This method may only be called after an interrupt.
147
int NativeRegExpMacroAssembler::CheckStackGuardState(
148
    Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
149
    Address* return_address, Code re_code, Address* subject,
150
    const byte** input_start, const byte** input_end) {
151 152
  DisallowHeapAllocation no_gc;

153 154
  DCHECK(re_code.raw_instruction_start() <= *return_address);
  DCHECK(*return_address <= re_code.raw_instruction_end());
155
  StackLimitCheck check(isolate);
156 157
  bool js_has_overflowed = check.JsHasOverflowed();

158
  if (call_origin == RegExp::CallOrigin::kFromJs) {
159 160 161 162 163
    // Direct calls from JavaScript can be interrupted in two ways:
    // 1. A real stack overflow, in which case we let the caller throw the
    //    exception.
    // 2. The stack guard was used to interrupt execution for another purpose,
    //    forcing the call through the runtime system.
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185

    // Bug(v8:9540) Investigate why this method is called from JS although no
    // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
    // to continue execution normally.
    if (js_has_overflowed) {
      return EXCEPTION;
    } else if (check.InterruptRequested()) {
      return RETRY;
    } else {
      return 0;
    }
  }
  DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);

  // Prepare for possible GC.
  HandleScope handles(isolate);
  Handle<Code> code_handle(re_code, isolate);
  Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
  bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
  int return_value = 0;

  if (js_has_overflowed) {
186
    AllowHeapAllocation yes_gc;
187 188
    isolate->StackOverflow();
    return_value = EXCEPTION;
189 190
  } else if (check.InterruptRequested()) {
    AllowHeapAllocation yes_gc;
191
    Object result = isolate->stack_guard()->HandleInterrupts();
192
    if (result.IsException(isolate)) return_value = EXCEPTION;
193 194 195
  }

  if (*code_handle != re_code) {  // Return address no longer valid
196
    intptr_t delta = code_handle->address() - re_code.address();
197 198 199 200 201 202 203
    // Overwrite the return address on the stack.
    *return_address += delta;
  }

  // If we continue, we need to update the subject string addresses.
  if (return_value == 0) {
    // String encoding might have changed.
204 205
    if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
        is_one_byte) {
206 207 208 209 210
      // If we changed between an LATIN1 and an UC16 string, the specialized
      // code cannot be used, and we need to restart regexp matching from
      // scratch (including, potentially, compiling a new version of the code).
      return_value = RETRY;
    } else {
211
      *subject = subject_handle->ptr();
212
      intptr_t byte_length = *input_end - *input_start;
213 214
      *input_start =
          StringCharacterPosition(*subject_handle, start_index, no_gc);
215 216
      *input_end = *input_start + byte_length;
    }
217
  }
218
  return return_value;
219 220
}

221
// Returns a {Result} sentinel, or the number of successful matches.
222
int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
223 224 225 226
                                      Handle<String> subject,
                                      int* offsets_vector,
                                      int offsets_vector_length,
                                      int previous_index, Isolate* isolate) {
227
  DCHECK(subject->IsFlat());
228 229
  DCHECK_LE(0, previous_index);
  DCHECK_LE(previous_index, subject->length());
230 231

  // No allocations before calling the regexp, but we can't use
232 233
  // DisallowHeapAllocation, since regexps might be preempted, and another
  // thread might do allocation anyway.
234

235
  String subject_ptr = *subject;
236 237
  // Character offsets into string.
  int start_offset = previous_index;
238
  int char_length = subject_ptr.length() - start_offset;
239
  int slice_offset = 0;
240

241
  // The string has been flattened, so if it is a cons string it contains the
242
  // full string in the first part.
243
  if (StringShape(subject_ptr).IsCons()) {
244 245
    DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
    subject_ptr = ConsString::cast(subject_ptr).first();
246
  } else if (StringShape(subject_ptr).IsSliced()) {
247
    SlicedString slice = SlicedString::cast(subject_ptr);
248 249
    subject_ptr = slice.parent();
    slice_offset = slice.offset();
250
  }
251
  if (StringShape(subject_ptr).IsThin()) {
252
    subject_ptr = ThinString::cast(subject_ptr).actual();
253
  }
254
  // Ensure that an underlying string has the same representation.
255 256
  bool is_one_byte = subject_ptr.IsOneByteRepresentation();
  DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
257
  // String is now either Sequential or External
258
  int char_size_shift = is_one_byte ? 0 : 1;
259

260
  DisallowHeapAllocation no_gc;
261
  const byte* input_start =
262
      StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
263 264
  int byte_length = char_length << char_size_shift;
  const byte* input_end = input_start + byte_length;
265 266
  return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
                 offsets_vector_length, isolate, *regexp);
267
}
268

269
// Returns a {Result} sentinel, or the number of successful matches.
270 271 272
// TODO(pthier): The JSRegExp object is passed to native irregexp code to match
// the signature of the interpreter. We should get rid of JS objects passed to
// internal methods.
273
int NativeRegExpMacroAssembler::Execute(
274
    String input,  // This needs to be the unpacked (sliced, cons) string.
275
    int start_offset, const byte* input_start, const byte* input_end,
276
    int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
277
  // Ensure that the minimum stack has been allocated.
278 279
  RegExpStackScope stack_scope(isolate);
  Address stack_base = stack_scope.stack()->stack_base();
280

281 282 283
  bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
  Code code = Code::cast(regexp.Code(is_one_byte));
  RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
284 285

  using RegexpMatcherSig = int(
286
      Address input_string, int start_offset,  // NOLINT(readability/casting)
287
      const byte* input_start, const byte* input_end, int* output,
288 289
      int output_size, Address stack_base, int call_origin, Isolate* isolate,
      Address regexp);
290 291

  auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
292 293 294
  int result = fn.CallIrregexp(input.ptr(), start_offset, input_start,
                               input_end, output, output_size, stack_base,
                               call_origin, isolate, regexp.ptr());
295
  DCHECK(result >= RETRY);
296

297
  if (result == EXCEPTION && !isolate->has_pending_exception()) {
298
    // We detected a stack overflow (on the backtrack stack) in RegExp code,
299 300 301 302
    // but haven't created the exception yet. Additionally, we allow heap
    // allocation because even though it invalidates {input_start} and
    // {input_end}, we are about to return anyway.
    AllowHeapAllocation allow_allocation;
303
    isolate->StackOverflow();
304
  }
305
  return result;
306 307
}

308
// clang-format off
309
const byte NativeRegExpMacroAssembler::word_character_map[] = {
310 311 312 313 314 315 316
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317 318 319 320 321 322 323 324 325 326 327 328
    0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
    0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'

    0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
    0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
    0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
    0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'

    0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
    0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
    0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
    0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
    // Latin-1 range
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
349
};
350
// clang-format on
351

lrn@chromium.org's avatar
lrn@chromium.org committed
352
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
353 354 355 356 357
                                              Address* stack_base,
                                              Isolate* isolate) {
  RegExpStack* regexp_stack = isolate->regexp_stack();
  size_t size = regexp_stack->stack_capacity();
  Address old_stack_base = regexp_stack->stack_base();
358 359 360
  DCHECK(old_stack_base == *stack_base);
  DCHECK(stack_pointer <= old_stack_base);
  DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
361
  Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
362 363
  if (new_stack_base == kNullAddress) {
    return kNullAddress;
lrn@chromium.org's avatar
lrn@chromium.org committed
364 365 366 367 368 369
  }
  *stack_base = new_stack_base;
  intptr_t stack_content_size = old_stack_base - stack_pointer;
  return new_stack_base - stack_content_size;
}

370 371
}  // namespace internal
}  // namespace v8