// Copyright 2019 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/objects/js-regexp.h" #include "src/base/strings.h" #include "src/common/globals.h" #include "src/objects/code.h" #include "src/objects/js-array-inl.h" #include "src/objects/js-regexp-inl.h" #include "src/regexp/regexp.h" namespace v8 { namespace internal { Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices( Isolate* isolate, Handle<RegExpMatchInfo> match_info, Handle<Object> maybe_names) { Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast( isolate->factory()->NewJSObjectFromMap( isolate->regexp_result_indices_map()))); // Initialize indices length to avoid having a partially initialized object // should GC be triggered by creating a NewFixedArray. indices->set_length(Smi::zero()); // Build indices array from RegExpMatchInfo. int num_indices = match_info->NumberOfCaptureRegisters(); int num_results = num_indices >> 1; Handle<FixedArray> indices_array = isolate->factory()->NewFixedArray(num_results); JSArray::SetContent(indices, indices_array); for (int i = 0; i < num_results; i++) { int base_offset = i * 2; int start_offset = match_info->Capture(base_offset); int end_offset = match_info->Capture(base_offset + 1); // Any unmatched captures are set to undefined, otherwise we set them to a // subarray of the indices. if (start_offset == -1) { indices_array->set(i, ReadOnlyRoots(isolate).undefined_value()); } else { Handle<FixedArray> indices_sub_array( isolate->factory()->NewFixedArray(2)); indices_sub_array->set(0, Smi::FromInt(start_offset)); indices_sub_array->set(1, Smi::FromInt(end_offset)); Handle<JSArray> indices_sub_jsarray = isolate->factory()->NewJSArrayWithElements(indices_sub_array, PACKED_SMI_ELEMENTS, 2); indices_array->set(i, *indices_sub_jsarray); } } // If there are no capture groups, set the groups property to undefined. FieldIndex groups_index = FieldIndex::ForDescriptor( indices->map(), InternalIndex(kGroupsDescriptorIndex)); if (maybe_names->IsUndefined(isolate)) { indices->FastPropertyAtPut(groups_index, ReadOnlyRoots(isolate).undefined_value()); return indices; } // Create a groups property which returns a dictionary of named captures to // their corresponding capture indices. Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names)); int num_names = names->length() >> 1; Handle<HeapObject> group_names; if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) { group_names = isolate->factory()->NewSwissNameDictionary(num_names); } else { group_names = isolate->factory()->NewNameDictionary(num_names); } for (int i = 0; i < num_names; i++) { int base_offset = i * 2; int name_offset = base_offset; int index_offset = base_offset + 1; Handle<String> name(String::cast(names->get(name_offset)), isolate); Handle<Smi> smi_index(Smi::cast(names->get(index_offset)), isolate); Handle<Object> capture_indices(indices_array->get(smi_index->value()), isolate); if (!capture_indices->IsUndefined(isolate)) { capture_indices = Handle<JSArray>::cast(capture_indices); } if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) { group_names = SwissNameDictionary::Add( isolate, Handle<SwissNameDictionary>::cast(group_names), name, capture_indices, PropertyDetails::Empty()); } else { group_names = NameDictionary::Add( isolate, Handle<NameDictionary>::cast(group_names), name, capture_indices, PropertyDetails::Empty()); } } // Convert group_names to a JSObject and store at the groups property of the // result indices. Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array(); Handle<HeapObject> null = Handle<HeapObject>::cast(isolate->factory()->null_value()); Handle<JSObject> js_group_names = isolate->factory()->NewSlowJSObjectWithPropertiesAndElements( null, group_names, elements); indices->FastPropertyAtPut(groups_index, *js_group_names); return indices; } uint32_t JSRegExp::backtrack_limit() const { CHECK_EQ(type_tag(), IRREGEXP); return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit))); } // static base::Optional<JSRegExp::Flags> JSRegExp::FlagsFromString( Isolate* isolate, Handle<String> flags) { const int length = flags->length(); // A longer flags string cannot be valid. if (length > JSRegExp::kFlagCount) return {}; RegExpFlags value; FlatStringReader reader(isolate, String::Flatten(isolate, flags)); for (int i = 0; i < length; i++) { base::Optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i)); if (!flag.has_value()) return {}; if (value & flag.value()) return {}; // Duplicate. value |= flag.value(); } return JSRegExp::AsJSRegExpFlags(value); } // static Handle<String> JSRegExp::StringFromFlags(Isolate* isolate, JSRegExp::Flags flags) { static constexpr int kStringTerminator = 1; int cursor = 0; char buffer[kFlagCount + kStringTerminator]; #define V(Lower, Camel, LowerCamel, Char, Bit) \ if (flags & JSRegExp::k##Camel) buffer[cursor++] = Char; REGEXP_FLAG_LIST(V) #undef V buffer[cursor++] = '\0'; DCHECK_LE(cursor, kFlagCount + kStringTerminator); return isolate->factory()->NewStringFromAsciiChecked(buffer); } // static MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern, Flags flags, uint32_t backtrack_limit) { Handle<JSFunction> constructor = isolate->regexp_function(); Handle<JSRegExp> regexp = Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor)); return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit); } Object JSRegExp::code(bool is_latin1) const { DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); Object value = DataAt(code_index(is_latin1)); DCHECK_IMPLIES(V8_EXTERNAL_CODE_SPACE_BOOL, value.IsSmi() || value.IsCodeT()); return value; } void JSRegExp::set_code(bool is_latin1, Handle<Code> code) { SetDataAt(code_index(is_latin1), ToCodeT(*code)); } Object JSRegExp::bytecode(bool is_latin1) const { DCHECK(type_tag() == JSRegExp::IRREGEXP || type_tag() == JSRegExp::EXPERIMENTAL); return DataAt(bytecode_index(is_latin1)); } void JSRegExp::set_bytecode_and_trampoline(Isolate* isolate, Handle<ByteArray> bytecode) { SetDataAt(kIrregexpLatin1BytecodeIndex, *bytecode); SetDataAt(kIrregexpUC16BytecodeIndex, *bytecode); Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline); SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, ToCodeT(*trampoline)); SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, ToCodeT(*trampoline)); } bool JSRegExp::ShouldProduceBytecode() { return FLAG_regexp_interpret_all || (FLAG_regexp_tier_up && !MarkedForTierUp()); } // Only irregexps are subject to tier-up. bool JSRegExp::CanTierUp() { return FLAG_regexp_tier_up && type_tag() == JSRegExp::IRREGEXP; } // An irregexp is considered to be marked for tier up if the tier-up ticks // value reaches zero. bool JSRegExp::MarkedForTierUp() { DCHECK(data().IsFixedArray()); if (!CanTierUp()) { return false; } return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0; } void JSRegExp::ResetLastTierUpTick() { DCHECK(FLAG_regexp_tier_up); DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1; FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex, Smi::FromInt(tier_up_ticks)); } void JSRegExp::TierUpTick() { DCHECK(FLAG_regexp_tier_up); DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)); if (tier_up_ticks == 0) { return; } FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex, Smi::FromInt(tier_up_ticks - 1)); } void JSRegExp::MarkTierUpForNextExec() { DCHECK(FLAG_regexp_tier_up); DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex, Smi::zero()); } // static MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp, Handle<String> source, Handle<String> flags_string) { Isolate* isolate = regexp->GetIsolate(); base::Optional<Flags> flags = JSRegExp::FlagsFromString(isolate, flags_string); if (!flags.has_value()) { THROW_NEW_ERROR( isolate, NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string), JSRegExp); } return Initialize(regexp, source, flags.value()); } namespace { bool IsLineTerminator(int c) { // Expected to return true for '\n', '\r', 0x2028, and 0x2029. return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c)); } // TODO(jgruber): Consider merging CountAdditionalEscapeChars and // WriteEscapedRegExpSource into a single function to deduplicate dispatch logic // and move related code closer to each other. template <typename Char> int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) { DisallowGarbageCollection no_gc; int escapes = 0; bool needs_escapes = false; bool in_char_class = false; base::Vector<const Char> src = source->GetCharVector<Char>(no_gc); for (int i = 0; i < src.length(); i++) { const Char c = src[i]; if (c == '\\') { if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) { // This '\' is ignored since the next character itself will be escaped. escapes--; } else { // Escape. Skip next character, which will be copied verbatim; i++; } } else if (c == '/' && !in_char_class) { // Not escaped forward-slash needs escape. needs_escapes = true; escapes++; } else if (c == '[') { in_char_class = true; } else if (c == ']') { in_char_class = false; } else if (c == '\n') { needs_escapes = true; escapes++; } else if (c == '\r') { needs_escapes = true; escapes++; } else if (static_cast<int>(c) == 0x2028) { needs_escapes = true; escapes += std::strlen("\\u2028") - 1; } else if (static_cast<int>(c) == 0x2029) { needs_escapes = true; escapes += std::strlen("\\u2029") - 1; } else { DCHECK(!IsLineTerminator(c)); } } DCHECK(!in_char_class); DCHECK_GE(escapes, 0); DCHECK_IMPLIES(escapes != 0, needs_escapes); *needs_escapes_out = needs_escapes; return escapes; } template <typename Char> void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) { int s = 0; while (string[s] != '\0') v[(*d)++] = string[s++]; } template <typename Char, typename StringType> Handle<StringType> WriteEscapedRegExpSource(Handle<String> source, Handle<StringType> result) { DisallowGarbageCollection no_gc; base::Vector<const Char> src = source->GetCharVector<Char>(no_gc); base::Vector<Char> dst(result->GetChars(no_gc), result->length()); int s = 0; int d = 0; bool in_char_class = false; while (s < src.length()) { const Char c = src[s]; if (c == '\\') { if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) { // This '\' is ignored since the next character itself will be escaped. s++; continue; } else { // Escape. Copy this and next character. dst[d++] = src[s++]; } if (s == src.length()) break; } else if (c == '/' && !in_char_class) { // Not escaped forward-slash needs escape. dst[d++] = '\\'; } else if (c == '[') { in_char_class = true; } else if (c == ']') { in_char_class = false; } else if (c == '\n') { WriteStringToCharVector(dst, &d, "\\n"); s++; continue; } else if (c == '\r') { WriteStringToCharVector(dst, &d, "\\r"); s++; continue; } else if (static_cast<int>(c) == 0x2028) { WriteStringToCharVector(dst, &d, "\\u2028"); s++; continue; } else if (static_cast<int>(c) == 0x2029) { WriteStringToCharVector(dst, &d, "\\u2029"); s++; continue; } else { DCHECK(!IsLineTerminator(c)); } dst[d++] = src[s++]; } DCHECK_EQ(result->length(), d); DCHECK(!in_char_class); return result; } MaybeHandle<String> EscapeRegExpSource(Isolate* isolate, Handle<String> source) { DCHECK(source->IsFlat()); if (source->length() == 0) return isolate->factory()->query_colon_string(); bool one_byte = String::IsOneByteRepresentationUnderneath(*source); bool needs_escapes = false; int additional_escape_chars = one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes) : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes); if (!needs_escapes) return source; int length = source->length() + additional_escape_chars; if (one_byte) { Handle<SeqOneByteString> result; ASSIGN_RETURN_ON_EXCEPTION(isolate, result, isolate->factory()->NewRawOneByteString(length), String); return WriteEscapedRegExpSource<uint8_t>(source, result); } else { Handle<SeqTwoByteString> result; ASSIGN_RETURN_ON_EXCEPTION(isolate, result, isolate->factory()->NewRawTwoByteString(length), String); return WriteEscapedRegExpSource<base::uc16>(source, result); } } } // namespace // static MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp, Handle<String> source, Flags flags, uint32_t backtrack_limit) { Isolate* isolate = regexp->GetIsolate(); Factory* factory = isolate->factory(); // If source is the empty string we set it to "(?:)" instead as // suggested by ECMA-262, 5th, section 15.10.4.1. if (source->length() == 0) source = factory->query_colon_string(); source = String::Flatten(isolate, source); RETURN_ON_EXCEPTION( isolate, RegExp::Compile(isolate, regexp, source, JSRegExp::AsRegExpFlags(flags), backtrack_limit), JSRegExp); Handle<String> escaped_source; ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source, EscapeRegExpSource(isolate, source), JSRegExp); regexp->set_source(*escaped_source); regexp->set_flags(Smi::FromInt(flags)); Map map = regexp->map(); Object constructor = map.GetConstructor(); if (constructor.IsJSFunction() && JSFunction::cast(constructor).initial_map() == map) { // If we still have the original map, set in-object properties directly. regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex, Smi::FromInt(kInitialLastIndexValue), SKIP_WRITE_BARRIER); } else { // Map has changed, so use generic, but slower, method. RETURN_ON_EXCEPTION( isolate, Object::SetProperty( isolate, regexp, factory->lastIndex_string(), Handle<Smi>(Smi::FromInt(kInitialLastIndexValue), isolate)), JSRegExp); } return regexp; } } // namespace internal } // namespace v8