Commit a2baaaac authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] implement case-insensitive unicode regexps.

BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1599303002

Cr-Commit-Position: refs/heads/master@{#33538}
parent 2a0e4225
...@@ -891,7 +891,7 @@ class Isolate { ...@@ -891,7 +891,7 @@ class Isolate {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* unibrow::Mapping<unibrow::Ecma262Canonicalize>*
interp_canonicalize_mapping() { interp_canonicalize_mapping() {
return &interp_canonicalize_mapping_; return &regexp_macro_assembler_canonicalize_;
} }
Debug* debug() { return debug_; } Debug* debug() { return debug_; }
...@@ -1245,7 +1245,6 @@ class Isolate { ...@@ -1245,7 +1245,6 @@ class Isolate {
regexp_macro_assembler_canonicalize_; regexp_macro_assembler_canonicalize_;
RegExpStack* regexp_stack_; RegExpStack* regexp_stack_;
DateCache* date_cache_; DateCache* date_cache_;
unibrow::Mapping<unibrow::Ecma262Canonicalize> interp_canonicalize_mapping_;
CallInterfaceDescriptorData* call_descriptor_data_; CallInterfaceDescriptorData* call_descriptor_data_;
base::RandomNumberGenerator* random_number_generator_; base::RandomNumberGenerator* random_number_generator_;
......
...@@ -210,7 +210,7 @@ void RegExpMacroAssemblerARM::CheckGreedyLoop(Label* on_equal) { ...@@ -210,7 +210,7 @@ void RegExpMacroAssemblerARM::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough; Label fallthrough;
__ ldr(r0, register_location(start_reg)); // Index of start of capture __ ldr(r0, register_location(start_reg)); // Index of start of capture
__ ldr(r1, register_location(start_reg + 1)); // Index of end of capture __ ldr(r1, register_location(start_reg + 1)); // Index of end of capture
...@@ -302,7 +302,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( ...@@ -302,7 +302,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
// r0: Address byte_offset1 - Address captured substring's start. // r0: Address byte_offset1 - Address captured substring's start.
// r1: Address byte_offset2 - Address of current character position. // r1: Address byte_offset2 - Address of current character position.
// r2: size_t byte_length - length of capture in bytes(!) // r2: size_t byte_length - length of capture in bytes(!)
// r3: Isolate* isolate // r3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture. // Address of start of capture.
__ add(r0, r0, Operand(end_of_input_address())); __ add(r0, r0, Operand(end_of_input_address()));
...@@ -316,7 +316,14 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( ...@@ -316,7 +316,14 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
__ sub(r1, r1, r4); __ sub(r1, r1, r4);
} }
// Isolate. // Isolate.
__ mov(r3, Operand(ExternalReference::isolate_address(isolate()))); #ifdef V8_I18N_SUPPORT
if (unicode) {
__ mov(r3, Operand(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
}
{ {
AllowExternalCallThatCantCauseGC scope(masm_); AllowExternalCallThatCantCauseGC scope(masm_);
......
...@@ -38,7 +38,7 @@ class RegExpMacroAssemblerARM: public NativeRegExpMacroAssembler { ...@@ -38,7 +38,7 @@ class RegExpMacroAssemblerARM: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, virtual void CheckNotCharacterAfterAnd(unsigned c,
......
...@@ -274,7 +274,7 @@ void RegExpMacroAssemblerARM64::CheckGreedyLoop(Label* on_equal) { ...@@ -274,7 +274,7 @@ void RegExpMacroAssemblerARM64::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough; Label fallthrough;
Register capture_start_offset = w10; Register capture_start_offset = w10;
...@@ -388,7 +388,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( ...@@ -388,7 +388,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
// x0: Address byte_offset1 - Address captured substring's start. // x0: Address byte_offset1 - Address captured substring's start.
// x1: Address byte_offset2 - Address of current character position. // x1: Address byte_offset2 - Address of current character position.
// w2: size_t byte_length - length of capture in bytes(!) // w2: size_t byte_length - length of capture in bytes(!)
// x3: Isolate* isolate // x3: Isolate* isolate or 0 if unicode flag
// Address of start of capture. // Address of start of capture.
__ Add(x0, input_end(), Operand(capture_start_offset, SXTW)); __ Add(x0, input_end(), Operand(capture_start_offset, SXTW));
...@@ -400,7 +400,14 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( ...@@ -400,7 +400,14 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
__ Sub(x1, x1, Operand(capture_length, SXTW)); __ Sub(x1, x1, Operand(capture_length, SXTW));
} }
// Isolate. // Isolate.
__ Mov(x3, ExternalReference::isolate_address(isolate())); #ifdef V8_I18N_SUPPORT
if (unicode) {
__ Mov(x3, Operand(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ Mov(x3, ExternalReference::isolate_address(isolate()));
}
{ {
AllowExternalCallThatCantCauseGC scope(masm_); AllowExternalCallThatCantCauseGC scope(masm_);
......
...@@ -43,7 +43,7 @@ class RegExpMacroAssemblerARM64: public NativeRegExpMacroAssembler { ...@@ -43,7 +43,7 @@ class RegExpMacroAssemblerARM64: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, virtual void CheckNotCharacterAfterAnd(unsigned c,
......
...@@ -20,56 +20,58 @@ const unsigned int MAX_FIRST_ARG = 0x7fffffu; ...@@ -20,56 +20,58 @@ const unsigned int MAX_FIRST_ARG = 0x7fffffu;
const int BYTECODE_SHIFT = 8; const int BYTECODE_SHIFT = 8;
#define BYTECODE_ITERATOR(V) \ #define BYTECODE_ITERATOR(V) \
V(BREAK, 0, 4) /* bc8 */ \ V(BREAK, 0, 4) /* bc8 */ \
V(PUSH_CP, 1, 4) /* bc8 pad24 */ \ V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \ V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \ V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \ V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \ V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \ V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \ V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \ V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \ V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
V(POP_CP, 10, 4) /* bc8 pad24 */ \ V(POP_CP, 10, 4) /* bc8 pad24 */ \
V(POP_BT, 11, 4) /* bc8 pad24 */ \ V(POP_BT, 11, 4) /* bc8 pad24 */ \
V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \ V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
V(FAIL, 13, 4) /* bc8 pad24 */ \ V(FAIL, 13, 4) /* bc8 pad24 */ \
V(SUCCEED, 14, 4) /* bc8 pad24 */ \ V(SUCCEED, 14, 4) /* bc8 pad24 */ \
V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \ V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \ V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \ V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \ V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \ V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \ V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \ V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \ V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \ V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \ V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \ V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \ V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \ V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \ V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \ V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \ V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \ V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_NOT_AT_START, 46, 8) /* bc8 offset24 addr32 */ \ V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \ V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \ V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */ V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */
#define DECLARE_BYTECODES(name, code, length) \ #define DECLARE_BYTECODES(name, code, length) \
static const int BC_##name = code; static const int BC_##name = code;
......
...@@ -189,7 +189,7 @@ void RegExpMacroAssemblerIA32::CheckGreedyLoop(Label* on_equal) { ...@@ -189,7 +189,7 @@ void RegExpMacroAssemblerIA32::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough; Label fallthrough;
__ mov(edx, register_location(start_reg)); // Index of start of capture __ mov(edx, register_location(start_reg)); // Index of start of capture
__ mov(ebx, register_location(start_reg + 1)); // Index of end of capture __ mov(ebx, register_location(start_reg + 1)); // Index of end of capture
...@@ -296,11 +296,18 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( ...@@ -296,11 +296,18 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
// Address byte_offset1 - Address captured substring's start. // Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position. // Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!) // size_t byte_length - length of capture in bytes(!)
// Isolate* isolate // Isolate* isolate or 0 if unicode flag.
// Set isolate. // Set isolate.
__ mov(Operand(esp, 3 * kPointerSize), #ifdef V8_I18N_SUPPORT
Immediate(ExternalReference::isolate_address(isolate()))); if (unicode) {
__ mov(Operand(esp, 3 * kPointerSize), Immediate(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ mov(Operand(esp, 3 * kPointerSize),
Immediate(ExternalReference::isolate_address(isolate())));
}
// Set byte_length. // Set byte_length.
__ mov(Operand(esp, 2 * kPointerSize), ebx); __ mov(Operand(esp, 2 * kPointerSize), ebx);
// Set byte_offset2. // Set byte_offset2.
......
...@@ -37,7 +37,7 @@ class RegExpMacroAssemblerIA32: public NativeRegExpMacroAssembler { ...@@ -37,7 +37,7 @@ class RegExpMacroAssemblerIA32: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c, virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
...@@ -15,37 +15,32 @@ ...@@ -15,37 +15,32 @@
#include "src/unicode.h" #include "src/unicode.h"
#include "src/utils.h" #include "src/utils.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uchar.h"
#endif // V8_I18N_SUPPORT
namespace v8 { namespace v8 {
namespace internal { namespace internal {
typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize; typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
int from, int len, Vector<const uc16> subject,
int current, bool unicode) {
int len, Address offset_a =
Vector<const uc16> subject) { reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
for (int i = 0; i < len; i++) { Address offset_b =
unibrow::uchar old_char = subject[from++]; reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
unibrow::uchar new_char = subject[current++]; size_t length = len * kUC16Size;
if (old_char == new_char) continue; return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
unibrow::uchar old_string[1] = { old_char }; offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
unibrow::uchar new_string[1] = { new_char };
interp_canonicalize->get(old_char, '\0', old_string);
interp_canonicalize->get(new_char, '\0', new_string);
if (old_string[0] != new_string[0]) {
return false;
}
}
return true;
} }
static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
int from, int len, Vector<const uint8_t> subject,
int current, bool unicode) {
int len, // For Latin1 characters the unicode flag makes no difference.
Vector<const uint8_t> subject) {
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++]; unsigned int old_char = subject[from++];
unsigned int new_char = subject[current++]; unsigned int new_char = subject[current++];
...@@ -523,13 +518,16 @@ static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate, ...@@ -523,13 +518,16 @@ static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH; pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
break; break;
} }
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE)
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
bool unicode =
(insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE;
int from = registers[insn >> BYTECODE_SHIFT]; int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) { if (from >= 0 && len > 0) {
if (current + len > subject.length() || if (current + len > subject.length() ||
!BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(), !BackRefMatchesNoCase(isolate, from, current, len, subject,
from, current, len, subject)) { unicode)) {
pc = code_base + Load32Aligned(pc + 4); pc = code_base + Load32Aligned(pc + 4);
break; break;
} }
...@@ -538,13 +536,16 @@ static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate, ...@@ -538,13 +536,16 @@ static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
break; break;
} }
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD)
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
bool unicode = (insn & BYTECODE_MASK) ==
BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD;
int from = registers[insn >> BYTECODE_SHIFT]; int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) { if (from >= 0 && len > 0) {
if (current - len < 0 || if (current - len < 0 ||
!BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(), !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
from, current - len, len, subject)) { unicode)) {
pc = code_base + Load32Aligned(pc + 4); pc = code_base + Load32Aligned(pc + 4);
break; break;
} }
......
...@@ -25,6 +25,11 @@ ...@@ -25,6 +25,11 @@
#include "src/string-search.h" #include "src/string-search.h"
#include "src/unicode-decoder.h" #include "src/unicode-decoder.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uset.h"
#include "unicode/utypes.h"
#endif // V8_I18N_SUPPORT
#ifndef V8_INTERPRETED_REGEXP #ifndef V8_INTERPRETED_REGEXP
#if V8_TARGET_ARCH_IA32 #if V8_TARGET_ARCH_IA32
#include "src/regexp/ia32/regexp-macro-assembler-ia32.h" #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
...@@ -3420,10 +3425,7 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { ...@@ -3420,10 +3425,7 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
// independent case and it slows us down if we don't know that. // independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue; if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone()); ZoneList<CharacterRange>* ranges = cc->ranges(zone());
int range_count = ranges->length(); CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
for (int j = 0; j < range_count; j++) {
ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
}
} }
} }
} }
...@@ -3586,13 +3588,6 @@ class AlternativeGenerationList { ...@@ -3586,13 +3588,6 @@ class AlternativeGenerationList {
AlternativeGeneration a_few_alt_gens_[kAFew]; AlternativeGeneration a_few_alt_gens_[kAFew];
}; };
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
static const uc32 kRangeEndMarker = 0x110000; static const uc32 kRangeEndMarker = 0x110000;
// The '2' variant is has inclusive from and exclusive to. // The '2' variant is has inclusive from and exclusive to.
...@@ -4395,8 +4390,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { ...@@ -4395,8 +4390,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
DCHECK_EQ(start_reg_ + 1, end_reg_); DCHECK_EQ(start_reg_ + 1, end_reg_);
if (compiler->ignore_case()) { if (compiler->ignore_case()) {
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), assembler->CheckNotBackReferenceIgnoreCase(
trace->backtrack()); start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
} else { } else {
assembler->CheckNotBackReference(start_reg_, read_backward(), assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack()); trace->backtrack());
...@@ -4866,21 +4861,6 @@ bool RegExpCharacterClass::is_standard(Zone* zone) { ...@@ -4866,21 +4861,6 @@ bool RegExpCharacterClass::is_standard(Zone* zone) {
} }
bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {
ZoneList<CharacterRange>* ranges = this->ranges(zone);
CharacterRange::Canonicalize(ranges);
for (int i = ranges->length() - 1; i >= 0; i--) {
uc32 from = ranges->at(i).from();
uc32 to = ranges->at(i).to();
// Check for non-BMP characters.
if (to >= kNonBmpStart) return true;
// Check for lone surrogates.
if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
}
return false;
}
UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
ZoneList<CharacterRange>* base) ZoneList<CharacterRange>* base)
: zone_(zone), : zone_(zone),
...@@ -5120,11 +5100,53 @@ void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result, ...@@ -5120,11 +5100,53 @@ void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result,
} }
void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
ZoneList<CharacterRange>* ranges) {
#ifdef V8_I18N_SUPPORT
// Use ICU to compute the case fold closure over the ranges.
DCHECK(compiler->unicode());
DCHECK(compiler->ignore_case());
USet* set = uset_openEmpty();
for (int i = 0; i < ranges->length(); i++) {
uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
}
ranges->Clear();
uset_closeOver(set, USET_CASE_INSENSITIVE);
// Full case mapping map single characters to multiple characters.
// Those are represented as strings in the set. Remove them so that
// we end up with only simple and common case mappings.
uset_removeAllStrings(set);
int item_count = uset_getItemCount(set);
int item_result = 0;
UErrorCode ec = U_ZERO_ERROR;
Zone* zone = compiler->zone();
for (int i = 0; i < item_count; i++) {
uc32 start = 0;
uc32 end = 0;
item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
ranges->Add(CharacterRange::Range(start, end), zone);
}
// No errors and everything we collected have been ranges.
DCHECK_EQ(U_ZERO_ERROR, ec);
DCHECK_EQ(0, item_result);
uset_close(set);
#else
// Fallback if ICU is not included.
CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
ranges, compiler->one_byte());
#endif // V8_I18N_SUPPORT
CharacterRange::Canonicalize(ranges);
}
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) { RegExpNode* on_success) {
set_.Canonicalize(); set_.Canonicalize();
Zone* zone = compiler->zone(); Zone* zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone); ZoneList<CharacterRange>* ranges = this->ranges(zone);
if (compiler->unicode() && compiler->ignore_case()) {
AddUnicodeCaseEquivalents(compiler, ranges);
}
if (compiler->unicode() && !compiler->one_byte()) { if (compiler->unicode() && !compiler->one_byte()) {
if (is_negated()) { if (is_negated()) {
ZoneList<CharacterRange>* negated = ZoneList<CharacterRange>* negated =
...@@ -5853,16 +5875,19 @@ Vector<const int> CharacterRange::GetWordBounds() { ...@@ -5853,16 +5875,19 @@ Vector<const int> CharacterRange::GetWordBounds() {
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, ZoneList<CharacterRange>* ranges,
bool is_one_byte) { bool is_one_byte) {
uc32 bottom = from(); int range_count = ranges->length();
uc32 top = to(); for (int i = 0; i < range_count; i++) {
// Nothing to be done for surrogates. CharacterRange range = ranges->at(i);
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; uc32 bottom = range.from();
if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { uc32 top = range.to();
if (bottom > String::kMaxOneByteCharCode) return; // Nothing to be done for surrogates.
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
} if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; if (bottom > String::kMaxOneByteCharCode) return;
if (top == bottom) { if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if (top == bottom) {
// If this is a singleton we just expand the one character. // If this is a singleton we just expand the one character.
int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
...@@ -5914,6 +5939,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ...@@ -5914,6 +5939,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
pos = end + 1; pos = end + 1;
} }
} }
}
} }
...@@ -6284,7 +6310,7 @@ void TextNode::CalculateOffsets() { ...@@ -6284,7 +6310,7 @@ void TextNode::CalculateOffsets() {
void Analysis::VisitText(TextNode* that) { void Analysis::VisitText(TextNode* that) {
if (ignore_case_) { if (ignore_case()) {
that->MakeCaseIndependent(isolate(), is_one_byte_); that->MakeCaseIndependent(isolate(), is_one_byte_);
} }
EnsureAnalyzed(that->on_success()); EnsureAnalyzed(that->on_success());
...@@ -6649,7 +6675,7 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ...@@ -6649,7 +6675,7 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node; data->node = node;
Analysis analysis(isolate, ignore_case, is_one_byte); Analysis analysis(isolate, flags, is_one_byte);
analysis.EnsureAnalyzed(node); analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) { if (analysis.has_failed()) {
const char* error_message = analysis.error_message(); const char* error_message = analysis.error_message();
......
...@@ -19,6 +19,15 @@ class RegExpNode; ...@@ -19,6 +19,15 @@ class RegExpNode;
class RegExpTree; class RegExpTree;
class BoyerMooreLookahead; class BoyerMooreLookahead;
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
class RegExpImpl { class RegExpImpl {
public: public:
// Whether V8 is compiled with native regexp support or not. // Whether V8 is compiled with native regexp support or not.
...@@ -1478,9 +1487,9 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) ...@@ -1478,9 +1487,9 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
// +-------+ +------------+ // +-------+ +------------+
class Analysis: public NodeVisitor { class Analysis: public NodeVisitor {
public: public:
Analysis(Isolate* isolate, bool ignore_case, bool is_one_byte) Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
: isolate_(isolate), : isolate_(isolate),
ignore_case_(ignore_case), flags_(flags),
is_one_byte_(is_one_byte), is_one_byte_(is_one_byte),
error_message_(NULL) {} error_message_(NULL) {}
void EnsureAnalyzed(RegExpNode* node); void EnsureAnalyzed(RegExpNode* node);
...@@ -1502,9 +1511,12 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) ...@@ -1502,9 +1511,12 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
Isolate* isolate() const { return isolate_; } Isolate* isolate() const { return isolate_; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
private: private:
Isolate* isolate_; Isolate* isolate_;
bool ignore_case_; JSRegExp::Flags flags_;
bool is_one_byte_; bool is_one_byte_;
const char* error_message_; const char* error_message_;
......
...@@ -215,7 +215,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) { ...@@ -215,7 +215,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough; Label fallthrough;
__ lw(a0, register_location(start_reg)); // Index of start of capture. __ lw(a0, register_location(start_reg)); // Index of start of capture.
__ lw(a1, register_location(start_reg + 1)); // Index of end of capture. __ lw(a1, register_location(start_reg + 1)); // Index of end of capture.
...@@ -310,7 +310,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( ...@@ -310,7 +310,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
// a0: Address byte_offset1 - Address captured substring's start. // a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position. // a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!). // a2: size_t byte_length - length of capture in bytes(!).
// a3: Isolate* isolate. // a3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture. // Address of start of capture.
__ Addu(a0, a0, Operand(end_of_input_address())); __ Addu(a0, a0, Operand(end_of_input_address()));
...@@ -324,7 +324,14 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( ...@@ -324,7 +324,14 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ Subu(a1, a1, Operand(s3)); __ Subu(a1, a1, Operand(s3));
} }
// Isolate. // Isolate.
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate()))); #ifdef V8_I18N_SUPPORT
if (unicode) {
__ li(a3, Operand(zero_reg));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
}
{ {
AllowExternalCallThatCantCauseGC scope(masm_); AllowExternalCallThatCantCauseGC scope(masm_);
......
...@@ -37,7 +37,7 @@ class RegExpMacroAssemblerMIPS: public NativeRegExpMacroAssembler { ...@@ -37,7 +37,7 @@ class RegExpMacroAssemblerMIPS: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c, virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
...@@ -251,7 +251,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) { ...@@ -251,7 +251,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough; Label fallthrough;
__ ld(a0, register_location(start_reg)); // Index of start of capture. __ ld(a0, register_location(start_reg)); // Index of start of capture.
__ ld(a1, register_location(start_reg + 1)); // Index of end of capture. __ ld(a1, register_location(start_reg + 1)); // Index of end of capture.
...@@ -346,7 +346,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( ...@@ -346,7 +346,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
// a0: Address byte_offset1 - Address captured substring's start. // a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position. // a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!). // a2: size_t byte_length - length of capture in bytes(!).
// a3: Isolate* isolate. // a3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture. // Address of start of capture.
__ Daddu(a0, a0, Operand(end_of_input_address())); __ Daddu(a0, a0, Operand(end_of_input_address()));
...@@ -360,7 +360,14 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( ...@@ -360,7 +360,14 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ Dsubu(a1, a1, Operand(s3)); __ Dsubu(a1, a1, Operand(s3));
} }
// Isolate. // Isolate.
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate()))); #ifdef V8_I18N_SUPPORT
if (unicode) {
__ li(a3, Operand(zero_reg));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
}
{ {
AllowExternalCallThatCantCauseGC scope(masm_); AllowExternalCallThatCantCauseGC scope(masm_);
......
...@@ -37,7 +37,7 @@ class RegExpMacroAssemblerMIPS: public NativeRegExpMacroAssembler { ...@@ -37,7 +37,7 @@ class RegExpMacroAssemblerMIPS: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c, virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
...@@ -108,8 +108,9 @@ class CharacterRange { ...@@ -108,8 +108,9 @@ class CharacterRange {
bool is_valid() { return from_ <= to_; } bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); } bool IsSingleton() { return (from_ == to_); }
void AddCaseEquivalents(Isolate* isolate, Zone* zone, static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, bool is_one_byte); ZoneList<CharacterRange>* ranges,
bool is_one_byte);
// Whether a range list is in canonical form: Ranges ordered by from value, // Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent. // and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges); static bool IsCanonical(ZoneList<CharacterRange>* ranges);
...@@ -293,7 +294,6 @@ class RegExpCharacterClass final : public RegExpTree { ...@@ -293,7 +294,6 @@ class RegExpCharacterClass final : public RegExpTree {
RegExpCharacterClass* AsCharacterClass() override; RegExpCharacterClass* AsCharacterClass() override;
bool IsCharacterClass() override; bool IsCharacterClass() override;
bool IsTextElement() override { return true; } bool IsTextElement() override { return true; }
bool NeedsDesugaringForUnicode(Zone* zone);
int min_match() override { return 1; } int min_match() override { return 1; }
int max_match() override { return 1; } int max_match() override { return 1; }
void AppendToText(RegExpText* text, Zone* zone) override; void AppendToText(RegExpText* text, Zone* zone) override;
...@@ -310,7 +310,7 @@ class RegExpCharacterClass final : public RegExpTree { ...@@ -310,7 +310,7 @@ class RegExpCharacterClass final : public RegExpTree {
// W : non-ASCII word character // W : non-ASCII word character
// d : ASCII digit // d : ASCII digit
// D : non-ASCII digit // D : non-ASCII digit
// . : non-unicode non-newline // . : non-newline
// * : All characters, for advancing unanchored regexp // * : All characters, for advancing unanchored regexp
uc16 standard_type() { return set_.standard_set_type(); } uc16 standard_type() { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
......
...@@ -381,11 +381,13 @@ void RegExpMacroAssemblerIrregexp::CheckNotBackReference(int start_reg, ...@@ -381,11 +381,13 @@ void RegExpMacroAssemblerIrregexp::CheckNotBackReference(int start_reg,
void RegExpMacroAssemblerIrregexp::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerIrregexp::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_not_equal) { int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
DCHECK(start_reg >= 0); DCHECK(start_reg >= 0);
DCHECK(start_reg <= kMaxRegister); DCHECK(start_reg <= kMaxRegister);
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
: BC_CHECK_NOT_BACK_REF_NO_CASE, : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
: (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
: BC_CHECK_NOT_BACK_REF_NO_CASE),
start_reg); start_reg);
EmitOrLink(on_not_equal); EmitOrLink(on_not_equal);
} }
......
...@@ -82,16 +82,10 @@ class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler { ...@@ -82,16 +82,10 @@ class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler {
uc16 to, uc16 to,
Label* on_not_in_range); Label* on_not_in_range);
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set); virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set);
virtual void CheckPosition(int cp_offset, Label* on_outside_input) {
LoadCurrentCharacter(cp_offset, on_outside_input, true);
}
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) {
return false; // No custom support for character classes.
}
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt); virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge); virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
......
...@@ -360,11 +360,11 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg, ...@@ -360,11 +360,11 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n", PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
start_reg, read_backward ? "backward" : "forward", start_reg, read_backward ? "backward" : "forward",
LabelToInt(on_no_match)); unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
on_no_match); on_no_match);
} }
......
...@@ -34,7 +34,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { ...@@ -34,7 +34,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, virtual void CheckNotCharacterAfterAnd(unsigned c,
......
...@@ -9,6 +9,10 @@ ...@@ -9,6 +9,10 @@
#include "src/regexp/regexp-stack.h" #include "src/regexp/regexp-stack.h"
#include "src/simulator.h" #include "src/simulator.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uchar.h"
#endif // V8_I18N_SUPPORT
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -23,6 +27,67 @@ RegExpMacroAssembler::~RegExpMacroAssembler() { ...@@ -23,6 +27,67 @@ RegExpMacroAssembler::~RegExpMacroAssembler() {
} }
int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate) {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
isolate->regexp_macro_assembler_canonicalize();
// This function is not allowed to cause a garbage collection.
// A GC might move the calling generated code and invalidate the
// return address on the stack.
DCHECK(byte_length % 2 == 0);
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
size_t length = byte_length >> 1;
#ifdef V8_I18N_SUPPORT
if (isolate == nullptr) {
for (size_t i = 0; i < length; i++) {
uc32 c1 = substring1[i];
uc32 c2 = substring2[i];
if (unibrow::Utf16::IsLeadSurrogate(c1)) {
// Non-BMP characters do not have case-equivalents in the BMP.
// Both have to be non-BMP for them to be able to match.
if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
if (i + 1 < length) {
uc16 c1t = substring1[i + 1];
uc16 c2t = substring2[i + 1];
if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
unibrow::Utf16::IsTrailSurrogate(c2t)) {
c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
i++;
}
}
}
c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
if (c1 != c2) return 0;
}
return 1;
}
#endif // V8_I18N_SUPPORT
DCHECK_NOT_NULL(isolate);
for (size_t i = 0; i < length; i++) {
unibrow::uchar c1 = substring1[i];
unibrow::uchar c2 = substring2[i];
if (c1 != c2) {
unibrow::uchar s1[1] = {c1};
canonicalize->get(c1, '\0', s1);
if (s1[0] != c2) {
unibrow::uchar s2[1] = {c2};
canonicalize->get(c2, '\0', s2);
if (s1[0] != s2[0]) {
return 0;
}
}
}
}
return 1;
}
#ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM. #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
...@@ -245,40 +310,6 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = { ...@@ -245,40 +310,6 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
}; };
int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate) {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
isolate->regexp_macro_assembler_canonicalize();
// This function is not allowed to cause a garbage collection.
// A GC might move the calling generated code and invalidate the
// return address on the stack.
DCHECK(byte_length % 2 == 0);
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
size_t length = byte_length >> 1;
for (size_t i = 0; i < length; i++) {
unibrow::uchar c1 = substring1[i];
unibrow::uchar c2 = substring2[i];
if (c1 != c2) {
unibrow::uchar s1[1] = { c1 };
canonicalize->get(c1, '\0', s1);
if (s1[0] != c2) {
unibrow::uchar s2[1] = { c2 };
canonicalize->get(c2, '\0', s2);
if (s1[0] != s2[0]) {
return 0;
}
}
}
}
return 1;
}
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer, Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
Address* stack_base, Address* stack_base,
Isolate* isolate) { Isolate* isolate) {
......
...@@ -76,7 +76,7 @@ class RegExpMacroAssembler { ...@@ -76,7 +76,7 @@ class RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0; Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match) = 0; Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we // Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always // fail to match then goto the on_failure label. End of input always
...@@ -146,6 +146,12 @@ class RegExpMacroAssembler { ...@@ -146,6 +146,12 @@ class RegExpMacroAssembler {
virtual void ClearRegisters(int reg_from, int reg_to) = 0; virtual void ClearRegisters(int reg_from, int reg_to) = 0;
virtual void WriteStackPointerToRegister(int reg) = 0; virtual void WriteStackPointerToRegister(int reg) = 0;
// Compares two-byte strings case insensitively.
// Called from generated RegExp code.
static int CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length, Isolate* isolate);
// Controls the generation of large inlined constants in the code. // Controls the generation of large inlined constants in the code.
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; } bool slow_safe() { return slow_safe_compiler_; }
...@@ -199,13 +205,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { ...@@ -199,13 +205,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
int previous_index, int previous_index,
Isolate* isolate); Isolate* isolate);
// Compares two-byte strings case insensitively.
// Called from generated RegExp code.
static int CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate);
// Called from RegExp if the backtrack stack limit is hit. // Called from RegExp if the backtrack stack limit is hit.
// Tries to expand the stack. Returns the new stack-pointer if // Tries to expand the stack. Returns the new stack-pointer if
// successful, and updates the stack_top address, or returns 0 if unable // successful, and updates the stack_top address, or returns 0 if unable
......
...@@ -11,6 +11,10 @@ ...@@ -11,6 +11,10 @@
#include "src/regexp/jsregexp.h" #include "src/regexp/jsregexp.h"
#include "src/utils.h" #include "src/utils.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uset.h"
#endif // V8_I18N_SUPPORT
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { ...@@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
if (pending_surrogate_ != kNoPendingSurrogate) { if (pending_surrogate_ != kNoPendingSurrogate) {
uc16 lead_surrogate = pending_surrogate_; uc16 lead_surrogate = pending_surrogate_;
DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
ZoneList<uc16> surrogate_pair(2, zone());
surrogate_pair.Add(lead_surrogate, zone());
surrogate_pair.Add(trail_surrogate, zone());
RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
pending_surrogate_ = kNoPendingSurrogate; pending_surrogate_ = kNoPendingSurrogate;
AddAtom(atom); DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
uc32 combined =
unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
if (NeedsDesugaringForIgnoreCase(combined)) {
AddCharacterClass(combined);
} else {
ZoneList<uc16> surrogate_pair(2, zone());
surrogate_pair.Add(lead_surrogate, zone());
surrogate_pair.Add(trail_surrogate, zone());
RegExpAtom* atom =
new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
AddAtom(atom);
}
} else { } else {
pending_surrogate_ = trail_surrogate; pending_surrogate_ = trail_surrogate;
FlushPendingSurrogate(); FlushPendingSurrogate();
...@@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { ...@@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
void RegExpBuilder::FlushPendingSurrogate() { void RegExpBuilder::FlushPendingSurrogate() {
if (pending_surrogate_ != kNoPendingSurrogate) { if (pending_surrogate_ != kNoPendingSurrogate) {
// Use character class to desugar lone surrogate matching.
RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
CharacterRange::List(zone(),
CharacterRange::Singleton(pending_surrogate_)),
false);
pending_surrogate_ = kNoPendingSurrogate;
DCHECK(unicode()); DCHECK(unicode());
AddCharacterClass(cc); uc32 c = pending_surrogate_;
pending_surrogate_ = kNoPendingSurrogate;
AddCharacterClass(c);
} }
} }
...@@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() { ...@@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() {
void RegExpBuilder::AddCharacter(uc16 c) { void RegExpBuilder::AddCharacter(uc16 c) {
FlushPendingSurrogate(); FlushPendingSurrogate();
pending_empty_ = false; pending_empty_ = false;
if (characters_ == NULL) { if (NeedsDesugaringForIgnoreCase(c)) {
characters_ = new (zone()) ZoneList<uc16>(4, zone()); AddCharacterClass(c);
} else {
if (characters_ == NULL) {
characters_ = new (zone()) ZoneList<uc16>(4, zone());
}
characters_->Add(c, zone());
LAST(ADD_CHAR);
} }
characters_->Add(c, zone());
LAST(ADD_CHAR);
} }
...@@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; } ...@@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { if (NeedsDesugaringForUnicode(cc)) {
// In unicode mode, character class needs to be desugared, so it // In unicode mode, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText. // must be a standalone term instead of being part of a RegExpText.
AddTerm(cc); AddTerm(cc);
...@@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { ...@@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
} }
void RegExpBuilder::AddCharacterClass(uc32 c) {
AddCharacterClass(new (zone()) RegExpCharacterClass(
CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
}
void RegExpBuilder::AddAtom(RegExpTree* term) { void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) { if (term->IsEmpty()) {
AddEmpty(); AddEmpty();
...@@ -1210,6 +1227,47 @@ void RegExpBuilder::FlushTerms() { ...@@ -1210,6 +1227,47 @@ void RegExpBuilder::FlushTerms() {
} }
bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
if (!unicode()) return false;
switch (cc->standard_type()) {
case 's': // white space
case 'w': // ASCII word character
case 'd': // ASCII digit
return false; // These characters do not need desugaring.
default:
break;
}
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::Canonicalize(ranges);
for (int i = ranges->length() - 1; i >= 0; i--) {
uc32 from = ranges->at(i).from();
uc32 to = ranges->at(i).to();
// Check for non-BMP characters.
if (to >= kNonBmpStart) return true;
// Check for lone surrogates.
if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
}
return false;
}
bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
#ifdef V8_I18N_SUPPORT
if (unicode() && ignore_case()) {
USet* set = uset_open(c, c);
uset_closeOver(set, USET_CASE_INSENSITIVE);
uset_removeAllStrings(set);
bool result = uset_size(set) > 1;
uset_close(set);
return result;
}
// In the case where ICU is not included, we act as if the unicode flag is
// not set, and do not desugar.
#endif // V8_I18N_SUPPORT
return false;
}
RegExpTree* RegExpBuilder::ToRegExp() { RegExpTree* RegExpBuilder::ToRegExp() {
FlushTerms(); FlushTerms();
int num_alternatives = alternatives_.length(); int num_alternatives = alternatives_.length();
......
...@@ -106,6 +106,7 @@ class RegExpBuilder : public ZoneObject { ...@@ -106,6 +106,7 @@ class RegExpBuilder : public ZoneObject {
// following quantifier // following quantifier
void AddEmpty(); void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc); void AddCharacterClass(RegExpCharacterClass* cc);
void AddCharacterClass(uc32 c);
void AddAtom(RegExpTree* tree); void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree); void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree); void AddAssertion(RegExpTree* tree);
...@@ -122,8 +123,11 @@ class RegExpBuilder : public ZoneObject { ...@@ -122,8 +123,11 @@ class RegExpBuilder : public ZoneObject {
void FlushCharacters(); void FlushCharacters();
void FlushText(); void FlushText();
void FlushTerms(); void FlushTerms();
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
bool NeedsDesugaringForIgnoreCase(uc32 c);
Zone* zone() const { return zone_; } Zone* zone() const { return zone_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
Zone* zone_; Zone* zone_;
bool pending_empty_; bool pending_empty_;
......
...@@ -203,7 +203,7 @@ void RegExpMacroAssemblerX64::CheckGreedyLoop(Label* on_equal) { ...@@ -203,7 +203,7 @@ void RegExpMacroAssemblerX64::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) { int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough; Label fallthrough;
ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture
ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture
...@@ -308,8 +308,10 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( ...@@ -308,8 +308,10 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
// Address byte_offset1 - Address captured substring's start. // Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position. // Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!) // size_t byte_length - length of capture in bytes(!)
// Isolate* isolate // Isolate* isolate or 0 if unicode flag.
#ifdef _WIN64 #ifdef _WIN64
DCHECK(rcx.is(arg_reg_1));
DCHECK(rdx.is(arg_reg_2));
// Compute and set byte_offset1 (start of capture). // Compute and set byte_offset1 (start of capture).
__ leap(rcx, Operand(rsi, rdx, times_1, 0)); __ leap(rcx, Operand(rsi, rdx, times_1, 0));
// Set byte_offset2. // Set byte_offset2.
...@@ -317,11 +319,9 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( ...@@ -317,11 +319,9 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
if (read_backward) { if (read_backward) {
__ subq(rdx, rbx); __ subq(rdx, rbx);
} }
// Set byte_length.
__ movp(r8, rbx);
// Isolate.
__ LoadAddress(r9, ExternalReference::isolate_address(isolate()));
#else // AMD64 calling convention #else // AMD64 calling convention
DCHECK(rdi.is(arg_reg_1));
DCHECK(rsi.is(arg_reg_2));
// Compute byte_offset2 (current position = rsi+rdi). // Compute byte_offset2 (current position = rsi+rdi).
__ leap(rax, Operand(rsi, rdi, times_1, 0)); __ leap(rax, Operand(rsi, rdi, times_1, 0));
// Compute and set byte_offset1 (start of capture). // Compute and set byte_offset1 (start of capture).
...@@ -331,11 +331,19 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( ...@@ -331,11 +331,19 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
if (read_backward) { if (read_backward) {
__ subq(rsi, rbx); __ subq(rsi, rbx);
} }
#endif // _WIN64
// Set byte_length. // Set byte_length.
__ movp(rdx, rbx); __ movp(arg_reg_3, rbx);
// Isolate. // Isolate.
__ LoadAddress(rcx, ExternalReference::isolate_address(isolate())); #ifdef V8_I18N_SUPPORT
#endif if (unicode) {
__ movp(arg_reg_4, Immediate(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate()));
}
{ // NOLINT: Can't find a way to open this scope without confusing the { // NOLINT: Can't find a way to open this scope without confusing the
// linter. // linter.
......
...@@ -38,7 +38,7 @@ class RegExpMacroAssemblerX64: public NativeRegExpMacroAssembler { ...@@ -38,7 +38,7 @@ class RegExpMacroAssemblerX64: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward, virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool read_backward, bool unicode,
Label* on_no_match); Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c, virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
...@@ -1186,16 +1186,16 @@ TEST(MacroAssemblerNativeBackRefNoCase) { ...@@ -1186,16 +1186,16 @@ TEST(MacroAssemblerNativeBackRefNoCase) {
m.WriteCurrentPositionToRegister(2, 0); m.WriteCurrentPositionToRegister(2, 0);
m.AdvanceCurrentPosition(3); m.AdvanceCurrentPosition(3);
m.WriteCurrentPositionToRegister(3, 0); m.WriteCurrentPositionToRegister(3, 0);
m.CheckNotBackReferenceIgnoreCase(2, false, &fail); // Match "AbC". m.CheckNotBackReferenceIgnoreCase(2, false, false, &fail); // Match "AbC".
m.CheckNotBackReferenceIgnoreCase(2, false, &fail); // Match "ABC". m.CheckNotBackReferenceIgnoreCase(2, false, false, &fail); // Match "ABC".
Label expected_fail; Label expected_fail;
m.CheckNotBackReferenceIgnoreCase(2, false, &expected_fail); m.CheckNotBackReferenceIgnoreCase(2, false, false, &expected_fail);
m.Bind(&fail); m.Bind(&fail);
m.Fail(); m.Fail();
m.Bind(&expected_fail); m.Bind(&expected_fail);
m.AdvanceCurrentPosition(3); // Skip "xYz" m.AdvanceCurrentPosition(3); // Skip "xYz"
m.CheckNotBackReferenceIgnoreCase(2, false, &succ); m.CheckNotBackReferenceIgnoreCase(2, false, false, &succ);
m.Fail(); m.Fail();
m.Bind(&succ); m.Bind(&succ);
...@@ -1629,7 +1629,9 @@ static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input, ...@@ -1629,7 +1629,9 @@ static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
int count = expected.length(); int count = expected.length();
ZoneList<CharacterRange>* list = ZoneList<CharacterRange>* list =
new(&zone) ZoneList<CharacterRange>(count, &zone); new(&zone) ZoneList<CharacterRange>(count, &zone);
input.AddCaseEquivalents(isolate, &zone, list, false); list->Add(input, &zone);
CharacterRange::AddCaseEquivalents(isolate, &zone, list, false);
list->Remove(0); // Remove the input before checking results.
CHECK_EQ(count, list->length()); CHECK_EQ(count, list->length());
for (int i = 0; i < list->length(); i++) { for (int i = 0; i < list->length(); i++) {
CHECK_EQ(expected[i].from(), list->at(i).from()); CHECK_EQ(expected[i].from(), list->at(i).from());
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
// Non-unicode use toUpperCase mappings.
assertFalse(/[\u00e5]/i.test("\u212b"));
assertFalse(/[\u212b]/i.test("\u00e5\u1234"));
assertFalse(/[\u212b]/i.test("\u00e5"));
assertTrue("\u212b".toLowerCase() == "\u00e5");
assertTrue("\u00c5".toLowerCase() == "\u00e5");
assertTrue("\u00e5".toUpperCase() == "\u00c5");
// Unicode uses case folding mappings.
assertFalse(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00e5/ui.test("\u00c5"));
assertTrue(/\u00e5/ui.test("\u00e5"));
assertFalse(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00e5"));
assertFalse(/\u00c5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00c5"));
assertFalse(/\u212b/ui.test("\u00c5"));
assertFalse(/\u212b/ui.test("\u00e5"));
assertTrue(/\u212b/ui.test("\u212b"));
// Non-BMP.
assertFalse(/\u{10400}/i.test("\u{10428}"));
assertFalse(/\u{10400}/ui.test("\u{10428}"));
assertFalse(/\ud801\udc00/ui.test("\u{10428}"));
assertFalse(/[\u{10428}]/ui.test("\u{10400}"));
assertFalse(/[\ud801\udc28]/ui.test("\u{10400}"));
assertEquals(["\uff21\u{10400}"],
/[\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc"));
assertEquals(["abc"], /[^\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc\uff23"));
assertEquals(["\uff53\u24bb"],
/[\u24d5-\uff33]+/ui.exec("\uff54\uff53\u24bb\u24ba"));
// Full mappings are ignored.
assertFalse(/\u00df/ui.test("SS"));
assertFalse(/\u1f8d/ui.test("\u1f05\u03b9"));
// Simple mappings.
assertFalse(/\u1f8d/ui.test("\u1f85"));
// Common mappings.
assertTrue(/\u1f6b/ui.test("\u1f63"));
// Back references.
assertNull(/(.)\1\1/ui.exec("\u00e5\u212b\u00c5"));
assertNull(/(.)\1/ui.exec("\u{118aa}\u{118ca}"));
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
// Non-unicode use toUpperCase mappings.
assertFalse(/[\u00e5]/i.test("\u212b"));
assertFalse(/[\u212b]/i.test("\u00e5\u1234"));
assertFalse(/[\u212b]/i.test("\u00e5"));
assertTrue("\u212b".toLowerCase() == "\u00e5");
assertTrue("\u00c5".toLowerCase() == "\u00e5");
assertTrue("\u00e5".toUpperCase() == "\u00c5");
// Unicode uses case folding mappings.
assertTrue(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00e5/ui.test("\u00c5"));
assertTrue(/\u00e5/ui.test("\u00e5"));
assertTrue(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00e5"));
assertTrue(/\u00c5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00c5"));
assertTrue(/\u212b/ui.test("\u00c5"));
assertTrue(/\u212b/ui.test("\u00e5"));
assertTrue(/\u212b/ui.test("\u212b"));
// Non-BMP.
assertFalse(/\u{10400}/i.test("\u{10428}"));
assertTrue(/\u{10400}/ui.test("\u{10428}"));
assertTrue(/\ud801\udc00/ui.test("\u{10428}"));
assertTrue(/[\u{10428}]/ui.test("\u{10400}"));
assertTrue(/[\ud801\udc28]/ui.test("\u{10400}"));
assertEquals(["\uff21\u{10400}"],
/[\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc"));
assertEquals(["abc"], /[^\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc\uff23"));
assertEquals(["\uff53\u24bb"],
/[\u24d5-\uff33]+/ui.exec("\uff54\uff53\u24bb\u24ba"));
// Full mappings are ignored.
assertFalse(/\u00df/ui.test("SS"));
assertFalse(/\u1f8d/ui.test("\u1f05\u03b9"));
// Simple mappings work.
assertTrue(/\u1f8d/ui.test("\u1f85"));
// Common mappings work.
assertTrue(/\u1f6b/ui.test("\u1f63"));
// Back references.
assertEquals(["\u00e5\u212b\u00c5", "\u00e5"],
/(.)\1\1/ui.exec("\u00e5\u212b\u00c5"));
assertEquals(["\u{118aa}\u{118ca}", "\u{118aa}"],
/(.)\1/ui.exec("\u{118aa}\u{118ca}"));
...@@ -289,6 +289,10 @@ ...@@ -289,6 +289,10 @@
# TODO(titzer): SSE 4.1 required for asm-wasm test (floor). # TODO(titzer): SSE 4.1 required for asm-wasm test (floor).
'wasm/asm-wasm': [SKIP], 'wasm/asm-wasm': [SKIP],
# case-insensitive unicode regexp relies on case mapping provided by ICU.
'harmony/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]],
'harmony/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]],
}], # ALWAYS }], # ALWAYS
['novfp3 == True', { ['novfp3 == True', {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment