Commit a2baaaac authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] implement case-insensitive unicode regexps.

BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1599303002

Cr-Commit-Position: refs/heads/master@{#33538}
parent 2a0e4225
......@@ -891,7 +891,7 @@ class Isolate {
unibrow::Mapping<unibrow::Ecma262Canonicalize>*
interp_canonicalize_mapping() {
return &interp_canonicalize_mapping_;
return &regexp_macro_assembler_canonicalize_;
}
Debug* debug() { return debug_; }
......@@ -1245,7 +1245,6 @@ class Isolate {
regexp_macro_assembler_canonicalize_;
RegExpStack* regexp_stack_;
DateCache* date_cache_;
unibrow::Mapping<unibrow::Ecma262Canonicalize> interp_canonicalize_mapping_;
CallInterfaceDescriptorData* call_descriptor_data_;
base::RandomNumberGenerator* random_number_generator_;
......
......@@ -210,7 +210,7 @@ void RegExpMacroAssemblerARM::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ ldr(r0, register_location(start_reg)); // Index of start of capture
__ ldr(r1, register_location(start_reg + 1)); // Index of end of capture
......@@ -302,7 +302,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
// r0: Address byte_offset1 - Address captured substring's start.
// r1: Address byte_offset2 - Address of current character position.
// r2: size_t byte_length - length of capture in bytes(!)
// r3: Isolate* isolate
// r3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ add(r0, r0, Operand(end_of_input_address()));
......@@ -316,7 +316,14 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
__ sub(r1, r1, r4);
}
// Isolate.
__ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
#ifdef V8_I18N_SUPPORT
if (unicode) {
__ mov(r3, Operand(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
}
{
AllowExternalCallThatCantCauseGC scope(masm_);
......
......@@ -38,7 +38,7 @@ class RegExpMacroAssemblerARM: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
......
......@@ -274,7 +274,7 @@ void RegExpMacroAssemblerARM64::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
Register capture_start_offset = w10;
......@@ -388,7 +388,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
// x0: Address byte_offset1 - Address captured substring's start.
// x1: Address byte_offset2 - Address of current character position.
// w2: size_t byte_length - length of capture in bytes(!)
// x3: Isolate* isolate
// x3: Isolate* isolate or 0 if unicode flag
// Address of start of capture.
__ Add(x0, input_end(), Operand(capture_start_offset, SXTW));
......@@ -400,7 +400,14 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
__ Sub(x1, x1, Operand(capture_length, SXTW));
}
// Isolate.
__ Mov(x3, ExternalReference::isolate_address(isolate()));
#ifdef V8_I18N_SUPPORT
if (unicode) {
__ Mov(x3, Operand(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ Mov(x3, ExternalReference::isolate_address(isolate()));
}
{
AllowExternalCallThatCantCauseGC scope(masm_);
......
......@@ -43,7 +43,7 @@ class RegExpMacroAssemblerARM64: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
......
......@@ -20,56 +20,58 @@ const unsigned int MAX_FIRST_ARG = 0x7fffffu;
const int BYTECODE_SHIFT = 8;
#define BYTECODE_ITERATOR(V) \
V(BREAK, 0, 4) /* bc8 */ \
V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
V(POP_CP, 10, 4) /* bc8 pad24 */ \
V(POP_BT, 11, 4) /* bc8 pad24 */ \
V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
V(FAIL, 13, 4) /* bc8 pad24 */ \
V(SUCCEED, 14, 4) /* bc8 pad24 */ \
V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \
V(CHECK_NOT_AT_START, 46, 8) /* bc8 offset24 addr32 */ \
V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */
V(BREAK, 0, 4) /* bc8 */ \
V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
V(POP_CP, 10, 4) /* bc8 pad24 */ \
V(POP_BT, 11, 4) /* bc8 pad24 */ \
V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
V(FAIL, 13, 4) /* bc8 pad24 */ \
V(SUCCEED, 14, 4) /* bc8 pad24 */ \
V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */
#define DECLARE_BYTECODES(name, code, length) \
static const int BC_##name = code;
......
......@@ -189,7 +189,7 @@ void RegExpMacroAssemblerIA32::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ mov(edx, register_location(start_reg)); // Index of start of capture
__ mov(ebx, register_location(start_reg + 1)); // Index of end of capture
......@@ -296,11 +296,18 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
// Isolate* isolate
// Isolate* isolate or 0 if unicode flag.
// Set isolate.
__ mov(Operand(esp, 3 * kPointerSize),
Immediate(ExternalReference::isolate_address(isolate())));
#ifdef V8_I18N_SUPPORT
if (unicode) {
__ mov(Operand(esp, 3 * kPointerSize), Immediate(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ mov(Operand(esp, 3 * kPointerSize),
Immediate(ExternalReference::isolate_address(isolate())));
}
// Set byte_length.
__ mov(Operand(esp, 2 * kPointerSize), ebx);
// Set byte_offset2.
......
......@@ -37,7 +37,7 @@ class RegExpMacroAssemblerIA32: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
......@@ -15,37 +15,32 @@
#include "src/unicode.h"
#include "src/utils.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uchar.h"
#endif // V8_I18N_SUPPORT
namespace v8 {
namespace internal {
typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
int from,
int current,
int len,
Vector<const uc16> subject) {
for (int i = 0; i < len; i++) {
unibrow::uchar old_char = subject[from++];
unibrow::uchar new_char = subject[current++];
if (old_char == new_char) continue;
unibrow::uchar old_string[1] = { old_char };
unibrow::uchar new_string[1] = { new_char };
interp_canonicalize->get(old_char, '\0', old_string);
interp_canonicalize->get(new_char, '\0', new_string);
if (old_string[0] != new_string[0]) {
return false;
}
}
return true;
static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
int len, Vector<const uc16> subject,
bool unicode) {
Address offset_a =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
Address offset_b =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
size_t length = len * kUC16Size;
return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
}
static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
int from,
int current,
int len,
Vector<const uint8_t> subject) {
static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
int len, Vector<const uint8_t> subject,
bool unicode) {
// For Latin1 characters the unicode flag makes no difference.
for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++];
unsigned int new_char = subject[current++];
......@@ -523,13 +518,16 @@ static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
break;
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE)
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
bool unicode =
(insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE;
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current + len > subject.length() ||
!BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
from, current, len, subject)) {
!BackRefMatchesNoCase(isolate, from, current, len, subject,
unicode)) {
pc = code_base + Load32Aligned(pc + 4);
break;
}
......@@ -538,13 +536,16 @@ static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
break;
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD)
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
bool unicode = (insn & BYTECODE_MASK) ==
BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD;
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current - len < 0 ||
!BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
from, current - len, len, subject)) {
!BackRefMatchesNoCase(isolate, from, current - len, len, subject,
unicode)) {
pc = code_base + Load32Aligned(pc + 4);
break;
}
......
......@@ -25,6 +25,11 @@
#include "src/string-search.h"
#include "src/unicode-decoder.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uset.h"
#include "unicode/utypes.h"
#endif // V8_I18N_SUPPORT
#ifndef V8_INTERPRETED_REGEXP
#if V8_TARGET_ARCH_IA32
#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
......@@ -3420,10 +3425,7 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
// independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
int range_count = ranges->length();
for (int j = 0; j < range_count; j++) {
ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
}
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
}
}
}
......@@ -3586,13 +3588,6 @@ class AlternativeGenerationList {
AlternativeGeneration a_few_alt_gens_[kAFew];
};
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
static const uc32 kRangeEndMarker = 0x110000;
// The '2' variant is has inclusive from and exclusive to.
......@@ -4395,8 +4390,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (compiler->ignore_case()) {
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
trace->backtrack());
assembler->CheckNotBackReferenceIgnoreCase(
start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
} else {
assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack());
......@@ -4866,21 +4861,6 @@ bool RegExpCharacterClass::is_standard(Zone* zone) {
}
bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {
ZoneList<CharacterRange>* ranges = this->ranges(zone);
CharacterRange::Canonicalize(ranges);
for (int i = ranges->length() - 1; i >= 0; i--) {
uc32 from = ranges->at(i).from();
uc32 to = ranges->at(i).to();
// Check for non-BMP characters.
if (to >= kNonBmpStart) return true;
// Check for lone surrogates.
if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
}
return false;
}
UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
ZoneList<CharacterRange>* base)
: zone_(zone),
......@@ -5120,11 +5100,53 @@ void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result,
}
void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
ZoneList<CharacterRange>* ranges) {
#ifdef V8_I18N_SUPPORT
// Use ICU to compute the case fold closure over the ranges.
DCHECK(compiler->unicode());
DCHECK(compiler->ignore_case());
USet* set = uset_openEmpty();
for (int i = 0; i < ranges->length(); i++) {
uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
}
ranges->Clear();
uset_closeOver(set, USET_CASE_INSENSITIVE);
// Full case mapping map single characters to multiple characters.
// Those are represented as strings in the set. Remove them so that
// we end up with only simple and common case mappings.
uset_removeAllStrings(set);
int item_count = uset_getItemCount(set);
int item_result = 0;
UErrorCode ec = U_ZERO_ERROR;
Zone* zone = compiler->zone();
for (int i = 0; i < item_count; i++) {
uc32 start = 0;
uc32 end = 0;
item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
ranges->Add(CharacterRange::Range(start, end), zone);
}
// No errors and everything we collected have been ranges.
DCHECK_EQ(U_ZERO_ERROR, ec);
DCHECK_EQ(0, item_result);
uset_close(set);
#else
// Fallback if ICU is not included.
CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
ranges, compiler->one_byte());
#endif // V8_I18N_SUPPORT
CharacterRange::Canonicalize(ranges);
}
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
set_.Canonicalize();
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone);
if (compiler->unicode() && compiler->ignore_case()) {
AddUnicodeCaseEquivalents(compiler, ranges);
}
if (compiler->unicode() && !compiler->one_byte()) {
if (is_negated()) {
ZoneList<CharacterRange>* negated =
......@@ -5853,16 +5875,19 @@ Vector<const int> CharacterRange::GetWordBounds() {
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
bool is_one_byte) {
uc32 bottom = from();
uc32 top = to();
// Nothing to be done for surrogates.
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
if (bottom > String::kMaxOneByteCharCode) return;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if (top == bottom) {
int range_count = ranges->length();
for (int i = 0; i < range_count; i++) {
CharacterRange range = ranges->at(i);
uc32 bottom = range.from();
uc32 top = range.to();
// Nothing to be done for surrogates.
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
if (bottom > String::kMaxOneByteCharCode) return;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if (top == bottom) {
// If this is a singleton we just expand the one character.
int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
for (int i = 0; i < length; i++) {
......@@ -5914,6 +5939,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
pos = end + 1;
}
}
}
}
......@@ -6284,7 +6310,7 @@ void TextNode::CalculateOffsets() {
void Analysis::VisitText(TextNode* that) {
if (ignore_case_) {
if (ignore_case()) {
that->MakeCaseIndependent(isolate(), is_one_byte_);
}
EnsureAnalyzed(that->on_success());
......@@ -6649,7 +6675,7 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node;
Analysis analysis(isolate, ignore_case, is_one_byte);
Analysis analysis(isolate, flags, is_one_byte);
analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) {
const char* error_message = analysis.error_message();
......
......@@ -19,6 +19,15 @@ class RegExpNode;
class RegExpTree;
class BoyerMooreLookahead;
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
class RegExpImpl {
public:
// Whether V8 is compiled with native regexp support or not.
......@@ -1478,9 +1487,9 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
// +-------+ +------------+
class Analysis: public NodeVisitor {
public:
Analysis(Isolate* isolate, bool ignore_case, bool is_one_byte)
Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
: isolate_(isolate),
ignore_case_(ignore_case),
flags_(flags),
is_one_byte_(is_one_byte),
error_message_(NULL) {}
void EnsureAnalyzed(RegExpNode* node);
......@@ -1502,9 +1511,12 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
Isolate* isolate() const { return isolate_; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
private:
Isolate* isolate_;
bool ignore_case_;
JSRegExp::Flags flags_;
bool is_one_byte_;
const char* error_message_;
......
......@@ -215,7 +215,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ lw(a0, register_location(start_reg)); // Index of start of capture.
__ lw(a1, register_location(start_reg + 1)); // Index of end of capture.
......@@ -310,7 +310,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
// a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!).
// a3: Isolate* isolate.
// a3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ Addu(a0, a0, Operand(end_of_input_address()));
......@@ -324,7 +324,14 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ Subu(a1, a1, Operand(s3));
}
// Isolate.
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
#ifdef V8_I18N_SUPPORT
if (unicode) {
__ li(a3, Operand(zero_reg));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
}
{
AllowExternalCallThatCantCauseGC scope(masm_);
......
......@@ -37,7 +37,7 @@ class RegExpMacroAssemblerMIPS: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
......@@ -251,7 +251,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
__ ld(a0, register_location(start_reg)); // Index of start of capture.
__ ld(a1, register_location(start_reg + 1)); // Index of end of capture.
......@@ -346,7 +346,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
// a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!).
// a3: Isolate* isolate.
// a3: Isolate* isolate or 0 if unicode flag.
// Address of start of capture.
__ Daddu(a0, a0, Operand(end_of_input_address()));
......@@ -360,7 +360,14 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ Dsubu(a1, a1, Operand(s3));
}
// Isolate.
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
#ifdef V8_I18N_SUPPORT
if (unicode) {
__ li(a3, Operand(zero_reg));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
}
{
AllowExternalCallThatCantCauseGC scope(masm_);
......
......@@ -37,7 +37,7 @@ class RegExpMacroAssemblerMIPS: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
......@@ -108,8 +108,9 @@ class CharacterRange {
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
void AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, bool is_one_byte);
static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
bool is_one_byte);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
......@@ -293,7 +294,6 @@ class RegExpCharacterClass final : public RegExpTree {
RegExpCharacterClass* AsCharacterClass() override;
bool IsCharacterClass() override;
bool IsTextElement() override { return true; }
bool NeedsDesugaringForUnicode(Zone* zone);
int min_match() override { return 1; }
int max_match() override { return 1; }
void AppendToText(RegExpText* text, Zone* zone) override;
......@@ -310,7 +310,7 @@ class RegExpCharacterClass final : public RegExpTree {
// W : non-ASCII word character
// d : ASCII digit
// D : non-ASCII digit
// . : non-unicode non-newline
// . : non-newline
// * : All characters, for advancing unanchored regexp
uc16 standard_type() { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
......
......@@ -381,11 +381,13 @@ void RegExpMacroAssemblerIrregexp::CheckNotBackReference(int start_reg,
void RegExpMacroAssemblerIrregexp::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_not_equal) {
int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
DCHECK(start_reg >= 0);
DCHECK(start_reg <= kMaxRegister);
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
: BC_CHECK_NOT_BACK_REF_NO_CASE,
Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
: BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
: (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
: BC_CHECK_NOT_BACK_REF_NO_CASE),
start_reg);
EmitOrLink(on_not_equal);
}
......
......@@ -82,16 +82,10 @@ class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler {
uc16 to,
Label* on_not_in_range);
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set);
virtual void CheckPosition(int cp_offset, Label* on_outside_input) {
LoadCurrentCharacter(cp_offset, on_outside_input, true);
}
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) {
return false; // No custom support for character classes.
}
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
......
......@@ -360,11 +360,11 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
start_reg, read_backward ? "backward" : "forward",
LabelToInt(on_no_match));
assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
on_no_match);
}
......
......@@ -34,7 +34,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
......
......@@ -9,6 +9,10 @@
#include "src/regexp/regexp-stack.h"
#include "src/simulator.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uchar.h"
#endif // V8_I18N_SUPPORT
namespace v8 {
namespace internal {
......@@ -23,6 +27,67 @@ RegExpMacroAssembler::~RegExpMacroAssembler() {
}
int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate) {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
isolate->regexp_macro_assembler_canonicalize();
// This function is not allowed to cause a garbage collection.
// A GC might move the calling generated code and invalidate the
// return address on the stack.
DCHECK(byte_length % 2 == 0);
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
size_t length = byte_length >> 1;
#ifdef V8_I18N_SUPPORT
if (isolate == nullptr) {
for (size_t i = 0; i < length; i++) {
uc32 c1 = substring1[i];
uc32 c2 = substring2[i];
if (unibrow::Utf16::IsLeadSurrogate(c1)) {
// Non-BMP characters do not have case-equivalents in the BMP.
// Both have to be non-BMP for them to be able to match.
if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
if (i + 1 < length) {
uc16 c1t = substring1[i + 1];
uc16 c2t = substring2[i + 1];
if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
unibrow::Utf16::IsTrailSurrogate(c2t)) {
c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
i++;
}
}
}
c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
if (c1 != c2) return 0;
}
return 1;
}
#endif // V8_I18N_SUPPORT
DCHECK_NOT_NULL(isolate);
for (size_t i = 0; i < length; i++) {
unibrow::uchar c1 = substring1[i];
unibrow::uchar c2 = substring2[i];
if (c1 != c2) {
unibrow::uchar s1[1] = {c1};
canonicalize->get(c1, '\0', s1);
if (s1[0] != c2) {
unibrow::uchar s2[1] = {c2};
canonicalize->get(c2, '\0', s2);
if (s1[0] != s2[0]) {
return 0;
}
}
}
}
return 1;
}
#ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
......@@ -245,40 +310,6 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
};
int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate) {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
isolate->regexp_macro_assembler_canonicalize();
// This function is not allowed to cause a garbage collection.
// A GC might move the calling generated code and invalidate the
// return address on the stack.
DCHECK(byte_length % 2 == 0);
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
size_t length = byte_length >> 1;
for (size_t i = 0; i < length; i++) {
unibrow::uchar c1 = substring1[i];
unibrow::uchar c2 = substring2[i];
if (c1 != c2) {
unibrow::uchar s1[1] = { c1 };
canonicalize->get(c1, '\0', s1);
if (s1[0] != c2) {
unibrow::uchar s2[1] = { c2 };
canonicalize->get(c2, '\0', s2);
if (s1[0] != s2[0]) {
return 0;
}
}
}
}
return 1;
}
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
Address* stack_base,
Isolate* isolate) {
......
......@@ -76,7 +76,7 @@ class RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
......@@ -146,6 +146,12 @@ class RegExpMacroAssembler {
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
virtual void WriteStackPointerToRegister(int reg) = 0;
// Compares two-byte strings case insensitively.
// Called from generated RegExp code.
static int CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length, Isolate* isolate);
// Controls the generation of large inlined constants in the code.
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; }
......@@ -199,13 +205,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
int previous_index,
Isolate* isolate);
// Compares two-byte strings case insensitively.
// Called from generated RegExp code.
static int CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate);
// Called from RegExp if the backtrack stack limit is hit.
// Tries to expand the stack. Returns the new stack-pointer if
// successful, and updates the stack_top address, or returns 0 if unable
......
......@@ -11,6 +11,10 @@
#include "src/regexp/jsregexp.h"
#include "src/utils.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uset.h"
#endif // V8_I18N_SUPPORT
namespace v8 {
namespace internal {
......@@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
if (pending_surrogate_ != kNoPendingSurrogate) {
uc16 lead_surrogate = pending_surrogate_;
DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
ZoneList<uc16> surrogate_pair(2, zone());
surrogate_pair.Add(lead_surrogate, zone());
surrogate_pair.Add(trail_surrogate, zone());
RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
pending_surrogate_ = kNoPendingSurrogate;
AddAtom(atom);
DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
uc32 combined =
unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
if (NeedsDesugaringForIgnoreCase(combined)) {
AddCharacterClass(combined);
} else {
ZoneList<uc16> surrogate_pair(2, zone());
surrogate_pair.Add(lead_surrogate, zone());
surrogate_pair.Add(trail_surrogate, zone());
RegExpAtom* atom =
new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
AddAtom(atom);
}
} else {
pending_surrogate_ = trail_surrogate;
FlushPendingSurrogate();
......@@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
void RegExpBuilder::FlushPendingSurrogate() {
if (pending_surrogate_ != kNoPendingSurrogate) {
// Use character class to desugar lone surrogate matching.
RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
CharacterRange::List(zone(),
CharacterRange::Singleton(pending_surrogate_)),
false);
pending_surrogate_ = kNoPendingSurrogate;
DCHECK(unicode());
AddCharacterClass(cc);
uc32 c = pending_surrogate_;
pending_surrogate_ = kNoPendingSurrogate;
AddCharacterClass(c);
}
}
......@@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() {
void RegExpBuilder::AddCharacter(uc16 c) {
FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ == NULL) {
characters_ = new (zone()) ZoneList<uc16>(4, zone());
if (NeedsDesugaringForIgnoreCase(c)) {
AddCharacterClass(c);
} else {
if (characters_ == NULL) {
characters_ = new (zone()) ZoneList<uc16>(4, zone());
}
characters_->Add(c, zone());
LAST(ADD_CHAR);
}
characters_->Add(c, zone());
LAST(ADD_CHAR);
}
......@@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {
if (NeedsDesugaringForUnicode(cc)) {
// In unicode mode, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText.
AddTerm(cc);
......@@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
}
void RegExpBuilder::AddCharacterClass(uc32 c) {
AddCharacterClass(new (zone()) RegExpCharacterClass(
CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
}
void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) {
AddEmpty();
......@@ -1210,6 +1227,47 @@ void RegExpBuilder::FlushTerms() {
}
bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
if (!unicode()) return false;
switch (cc->standard_type()) {
case 's': // white space
case 'w': // ASCII word character
case 'd': // ASCII digit
return false; // These characters do not need desugaring.
default:
break;
}
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::Canonicalize(ranges);
for (int i = ranges->length() - 1; i >= 0; i--) {
uc32 from = ranges->at(i).from();
uc32 to = ranges->at(i).to();
// Check for non-BMP characters.
if (to >= kNonBmpStart) return true;
// Check for lone surrogates.
if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
}
return false;
}
bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
#ifdef V8_I18N_SUPPORT
if (unicode() && ignore_case()) {
USet* set = uset_open(c, c);
uset_closeOver(set, USET_CASE_INSENSITIVE);
uset_removeAllStrings(set);
bool result = uset_size(set) > 1;
uset_close(set);
return result;
}
// In the case where ICU is not included, we act as if the unicode flag is
// not set, and do not desugar.
#endif // V8_I18N_SUPPORT
return false;
}
RegExpTree* RegExpBuilder::ToRegExp() {
FlushTerms();
int num_alternatives = alternatives_.length();
......
......@@ -106,6 +106,7 @@ class RegExpBuilder : public ZoneObject {
// following quantifier
void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc);
void AddCharacterClass(uc32 c);
void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
......@@ -122,8 +123,11 @@ class RegExpBuilder : public ZoneObject {
void FlushCharacters();
void FlushText();
void FlushTerms();
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
bool NeedsDesugaringForIgnoreCase(uc32 c);
Zone* zone() const { return zone_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
Zone* zone_;
bool pending_empty_;
......
......@@ -203,7 +203,7 @@ void RegExpMacroAssemblerX64::CheckGreedyLoop(Label* on_equal) {
void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
Label fallthrough;
ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture
ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture
......@@ -308,8 +308,10 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
// Isolate* isolate
// Isolate* isolate or 0 if unicode flag.
#ifdef _WIN64
DCHECK(rcx.is(arg_reg_1));
DCHECK(rdx.is(arg_reg_2));
// Compute and set byte_offset1 (start of capture).
__ leap(rcx, Operand(rsi, rdx, times_1, 0));
// Set byte_offset2.
......@@ -317,11 +319,9 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
if (read_backward) {
__ subq(rdx, rbx);
}
// Set byte_length.
__ movp(r8, rbx);
// Isolate.
__ LoadAddress(r9, ExternalReference::isolate_address(isolate()));
#else // AMD64 calling convention
DCHECK(rdi.is(arg_reg_1));
DCHECK(rsi.is(arg_reg_2));
// Compute byte_offset2 (current position = rsi+rdi).
__ leap(rax, Operand(rsi, rdi, times_1, 0));
// Compute and set byte_offset1 (start of capture).
......@@ -331,11 +331,19 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
if (read_backward) {
__ subq(rsi, rbx);
}
#endif // _WIN64
// Set byte_length.
__ movp(rdx, rbx);
__ movp(arg_reg_3, rbx);
// Isolate.
__ LoadAddress(rcx, ExternalReference::isolate_address(isolate()));
#endif
#ifdef V8_I18N_SUPPORT
if (unicode) {
__ movp(arg_reg_4, Immediate(0));
} else // NOLINT
#endif // V8_I18N_SUPPORT
{
__ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate()));
}
{ // NOLINT: Can't find a way to open this scope without confusing the
// linter.
......
......@@ -38,7 +38,7 @@ class RegExpMacroAssemblerX64: public NativeRegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
bool read_backward, bool unicode,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
......@@ -1186,16 +1186,16 @@ TEST(MacroAssemblerNativeBackRefNoCase) {
m.WriteCurrentPositionToRegister(2, 0);
m.AdvanceCurrentPosition(3);
m.WriteCurrentPositionToRegister(3, 0);
m.CheckNotBackReferenceIgnoreCase(2, false, &fail); // Match "AbC".
m.CheckNotBackReferenceIgnoreCase(2, false, &fail); // Match "ABC".
m.CheckNotBackReferenceIgnoreCase(2, false, false, &fail); // Match "AbC".
m.CheckNotBackReferenceIgnoreCase(2, false, false, &fail); // Match "ABC".
Label expected_fail;
m.CheckNotBackReferenceIgnoreCase(2, false, &expected_fail);
m.CheckNotBackReferenceIgnoreCase(2, false, false, &expected_fail);
m.Bind(&fail);
m.Fail();
m.Bind(&expected_fail);
m.AdvanceCurrentPosition(3); // Skip "xYz"
m.CheckNotBackReferenceIgnoreCase(2, false, &succ);
m.CheckNotBackReferenceIgnoreCase(2, false, false, &succ);
m.Fail();
m.Bind(&succ);
......@@ -1629,7 +1629,9 @@ static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
int count = expected.length();
ZoneList<CharacterRange>* list =
new(&zone) ZoneList<CharacterRange>(count, &zone);
input.AddCaseEquivalents(isolate, &zone, list, false);
list->Add(input, &zone);
CharacterRange::AddCaseEquivalents(isolate, &zone, list, false);
list->Remove(0); // Remove the input before checking results.
CHECK_EQ(count, list->length());
for (int i = 0; i < list->length(); i++) {
CHECK_EQ(expected[i].from(), list->at(i).from());
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
// Non-unicode use toUpperCase mappings.
assertFalse(/[\u00e5]/i.test("\u212b"));
assertFalse(/[\u212b]/i.test("\u00e5\u1234"));
assertFalse(/[\u212b]/i.test("\u00e5"));
assertTrue("\u212b".toLowerCase() == "\u00e5");
assertTrue("\u00c5".toLowerCase() == "\u00e5");
assertTrue("\u00e5".toUpperCase() == "\u00c5");
// Unicode uses case folding mappings.
assertFalse(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00e5/ui.test("\u00c5"));
assertTrue(/\u00e5/ui.test("\u00e5"));
assertFalse(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00e5"));
assertFalse(/\u00c5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00c5"));
assertFalse(/\u212b/ui.test("\u00c5"));
assertFalse(/\u212b/ui.test("\u00e5"));
assertTrue(/\u212b/ui.test("\u212b"));
// Non-BMP.
assertFalse(/\u{10400}/i.test("\u{10428}"));
assertFalse(/\u{10400}/ui.test("\u{10428}"));
assertFalse(/\ud801\udc00/ui.test("\u{10428}"));
assertFalse(/[\u{10428}]/ui.test("\u{10400}"));
assertFalse(/[\ud801\udc28]/ui.test("\u{10400}"));
assertEquals(["\uff21\u{10400}"],
/[\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc"));
assertEquals(["abc"], /[^\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc\uff23"));
assertEquals(["\uff53\u24bb"],
/[\u24d5-\uff33]+/ui.exec("\uff54\uff53\u24bb\u24ba"));
// Full mappings are ignored.
assertFalse(/\u00df/ui.test("SS"));
assertFalse(/\u1f8d/ui.test("\u1f05\u03b9"));
// Simple mappings.
assertFalse(/\u1f8d/ui.test("\u1f85"));
// Common mappings.
assertTrue(/\u1f6b/ui.test("\u1f63"));
// Back references.
assertNull(/(.)\1\1/ui.exec("\u00e5\u212b\u00c5"));
assertNull(/(.)\1/ui.exec("\u{118aa}\u{118ca}"));
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
// Non-unicode use toUpperCase mappings.
assertFalse(/[\u00e5]/i.test("\u212b"));
assertFalse(/[\u212b]/i.test("\u00e5\u1234"));
assertFalse(/[\u212b]/i.test("\u00e5"));
assertTrue("\u212b".toLowerCase() == "\u00e5");
assertTrue("\u00c5".toLowerCase() == "\u00e5");
assertTrue("\u00e5".toUpperCase() == "\u00c5");
// Unicode uses case folding mappings.
assertTrue(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00e5/ui.test("\u00c5"));
assertTrue(/\u00e5/ui.test("\u00e5"));
assertTrue(/\u00e5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00e5"));
assertTrue(/\u00c5/ui.test("\u212b"));
assertTrue(/\u00c5/ui.test("\u00c5"));
assertTrue(/\u212b/ui.test("\u00c5"));
assertTrue(/\u212b/ui.test("\u00e5"));
assertTrue(/\u212b/ui.test("\u212b"));
// Non-BMP.
assertFalse(/\u{10400}/i.test("\u{10428}"));
assertTrue(/\u{10400}/ui.test("\u{10428}"));
assertTrue(/\ud801\udc00/ui.test("\u{10428}"));
assertTrue(/[\u{10428}]/ui.test("\u{10400}"));
assertTrue(/[\ud801\udc28]/ui.test("\u{10400}"));
assertEquals(["\uff21\u{10400}"],
/[\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc"));
assertEquals(["abc"], /[^\uff40-\u{10428}]+/ui.exec("\uff21\u{10400}abc\uff23"));
assertEquals(["\uff53\u24bb"],
/[\u24d5-\uff33]+/ui.exec("\uff54\uff53\u24bb\u24ba"));
// Full mappings are ignored.
assertFalse(/\u00df/ui.test("SS"));
assertFalse(/\u1f8d/ui.test("\u1f05\u03b9"));
// Simple mappings work.
assertTrue(/\u1f8d/ui.test("\u1f85"));
// Common mappings work.
assertTrue(/\u1f6b/ui.test("\u1f63"));
// Back references.
assertEquals(["\u00e5\u212b\u00c5", "\u00e5"],
/(.)\1\1/ui.exec("\u00e5\u212b\u00c5"));
assertEquals(["\u{118aa}\u{118ca}", "\u{118aa}"],
/(.)\1/ui.exec("\u{118aa}\u{118ca}"));
......@@ -289,6 +289,10 @@
# TODO(titzer): SSE 4.1 required for asm-wasm test (floor).
'wasm/asm-wasm': [SKIP],
# case-insensitive unicode regexp relies on case mapping provided by ICU.
'harmony/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]],
'harmony/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]],
}], # ALWAYS
['novfp3 == True', {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment