Commit b4aa41d0 authored by Jakob Gruber's avatar Jakob Gruber Committed by V8 LUCI CQ

[regexp] Add dedicated enums for standard character sets

.. instead of referring to them through magic chars {s,S,w,W,d,D,n,.,*}.

Change-Id: Ib50937a2a7d4229a021377586a54be3db9ed8c1d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3217196
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarPatrick Thier <pthier@chromium.org>
Cr-Commit-Position: refs/heads/main@{#77337}
parent ad89fd9f
......@@ -494,118 +494,116 @@ void RegExpMacroAssemblerARM::CheckBitInTable(
BranchOrBacktrack(ne, on_bit_set);
}
bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(base::uc16 type,
Label* on_no_match) {
bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(
StandardCharacterSet type, Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
// TODO(jgruber): No custom implementation (yet): s(UC16), S(UC16).
switch (type) {
case 's':
// Match space-characters
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
__ cmp(current_character(), Operand(' '));
__ b(eq, &success);
// Check range 0x09..0x0D
__ sub(r0, current_character(), Operand('\t'));
__ cmp(r0, Operand('\r' - '\t'));
__ b(ls, &success);
// \u00a0 (NBSP).
__ cmp(r0, Operand(0x00A0 - '\t'));
BranchOrBacktrack(ne, on_no_match);
__ bind(&success);
case StandardCharacterSet::kWhitespace:
// Match space-characters.
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
__ cmp(current_character(), Operand(' '));
__ b(eq, &success);
// Check range 0x09..0x0D.
__ sub(r0, current_character(), Operand('\t'));
__ cmp(r0, Operand('\r' - '\t'));
__ b(ls, &success);
// \u00a0 (NBSP).
__ cmp(r0, Operand(0x00A0 - '\t'));
BranchOrBacktrack(ne, on_no_match);
__ bind(&success);
return true;
}
return false;
case StandardCharacterSet::kNotWhitespace:
// The emitted code for generic character classes is good enough.
return false;
case StandardCharacterSet::kDigit:
// Match ASCII digits ('0'..'9')
__ sub(r0, current_character(), Operand('0'));
__ cmp(r0, Operand('9' - '0'));
BranchOrBacktrack(hi, on_no_match);
return true;
}
return false;
case 'S':
// The emitted code for generic character classes is good enough.
return false;
case 'd':
// Match ASCII digits ('0'..'9')
__ sub(r0, current_character(), Operand('0'));
__ cmp(r0, Operand('9' - '0'));
BranchOrBacktrack(hi, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
__ sub(r0, current_character(), Operand('0'));
__ cmp(r0, Operand('9' - '0'));
BranchOrBacktrack(ls, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ sub(r0, r0, Operand(0x0B));
__ cmp(r0, Operand(0x0C - 0x0B));
BranchOrBacktrack(ls, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(r0, r0, Operand(0x2028 - 0x0B));
__ cmp(r0, Operand(1));
case StandardCharacterSet::kNotDigit:
// Match non ASCII-digits
__ sub(r0, current_character(), Operand('0'));
__ cmp(r0, Operand('9' - '0'));
BranchOrBacktrack(ls, on_no_match);
return true;
case StandardCharacterSet::kNotLineTerminator: {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ sub(r0, r0, Operand(0x0B));
__ cmp(r0, Operand(0x0C - 0x0B));
BranchOrBacktrack(ls, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(r0, r0, Operand(0x2028 - 0x0B));
__ cmp(r0, Operand(1));
BranchOrBacktrack(ls, on_no_match);
}
return true;
}
return true;
}
case 'n': {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ sub(r0, r0, Operand(0x0B));
__ cmp(r0, Operand(0x0C - 0x0B));
if (mode_ == LATIN1) {
BranchOrBacktrack(hi, on_no_match);
} else {
Label done;
__ b(ls, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(r0, r0, Operand(0x2028 - 0x0B));
__ cmp(r0, Operand(1));
BranchOrBacktrack(hi, on_no_match);
__ bind(&done);
}
return true;
}
case 'w': {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Operand('z'));
BranchOrBacktrack(hi, on_no_match);
case StandardCharacterSet::kLineTerminator: {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ sub(r0, r0, Operand(0x0B));
__ cmp(r0, Operand(0x0C - 0x0B));
if (mode_ == LATIN1) {
BranchOrBacktrack(hi, on_no_match);
} else {
Label done;
__ b(ls, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(r0, r0, Operand(0x2028 - 0x0B));
__ cmp(r0, Operand(1));
BranchOrBacktrack(hi, on_no_match);
__ bind(&done);
}
return true;
}
ExternalReference map = ExternalReference::re_word_character_map();
__ mov(r0, Operand(map));
__ ldrb(r0, MemOperand(r0, current_character()));
__ cmp(r0, Operand::Zero());
BranchOrBacktrack(eq, on_no_match);
return true;
}
case 'W': {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Operand('z'));
__ b(hi, &done);
case StandardCharacterSet::kWord: {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Operand('z'));
BranchOrBacktrack(hi, on_no_match);
}
ExternalReference map = ExternalReference::re_word_character_map();
__ mov(r0, Operand(map));
__ ldrb(r0, MemOperand(r0, current_character()));
__ cmp(r0, Operand::Zero());
BranchOrBacktrack(eq, on_no_match);
return true;
}
ExternalReference map = ExternalReference::re_word_character_map();
__ mov(r0, Operand(map));
__ ldrb(r0, MemOperand(r0, current_character()));
__ cmp(r0, Operand::Zero());
BranchOrBacktrack(ne, on_no_match);
if (mode_ != LATIN1) {
__ bind(&done);
case StandardCharacterSet::kNotWord: {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Operand('z'));
__ b(hi, &done);
}
ExternalReference map = ExternalReference::re_word_character_map();
__ mov(r0, Operand(map));
__ ldrb(r0, MemOperand(r0, current_character()));
__ cmp(r0, Operand::Zero());
BranchOrBacktrack(ne, on_no_match);
if (mode_ != LATIN1) {
__ bind(&done);
}
return true;
}
return true;
}
case '*':
// Match any character.
return true;
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
case StandardCharacterSet::kEverything:
// Match any character.
return true;
}
}
......
......@@ -53,7 +53,8 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM
// Checks whether the given offset from the current position is before
// the end of the string.
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(base::uc16 type, Label* on_no_match) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;
......
......@@ -588,106 +588,104 @@ void RegExpMacroAssemblerARM64::CheckBitInTable(
CompareAndBranchOrBacktrack(w11, 0, ne, on_bit_set);
}
bool RegExpMacroAssemblerARM64::CheckSpecialCharacterClass(base::uc16 type,
Label* on_no_match) {
bool RegExpMacroAssemblerARM64::CheckSpecialCharacterClass(
StandardCharacterSet type, Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
// TODO(jgruber): No custom implementation (yet): s(UC16), S(UC16).
switch (type) {
case 's':
// Match space-characters
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
// Check for ' ' or 0x00A0.
__ Cmp(current_character(), ' ');
__ Ccmp(current_character(), 0x00A0, ZFlag, ne);
__ B(eq, &success);
// Check range 0x09..0x0D.
__ Sub(w10, current_character(), '\t');
CompareAndBranchOrBacktrack(w10, '\r' - '\t', hi, on_no_match);
__ Bind(&success);
case StandardCharacterSet::kWhitespace:
// Match space-characters.
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
// Check for ' ' or 0x00A0.
__ Cmp(current_character(), ' ');
__ Ccmp(current_character(), 0x00A0, ZFlag, ne);
__ B(eq, &success);
// Check range 0x09..0x0D.
__ Sub(w10, current_character(), '\t');
CompareAndBranchOrBacktrack(w10, '\r' - '\t', hi, on_no_match);
__ Bind(&success);
return true;
}
return false;
case StandardCharacterSet::kNotWhitespace:
// The emitted code for generic character classes is good enough.
return false;
case StandardCharacterSet::kDigit:
// Match ASCII digits ('0'..'9').
__ Sub(w10, current_character(), '0');
CompareAndBranchOrBacktrack(w10, '9' - '0', hi, on_no_match);
return true;
case StandardCharacterSet::kNotDigit:
// Match ASCII non-digits.
__ Sub(w10, current_character(), '0');
CompareAndBranchOrBacktrack(w10, '9' - '0', ls, on_no_match);
return true;
case StandardCharacterSet::kNotLineTerminator: {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
// Here we emit the conditional branch only once at the end to make branch
// prediction more efficient, even though we could branch out of here
// as soon as a character matches.
__ Cmp(current_character(), 0x0A);
__ Ccmp(current_character(), 0x0D, ZFlag, ne);
if (mode_ == UC16) {
__ Sub(w10, current_character(), 0x2028);
// If the Z flag was set we clear the flags to force a branch.
__ Ccmp(w10, 0x2029 - 0x2028, NoFlag, ne);
// ls -> !((C==1) && (Z==0))
BranchOrBacktrack(ls, on_no_match);
} else {
BranchOrBacktrack(eq, on_no_match);
}
return true;
}
return false;
case 'S':
// The emitted code for generic character classes is good enough.
return false;
case 'd':
// Match ASCII digits ('0'..'9').
__ Sub(w10, current_character(), '0');
CompareAndBranchOrBacktrack(w10, '9' - '0', hi, on_no_match);
return true;
case 'D':
// Match ASCII non-digits.
__ Sub(w10, current_character(), '0');
CompareAndBranchOrBacktrack(w10, '9' - '0', ls, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
// Here we emit the conditional branch only once at the end to make branch
// prediction more efficient, even though we could branch out of here
// as soon as a character matches.
__ Cmp(current_character(), 0x0A);
__ Ccmp(current_character(), 0x0D, ZFlag, ne);
if (mode_ == UC16) {
__ Sub(w10, current_character(), 0x2028);
// If the Z flag was set we clear the flags to force a branch.
__ Ccmp(w10, 0x2029 - 0x2028, NoFlag, ne);
// ls -> !((C==1) && (Z==0))
BranchOrBacktrack(ls, on_no_match);
} else {
BranchOrBacktrack(eq, on_no_match);
}
return true;
}
case 'n': {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
// We have to check all 4 newline characters before emitting
// the conditional branch.
__ Cmp(current_character(), 0x0A);
__ Ccmp(current_character(), 0x0D, ZFlag, ne);
if (mode_ == UC16) {
__ Sub(w10, current_character(), 0x2028);
// If the Z flag was set we clear the flags to force a fall-through.
__ Ccmp(w10, 0x2029 - 0x2028, NoFlag, ne);
// hi -> (C==1) && (Z==0)
BranchOrBacktrack(hi, on_no_match);
} else {
BranchOrBacktrack(ne, on_no_match);
case StandardCharacterSet::kLineTerminator: {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
// We have to check all 4 newline characters before emitting
// the conditional branch.
__ Cmp(current_character(), 0x0A);
__ Ccmp(current_character(), 0x0D, ZFlag, ne);
if (mode_ == UC16) {
__ Sub(w10, current_character(), 0x2028);
// If the Z flag was set we clear the flags to force a fall-through.
__ Ccmp(w10, 0x2029 - 0x2028, NoFlag, ne);
// hi -> (C==1) && (Z==0)
BranchOrBacktrack(hi, on_no_match);
} else {
BranchOrBacktrack(ne, on_no_match);
}
return true;
}
return true;
}
case 'w': {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
CompareAndBranchOrBacktrack(current_character(), 'z', hi, on_no_match);
case StandardCharacterSet::kWord: {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
CompareAndBranchOrBacktrack(current_character(), 'z', hi, on_no_match);
}
ExternalReference map = ExternalReference::re_word_character_map();
__ Mov(x10, map);
__ Ldrb(w10, MemOperand(x10, current_character(), UXTW));
CompareAndBranchOrBacktrack(w10, 0, eq, on_no_match);
return true;
}
ExternalReference map = ExternalReference::re_word_character_map();
__ Mov(x10, map);
__ Ldrb(w10, MemOperand(x10, current_character(), UXTW));
CompareAndBranchOrBacktrack(w10, 0, eq, on_no_match);
return true;
}
case 'W': {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ Cmp(current_character(), 'z');
__ B(hi, &done);
case StandardCharacterSet::kNotWord: {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ Cmp(current_character(), 'z');
__ B(hi, &done);
}
ExternalReference map = ExternalReference::re_word_character_map();
__ Mov(x10, map);
__ Ldrb(w10, MemOperand(x10, current_character(), UXTW));
CompareAndBranchOrBacktrack(w10, 0, ne, on_no_match);
__ Bind(&done);
return true;
}
ExternalReference map = ExternalReference::re_word_character_map();
__ Mov(x10, map);
__ Ldrb(w10, MemOperand(x10, current_character(), UXTW));
CompareAndBranchOrBacktrack(w10, 0, ne, on_no_match);
__ Bind(&done);
return true;
}
case '*':
// Match any character.
return true;
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
case StandardCharacterSet::kEverything:
// Match any character.
return true;
}
}
......
......@@ -57,7 +57,8 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM64
// Checks whether the given offset from the current position is before
// the end of the string.
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(base::uc16 type, Label* on_no_match) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void BindJumpTarget(Label* label = nullptr) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
......
......@@ -517,125 +517,124 @@ void RegExpMacroAssemblerIA32::CheckBitInTable(
BranchOrBacktrack(not_equal, on_bit_set);
}
bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(base::uc16 type,
Label* on_no_match) {
bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(
StandardCharacterSet type, Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
// TODO(jgruber): No custom implementation (yet): s(UC16), S(UC16).
switch (type) {
case 's':
// Match space-characters
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
__ cmp(current_character(), ' ');
__ j(equal, &success, Label::kNear);
// Check range 0x09..0x0D
__ lea(eax, Operand(current_character(), -'\t'));
__ cmp(eax, '\r' - '\t');
__ j(below_equal, &success, Label::kNear);
// \u00a0 (NBSP).
__ cmp(eax, 0x00A0 - '\t');
BranchOrBacktrack(not_equal, on_no_match);
__ bind(&success);
case StandardCharacterSet::kWhitespace:
// Match space-characters.
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
__ cmp(current_character(), ' ');
__ j(equal, &success, Label::kNear);
// Check range 0x09..0x0D.
__ lea(eax, Operand(current_character(), -'\t'));
__ cmp(eax, '\r' - '\t');
__ j(below_equal, &success, Label::kNear);
// \u00a0 (NBSP).
__ cmp(eax, 0x00A0 - '\t');
BranchOrBacktrack(not_equal, on_no_match);
__ bind(&success);
return true;
}
return false;
case StandardCharacterSet::kNotWhitespace:
// The emitted code for generic character classes is good enough.
return false;
case StandardCharacterSet::kDigit:
// Match ASCII digits ('0'..'9').
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(above, on_no_match);
return true;
}
return false;
case 'S':
// The emitted code for generic character classes is good enough.
return false;
case 'd':
// Match ASCII digits ('0'..'9')
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(above, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(below_equal, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ mov(eax, current_character());
__ xor_(eax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ sub(eax, Immediate(0x0B));
__ cmp(eax, 0x0C - 0x0B);
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(eax, Immediate(0x2028 - 0x0B));
__ cmp(eax, 0x2029 - 0x2028);
case StandardCharacterSet::kNotDigit:
// Match non ASCII-digits.
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(below_equal, on_no_match);
return true;
case StandardCharacterSet::kLineTerminator:
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 or 0x2029).
// The opposite of '.'.
__ mov(eax, current_character());
__ xor_(eax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C.
__ sub(eax, Immediate(0x0B));
__ cmp(eax, 0x0C - 0x0B);
if (mode_ == LATIN1) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
DCHECK_EQ(UC16, mode_);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(eax, Immediate(0x2028 - 0x0B));
__ cmp(eax, 1);
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
case StandardCharacterSet::kNotLineTerminator: {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029).
__ mov(eax, current_character());
__ xor_(eax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C.
__ sub(eax, Immediate(0x0B));
__ cmp(eax, 0x0C - 0x0B);
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(eax, Immediate(0x2028 - 0x0B));
__ cmp(eax, 0x2029 - 0x2028);
BranchOrBacktrack(below_equal, on_no_match);
}
return true;
}
return true;
}
case 'w': {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Immediate('z'));
BranchOrBacktrack(above, on_no_match);
}
DCHECK_EQ(0, word_character_map[0]); // Character '\0' is not a word char.
ExternalReference word_map = ExternalReference::re_word_character_map();
__ test_b(current_character(),
Operand(current_character(), times_1, word_map.address(),
RelocInfo::EXTERNAL_REFERENCE));
BranchOrBacktrack(zero, on_no_match);
return true;
}
case 'W': {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Immediate('z'));
__ j(above, &done);
case StandardCharacterSet::kWord: {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Immediate('z'));
BranchOrBacktrack(above, on_no_match);
}
DCHECK_EQ(0,
word_character_map[0]); // Character '\0' is not a word char.
ExternalReference word_map = ExternalReference::re_word_character_map();
__ test_b(current_character(),
Operand(current_character(), times_1, word_map.address(),
RelocInfo::EXTERNAL_REFERENCE));
BranchOrBacktrack(zero, on_no_match);
return true;
}
DCHECK_EQ(0, word_character_map[0]); // Character '\0' is not a word char.
ExternalReference word_map = ExternalReference::re_word_character_map();
__ test_b(current_character(),
Operand(current_character(), times_1, word_map.address(),
RelocInfo::EXTERNAL_REFERENCE));
BranchOrBacktrack(not_zero, on_no_match);
if (mode_ != LATIN1) {
__ bind(&done);
case StandardCharacterSet::kNotWord: {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmp(current_character(), Immediate('z'));
__ j(above, &done);
}
DCHECK_EQ(0,
word_character_map[0]); // Character '\0' is not a word char.
ExternalReference word_map = ExternalReference::re_word_character_map();
__ test_b(current_character(),
Operand(current_character(), times_1, word_map.address(),
RelocInfo::EXTERNAL_REFERENCE));
BranchOrBacktrack(not_zero, on_no_match);
if (mode_ != LATIN1) {
__ bind(&done);
}
return true;
}
return true;
}
// Non-standard classes (with no syntactic shorthand) used internally.
case '*':
case StandardCharacterSet::kEverything:
// Match any character.
return true;
case 'n': {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 or 0x2029).
// The opposite of '.'.
__ mov(eax, current_character());
__ xor_(eax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ sub(eax, Immediate(0x0B));
__ cmp(eax, 0x0C - 0x0B);
if (mode_ == LATIN1) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
DCHECK_EQ(UC16, mode_);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ sub(eax, Immediate(0x2028 - 0x0B));
__ cmp(eax, 1);
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
}
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
}
......
......@@ -54,7 +54,8 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerIA32
// Checks whether the given offset from the current position is before
// the end of the string.
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(base::uc16 type, Label* on_no_match) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;
......
......@@ -71,6 +71,19 @@ class Interval {
int to_;
};
// Named standard character sets.
enum class StandardCharacterSet : char {
kWhitespace = 's', // Like /\s/.
kNotWhitespace = 'S', // Like /\S/.
kWord = 'w', // Like /\w/.
kNotWord = 'W', // Like /\W/.
kDigit = 'd', // Like /\d/.
kNotDigit = 'D', // Like /\D/.
kLineTerminator = 'n', // The inverse of /./.
kNotLineTerminator = '.', // Like /./.
kEverything = '*', // Matches every character, like /./s.
};
// Represents code points (with values up to 0x10FFFF) in the range from from_
// to to_, both ends are inclusive.
class CharacterRange {
......@@ -99,13 +112,14 @@ class CharacterRange {
return list;
}
V8_EXPORT_PRIVATE static void AddClassEscape(char type,
ZoneList<CharacterRange>* ranges,
Zone* zone);
V8_EXPORT_PRIVATE static void AddClassEscape(
StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges, Zone* zone);
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
V8_EXPORT_PRIVATE static void AddClassEscape(
char type, ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents, Zone* zone);
StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
Zone* zone);
V8_EXPORT_PRIVATE static void AddCaseEquivalents(
Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
bool is_one_byte);
......@@ -238,24 +252,23 @@ class RegExpAssertion final : public RegExpTree {
class CharacterSet final {
public:
explicit CharacterSet(base::uc16 standard_set_type)
explicit CharacterSet(StandardCharacterSet standard_set_type)
: standard_set_type_(standard_set_type) {}
explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {}
ZoneList<CharacterRange>* ranges(Zone* zone);
base::uc16 standard_set_type() const { return standard_set_type_; }
void set_standard_set_type(base::uc16 special_set_type) {
standard_set_type_ = special_set_type;
StandardCharacterSet standard_set_type() const {
return standard_set_type_.value();
}
bool is_standard() const { return standard_set_type_ != 0; }
void set_standard_set_type(StandardCharacterSet standard_set_type) {
standard_set_type_ = standard_set_type;
}
bool is_standard() const { return standard_set_type_.has_value(); }
V8_EXPORT_PRIVATE void Canonicalize();
private:
ZoneList<CharacterRange>* ranges_ = nullptr;
// If non-zero, the value represents a standard set (e.g., all whitespace
// characters) without having to expand the ranges. See the comment on top of
// `standard_type` below.
base::uc16 standard_set_type_ = 0;
base::Optional<StandardCharacterSet> standard_set_type_;
};
class RegExpCharacterClass final : public RegExpTree {
......@@ -280,8 +293,8 @@ class RegExpCharacterClass final : public RegExpTree {
character_class_flags_ ^= NEGATED;
}
}
explicit RegExpCharacterClass(base::uc16 type)
: set_(type), character_class_flags_(CharacterClassFlags()) {}
explicit RegExpCharacterClass(StandardCharacterSet standard_set_type)
: set_(standard_set_type), character_class_flags_() {}
DECL_BOILERPLATE(CharacterClass);
......@@ -299,16 +312,9 @@ class RegExpCharacterClass final : public RegExpTree {
bool is_standard(Zone* zone);
// Returns a value representing the standard character set if is_standard()
// returns true.
// Currently used values are:
// s : unicode whitespace
// S : unicode non-whitespace
// w : ASCII word character (digit, letter, underscore)
// W : non-ASCII word character
// d : ASCII digit
// D : non-ASCII digit
// . : non-newline
// * : All characters, for advancing unanchored regexp
base::uc16 standard_type() const { return set_.standard_set_type(); }
StandardCharacterSet standard_type() const {
return set_.standard_set_type();
}
CharacterSet character_set() const { return set_; }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
......@@ -323,7 +329,6 @@ class RegExpCharacterClass final : public RegExpTree {
CharacterClassFlags character_class_flags_;
};
class RegExpAtom final : public RegExpTree {
public:
explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}
......
......@@ -101,29 +101,29 @@ bool RegExpCharacterClass::is_standard(Zone* zone) {
return true;
}
if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
set_.set_standard_set_type('s');
set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
return true;
}
if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
set_.set_standard_set_type('S');
set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
return true;
}
if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
kLineTerminatorRangeCount)) {
set_.set_standard_set_type('.');
set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
return true;
}
if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
kLineTerminatorRangeCount)) {
set_.set_standard_set_type('n');
set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
return true;
}
if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
set_.set_standard_set_type('w');
set_.set_standard_set_type(StandardCharacterSet::kWord);
return true;
}
if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
set_.set_standard_set_type('W');
set_.set_standard_set_type(StandardCharacterSet::kNotWord);
return true;
}
return false;
......@@ -423,7 +423,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
zone->New<RegExpCharacterClass>(zone, ranges);
return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
}
if (standard_type() == '*') {
if (set_.is_standard() &&
standard_type() == StandardCharacterSet::kEverything) {
return UnanchoredAdvance(compiler, on_success);
} else {
ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
......@@ -748,7 +749,8 @@ RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* word_range =
zone->New<ZoneList<CharacterRange>>(2, zone);
CharacterRange::AddClassEscape('w', word_range, true, zone);
CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
zone);
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
......@@ -808,8 +810,10 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
// Create a newline atom.
ZoneList<CharacterRange>* newline_ranges =
zone->New<ZoneList<CharacterRange>>(3, zone);
CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>('n');
CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
newline_ranges, false, zone);
RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>(
StandardCharacterSet::kLineTerminator);
TextNode* newline_matcher =
zone->New<TextNode>(newline_atom, false,
ActionNode::PositiveSubmatchSuccess(
......@@ -1057,10 +1061,13 @@ static void AddClassNegated(const int* elmv, int elmc,
ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
}
void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents,
Zone* zone) {
if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
if (add_unicode_case_equivalents &&
(standard_character_set == StandardCharacterSet::kWord ||
standard_character_set == StandardCharacterSet::kNotWord)) {
// See #sec-runtime-semantics-wordcharacters-abstract-operation
// In case of unicode and ignore_case, we need to create the closure over
// case equivalent characters before negating.
......@@ -1068,7 +1075,7 @@ void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
zone->New<ZoneList<CharacterRange>>(2, zone);
AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
AddUnicodeCaseEquivalents(new_ranges, zone);
if (type == 'W') {
if (standard_character_set == StandardCharacterSet::kNotWord) {
ZoneList<CharacterRange>* negated =
zone->New<ZoneList<CharacterRange>>(2, zone);
CharacterRange::Negate(new_ranges, negated, zone);
......@@ -1077,47 +1084,46 @@ void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
ranges->AddAll(*new_ranges, zone);
return;
}
AddClassEscape(type, ranges, zone);
AddClassEscape(standard_character_set, ranges, zone);
}
void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges,
Zone* zone) {
switch (type) {
case 's':
switch (standard_character_set) {
case StandardCharacterSet::kWhitespace:
AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
break;
case 'S':
case StandardCharacterSet::kNotWhitespace:
AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
break;
case 'w':
case StandardCharacterSet::kWord:
AddClass(kWordRanges, kWordRangeCount, ranges, zone);
break;
case 'W':
case StandardCharacterSet::kNotWord:
AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
break;
case 'd':
case StandardCharacterSet::kDigit:
AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
break;
case 'D':
case StandardCharacterSet::kNotDigit:
AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
break;
case '.':
// This is the set of characters matched by the $ and ^ symbols
// in multiline mode.
case StandardCharacterSet::kLineTerminator:
AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
break;
case StandardCharacterSet::kNotLineTerminator:
AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
zone);
break;
// This is not a character range as defined by the spec but a
// convenient shorthand for a character class that matches any
// character.
case '*':
case StandardCharacterSet::kEverything:
ranges->Add(CharacterRange::Everything(), zone);
break;
// This is the set of characters matched by the $ and ^ symbols
// in multiline mode.
case 'n':
AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
break;
default:
UNREACHABLE();
}
}
......@@ -1256,7 +1262,8 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
if (ranges_ == nullptr) {
ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
zone);
}
return ranges_;
}
......
......@@ -2073,7 +2073,8 @@ namespace {
void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
Label* non_word, bool fall_through_on_word) {
if (assembler->CheckSpecialCharacterClass(
fall_through_on_word ? 'w' : 'W',
fall_through_on_word ? StandardCharacterSet::kWord
: StandardCharacterSet::kNotWord,
fall_through_on_word ? non_word : word)) {
// Optimized implementation available.
return;
......@@ -2119,7 +2120,8 @@ void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
new_trace.backtrack(), can_skip_bounds_check);
if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
if (!assembler->CheckSpecialCharacterClass(
StandardCharacterSet::kLineTerminator, new_trace.backtrack())) {
// Newline means \n, \r, 0x2028 or 0x2029.
if (!compiler->one_byte()) {
assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
......@@ -3882,7 +3884,8 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
RegExpNode* loop_node = RegExpQuantifier::ToNode(
0, RegExpTree::kInfinity, false, zone()->New<RegExpCharacterClass>('*'),
0, RegExpTree::kInfinity, false,
zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
this, captured_body, data->contains_anchor);
if (data->contains_anchor) {
......@@ -3891,7 +3894,8 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
zone()->New<RegExpCharacterClass>('*'), false, loop_node)));
zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
false, loop_node)));
node = first_step_node;
} else {
node = loop_node;
......
......@@ -354,12 +354,11 @@ void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
}
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
base::uc16 type, Label* on_no_match) {
StandardCharacterSet type, Label* on_no_match) {
bool supported = assembler_->CheckSpecialCharacterClass(type,
on_no_match);
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
type,
LabelToInt(on_no_match),
static_cast<char>(type), LabelToInt(on_no_match),
supported ? "true" : "false");
return supported;
}
......
......@@ -50,7 +50,8 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
Label* on_not_in_range) override;
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(base::uc16 type, Label* on_no_match) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;
......
......@@ -107,7 +107,8 @@ class RegExpMacroAssembler {
// character. Returns false if the type of special character class does
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(base::uc16 type, Label* on_no_match) {
virtual bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) {
return false;
}
......
......@@ -671,10 +671,12 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
if (builder->dotall()) {
// Everything.
CharacterRange::AddClassEscape('*', ranges, false, zone());
CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
ranges, false, zone());
} else {
// Everything except \x0A, \x0D, \u2028 and \u2029
CharacterRange::AddClassEscape('.', ranges, false, zone());
// Everything except \x0A, \x0D, \u2028 and \u2029.
CharacterRange::AddClassEscape(
StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
}
RegExpCharacterClass* cc =
......@@ -1950,8 +1952,9 @@ bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
case 'S':
case 'w':
case 'W':
CharacterRange::AddClassEscape(static_cast<char>(next), ranges,
add_unicode_case_equivalents, zone);
CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
ranges, add_unicode_case_equivalents,
zone);
Advance(2);
return true;
case 'p':
......
......@@ -532,123 +532,123 @@ void RegExpMacroAssemblerX64::CheckBitInTable(
BranchOrBacktrack(not_equal, on_bit_set);
}
bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(base::uc16 type,
Label* on_no_match) {
bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(
StandardCharacterSet type, Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check, using the sequence:
// leal(rax, Operand(current_character(), -min)) or sub(rax, Immediate(min))
// cmpl(rax, Immediate(max - min))
// TODO(jgruber): No custom implementation (yet): s(UC16), S(UC16).
switch (type) {
case 's':
// Match space-characters
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
__ cmpl(current_character(), Immediate(' '));
__ j(equal, &success, Label::kNear);
// Check range 0x09..0x0D
__ leal(rax, Operand(current_character(), -'\t'));
__ cmpl(rax, Immediate('\r' - '\t'));
__ j(below_equal, &success, Label::kNear);
// \u00a0 (NBSP).
__ cmpl(rax, Immediate(0x00A0 - '\t'));
BranchOrBacktrack(not_equal, on_no_match);
__ bind(&success);
case StandardCharacterSet::kWhitespace:
// Match space-characters.
if (mode_ == LATIN1) {
// One byte space characters are '\t'..'\r', ' ' and \u00a0.
Label success;
__ cmpl(current_character(), Immediate(' '));
__ j(equal, &success, Label::kNear);
// Check range 0x09..0x0D.
__ leal(rax, Operand(current_character(), -'\t'));
__ cmpl(rax, Immediate('\r' - '\t'));
__ j(below_equal, &success, Label::kNear);
// \u00a0 (NBSP).
__ cmpl(rax, Immediate(0x00A0 - '\t'));
BranchOrBacktrack(not_equal, on_no_match);
__ bind(&success);
return true;
}
return false;
case StandardCharacterSet::kNotWhitespace:
// The emitted code for generic character classes is good enough.
return false;
case StandardCharacterSet::kDigit:
// Match ASCII digits ('0'..'9').
__ leal(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(above, on_no_match);
return true;
}
return false;
case 'S':
// The emitted code for generic character classes is good enough.
return false;
case 'd':
// Match ASCII digits ('0'..'9')
__ leal(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(above, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
__ leal(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(below_equal, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ movl(rax, current_character());
__ xorl(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ subl(rax, Immediate(0x0B));
__ cmpl(rax, Immediate(0x0C - 0x0B));
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ subl(rax, Immediate(0x2028 - 0x0B));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
case StandardCharacterSet::kNotDigit:
// Match non ASCII-digits.
__ leal(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(below_equal, on_no_match);
return true;
case StandardCharacterSet::kNotLineTerminator: {
// Match non-newlines (not 0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029).
__ movl(rax, current_character());
__ xorl(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C.
__ subl(rax, Immediate(0x0B));
__ cmpl(rax, Immediate(0x0C - 0x0B));
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ subl(rax, Immediate(0x2028 - 0x0B));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(below_equal, on_no_match);
}
return true;
}
return true;
}
case 'n': {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029)
__ movl(rax, current_character());
__ xorl(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C
__ subl(rax, Immediate(0x0B));
__ cmpl(rax, Immediate(0x0C - 0x0B));
if (mode_ == LATIN1) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ subl(rax, Immediate(0x2028 - 0x0B));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
}
case 'w': {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmpl(current_character(), Immediate('z'));
BranchOrBacktrack(above, on_no_match);
case StandardCharacterSet::kLineTerminator: {
// Match newlines (0x0A('\n'), 0x0D('\r'), 0x2028 and 0x2029).
__ movl(rax, current_character());
__ xorl(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0B or 0x0C.
__ subl(rax, Immediate(0x0B));
__ cmpl(rax, Immediate(0x0C - 0x0B));
if (mode_ == LATIN1) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0B). I.e., check for
// 0x201D (0x2028 - 0x0B) or 0x201E.
__ subl(rax, Immediate(0x2028 - 0x0B));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
}
__ Move(rbx, ExternalReference::re_word_character_map());
DCHECK_EQ(0, word_character_map[0]); // Character '\0' is not a word char.
__ testb(Operand(rbx, current_character(), times_1, 0),
current_character());
BranchOrBacktrack(zero, on_no_match);
return true;
}
case 'W': {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmpl(current_character(), Immediate('z'));
__ j(above, &done);
case StandardCharacterSet::kWord: {
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmpl(current_character(), Immediate('z'));
BranchOrBacktrack(above, on_no_match);
}
__ Move(rbx, ExternalReference::re_word_character_map());
DCHECK_EQ(0,
word_character_map[0]); // Character '\0' is not a word char.
__ testb(Operand(rbx, current_character(), times_1, 0),
current_character());
BranchOrBacktrack(zero, on_no_match);
return true;
}
__ Move(rbx, ExternalReference::re_word_character_map());
DCHECK_EQ(0, word_character_map[0]); // Character '\0' is not a word char.
__ testb(Operand(rbx, current_character(), times_1, 0),
current_character());
BranchOrBacktrack(not_zero, on_no_match);
if (mode_ != LATIN1) {
__ bind(&done);
case StandardCharacterSet::kNotWord: {
Label done;
if (mode_ != LATIN1) {
// Table is 256 entries, so all Latin1 characters can be tested.
__ cmpl(current_character(), Immediate('z'));
__ j(above, &done);
}
__ Move(rbx, ExternalReference::re_word_character_map());
DCHECK_EQ(0,
word_character_map[0]); // Character '\0' is not a word char.
__ testb(Operand(rbx, current_character(), times_1, 0),
current_character());
BranchOrBacktrack(not_zero, on_no_match);
if (mode_ != LATIN1) {
__ bind(&done);
}
return true;
}
return true;
}
case '*':
// Match any character.
return true;
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
case StandardCharacterSet::kEverything:
// Match any character.
return true;
}
}
......
......@@ -53,7 +53,8 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerX64
// Checks whether the given offset from the current position is before
// the end of the string.
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(base::uc16 type, Label* on_no_match) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;
......
......@@ -505,7 +505,8 @@ static bool NotLineTerminator(base::uc32 c) {
return !unibrow::IsLineTerminator(c);
}
static void TestCharacterClassEscapes(base::uc32 c, bool(pred)(base::uc32 c)) {
static void TestCharacterClassEscapes(StandardCharacterSet c,
bool(pred)(base::uc32 c)) {
Zone zone(CcTest::i_isolate()->allocator(), ZONE_NAME);
ZoneList<CharacterRange>* ranges =
zone.New<ZoneList<CharacterRange>>(2, &zone);
......@@ -521,13 +522,16 @@ static void TestCharacterClassEscapes(base::uc32 c, bool(pred)(base::uc32 c)) {
}
TEST(CharacterClassEscapes) {
TestCharacterClassEscapes('.', NotLineTerminator);
TestCharacterClassEscapes('d', IsDigit);
TestCharacterClassEscapes('D', NotDigit);
TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
TestCharacterClassEscapes('w', IsRegExpWord);
TestCharacterClassEscapes('W', NotWord);
TestCharacterClassEscapes(StandardCharacterSet::kNotLineTerminator,
NotLineTerminator);
TestCharacterClassEscapes(StandardCharacterSet::kDigit, IsDigit);
TestCharacterClassEscapes(StandardCharacterSet::kNotDigit, NotDigit);
TestCharacterClassEscapes(StandardCharacterSet::kWhitespace,
IsWhiteSpaceOrLineTerminator);
TestCharacterClassEscapes(StandardCharacterSet::kNotWhitespace,
NotWhiteSpaceNorLineTermiantor);
TestCharacterClassEscapes(StandardCharacterSet::kWord, IsRegExpWord);
TestCharacterClassEscapes(StandardCharacterSet::kNotWord, NotWord);
}
static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment