Commit dae25c02 authored by Toon Verwaest's avatar Toon Verwaest Committed by Commit Bot

[char-predicates] Use OneByte flag table and add line terminator support

Using a OneByte table allows branches to be removed if the function is inlined
in a place where we statically know the character is onebyte.

This adds support for line terminators. To support 2byte line terminators as
well this adds a entries for the lower byte into the table so we can often take
a faster path in that case as well.

Change-Id: Ibd08d540e0e13047d6c1f675c187f14fda4336c5
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2445471Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Reviewed-by: 's avatarToon Verwaest <verwaest@chromium.org>
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70286}
parent 896627db
......@@ -74,64 +74,102 @@ inline constexpr bool IsRegExpWord(uc32 c) {
}
// Constexpr cache table for character flags.
enum AsciiCharFlags {
enum OneByteCharFlags {
kIsIdentifierStart = 1 << 0,
kIsIdentifierPart = 1 << 1,
kIsWhiteSpace = 1 << 2,
kIsWhiteSpaceOrLineTerminator = 1 << 3
kIsWhiteSpaceOrLineTerminator = 1 << 3,
kMaybeLineEnd = 1 << 4
};
constexpr uint8_t BuildAsciiCharFlags(uc32 c) {
return ((IsAsciiIdentifier(c) || c == '\\')
? (kIsIdentifierPart |
(!IsDecimalDigit(c) ? kIsIdentifierStart : 0))
: 0) |
((c == ' ' || c == '\t' || c == '\v' || c == '\f')
? kIsWhiteSpace | kIsWhiteSpaceOrLineTerminator
: 0) |
((c == '\r' || c == '\n') ? kIsWhiteSpaceOrLineTerminator : 0);
}
const constexpr uint8_t kAsciiCharFlags[128] = {
#define BUILD_CHAR_FLAGS(N) BuildAsciiCharFlags(N),
// See http://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
// ID_Start. Additionally includes '_' and '$'.
constexpr bool IsOneByteIDStart(uc32 c) {
return c == 0x0024 || (c >= 0x0041 && c <= 0x005A) || c == 0x005F ||
(c >= 0x0061 && c <= 0x007A) || c == 0x00AA || c == 0x00B5 ||
c == 0x00BA || (c >= 0x00C0 && c <= 0x00D6) ||
(c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF);
}
// See http://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
// ID_Continue. Additionally includes '_' and '$'.
constexpr bool IsOneByteIDContinue(uc32 c) {
return c == 0x0024 || (c >= 0x0030 && c <= 0x0039) || c == 0x005F ||
(c >= 0x0041 && c <= 0x005A) || (c >= 0x0061 && c <= 0x007A) ||
c == 0x00AA || c == 0x00B5 || c == 0x00B7 || c == 0x00BA ||
(c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) ||
(c >= 0x00F8 && c <= 0x00FF);
}
constexpr bool IsOneByteWhitespace(uc32 c) {
return c == '\t' || c == '\v' || c == '\f' || c == ' ' || c == u'\xa0';
}
constexpr uint8_t BuildOneByteCharFlags(uc32 c) {
uint8_t result = 0;
if (IsOneByteIDStart(c) || c == '\\') result |= kIsIdentifierStart;
if (IsOneByteIDContinue(c) || c == '\\') result |= kIsIdentifierPart;
if (IsOneByteWhitespace(c)) {
result |= kIsWhiteSpace | kIsWhiteSpaceOrLineTerminator;
}
if (c == '\r' || c == '\n') {
result |= kIsWhiteSpaceOrLineTerminator | kMaybeLineEnd;
}
// Add markers to identify 0x2028 and 0x2029.
if (c == static_cast<uint8_t>(0x2028) || c == static_cast<uint8_t>(0x2029)) {
result |= kMaybeLineEnd;
}
return result;
}
const constexpr uint8_t kOneByteCharFlags[256] = {
#define BUILD_CHAR_FLAGS(N) BuildOneByteCharFlags(N),
INT_0_TO_127_LIST(BUILD_CHAR_FLAGS)
#undef BUILD_CHAR_FLAGS
#define BUILD_CHAR_FLAGS(N) BuildOneByteCharFlags(N + 128),
INT_0_TO_127_LIST(BUILD_CHAR_FLAGS)
#undef BUILD_CHAR_FLAGS
};
bool IsIdentifierStart(uc32 c) {
if (!base::IsInRange(c, 0, 127)) return IsIdentifierStartSlow(c);
if (!base::IsInRange(c, 0, 255)) return IsIdentifierStartSlow(c);
DCHECK_EQ(IsIdentifierStartSlow(c),
static_cast<bool>(kAsciiCharFlags[c] & kIsIdentifierStart));
return kAsciiCharFlags[c] & kIsIdentifierStart;
static_cast<bool>(kOneByteCharFlags[c] & kIsIdentifierStart));
return kOneByteCharFlags[c] & kIsIdentifierStart;
}
bool IsIdentifierPart(uc32 c) {
if (!base::IsInRange(c, 0, 127)) return IsIdentifierPartSlow(c);
if (!base::IsInRange(c, 0, 255)) return IsIdentifierPartSlow(c);
DCHECK_EQ(IsIdentifierPartSlow(c),
static_cast<bool>(kAsciiCharFlags[c] & kIsIdentifierPart));
return kAsciiCharFlags[c] & kIsIdentifierPart;
static_cast<bool>(kOneByteCharFlags[c] & kIsIdentifierPart));
return kOneByteCharFlags[c] & kIsIdentifierPart;
}
bool IsWhiteSpace(uc32 c) {
if (!base::IsInRange(c, 0, 127)) return IsWhiteSpaceSlow(c);
if (!base::IsInRange(c, 0, 255)) return IsWhiteSpaceSlow(c);
DCHECK_EQ(IsWhiteSpaceSlow(c),
static_cast<bool>(kAsciiCharFlags[c] & kIsWhiteSpace));
return kAsciiCharFlags[c] & kIsWhiteSpace;
static_cast<bool>(kOneByteCharFlags[c] & kIsWhiteSpace));
return kOneByteCharFlags[c] & kIsWhiteSpace;
}
bool IsWhiteSpaceOrLineTerminator(uc32 c) {
if (!base::IsInRange(c, 0, 127)) return IsWhiteSpaceOrLineTerminatorSlow(c);
if (!base::IsInRange(c, 0, 255)) return IsWhiteSpaceOrLineTerminatorSlow(c);
DCHECK_EQ(
IsWhiteSpaceOrLineTerminatorSlow(c),
static_cast<bool>(kAsciiCharFlags[c] & kIsWhiteSpaceOrLineTerminator));
return kAsciiCharFlags[c] & kIsWhiteSpaceOrLineTerminator;
static_cast<bool>(kOneByteCharFlags[c] & kIsWhiteSpaceOrLineTerminator));
return kOneByteCharFlags[c] & kIsWhiteSpaceOrLineTerminator;
}
bool IsLineTerminatorSequence(uc32 c, uc32 next) {
if (!unibrow::IsLineTerminator(c)) return false;
if (c == 0x000d && next == 0x000a) return false; // CR with following LF.
return true;
if (kOneByteCharFlags[static_cast<uint8_t>(c)] & kMaybeLineEnd) {
if (c == '\n') return true;
if (c == '\r') return next != '\n';
return base::IsInRange(static_cast<unsigned int>(c), 0x2028u, 0x2029u);
}
return false;
}
} // namespace internal
} // namespace v8
#endif // V8_STRINGS_CHAR_PREDICATES_INL_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment