Commit 2e17aaca authored by Jakob Gruber's avatar Jakob Gruber Committed by V8 LUCI CQ

[regexp] Fix CharacterRange limits again again again

When emitting code, character ranges must only specify ranges which
the actual subject string (one- or two-byte) may contain.

This was not always the case, specifically for ranges with
`from <= kMaxUint8` and `to > kMaxUint8`.

The reason this is so tricky: 1. not all parts of the pipeline know
whether we are compiling for one- or two-byte subjects; 2. for
case-insensitive regexps, an out-of-bounds CharacterRange may have an
in-bounds case equivalent (e.g. /[Ÿ]/i also matches 'ÿ' == \u{ff}),
which only gets added somewhere in the middle of the pipeline.

Our current solution is to clamp immediately before code emission. We
also keep the existing handling/dchecks of the 0x10ffff marker value
which may occur in the two-byte subject case.

Bug: v8:11069
Change-Id: Ic7b34a13a900ea2aa3df032daac9236bf5682a42
Fixed: chromium:1275096
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3306569
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Cr-Commit-Position: refs/heads/main@{#78186}
parent f19ea33e
......@@ -138,6 +138,9 @@ class CharacterRange {
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst, Zone* zone);
// Remove all ranges outside the one-byte range.
static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
private:
CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
......
......@@ -1420,6 +1420,7 @@ void CharacterSet::Canonicalize() {
CharacterRange::Canonicalize(ranges_);
}
// static
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
if (character_ranges->length() <= 1) return;
// Check whether ranges are already canonical (increasing, non-overlapping,
......@@ -1455,6 +1456,7 @@ void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
DCHECK(CharacterRange::IsCanonical(character_ranges));
}
// static
void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
ZoneList<CharacterRange>* negated_ranges,
Zone* zone) {
......@@ -1478,6 +1480,27 @@ void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
}
}
// static
void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
DCHECK(IsCanonical(ranges));
// Drop all ranges that don't contain one-byte code units, and clamp the last
// range s.t. it likewise only contains one-byte code units. Note this relies
// on `ranges` being canonicalized, i.e. sorted and non-overlapping.
static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
int n = ranges->length();
for (; n > 0; n--) {
CharacterRange& r = ranges->at(n - 1);
if (r.from() <= max_char) {
r.to_ = std::min(r.to_, max_char);
break;
}
}
ranges->Rewind(n);
}
namespace {
// Scoped object to keep track of how much we unroll quantifier loops in the
......
......@@ -1222,20 +1222,11 @@ void EmitCharClass(RegExpMacroAssembler* macro_assembler,
ZoneList<CharacterRange>* ranges = cc->ranges(zone);
CharacterRange::Canonicalize(ranges);
const base::uc32 max_char = MaxCodeUnit(one_byte);
// Determine the 'interesting' set of ranges; may be a subset of the given
// range set if it contains ranges not representable by the current string
// representation.
int ranges_length = ranges->length();
while (ranges_length > 0) {
CharacterRange& range = ranges->at(ranges_length - 1);
if (range.from() <= max_char) break;
ranges_length--;
}
ranges->Rewind(ranges_length); // Drop all uninteresting ranges.
// Now that all processing (like case-insensitivity) is done, clamp the
// ranges to the set of ranges that may actually occur in the subject string.
if (one_byte) CharacterRange::ClampToOneByte(ranges);
const int ranges_length = ranges->length();
if (ranges_length == 0) {
if (!cc->is_negated()) {
macro_assembler->GoTo(on_failure);
......@@ -1246,6 +1237,7 @@ void EmitCharClass(RegExpMacroAssembler* macro_assembler,
return;
}
const base::uc32 max_char = MaxCodeUnit(one_byte);
if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
if (cc->is_negated()) {
macro_assembler->GoTo(on_failure);
......
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --no-regexp-tier-up
/[\u{0}zPudf\u{d3}-\ud809\udccc]/iu.exec(""); // Don't crash :)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment