Commit e80ca24c authored by Iain Ireland's avatar Iain Ireland Committed by Commit Bot

[regexp] Rewrite error handling

This patch modifies irregexp's error handling. Instead of representing
errors as C strings, they are represented as an enumeration value
(RegExpError), and only converted to strings when throwing the error
object in regexp.cc. This makes it significantly easier to integrate
into SpiderMonkey. A few notes:

1. Depending on whether the stack overflows during parsing or
   analysis, the stack overflow message can vary ("Stack overflow" or
   "Maximum call stack size exceeded"). I kept that behaviour in this
   patch, under the assumption that stack overflow messages are
   (sadly) the sorts of things that real world code ends up depending
   on.

2. Depending on the point in code where the error was identified,
   invalid unicode escapes could be reported as "Invalid Unicode
   escape", "Invalid unicode escape", or "Invalid Unicode escape
   sequence". I fervently hope that nobody depends on the specific
   wording of a syntax error, so I standardized on the first one. (It
   was both the most common, and the most consistent with other
   "Invalid X escape" messages.)

3. In addition to changing the representation, this patch also adds an
   error_pos field to RegExpParser and RegExpCompileData, which stores
   the position at which an error occurred. This is used by
   SpiderMonkey to provide more helpful messages about where a syntax
   error occurred in large regular expressions.

4. This model is closer to V8's existing MessageTemplate
   infrastructure. I considered trying to integrate it more closely
   with MessageTemplate, but since one of our stated goals for this
   project was to make it easier to use irregexp outside of V8, I
   decided to hold off.

R=jgruber@chromium.org

Bug: v8:10303
Change-Id: I62605fd2def2fc539f38a7e0eefa04d36e14bbde
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2091863
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66784}
parent fabea6af
......@@ -2833,6 +2833,8 @@ v8_source_set("v8_base_without_compiler") {
"src/regexp/regexp-compiler.h",
"src/regexp/regexp-dotprinter.cc",
"src/regexp/regexp-dotprinter.h",
"src/regexp/regexp-error.cc",
"src/regexp/regexp-error.h",
"src/regexp/regexp-interpreter.cc",
"src/regexp/regexp-interpreter.h",
"src/regexp/regexp-macro-assembler-arch.h",
......
......@@ -3594,7 +3594,9 @@ template <typename... Propagators>
class Analysis : public NodeVisitor {
public:
Analysis(Isolate* isolate, bool is_one_byte)
: isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
: isolate_(isolate),
is_one_byte_(is_one_byte),
error_(RegExpError::kNone) {}
void EnsureAnalyzed(RegExpNode* that) {
StackLimitCheck check(isolate());
......@@ -3602,7 +3604,7 @@ class Analysis : public NodeVisitor {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Analysis: Aborting on stack overflow");
}
fail("Stack overflow");
fail(RegExpError::kAnalysisStackOverflow);
return;
}
if (that->info()->been_analyzed || that->info()->being_analyzed) return;
......@@ -3612,12 +3614,12 @@ class Analysis : public NodeVisitor {
that->info()->been_analyzed = true;
}
bool has_failed() { return error_message_ != nullptr; }
const char* error_message() {
DCHECK(error_message_ != nullptr);
return error_message_;
bool has_failed() { return error_ != RegExpError::kNone; }
RegExpError error() {
DCHECK(error_ != RegExpError::kNone);
return error_;
}
void fail(const char* error_message) { error_message_ = error_message; }
void fail(RegExpError error) { error_ = error; }
Isolate* isolate() const { return isolate_; }
......@@ -3702,19 +3704,19 @@ class Analysis : public NodeVisitor {
private:
Isolate* isolate_;
bool is_one_byte_;
const char* error_message_;
RegExpError error_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
};
const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
RegExpNode* node) {
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
is_one_byte);
DCHECK_EQ(node->info()->been_analyzed, false);
analysis.EnsureAnalyzed(node);
DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr);
return analysis.has_failed() ? analysis.error_message() : nullptr;
DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
}
void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
......
......@@ -423,10 +423,7 @@ struct PreloadState {
// Analysis performs assertion propagation and computes eats_at_least_ values.
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
// details.
//
// This method returns nullptr on success or a null-terminated failure message
// on failure.
const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
class FrequencyCollator {
public:
......@@ -503,18 +500,17 @@ class RegExpCompiler {
}
struct CompilationResult final {
explicit CompilationResult(const char* error_message)
: error_message(error_message) {}
explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResult(Object code, int registers)
: code(code), num_registers(registers) {}
static CompilationResult RegExpTooBig() {
return CompilationResult("RegExp too big");
return CompilationResult(RegExpError::kTooLarge);
}
bool Succeeded() const { return error_message == nullptr; }
bool Succeeded() const { return error == RegExpError::kNone; }
const char* const error_message = nullptr;
const RegExpError error = RegExpError::kNone;
Object code;
int num_registers = 0;
};
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/regexp-error.h"
namespace v8 {
namespace internal {
const char* kRegExpErrorStrings[] = {
#define TEMPLATE(NAME, STRING) STRING,
REGEXP_ERROR_MESSAGES(TEMPLATE)
#undef TEMPLATE
};
const char* RegExpErrorString(RegExpError error) {
DCHECK_LT(error, RegExpError::NumErrors);
return kRegExpErrorStrings[static_cast<int>(error)];
}
} // namespace internal
} // namespace v8
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_ERROR_H_
#define V8_REGEXP_REGEXP_ERROR_H_
#include "src/base/logging.h"
#include "src/base/macros.h"
namespace v8 {
namespace internal {
#define REGEXP_ERROR_MESSAGES(T) \
T(None, "") \
T(StackOverflow, "Maximum call stack size exceeded") \
T(AnalysisStackOverflow, "Stack overflow") \
T(TooLarge, "Regular expression too large") \
T(UnterminatedGroup, "Unterminated group") \
T(UnmatchedParen, "Unmatched ')'") \
T(EscapeAtEndOfPattern, "\\ at end of pattern") \
T(InvalidPropertyName, "Invalid property name") \
T(InvalidEscape, "Invalid escape") \
T(InvalidDecimalEscape, "Invalid decimal escape") \
T(InvalidUnicodeEscape, "Invalid Unicode escape") \
T(NothingToRepeat, "Nothing to repeat") \
T(LoneQuantifierBrackets, "Lone quantifier brackets") \
T(RangeOutOfOrder, "Numbers out of order in {} quantifier") \
T(IncompleteQuantifier, "Incomplete quantifier") \
T(InvalidQuantifier, "Invalid quantifier") \
T(InvalidGroup, "Invalid group") \
T(MultipleFlagDashes, "Multiple dashes in flag group") \
T(RepeatedFlag, "Repeated flag in flag group") \
T(InvalidFlagGroup, "Invalid flag group") \
T(TooManyCaptures, "Too many captures") \
T(InvalidCaptureGroupName, "Invalid capture group name") \
T(DuplicateCaptureGroupName, "Duplicate capture group name") \
T(InvalidNamedReference, "Invalid named reference") \
T(InvalidNamedCaptureReference, "Invalid named capture referenced") \
T(InvalidClassEscape, "Invalid class escape") \
T(InvalidClassPropertyName, "Invalid property name in character class") \
T(InvalidCharacterClass, "Invalid character class") \
T(UnterminatedCharacterClass, "Unterminated character class") \
T(OutOfOrderCharacterClass, "Range out of order in character class")
enum class RegExpError : uint32_t {
#define TEMPLATE(NAME, STRING) k##NAME,
REGEXP_ERROR_MESSAGES(TEMPLATE)
#undef TEMPLATE
NumErrors
};
V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_ERROR_H_
This diff is collapsed.
......@@ -8,6 +8,7 @@
#include "src/objects/js-regexp.h"
#include "src/objects/objects.h"
#include "src/regexp/regexp-ast.h"
#include "src/regexp/regexp-error.h"
#include "src/zone/zone.h"
namespace v8 {
......@@ -153,8 +154,8 @@ class RegExpBuilder : public ZoneObject {
class V8_EXPORT_PRIVATE RegExpParser {
public:
RegExpParser(FlatStringReader* in, Handle<String>* error,
JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
JSRegExp::Flags flags, RegExpCompileData* result);
......@@ -202,7 +203,7 @@ class V8_EXPORT_PRIVATE RegExpParser {
char ParseClassEscape();
RegExpTree* ReportError(Vector<const char> message);
RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void Reset(int pos);
......@@ -335,7 +336,8 @@ class V8_EXPORT_PRIVATE RegExpParser {
Isolate* isolate_;
Zone* zone_;
Handle<String>* error_;
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
......
......@@ -92,9 +92,15 @@ class RegExpImpl final : public AllStatic {
};
V8_WARN_UNUSED_RESULT
static inline MaybeHandle<Object> ThrowRegExpException(
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
Handle<String> error_text) {
static inline MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> pattern,
RegExpError error) {
Vector<const char> error_data = CStrVector(RegExpErrorString(error));
Handle<String> error_text =
isolate->factory()
->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data))
.ToHandleChecked();
THROW_NEW_ERROR(
isolate,
NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
......@@ -102,7 +108,7 @@ static inline MaybeHandle<Object> ThrowRegExpException(
}
inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> error_text) {
RegExpError error_text) {
USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
error_text));
}
......@@ -408,7 +414,7 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
is_one_byte, re->BacktrackLimit());
if (!compilation_succeeded) {
DCHECK(!compile_data.error.is_null());
DCHECK(compile_data.error != RegExpError::kNone);
ThrowRegExpException(isolate, re, compile_data.error);
return false;
}
......@@ -741,8 +747,7 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
Handle<String> sample_subject, bool is_one_byte,
uint32_t backtrack_limit) {
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
data->error =
isolate->factory()->NewStringFromAsciiChecked("RegExp too big");
data->error = RegExpError::kTooLarge;
return false;
}
......@@ -810,8 +815,8 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node;
if (const char* error_message = AnalyzeRegExp(isolate, is_one_byte, node)) {
data->error = isolate->factory()->NewStringFromAsciiChecked(error_message);
data->error = AnalyzeRegExp(isolate, is_one_byte, node);
if (data->error != RegExpError::kNone) {
return false;
}
......@@ -913,13 +918,12 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
}
}
if (result.error_message != nullptr) {
if (result.error != RegExpError::kNone) {
if (FLAG_correctness_fuzzer_suppressions &&
strncmp(result.error_message, "Stack overflow", 15) == 0) {
result.error == RegExpError::kStackOverflow) {
FATAL("Aborting on stack overflow");
}
data->error =
isolate->factory()->NewStringFromAsciiChecked(result.error_message);
data->error = result.error;
}
data->code = result.code;
......
......@@ -6,6 +6,7 @@
#define V8_REGEXP_REGEXP_H_
#include "src/objects/js-regexp.h"
#include "src/regexp/regexp-error.h"
namespace v8 {
namespace internal {
......@@ -42,7 +43,11 @@ struct RegExpCompileData {
// The error message. Only used if an error occurred during parsing or
// compilation.
Handle<String> error;
RegExpError error = RegExpError::kNone;
// The position at which the error was detected. Only used if an
// error occurred.
int error_pos = 0;
// The number of capture groups, without the global capture \0.
int capture_count = 0;
......
......@@ -76,7 +76,7 @@ static void CheckParseEq(const char* input, const char* expected,
CHECK(v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), &zone,
&reader, flags, &result));
CHECK_NOT_NULL(result.tree);
CHECK(result.error.is_null());
CHECK(result.error == RegExpError::kNone);
std::ostringstream os;
result.tree->Print(os, &zone);
if (strcmp(expected, os.str().c_str()) != 0) {
......@@ -94,7 +94,7 @@ static bool CheckSimple(const char* input) {
CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK_NOT_NULL(result.tree);
CHECK(result.error.is_null());
CHECK(result.error == RegExpError::kNone);
return result.simple;
}
......@@ -112,7 +112,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) {
CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK_NOT_NULL(result.tree);
CHECK(result.error.is_null());
CHECK(result.error == RegExpError::kNone);
int min_match = result.tree->min_match();
int max_match = result.tree->max_match();
MinMaxPair pair = { min_match, max_match };
......@@ -428,9 +428,8 @@ static void ExpectError(const char* input, const char* expected,
CHECK(!v8::internal::RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
&result));
CHECK_NULL(result.tree);
CHECK(!result.error.is_null());
std::unique_ptr<char[]> str = result.error->ToCString(ALLOW_NULLS);
CHECK_EQ(0, strcmp(expected, str.get()));
CHECK(result.error != RegExpError::kNone);
CHECK_EQ(0, strcmp(expected, RegExpErrorString(result.error)));
}
......@@ -468,7 +467,7 @@ TEST(Errors) {
ExpectError("\\k<a", kInvalidCaptureName, true);
const char* kDuplicateCaptureName = "Duplicate capture group name";
ExpectError("(?<a>.)(?<a>.)", kDuplicateCaptureName, true);
const char* kInvalidUnicodeEscape = "Invalid Unicode escape sequence";
const char* kInvalidUnicodeEscape = "Invalid Unicode escape";
ExpectError("(?<\\u{FISK}", kInvalidUnicodeEscape, true);
const char* kInvalidCaptureReferenced = "Invalid named capture referenced";
ExpectError("\\k<a>", kInvalidCaptureReferenced, true);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment