Commit 0a808704 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] move regexp parser into own files.

R=rossberg@chromium.org, ulan@chromium.org

Review URL: https://codereview.chromium.org/1565183002

Cr-Commit-Position: refs/heads/master@{#33169}
parent 493aa231
......@@ -1185,6 +1185,8 @@ source_set("v8_base") {
"src/regexp/jsregexp-inl.h",
"src/regexp/jsregexp.cc",
"src/regexp/jsregexp.h",
"src/regexp/regexp-ast.cc",
"src/regexp/regexp-ast.h",
"src/regexp/regexp-macro-assembler-irregexp-inl.h",
"src/regexp/regexp-macro-assembler-irregexp.cc",
"src/regexp/regexp-macro-assembler-irregexp.h",
......@@ -1192,6 +1194,8 @@ source_set("v8_base") {
"src/regexp/regexp-macro-assembler-tracer.h",
"src/regexp/regexp-macro-assembler.cc",
"src/regexp/regexp-macro-assembler.h",
"src/regexp/regexp-parser.cc",
"src/regexp/regexp-parser.h",
"src/regexp/regexp-stack.cc",
"src/regexp/regexp-stack.h",
"src/register-configuration.cc",
......
......@@ -798,336 +798,6 @@ void AstVisitor::VisitExpressions(ZoneList<Expression*>* expressions) {
}
// ----------------------------------------------------------------------------
// Regular expressions
#define MAKE_ACCEPT(Name) \
void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
return visitor->Visit##Name(this, data); \
}
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
#undef MAKE_ACCEPT
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExpTree::As##Name() { \
return NULL; \
} \
bool RegExpTree::Is##Name() { return false; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExp##Name::As##Name() { \
return this; \
} \
bool RegExp##Name::Is##Name() { return true; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
Interval result = Interval::Empty();
for (int i = 0; i < children->length(); i++)
result = result.Union(children->at(i)->CaptureRegisters());
return result;
}
Interval RegExpAlternative::CaptureRegisters() {
return ListCaptureRegisters(nodes());
}
Interval RegExpDisjunction::CaptureRegisters() {
return ListCaptureRegisters(alternatives());
}
Interval RegExpLookaround::CaptureRegisters() {
return body()->CaptureRegisters();
}
Interval RegExpCapture::CaptureRegisters() {
Interval self(StartRegister(index()), EndRegister(index()));
return self.Union(body()->CaptureRegisters());
}
Interval RegExpQuantifier::CaptureRegisters() {
return body()->CaptureRegisters();
}
bool RegExpAssertion::IsAnchoredAtStart() {
return assertion_type() == RegExpAssertion::START_OF_INPUT;
}
bool RegExpAssertion::IsAnchoredAtEnd() {
return assertion_type() == RegExpAssertion::END_OF_INPUT;
}
bool RegExpAlternative::IsAnchoredAtStart() {
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchoredAtStart()) { return true; }
if (node->max_match() > 0) { return false; }
}
return false;
}
bool RegExpAlternative::IsAnchoredAtEnd() {
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = nodes->length() - 1; i >= 0; i--) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchoredAtEnd()) { return true; }
if (node->max_match() > 0) { return false; }
}
return false;
}
bool RegExpDisjunction::IsAnchoredAtStart() {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
for (int i = 0; i < alternatives->length(); i++) {
if (!alternatives->at(i)->IsAnchoredAtStart())
return false;
}
return true;
}
bool RegExpDisjunction::IsAnchoredAtEnd() {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
for (int i = 0; i < alternatives->length(); i++) {
if (!alternatives->at(i)->IsAnchoredAtEnd())
return false;
}
return true;
}
bool RegExpLookaround::IsAnchoredAtStart() {
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtStart() {
return body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtEnd() {
return body()->IsAnchoredAtEnd();
}
// Convert regular expression trees to a simple sexp representation.
// This representation should be different from the input grammar
// in as many cases as possible, to make it more difficult for incorrect
// parses to look as correct ones which is likely if the input and
// output formats are alike.
class RegExpUnparser final : public RegExpVisitor {
public:
RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
void VisitCharacterRange(CharacterRange that);
#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
#undef MAKE_CASE
private:
std::ostream& os_;
Zone* zone_;
};
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
os_ << "(|";
for (int i = 0; i < that->alternatives()->length(); i++) {
os_ << " ";
that->alternatives()->at(i)->Accept(this, data);
}
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
os_ << "(:";
for (int i = 0; i < that->nodes()->length(); i++) {
os_ << " ";
that->nodes()->at(i)->Accept(this, data);
}
os_ << ")";
return NULL;
}
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
os_ << AsUC16(that.from());
if (!that.IsSingleton()) {
os_ << "-" << AsUC16(that.to());
}
}
void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
void* data) {
if (that->is_negated()) os_ << "^";
os_ << "[";
for (int i = 0; i < that->ranges(zone_)->length(); i++) {
if (i > 0) os_ << " ";
VisitCharacterRange(that->ranges(zone_)->at(i));
}
os_ << "]";
return NULL;
}
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
switch (that->assertion_type()) {
case RegExpAssertion::START_OF_INPUT:
os_ << "@^i";
break;
case RegExpAssertion::END_OF_INPUT:
os_ << "@$i";
break;
case RegExpAssertion::START_OF_LINE:
os_ << "@^l";
break;
case RegExpAssertion::END_OF_LINE:
os_ << "@$l";
break;
case RegExpAssertion::BOUNDARY:
os_ << "@b";
break;
case RegExpAssertion::NON_BOUNDARY:
os_ << "@B";
break;
}
return NULL;
}
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
os_ << "'";
Vector<const uc16> chardata = that->data();
for (int i = 0; i < chardata.length(); i++) {
os_ << AsUC16(chardata[i]);
}
os_ << "'";
return NULL;
}
void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
if (that->elements()->length() == 1) {
that->elements()->at(0).tree()->Accept(this, data);
} else {
os_ << "(!";
for (int i = 0; i < that->elements()->length(); i++) {
os_ << " ";
that->elements()->at(i).tree()->Accept(this, data);
}
os_ << ")";
}
return NULL;
}
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
os_ << "(# " << that->min() << " ";
if (that->max() == RegExpTree::kInfinity) {
os_ << "- ";
} else {
os_ << that->max() << " ";
}
os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
that->body()->Accept(this, data);
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
os_ << "(^ ";
that->body()->Accept(this, data);
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
os_ << "(";
os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
os_ << (that->is_positive() ? " + " : " - ");
that->body()->Accept(this, data);
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
void* data) {
os_ << "(<- " << that->index() << ")";
return NULL;
}
void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
os_ << '%';
return NULL;
}
std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { // NOLINT
RegExpUnparser unparser(os, zone);
Accept(&unparser, NULL);
return os;
}
RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
: alternatives_(alternatives) {
DCHECK(alternatives->length() > 1);
RegExpTree* first_alternative = alternatives->at(0);
min_match_ = first_alternative->min_match();
max_match_ = first_alternative->max_match();
for (int i = 1; i < alternatives->length(); i++) {
RegExpTree* alternative = alternatives->at(i);
min_match_ = Min(min_match_, alternative->min_match());
max_match_ = Max(max_match_, alternative->max_match());
}
}
static int IncreaseBy(int previous, int increase) {
if (RegExpTree::kInfinity - previous < increase) {
return RegExpTree::kInfinity;
} else {
return previous + increase;
}
}
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
: nodes_(nodes) {
DCHECK(nodes->length() > 1);
min_match_ = 0;
max_match_ = 0;
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
int node_min_match = node->min_match();
min_match_ = IncreaseBy(min_match_, node_min_match);
int node_max_match = node->max_match();
max_match_ = IncreaseBy(max_match_, node_max_match);
}
}
CaseClause::CaseClause(Zone* zone, Expression* label,
ZoneList<Statement*>* statements, int pos)
: Expression(zone, pos),
......
This diff is collapsed.
......@@ -32,6 +32,7 @@
#include "src/heap/store-buffer.h"
#include "src/interpreter/interpreter.h"
#include "src/profiler/cpu-profiler.h"
#include "src/regexp/jsregexp.h"
#include "src/runtime-profiler.h"
#include "src/snapshot/natives.h"
#include "src/snapshot/serialize.h"
......
......@@ -42,6 +42,7 @@
#include "src/profiler/cpu-profiler.h"
#include "src/property-descriptor.h"
#include "src/prototype.h"
#include "src/regexp/jsregexp.h"
#include "src/safepoint-table.h"
#include "src/string-builder.h"
#include "src/string-search.h"
......
This diff is collapsed.
......@@ -288,264 +288,6 @@ class ParseData {
DISALLOW_COPY_AND_ASSIGN(ParseData);
};
// ----------------------------------------------------------------------------
// REGEXP PARSING
// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be NULL pointers.
template <typename T, int initial_size>
class BufferedZoneList {
public:
BufferedZoneList() : list_(NULL), last_(NULL) {}
// Adds element at end of list. This element is buffered and can
// be read using last() or removed using RemoveLast until a new Add or until
// RemoveLast or GetList has been called.
void Add(T* value, Zone* zone) {
if (last_ != NULL) {
if (list_ == NULL) {
list_ = new(zone) ZoneList<T*>(initial_size, zone);
}
list_->Add(last_, zone);
}
last_ = value;
}
T* last() {
DCHECK(last_ != NULL);
return last_;
}
T* RemoveLast() {
DCHECK(last_ != NULL);
T* result = last_;
if ((list_ != NULL) && (list_->length() > 0))
last_ = list_->RemoveLast();
else
last_ = NULL;
return result;
}
T* Get(int i) {
DCHECK((0 <= i) && (i < length()));
if (list_ == NULL) {
DCHECK_EQ(0, i);
return last_;
} else {
if (i == list_->length()) {
DCHECK(last_ != NULL);
return last_;
} else {
return list_->at(i);
}
}
}
void Clear() {
list_ = NULL;
last_ = NULL;
}
int length() {
int length = (list_ == NULL) ? 0 : list_->length();
return length + ((last_ == NULL) ? 0 : 1);
}
ZoneList<T*>* GetList(Zone* zone) {
if (list_ == NULL) {
list_ = new(zone) ZoneList<T*>(initial_size, zone);
}
if (last_ != NULL) {
list_->Add(last_, zone);
last_ = NULL;
}
return list_;
}
private:
ZoneList<T*>* list_;
T* last_;
};
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder: public ZoneObject {
public:
explicit RegExpBuilder(Zone* zone);
void AddCharacter(uc16 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp();
private:
void FlushCharacters();
void FlushText();
void FlushTerms();
Zone* zone() const { return zone_; }
Zone* zone_;
bool pending_empty_;
ZoneList<uc16>* characters_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};
class RegExpParser BASE_EMBEDDED {
public:
RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
bool unicode, Isolate* isolate, Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
bool multiline, bool unicode,
RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
RegExpTree* ParseCharacterClass();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
uc32 ParseClassCharacterEscape();
// Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is.
bool ParseHexEscape(int length, uc32* value);
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);
CharacterRange ParseClassAtom(uc16* char_class);
RegExpTree* ReportError(Vector<const char> message);
void Advance();
void Advance(int dist);
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
static bool IsSyntaxCharacter(uc32 c);
static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21);
private:
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAROUND,
NEGATIVE_LOOKAROUND,
GROUPING
};
class RegExpParserState : public ZoneObject {
public:
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index, Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
bool IsSubexpression() { return previous_state_ != NULL; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() { return group_type_; }
// Lookahead or Lookbehind.
RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() { return disjunction_capture_index_; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index);
private:
// Linked list implementation of stack of states.
RegExpParserState* previous_state_;
// Builder for the stored disjunction.
RegExpBuilder* builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
SubexpressionType group_type_;
// Stored read direction.
RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
int disjunction_capture_index_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
uc32 current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
Isolate* isolate_;
Zone* zone_;
Handle<String>* error_;
ZoneList<RegExpCapture*>* captures_;
FlatStringReader* in_;
uc32 current_;
int next_pos_;
int captures_started_;
// The capture count is only valid after we have scanned for captures.
int capture_count_;
bool has_more_;
bool multiline_;
bool unicode_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool failed_;
};
// ----------------------------------------------------------------------------
// JAVASCRIPT PARSING
......
......@@ -28,17 +28,6 @@ class ParserRecorder;
class UnicodeCache;
// Returns the value (0 .. 15) of a hexadecimal character c.
// If c is not a legal hexadecimal character, returns a value < 0.
inline int HexValue(uc32 c) {
c -= '0';
if (static_cast<unsigned>(c) <= 9) return c;
c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
if (static_cast<unsigned>(c) <= 5) return c + 10;
return -1;
}
// ---------------------------------------------------------------------
// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
// A code unit is a 16 bit value representing either a 16 bit code point
......
......@@ -13,12 +13,12 @@
#include "src/isolate-inl.h"
#include "src/messages.h"
#include "src/ostreams.h"
#include "src/parsing/parser.h"
#include "src/regexp/interpreter-irregexp.h"
#include "src/regexp/jsregexp-inl.h"
#include "src/regexp/regexp-macro-assembler.h"
#include "src/regexp/regexp-macro-assembler-irregexp.h"
#include "src/regexp/regexp-macro-assembler-tracer.h"
#include "src/regexp/regexp-parser.h"
#include "src/regexp/regexp-stack.h"
#include "src/runtime/runtime.h"
#include "src/splay-tree-inl.h"
......
......@@ -7,6 +7,7 @@
#include "src/allocation.h"
#include "src/assembler.h"
#include "src/regexp/regexp-ast.h"
namespace v8 {
namespace internal {
......@@ -226,63 +227,6 @@ enum ElementInSetsRelation {
};
// Represents code units in the range from from_ to to_, both ends are
// inclusive.
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) { }
// For compatibility with the CHECK_OK macro
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
Zone* zone);
static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc16 value) {
return CharacterRange(value, value);
}
static inline CharacterRange Range(uc16 from, uc16 to) {
DCHECK(from <= to);
return CharacterRange(from, to);
}
static inline CharacterRange Everything() {
return CharacterRange(0, 0xFFFF);
}
bool Contains(uc16 i) { return from_ <= i && i <= to_; }
uc16 from() const { return from_; }
void set_from(uc16 value) { from_ = value; }
uc16 to() const { return to_; }
void set_to(uc16 value) { to_ = value; }
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
void AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, bool is_one_byte);
static void Split(ZoneList<CharacterRange>* base,
Vector<const int> overlay,
ZoneList<CharacterRange>** included,
ZoneList<CharacterRange>** excluded,
Zone* zone);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
// Convert range list to canonical form. The characters covered by the ranges
// will still be the same, but no character is in more than one range, and
// adjacent ranges are merged. The resulting list may be shorter than the
// original, but cannot be longer.
static void Canonicalize(ZoneList<CharacterRange>* ranges);
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst,
Zone* zone);
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
private:
uc16 from_;
uc16 to_;
};
// A set of unsigned integers that behaves especially well on small
// integers (< 32). May do zone-allocation.
class OutSet: public ZoneObject {
......@@ -380,63 +324,6 @@ class DispatchTable : public ZoneObject {
VISIT(Text)
#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
VISIT(Disjunction) \
VISIT(Alternative) \
VISIT(Assertion) \
VISIT(CharacterClass) \
VISIT(Atom) \
VISIT(Quantifier) \
VISIT(Capture) \
VISIT(Lookaround) \
VISIT(BackReference) \
VISIT(Empty) \
VISIT(Text)
#define FORWARD_DECLARE(Name) class RegExp##Name;
FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
#undef FORWARD_DECLARE
class TextElement final BASE_EMBEDDED {
public:
enum TextType {
ATOM,
CHAR_CLASS
};
static TextElement Atom(RegExpAtom* atom);
static TextElement CharClass(RegExpCharacterClass* char_class);
int cp_offset() const { return cp_offset_; }
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
int length() const;
TextType text_type() const { return text_type_; }
RegExpTree* tree() const { return tree_; }
RegExpAtom* atom() const {
DCHECK(text_type() == ATOM);
return reinterpret_cast<RegExpAtom*>(tree());
}
RegExpCharacterClass* char_class() const {
DCHECK(text_type() == CHAR_CLASS);
return reinterpret_cast<RegExpCharacterClass*>(tree());
}
private:
TextElement(TextType text_type, RegExpTree* tree)
: cp_offset_(-1), text_type_(text_type), tree_(tree) {}
int cp_offset_;
TextType text_type_;
RegExpTree* tree_;
};
class Trace;
struct PreloadState;
class GreedyLoopState;
......@@ -688,33 +575,6 @@ class RegExpNode: public ZoneObject {
};
// A simple closed interval.
class Interval {
public:
Interval() : from_(kNone), to_(kNone) { }
Interval(int from, int to) : from_(from), to_(to) { }
Interval Union(Interval that) {
if (that.from_ == kNone)
return *this;
else if (from_ == kNone)
return that;
else
return Interval(Min(from_, that.from_), Max(to_, that.to_));
}
bool Contains(int value) {
return (from_ <= value) && (value <= to_);
}
bool is_empty() { return from_ == kNone; }
int from() const { return from_; }
int to() const { return to_; }
static Interval Empty() { return Interval(); }
static const int kNone = -1;
private:
int from_;
int to_;
};
class SeqRegExpNode: public RegExpNode {
public:
explicit SeqRegExpNode(RegExpNode* on_success)
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/ostreams.h"
#include "src/regexp/regexp-ast.h"
namespace v8 {
namespace internal {
#define MAKE_ACCEPT(Name) \
void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
return visitor->Visit##Name(this, data); \
}
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
#undef MAKE_ACCEPT
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExpTree::As##Name() { return NULL; } \
bool RegExpTree::Is##Name() { return false; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExp##Name::As##Name() { return this; } \
bool RegExp##Name::Is##Name() { return true; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
Interval result = Interval::Empty();
for (int i = 0; i < children->length(); i++)
result = result.Union(children->at(i)->CaptureRegisters());
return result;
}
Interval RegExpAlternative::CaptureRegisters() {
return ListCaptureRegisters(nodes());
}
Interval RegExpDisjunction::CaptureRegisters() {
return ListCaptureRegisters(alternatives());
}
Interval RegExpLookaround::CaptureRegisters() {
return body()->CaptureRegisters();
}
Interval RegExpCapture::CaptureRegisters() {
Interval self(StartRegister(index()), EndRegister(index()));
return self.Union(body()->CaptureRegisters());
}
Interval RegExpQuantifier::CaptureRegisters() {
return body()->CaptureRegisters();
}
bool RegExpAssertion::IsAnchoredAtStart() {
return assertion_type() == RegExpAssertion::START_OF_INPUT;
}
bool RegExpAssertion::IsAnchoredAtEnd() {
return assertion_type() == RegExpAssertion::END_OF_INPUT;
}
bool RegExpAlternative::IsAnchoredAtStart() {
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchoredAtStart()) {
return true;
}
if (node->max_match() > 0) {
return false;
}
}
return false;
}
bool RegExpAlternative::IsAnchoredAtEnd() {
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = nodes->length() - 1; i >= 0; i--) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchoredAtEnd()) {
return true;
}
if (node->max_match() > 0) {
return false;
}
}
return false;
}
bool RegExpDisjunction::IsAnchoredAtStart() {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
for (int i = 0; i < alternatives->length(); i++) {
if (!alternatives->at(i)->IsAnchoredAtStart()) return false;
}
return true;
}
bool RegExpDisjunction::IsAnchoredAtEnd() {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
for (int i = 0; i < alternatives->length(); i++) {
if (!alternatives->at(i)->IsAnchoredAtEnd()) return false;
}
return true;
}
bool RegExpLookaround::IsAnchoredAtStart() {
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
// Convert regular expression trees to a simple sexp representation.
// This representation should be different from the input grammar
// in as many cases as possible, to make it more difficult for incorrect
// parses to look as correct ones which is likely if the input and
// output formats are alike.
class RegExpUnparser final : public RegExpVisitor {
public:
RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
void VisitCharacterRange(CharacterRange that);
#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
#undef MAKE_CASE
private:
std::ostream& os_;
Zone* zone_;
};
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
os_ << "(|";
for (int i = 0; i < that->alternatives()->length(); i++) {
os_ << " ";
that->alternatives()->at(i)->Accept(this, data);
}
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
os_ << "(:";
for (int i = 0; i < that->nodes()->length(); i++) {
os_ << " ";
that->nodes()->at(i)->Accept(this, data);
}
os_ << ")";
return NULL;
}
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
os_ << AsUC16(that.from());
if (!that.IsSingleton()) {
os_ << "-" << AsUC16(that.to());
}
}
void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
void* data) {
if (that->is_negated()) os_ << "^";
os_ << "[";
for (int i = 0; i < that->ranges(zone_)->length(); i++) {
if (i > 0) os_ << " ";
VisitCharacterRange(that->ranges(zone_)->at(i));
}
os_ << "]";
return NULL;
}
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
switch (that->assertion_type()) {
case RegExpAssertion::START_OF_INPUT:
os_ << "@^i";
break;
case RegExpAssertion::END_OF_INPUT:
os_ << "@$i";
break;
case RegExpAssertion::START_OF_LINE:
os_ << "@^l";
break;
case RegExpAssertion::END_OF_LINE:
os_ << "@$l";
break;
case RegExpAssertion::BOUNDARY:
os_ << "@b";
break;
case RegExpAssertion::NON_BOUNDARY:
os_ << "@B";
break;
}
return NULL;
}
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
os_ << "'";
Vector<const uc16> chardata = that->data();
for (int i = 0; i < chardata.length(); i++) {
os_ << AsUC16(chardata[i]);
}
os_ << "'";
return NULL;
}
void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
if (that->elements()->length() == 1) {
that->elements()->at(0).tree()->Accept(this, data);
} else {
os_ << "(!";
for (int i = 0; i < that->elements()->length(); i++) {
os_ << " ";
that->elements()->at(i).tree()->Accept(this, data);
}
os_ << ")";
}
return NULL;
}
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
os_ << "(# " << that->min() << " ";
if (that->max() == RegExpTree::kInfinity) {
os_ << "- ";
} else {
os_ << that->max() << " ";
}
os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
that->body()->Accept(this, data);
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
os_ << "(^ ";
that->body()->Accept(this, data);
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
os_ << "(";
os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
os_ << (that->is_positive() ? " + " : " - ");
that->body()->Accept(this, data);
os_ << ")";
return NULL;
}
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
void* data) {
os_ << "(<- " << that->index() << ")";
return NULL;
}
void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
os_ << '%';
return NULL;
}
std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { // NOLINT
RegExpUnparser unparser(os, zone);
Accept(&unparser, NULL);
return os;
}
RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
: alternatives_(alternatives) {
DCHECK(alternatives->length() > 1);
RegExpTree* first_alternative = alternatives->at(0);
min_match_ = first_alternative->min_match();
max_match_ = first_alternative->max_match();
for (int i = 1; i < alternatives->length(); i++) {
RegExpTree* alternative = alternatives->at(i);
min_match_ = Min(min_match_, alternative->min_match());
max_match_ = Max(max_match_, alternative->max_match());
}
}
static int IncreaseBy(int previous, int increase) {
if (RegExpTree::kInfinity - previous < increase) {
return RegExpTree::kInfinity;
} else {
return previous + increase;
}
}
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
: nodes_(nodes) {
DCHECK(nodes->length() > 1);
min_match_ = 0;
max_match_ = 0;
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
int node_min_match = node->min_match();
min_match_ = IncreaseBy(min_match_, node_min_match);
int node_max_match = node->max_match();
max_match_ = IncreaseBy(max_match_, node_max_match);
}
}
} // namespace internal
} // namespace v8
This diff is collapsed.
......@@ -5,7 +5,8 @@
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
#include "src/ast/ast.h"
#include "src/assembler.h"
#include "src/regexp/regexp-ast.h"
namespace v8 {
namespace internal {
......
This diff is collapsed.
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_PARSER_H_
#define V8_REGEXP_REGEXP_PARSER_H_
#include "src/objects.h"
#include "src/regexp/regexp-ast.h"
#include "src/zone.h"
namespace v8 {
namespace internal {
struct RegExpCompileData;
// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be NULL pointers.
template <typename T, int initial_size>
class BufferedZoneList {
public:
BufferedZoneList() : list_(NULL), last_(NULL) {}
// Adds element at end of list. This element is buffered and can
// be read using last() or removed using RemoveLast until a new Add or until
// RemoveLast or GetList has been called.
void Add(T* value, Zone* zone) {
if (last_ != NULL) {
if (list_ == NULL) {
list_ = new (zone) ZoneList<T*>(initial_size, zone);
}
list_->Add(last_, zone);
}
last_ = value;
}
T* last() {
DCHECK(last_ != NULL);
return last_;
}
T* RemoveLast() {
DCHECK(last_ != NULL);
T* result = last_;
if ((list_ != NULL) && (list_->length() > 0))
last_ = list_->RemoveLast();
else
last_ = NULL;
return result;
}
T* Get(int i) {
DCHECK((0 <= i) && (i < length()));
if (list_ == NULL) {
DCHECK_EQ(0, i);
return last_;
} else {
if (i == list_->length()) {
DCHECK(last_ != NULL);
return last_;
} else {
return list_->at(i);
}
}
}
void Clear() {
list_ = NULL;
last_ = NULL;
}
int length() {
int length = (list_ == NULL) ? 0 : list_->length();
return length + ((last_ == NULL) ? 0 : 1);
}
ZoneList<T*>* GetList(Zone* zone) {
if (list_ == NULL) {
list_ = new (zone) ZoneList<T*>(initial_size, zone);
}
if (last_ != NULL) {
list_->Add(last_, zone);
last_ = NULL;
}
return list_;
}
private:
ZoneList<T*>* list_;
T* last_;
};
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
public:
explicit RegExpBuilder(Zone* zone);
void AddCharacter(uc16 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp();
private:
void FlushCharacters();
void FlushText();
void FlushTerms();
Zone* zone() const { return zone_; }
Zone* zone_;
bool pending_empty_;
ZoneList<uc16>* characters_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};
class RegExpParser BASE_EMBEDDED {
public:
RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
bool unicode, Isolate* isolate, Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
bool multiline, bool unicode,
RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
RegExpTree* ParseCharacterClass();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
uc32 ParseClassCharacterEscape();
// Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is.
bool ParseHexEscape(int length, uc32* value);
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);
CharacterRange ParseClassAtom(uc16* char_class);
RegExpTree* ReportError(Vector<const char> message);
void Advance();
void Advance(int dist);
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
static bool IsSyntaxCharacter(uc32 c);
static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21);
private:
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAROUND,
NEGATIVE_LOOKAROUND,
GROUPING
};
class RegExpParserState : public ZoneObject {
public:
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index, Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
bool IsSubexpression() { return previous_state_ != NULL; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() { return group_type_; }
// Lookahead or Lookbehind.
RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() { return disjunction_capture_index_; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index);
private:
// Linked list implementation of stack of states.
RegExpParserState* previous_state_;
// Builder for the stored disjunction.
RegExpBuilder* builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
SubexpressionType group_type_;
// Stored read direction.
RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
int disjunction_capture_index_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
uc32 current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
Isolate* isolate_;
Zone* zone_;
Handle<String>* error_;
ZoneList<RegExpCapture*>* captures_;
FlatStringReader* in_;
uc32 current_;
int next_pos_;
int captures_started_;
// The capture count is only valid after we have scanned for captures.
int capture_count_;
bool has_more_;
bool multiline_;
bool unicode_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool failed_;
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_PARSER_H_
......@@ -26,6 +26,16 @@ namespace internal {
// ----------------------------------------------------------------------------
// General helper functions
// Returns the value (0 .. 15) of a hexadecimal character c.
// If c is not a legal hexadecimal character, returns a value < 0.
inline int HexValue(uc32 c) {
c -= '0';
if (static_cast<unsigned>(c) <= 9) return c;
c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
if (static_cast<unsigned>(c) <= 5) return c + 10;
return -1;
}
inline int BoolToInt(bool b) { return b ? 1 : 0; }
......
......@@ -38,6 +38,7 @@
#include "src/heap/memory-reducer.h"
#include "src/ic/ic.h"
#include "src/macro-assembler.h"
#include "src/regexp/jsregexp.h"
#include "src/snapshot/snapshot.h"
#include "test/cctest/cctest.h"
#include "test/cctest/heap/heap-tester.h"
......
......@@ -34,10 +34,10 @@
#include "src/ast/ast.h"
#include "src/char-predicates-inl.h"
#include "src/ostreams.h"
#include "src/parsing/parser.h"
#include "src/regexp/jsregexp.h"
#include "src/regexp/regexp-macro-assembler.h"
#include "src/regexp/regexp-macro-assembler-irregexp.h"
#include "src/regexp/regexp-parser.h"
#include "src/splay-tree-inl.h"
#include "src/string-stream.h"
#ifdef V8_INTERPRETED_REGEXP
......
......@@ -964,6 +964,8 @@
'../../src/regexp/jsregexp-inl.h',
'../../src/regexp/jsregexp.cc',
'../../src/regexp/jsregexp.h',
'../../src/regexp/regexp-ast.cc',
'../../src/regexp/regexp-ast.h',
'../../src/regexp/regexp-macro-assembler-irregexp-inl.h',
'../../src/regexp/regexp-macro-assembler-irregexp.cc',
'../../src/regexp/regexp-macro-assembler-irregexp.h',
......@@ -971,6 +973,8 @@
'../../src/regexp/regexp-macro-assembler-tracer.h',
'../../src/regexp/regexp-macro-assembler.cc',
'../../src/regexp/regexp-macro-assembler.h',
'../../src/regexp/regexp-parser.cc',
'../../src/regexp/regexp-parser.h',
'../../src/regexp/regexp-stack.cc',
'../../src/regexp/regexp-stack.h',
'../../src/register-configuration.cc',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment