- Specialized jscre on the type of the string involved.

- Specialized jscre on the type of the string involved.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@476 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 4f7b6654
......@@ -223,7 +223,6 @@ void Heap::ReportStatisticsAfterGC() {
void Heap::GarbageCollectionPrologue() {
RegExpImpl::NewSpaceCollectionPrologue();
gc_count_++;
#ifdef DEBUG
ASSERT(allocation_allowed_ && gc_state_ == NOT_IN_GC);
......@@ -424,7 +423,6 @@ void Heap::MarkCompact(GCTracer* tracer) {
void Heap::MarkCompactPrologue() {
CompilationCache::MarkCompactPrologue();
RegExpImpl::OldSpaceCollectionPrologue();
Top::MarkCompactPrologue();
ThreadManager::MarkCompactPrologue();
}
......
......@@ -65,27 +65,6 @@ static void JSREFree(void* p) {
}
String* RegExpImpl::last_ascii_string_ = NULL;
String* RegExpImpl::two_byte_cached_string_ = NULL;
void RegExpImpl::NewSpaceCollectionPrologue() {
// The two byte string is always in the old space. The Ascii string may be
// in either place. If it is in the old space we don't need to do anything.
if (Heap::InNewSpace(last_ascii_string_)) {
// Invalidate the cache.
last_ascii_string_ = NULL;
two_byte_cached_string_ = NULL;
}
}
void RegExpImpl::OldSpaceCollectionPrologue() {
last_ascii_string_ = NULL;
two_byte_cached_string_ = NULL;
}
Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
Handle<String> pattern,
Handle<String> flags,
......@@ -102,47 +81,6 @@ Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
}
// Converts a source string to a 16 bit flat string or a SlicedString containing
// a 16 bit flat string).
Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
if (*subject == last_ascii_string_) {
ASSERT(two_byte_cached_string_ != NULL);
return Handle<String>(String::cast(two_byte_cached_string_));
}
Handle<String> two_byte_string = StringToTwoByte(subject);
last_ascii_string_ = *subject;
two_byte_cached_string_ = *two_byte_string;
return two_byte_string;
}
// Converts a source string to a 16 bit flat string or a SlicedString containing
// a 16 bit flat string).
Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
if (!pattern->IsFlat()) {
FlattenString(pattern);
}
Handle<String> flat_string(pattern->IsConsString() ?
String::cast(ConsString::cast(*pattern)->first()) :
*pattern);
ASSERT(!flat_string->IsConsString());
ASSERT(flat_string->IsSeqString() || flat_string->IsSlicedString() ||
flat_string->IsExternalString());
if (!flat_string->IsAsciiRepresentation()) {
return flat_string;
}
Handle<String> two_byte_string =
Factory::NewRawTwoByteString(flat_string->length(), TENURED);
static StringInputBuffer convert_to_two_byte_buffer;
convert_to_two_byte_buffer.Reset(*flat_string);
for (int i = 0; convert_to_two_byte_buffer.has_more(); i++) {
two_byte_string->Set(i, convert_to_two_byte_buffer.GetNext());
}
return two_byte_string;
}
unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char;
......@@ -189,7 +127,14 @@ Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
Handle<String> subject) {
switch (regexp->type_tag()) {
case JSRegExp::JSCRE:
return JsreExecGlobal(regexp, subject);
FlattenString(subject);
if (subject->IsAsciiRepresentation()) {
Vector<const char> contents = subject->ToAsciiVector();
return JsreExecGlobal(regexp, subject, contents);
} else {
Vector<const uc16> contents = subject->ToUC16Vector();
return JsreExecGlobal(regexp, subject, contents);
}
case JSRegExp::ATOM:
return AtomExecGlobal(regexp, subject);
default:
......@@ -268,16 +213,34 @@ Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,
if (flags->Get(i) == 'm') multiline_option = JSRegExpMultiline;
}
Handle<String> two_byte_pattern = StringToTwoByte(pattern);
unsigned number_of_captures;
const char* error_message = NULL;
malloc_failure = Failure::Exception();
JscreRegExp* code = jsRegExpCompile(two_byte_pattern->GetTwoByteData(),
pattern->length(), case_option,
multiline_option, &number_of_captures,
&error_message, &JSREMalloc, &JSREFree);
JscreRegExp* code;
FlattenString(pattern);
if (pattern->IsAsciiRepresentation()) {
Vector<const char> contents = pattern->ToAsciiVector();
code = jsRegExpCompile(contents.start(),
contents.length(),
case_option,
multiline_option,
&number_of_captures,
&error_message,
&JSREMalloc,
&JSREFree);
} else {
Vector<const uc16> contents = pattern->ToUC16Vector();
code = jsRegExpCompile(contents.start(),
contents.length(),
case_option,
multiline_option,
&number_of_captures,
&error_message,
&JSREMalloc,
&JSREFree);
}
if (code == NULL && malloc_failure->IsRetryAfterGC()) {
// Performs a GC, then retries.
......@@ -287,10 +250,29 @@ Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,
V8::FatalProcessOutOfMemory("RegExpImpl::JsreCompile");
}
malloc_failure = Failure::Exception();
code = jsRegExpCompile(two_byte_pattern->GetTwoByteData(),
pattern->length(), case_option,
multiline_option, &number_of_captures,
&error_message, &JSREMalloc, &JSREFree);
if (pattern->IsAsciiRepresentation()) {
Vector<const char> contents = pattern->ToAsciiVector();
code = jsRegExpCompile(contents.start(),
contents.length(),
case_option,
multiline_option,
&number_of_captures,
&error_message,
&JSREMalloc,
&JSREFree);
} else {
Vector<const uc16> contents = pattern->ToUC16Vector();
code = jsRegExpCompile(contents.start(),
contents.length(),
case_option,
multiline_option,
&number_of_captures,
&error_message,
&JSREMalloc,
&JSREFree);
}
if (code == NULL && malloc_failure->IsRetryAfterGC()) {
// TODO(1181417): Fix this.
V8::FatalProcessOutOfMemory("RegExpImpl::JsreCompile");
......@@ -299,10 +281,8 @@ Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,
if (error_message != NULL) {
// Throw an exception.
SmartPointer<char> char_pattern =
two_byte_pattern->ToCString(DISALLOW_NULLS);
Handle<JSArray> array = Factory::NewJSArray(2);
SetElement(array, 0, Factory::NewStringFromUtf8(CStrVector(*char_pattern)));
SetElement(array, 0, pattern);
SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(error_message)));
Handle<Object> regexp_err =
Factory::NewSyntaxError("malformed_regexp", array);
......@@ -325,11 +305,12 @@ Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,
}
template <typename T>
Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp,
int num_captures,
Handle<String> subject,
int previous_index,
const uc16* two_byte_subject,
Vector<const T> contents,
int* offsets_vector,
int offsets_vector_length) {
int rc;
......@@ -341,12 +322,12 @@ Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp,
LOG(RegExpExecEvent(regexp, previous_index, subject));
rc = jsRegExpExecute(js_regexp,
two_byte_subject,
subject->length(),
previous_index,
offsets_vector,
offsets_vector_length);
rc = jsRegExpExecute<T>(js_regexp,
contents.start(),
contents.length(),
previous_index,
offsets_vector,
offsets_vector_length);
}
// The KJS JavaScript engine returns null (ie, a failed match) when
......@@ -428,19 +409,29 @@ Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp,
int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
Handle<String> subject16 = CachedStringToTwoByte(subject);
Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,
previous_index,
subject16->GetTwoByteData(),
offsets.vector(), offsets.length()));
return result;
FlattenString(subject);
if (subject->IsAsciiRepresentation()) {
Vector<const char> contents = subject->ToAsciiVector();
Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,
previous_index,
contents,
offsets.vector(), offsets.length()));
return result;
} else {
Vector<const uc16> contents = subject->ToUC16Vector();
Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,
previous_index,
contents,
offsets.vector(), offsets.length()));
return result;
}
}
template <typename T>
Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp,
Handle<String> subject) {
Handle<String> subject,
Vector<const T> contents) {
// Prepare space for the return values.
int num_captures = JsreCapture(regexp);
......@@ -452,17 +443,19 @@ Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp,
int i = 0;
Handle<Object> matches;
Handle<String> subject16 = CachedStringToTwoByte(subject);
do {
if (previous_index > subject->length() || previous_index < 0) {
// Per ECMA-262 15.10.6.2, if the previous index is greater than the
// string length, there is no match.
matches = Factory::null_value();
} else {
matches = JsreExecOnce(regexp, num_captures, subject, previous_index,
subject16->GetTwoByteData(),
offsets.vector(), offsets.length());
matches = JsreExecOnce<T>(regexp,
num_captures,
subject,
previous_index,
contents,
offsets.vector(),
offsets.length());
if (matches->IsJSArray()) {
SetElement(result, i, matches);
......
......@@ -79,32 +79,24 @@ class RegExpImpl {
Handle<String> subject,
Handle<Object> index);
template <typename T>
static Handle<Object> JsreExecGlobal(Handle<JSRegExp> regexp,
Handle<String> subject);
static void NewSpaceCollectionPrologue();
static void OldSpaceCollectionPrologue();
Handle<String> subject,
Vector<const T> contents);
private:
// Converts a source string to a 16 bit flat string. The string
// will be either sequential or it will be a SlicedString backed
// by a flat string.
static Handle<String> StringToTwoByte(Handle<String> pattern);
static Handle<String> CachedStringToTwoByte(Handle<String> pattern);
static String* last_ascii_string_;
static String* two_byte_cached_string_;
// Returns the caputure from the re.
static int JsreCapture(Handle<JSRegExp> re);
static ByteArray* JsreInternal(Handle<JSRegExp> re);
// Call jsRegExpExecute once
template <typename T>
static Handle<Object> JsreExecOnce(Handle<JSRegExp> regexp,
int num_captures,
Handle<String> subject,
int previous_index,
const uc16* utf8_subject,
Vector<const T> contents,
int* ovector,
int ovector_length);
......
......@@ -964,7 +964,21 @@ Object* JSObject::AddFastProperty(String* name,
return AddSlowProperty(name, value, attributes);
}
// Replace a CONSTANT_TRANSITION flag with a transition.
// Do this by removing it, and the standard code for adding a map transition
// will then run.
DescriptorArray* old_descriptors = map()->instance_descriptors();
int old_name_index = old_descriptors->Search(name);
bool constant_transition = false; // Only used in assertions.
if (old_name_index != DescriptorArray::kNotFound && CONSTANT_TRANSITION ==
PropertyDetails(old_descriptors->GetDetails(old_name_index)).type()) {
constant_transition = true;
Object* r = old_descriptors->CopyRemove(name);
if (r->IsFailure()) return r;
old_descriptors = DescriptorArray::cast(r);
old_name_index = DescriptorArray::kNotFound;
}
// Compute the new index for new field.
int index = map()->NextFreePropertyIndex();
......@@ -979,43 +993,64 @@ Object* JSObject::AddFastProperty(String* name,
bool allow_map_transition =
!old_descriptors->Contains(name) &&
(Top::context()->global_context()->object_function()->map() != map());
ASSERT(allow_map_transition || !constant_transition);
ASSERT(index < properties()->length() ||
map()->unused_property_fields() == 0);
// Allocate a new map for the object.
Object* r = map()->Copy();
if (r->IsFailure()) return r;
Map* new_map = Map::cast(r);
if (allow_map_transition) {
// Allocate new instance descriptors for the old map with map transition.
MapTransitionDescriptor d(name, Map::cast(new_map), attributes);
Object* r = old_descriptors->CopyInsert(&d, KEEP_TRANSITIONS);
if (map()->unused_property_fields() > 0) {
ASSERT(index < properties()->length());
// Allocate a new map for the object.
Object* r = map()->Copy();
if (r->IsFailure()) return r;
old_descriptors = DescriptorArray::cast(r);
}
Map* new_map = Map::cast(r);
if (allow_map_transition) {
// Allocate new instance descriptors for the old map with map transition.
MapTransitionDescriptor d(name, Map::cast(new_map), attributes);
Object* r = old_descriptors->CopyInsert(&d, KEEP_TRANSITIONS);
if (r->IsFailure()) return r;
old_descriptors = DescriptorArray::cast(r);
}
// We have now allocated all the necessary objects.
// All the changes can be applied at once, so they are atomic.
map()->set_instance_descriptors(old_descriptors);
new_map->set_instance_descriptors(DescriptorArray::cast(new_descriptors));
new_map->set_unused_property_fields(map()->unused_property_fields() - 1);
set_map(new_map);
properties()->set(index, value);
} else {
ASSERT(map()->unused_property_fields() == 0);
if (map()->unused_property_fields() == 0) {
if (properties()->length() > kMaxFastProperties) {
Object* obj = NormalizeProperties();
if (obj->IsFailure()) return obj;
return AddSlowProperty(name, value, attributes);
}
static const int kExtraFields = 3;
// Make room for the new value
Object* values =
properties()->CopySize(properties()->length() + kFieldsAdded);
properties()->CopySize(properties()->length() + kExtraFields);
if (values->IsFailure()) return values;
FixedArray::cast(values)->set(index, value);
// Allocate a new map for the object.
Object* r = map()->Copy();
if (r->IsFailure()) return r;
Map* new_map = Map::cast(r);
if (allow_map_transition) {
MapTransitionDescriptor d(name, Map::cast(new_map), attributes);
// Allocate new instance descriptors for the old map with map transition.
Object* r = old_descriptors->CopyInsert(&d, KEEP_TRANSITIONS);
if (r->IsFailure()) return r;
old_descriptors = DescriptorArray::cast(r);
}
// We have now allocated all the necessary objects.
// All changes can be done at once, atomically.
map()->set_instance_descriptors(old_descriptors);
new_map->set_instance_descriptors(DescriptorArray::cast(new_descriptors));
new_map->set_unused_property_fields(kExtraFields - 1);
set_map(new_map);
set_properties(FixedArray::cast(values));
new_map->set_unused_property_fields(kFieldsAdded - 1);
} else {
new_map->set_unused_property_fields(map()->unused_property_fields() - 1);
}
// We have now allocated all the necessary objects.
// All the changes can be applied at once, so they are atomic.
map()->set_instance_descriptors(old_descriptors);
new_map->set_instance_descriptors(DescriptorArray::cast(new_descriptors));
set_map(new_map);
properties()->set(index, value);
return value;
}
......@@ -1069,6 +1104,74 @@ Object* JSObject::AddConstantFunctionProperty(String* name,
}
Object* JSObject::ReplaceConstantFunctionProperty(String* name,
Object* value) {
// There are two situations to handle here:
// 1: Replace a constant function with another function.
// 2: Replace a constant function with an object.
if (value->IsJSFunction()) {
JSFunction* function = JSFunction::cast(value);
Object* new_map = map()->CopyDropTransitions();
if (new_map->IsFailure()) return new_map;
set_map(Map::cast(new_map));
// Replace the function entry
int index = map()->instance_descriptors()->Search(name);
ASSERT(index != DescriptorArray::kNotFound);
map()->instance_descriptors()->ReplaceConstantFunction(index, function);
} else {
// Allocate new instance descriptors with updated property index.
int index = map()->NextFreePropertyIndex();
Object* new_descriptors =
map()->instance_descriptors()->CopyReplace(name, index, NONE);
if (new_descriptors->IsFailure()) return new_descriptors;
if (map()->unused_property_fields() > 0) {
ASSERT(index < properties()->length());
// Allocate a new map for the object.
Object* new_map = map()->Copy();
if (new_map->IsFailure()) return new_map;
Map::cast(new_map)->
set_instance_descriptors(DescriptorArray::cast(new_descriptors));
Map::cast(new_map)->
set_unused_property_fields(map()->unused_property_fields()-1);
set_map(Map::cast(new_map));
properties()->set(index, value);
} else {
ASSERT(map()->unused_property_fields() == 0);
static const int kFastNofProperties = 20;
if (properties()->length() > kFastNofProperties) {
Object* obj = NormalizeProperties();
if (obj->IsFailure()) return obj;
return SetProperty(name, value, NONE);
}
static const int kExtraFields = 5;
// Make room for the more properties.
Object* values =
properties()->CopySize(properties()->length() + kExtraFields);
if (values->IsFailure()) return values;
FixedArray::cast(values)->set(index, value);
// Allocate a new map for the object.
Object* new_map = map()->Copy();
if (new_map->IsFailure()) return new_map;
Map::cast(new_map)->
set_instance_descriptors(DescriptorArray::cast(new_descriptors));
Map::cast(new_map)->
set_unused_property_fields(kExtraFields - 1);
set_map(Map::cast(new_map));
set_properties(FixedArray::cast(values));
}
}
return value;
}
// Add property in slow mode
Object* JSObject::AddSlowProperty(String* name,
Object* value,
......@@ -1120,103 +1223,6 @@ Object* JSObject::SetPropertyPostInterceptor(String* name,
}
Object* JSObject::ReplaceSlowProperty(String* name,
Object* value,
PropertyAttributes attributes) {
Dictionary* dictionary = property_dictionary();
PropertyDetails old_details =
dictionary->DetailsAt(dictionary->FindStringEntry(name));
int new_index = old_details.index();
if (old_details.IsTransition()) new_index = 0;
PropertyDetails new_details(attributes, NORMAL, old_details.index());
Object* result =
property_dictionary()->SetOrAddStringEntry(name, value, new_details);
if (result->IsFailure()) return result;
if (property_dictionary() != result) {
set_properties(Dictionary::cast(result));
}
return value;
}
Object* JSObject::ConvertDescriptorToFieldAndMapTransition(
String* name,
Object* new_value,
PropertyAttributes attributes) {
Map* old_map = map();
Object* result = ConvertDescriptorToField(name, new_value, attributes);
if (result->IsFailure()) return result;
// If we get to this point we have succeeded - do not return failure
// after this point. Later stuff is optional.
if (!HasFastProperties()) {
return result;
}
// Do not add transitions to the map of "new Object()".
if (map() == Top::context()->global_context()->object_function()->map()) {
return result;
}
MapTransitionDescriptor transition(name,
map(),
attributes);
Object* new_descriptors =
old_map->instance_descriptors()->
CopyInsert(&transition, KEEP_TRANSITIONS);
if (new_descriptors->IsFailure()) return result; // Yes, return _result_.
old_map->set_instance_descriptors(DescriptorArray::cast(new_descriptors));
return result;
}
Object* JSObject::ConvertDescriptorToField(String* name,
Object* new_value,
PropertyAttributes attributes) {
if (map()->unused_property_fields() == 0 &&
properties()->length() > kMaxFastProperties) {
Object* obj = NormalizeProperties();
if (obj->IsFailure()) return obj;
return ReplaceSlowProperty(name, new_value, attributes);
}
int index = map()->NextFreePropertyIndex();
FieldDescriptor new_field(name, index, attributes);
// Make a new DescriptorArray replacing an entry with FieldDescriptor.
Object* descriptors_unchecked = map()->instance_descriptors()->
CopyInsert(&new_field, REMOVE_TRANSITIONS);
if (descriptors_unchecked->IsFailure()) return descriptors_unchecked;
DescriptorArray* new_descriptors =
DescriptorArray::cast(descriptors_unchecked);
// Make a new map for the object.
Object* new_map_unchecked = map()->Copy();
if (new_map_unchecked->IsFailure()) return new_map_unchecked;
Map* new_map = Map::cast(new_map_unchecked);
new_map->set_instance_descriptors(new_descriptors);
// Make new properties array if necessary.
FixedArray* new_properties = 0; // Will always be NULL or a valid pointer.
int new_unused_property_fields = map()->unused_property_fields() - 1;
if (map()->unused_property_fields() == 0) {
new_unused_property_fields = kFieldsAdded - 1;
Object* new_properties_unchecked =
properties()->CopySize(properties()->length() + kFieldsAdded);
if (new_properties_unchecked->IsFailure()) return new_properties_unchecked;
new_properties = FixedArray::cast(new_properties_unchecked);
}
// Update pointers to commit changes.
// Object points to the new map.
new_map->set_unused_property_fields(new_unused_property_fields);
set_map(new_map);
if (new_properties) {
set_properties(FixedArray::cast(new_properties));
}
properties()->set(index, new_value);
return new_value;
}
Object* JSObject::SetPropertyWithInterceptor(String* name,
Object* value,
PropertyAttributes attributes) {
......@@ -1522,12 +1528,13 @@ Object* JSObject::SetProperty(LookupResult* result,
return AddFastPropertyUsingMap(result->GetTransitionMap(),
name,
value);
} else {
return AddFastProperty(name, value, attributes);
}
return ConvertDescriptorToField(name, value, attributes);
case CONSTANT_FUNCTION:
if (value == result->GetConstantFunction()) return value;
// Only replace the function if necessary.
return ConvertDescriptorToFieldAndMapTransition(name, value, attributes);
return ReplaceConstantFunctionProperty(name, value);
case CALLBACKS:
return SetPropertyWithCallback(result->GetCallbackObject(),
name,
......@@ -1538,9 +1545,10 @@ Object* JSObject::SetProperty(LookupResult* result,
case CONSTANT_TRANSITION:
// Replace with a MAP_TRANSITION to a new map with a FIELD, even
// if the value is a function.
return ConvertDescriptorToFieldAndMapTransition(name, value, attributes);
// AddProperty has been extended to do this, in this case.
return AddFastProperty(name, value, attributes);
case NULL_DESCRIPTOR:
return ConvertDescriptorToFieldAndMapTransition(name, value, attributes);
UNREACHABLE();
default:
UNREACHABLE();
}
......@@ -1572,14 +1580,33 @@ Object* JSObject::IgnoreAttributesAndSetLocalProperty(
&& !Top::MayNamedAccess(this, name, v8::ACCESS_SET)) {
return SetPropertyWithFailedAccessCheck(result, name, value);
}
// Check for accessor in prototype chain removed here in clone.
/*
REMOVED FROM CLONE
if (result->IsNotFound() || !result->IsProperty()) {
// We could not find a local property so let's check whether there is an
// accessor that wants to handle the property.
LookupResult accessor_result;
LookupCallbackSetterInPrototypes(name, &accessor_result);
if (accessor_result.IsValid()) {
return SetPropertyWithCallback(accessor_result.GetCallbackObject(),
name,
value,
accessor_result.holder());
}
}
*/
if (result->IsNotFound()) {
return AddProperty(name, value, attributes);
}
if (!result->IsLoaded()) {
return SetLazyProperty(result, name, value, attributes);
}
// Check of IsReadOnly removed from here in clone.
/*
REMOVED FROM CLONE
if (result->IsReadOnly() && result->IsProperty()) return value;
*/
// This is a real property that is not read-only, or it is a
// transition or null descriptor and there are no setters in the prototypes.
switch (result->type()) {
case NORMAL:
property_dictionary()->ValueAtPut(result->GetDictionaryEntry(), value);
......@@ -1594,12 +1621,12 @@ Object* JSObject::IgnoreAttributesAndSetLocalProperty(
name,
value);
} else {
return ConvertDescriptorToField(name, value, attributes);
return AddFastProperty(name, value, attributes);
}
case CONSTANT_FUNCTION:
if (value == result->GetConstantFunction()) return value;
// Only replace the function if necessary.
return ConvertDescriptorToFieldAndMapTransition(name, value, attributes);
return ReplaceConstantFunctionProperty(name, value);
case CALLBACKS:
return SetPropertyWithCallback(result->GetCallbackObject(),
name,
......@@ -1610,9 +1637,10 @@ Object* JSObject::IgnoreAttributesAndSetLocalProperty(
case CONSTANT_TRANSITION:
// Replace with a MAP_TRANSITION to a new map with a FIELD, even
// if the value is a function.
return ConvertDescriptorToFieldAndMapTransition(name, value, attributes);
// AddProperty has been extended to do this, in this case.
return AddFastProperty(name, value, attributes);
case NULL_DESCRIPTOR:
return ConvertDescriptorToFieldAndMapTransition(name, value, attributes);
UNREACHABLE();
default:
UNREACHABLE();
}
......@@ -2635,6 +2663,14 @@ void DescriptorArray::SetEnumCache(FixedArray* bridge_storage,
}
void DescriptorArray::ReplaceConstantFunction(int descriptor_number,
JSFunction* value) {
ASSERT(!Heap::InNewSpace(value));
FixedArray* content_array = GetContentArray();
fast_set(content_array, ToValueIndex(descriptor_number), value);
}
Object* DescriptorArray::CopyInsert(Descriptor* descriptor,
TransitionFlag transition_flag) {
// Transitions are only kept when inserting another transition.
......@@ -2735,6 +2771,69 @@ Object* DescriptorArray::CopyInsert(Descriptor* descriptor,
}
Object* DescriptorArray::CopyReplace(String* name,
int index,
PropertyAttributes attributes) {
// Allocate the new descriptor array.
Object* result = DescriptorArray::Allocate(number_of_descriptors());
if (result->IsFailure()) return result;
// Make sure only symbols are added to the instance descriptor.
if (!name->IsSymbol()) {
Object* result = Heap::LookupSymbol(name);
if (result->IsFailure()) return result;
name = String::cast(result);
}
DescriptorWriter w(DescriptorArray::cast(result));
for (DescriptorReader r(this); !r.eos(); r.advance()) {
if (r.Equals(name)) {
FieldDescriptor d(name, index, attributes);
d.SetEnumerationIndex(r.GetDetails().index());
w.Write(&d);
} else {
w.WriteFrom(&r);
}
}
// Copy the next enumeration index.
DescriptorArray::cast(result)->
SetNextEnumerationIndex(NextEnumerationIndex());
ASSERT(w.eos());
return result;
}
Object* DescriptorArray::CopyRemove(String* name) {
if (!name->IsSymbol()) {
Object* result = Heap::LookupSymbol(name);
if (result->IsFailure()) return result;
name = String::cast(result);
}
ASSERT(name->IsSymbol());
Object* result = Allocate(number_of_descriptors() - 1);
if (result->IsFailure()) return result;
DescriptorArray* new_descriptors = DescriptorArray::cast(result);
// Set the enumeration index in the descriptors and set the enumeration index
// in the result.
new_descriptors->SetNextEnumerationIndex(NextEnumerationIndex());
// Write the old content and the descriptor information
DescriptorWriter w(new_descriptors);
DescriptorReader r(this);
while (!r.eos()) {
if (r.GetKey() != name) { // Both are symbols; object identity suffices.
w.WriteFrom(&r);
}
r.advance();
}
ASSERT(w.eos());
return new_descriptors;
}
Object* DescriptorArray::RemoveTransitions() {
// Remove all transitions. Return a copy of the array with all transitions
// removed, or a Failure object if the new array could not be allocated.
......
......@@ -1303,26 +1303,9 @@ class JSObject: public HeapObject {
JSFunction* function,
PropertyAttributes attributes);
Object* ReplaceSlowProperty(String* name,
Object* value,
PropertyAttributes attributes);
// Converts a descriptor of any other type to a real field,
// backed by the properties array. Descriptors of visible
// types, such as CONSTANT_FUNCTION, keep their enumeration order.
// Converts the descriptor on the original object's map to a
// map transition, and the the new field is on the object's new map.
Object* ConvertDescriptorToFieldAndMapTransition(
String* name,
Object* new_value,
PropertyAttributes attributes);
// Converts a descriptor of any other type to a real field,
// backed by the properties array. Descriptors of visible
// types, such as CONSTANT_FUNCTION, keep their enumeration order.
Object* ConvertDescriptorToField(String* name,
Object* new_value,
PropertyAttributes attributes);
// Replace a constant function property on a fast-case object.
Object* ReplaceConstantFunctionProperty(String* name,
Object* value);
// Add a property to a fast-case object.
Object* AddFastProperty(String* name,
......@@ -1394,10 +1377,6 @@ class JSObject: public HeapObject {
static const uint32_t kMaxGap = 1024;
static const int kMaxFastElementsLength = 5000;
static const int kMaxFastProperties = 8;
// When extending the backing storage for property values, we increase
// its size by more than the 1 entry necessary, so sequentially adding fields
// to the same object requires fewer allocations and copies.
static const int kFieldsAdded = 3;
// Layout description.
static const int kPropertiesOffset = HeapObject::kHeaderSize;
......@@ -1583,6 +1562,7 @@ class DescriptorArray: public FixedArray {
inline void Get(int descriptor_number, Descriptor* desc);
inline void Set(int descriptor_number, Descriptor* desc);
void ReplaceConstantFunction(int descriptor_number, JSFunction* value);
// Copy the descriptor array, insert a new descriptor and optionally
// remove map transitions. If the descriptor is already present, it is
......@@ -1592,6 +1572,20 @@ class DescriptorArray: public FixedArray {
// a transition, they must not be removed. All null descriptors are removed.
Object* CopyInsert(Descriptor* descriptor, TransitionFlag transition_flag);
// Makes a copy of the descriptor array with the descriptor with key name
// removed. If name is the empty string, the descriptor array is copied.
// Transitions are removed if TransitionFlag is REMOVE_TRANSITIONS.
// All null descriptors are removed.
Object* CopyRemove(TransitionFlag remove_transitions, String* name);
// Copy the descriptor array, replace the property index and attributes
// of the named property, but preserve its enumeration index.
Object* CopyReplace(String* name, int index, PropertyAttributes attributes);
// Copy the descriptor array, removing the property index and attributes
// of the named property.
Object* CopyRemove(String* name);
// Remove all transitions. Return a copy of the array with all transitions
// removed, or a Failure object if the new array could not be allocated.
Object* RemoveTransitions();
......
......@@ -66,13 +66,15 @@ const int JSRegExpErrorInternal = -4;
typedef void* malloc_t(size_t size);
typedef void free_t(void* address);
JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
template <typename Char>
JSRegExp* jsRegExpCompile(const Char* pattern, int patternLength,
JSRegExpIgnoreCaseOption, JSRegExpMultilineOption,
unsigned* numSubpatterns, const char** errorMessage,
malloc_t* allocate_function, free_t* free_function);
template <typename Char>
int jsRegExpExecute(const JSRegExp*,
const UChar* subject, int subjectLength, int startOffset,
const Char* subject, int subjectLength, int startOffset,
int* offsetsVector, int offsetsVectorLength);
void jsRegExpFree(JSRegExp*);
......
......@@ -147,7 +147,8 @@ struct CompileData {
/* Definitions to allow mutual recursion */
static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
template <typename Char>
static bool compileBracket(int, int*, unsigned char**, const Char**, const Char*, ErrorCode*, int, int*, int*, CompileData&);
static bool bracketIsAnchored(const unsigned char* code);
static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
......@@ -174,9 +175,10 @@ Returns: zero or positive => a data character
on error, errorptr is set
*/
static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)
template <typename Char>
static int checkEscape(const Char** ptrptr, const Char* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)
{
const UChar* ptr = *ptrptr + 1;
const Char* ptr = *ptrptr + 1;
/* If backslash is at the end of the pattern, it's an error. */
if (ptr == patternEnd) {
......@@ -184,13 +186,13 @@ static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode*
*ptrptr = ptr;
return 0;
}
int c = *ptr;
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
a table. A non-zero result is something that can be returned immediately.
Otherwise further processing may be required. */
if (c < '0' || c > 'z') { /* Not alphameric */
} else if (int escapeValue = escapes[c - '0']) {
c = escapeValue;
......@@ -201,7 +203,7 @@ static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode*
c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
}
/* Escapes that need further processing, or are illegal. */
} else {
switch (c) {
case '1':
......@@ -217,9 +219,9 @@ static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode*
unless there are insufficient brackets, in which case they are octal
escape sequences. Those sequences end on the first non-octal character
or when we overflow 0-255, whichever comes first. */
if (!isclass) {
const UChar* oldptr = ptr;
const Char* oldptr = ptr;
c -= '0';
while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
c = c * 10 + *(++ptr) - '0';
......@@ -229,10 +231,10 @@ static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode*
}
ptr = oldptr; /* Put the pointer back and fall through */
}
/* Handle an octal number following \. If the first digit is 8 or 9,
this is not octal. */
if ((c = *ptr) >= '8')
break;
......@@ -296,14 +298,14 @@ static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode*
return 0;
}
c = *ptr;
/* A letter is upper-cased; then the 0x40 bit is flipped. This coding
is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
c = toASCIIUpper(c) ^ 0x40;
break;
}
}
*ptrptr = ptr;
return c;
}
......@@ -323,7 +325,8 @@ Arguments:
Returns: true or false
*/
static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
template <typename Char>
static bool isCountedRepeat(const Char* p, const Char* patternEnd)
{
if (p >= patternEnd || !isASCIIDigit(*p))
return false;
......@@ -332,18 +335,18 @@ static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
p++;
if (p < patternEnd && *p == '}')
return true;
if (p >= patternEnd || *p++ != ',')
return false;
if (p < patternEnd && *p == '}')
return true;
if (p >= patternEnd || !isASCIIDigit(*p))
return false;
p++;
while (p < patternEnd && isASCIIDigit(*p))
p++;
return (p < patternEnd && *p == '}');
}
......@@ -366,24 +369,25 @@ Returns: pointer to '}' on success;
current ptr on error, with errorcodeptr set non-zero
*/
static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorcodeptr)
template <typename Char>
static const Char* readRepeatCounts(const Char* p, int* minp, int* maxp, ErrorCode* errorcodeptr)
{
int min = 0;
int max = -1;
/* Read the minimum value and do a paranoid check: a negative value indicates
an integer overflow. */
while (isASCIIDigit(*p))
min = min * 10 + *p++ - '0';
if (min < 0 || min > 65535) {
*errorcodeptr = ERR5;
return p;
}
/* Read the maximum value if there is one, and again do a paranoid on its size.
Also, max must not be less than min. */
if (*p == '}')
max = min;
else {
......@@ -401,10 +405,10 @@ static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, Error
}
}
}
/* Fill in the required variables, and pass back the pointer to the terminating
'}'. */
*minp = min;
*maxp = max;
return p;
......@@ -472,27 +476,27 @@ Yield: true when range returned; false when no more
static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
{
int c, othercase = 0;
for (c = *cptr; c <= d; c++) {
if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)
break;
}
if (c > d)
return false;
*ocptr = othercase;
int next = othercase + 1;
for (++c; c <= d; c++) {
if (kjs_pcre_ucp_othercase(c) != next)
break;
next++;
}
*odptr = next - 1;
*cptr = c;
return true;
}
......@@ -502,11 +506,11 @@ static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
/* This function takes an integer value in the range 0 - 0x7fffffff
and encodes it as a UTF-8 character in 0 to 6 bytes.
Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
Returns: number of characters placed in the buffer
*/
......@@ -545,14 +549,16 @@ Returns: true on success
false, with *errorcodeptr set non-zero on error
*/
static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
template <typename Char>
static inline bool safelyCheckNextChar(const Char* ptr, const Char* patternEnd, char expected)
{
return ((ptr + 1 < patternEnd) && ptr[1] == expected);
}
template <typename Char>
static bool
compileBranch(int options, int* brackets, unsigned char** codeptr,
const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
const Char** ptrptr, const Char* patternEnd, ErrorCode* errorcodeptr, int *firstbyteptr,
int* reqbyteptr, CompileData& cd)
{
int repeat_type, op_type;
......@@ -563,39 +569,39 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
unsigned char* code = *codeptr;
unsigned char* tempcode;
bool groupsetfirstbyte = false;
const UChar* ptr = *ptrptr;
const UChar* tempptr;
const Char* ptr = *ptrptr;
const Char* tempptr;
unsigned char* previous = NULL;
unsigned char classbits[32];
bool class_utf8;
unsigned char* class_utf8data;
unsigned char utf8_char[6];
/* Initialize no first byte, no required byte. REQ_UNSET means "no char
matching encountered yet". It gets changed to REQ_NONE if we hit something that
matches a non-fixed char first char; reqbyte just remains unset if we never
find one.
When we hit a repeat whose minimum is zero, we may have to adjust these values
to take the zero repeat into account. This is implemented by setting them to
zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
item types that can be repeated set these backoff variables appropriately. */
int firstbyte = REQ_UNSET;
int reqbyte = REQ_UNSET;
int zeroreqbyte = REQ_UNSET;
int zerofirstbyte = REQ_UNSET;
/* The variable req_caseopt contains either the REQ_IGNORE_CASE value or zero,
according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
value > 255. It is added into the firstbyte or reqbyte variables to record the
case status of the value. This is used only for ASCII characters. */
int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
/* Switch on next character until the end of the branch */
for (;; ptr++) {
bool negate_class;
bool should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
......@@ -606,19 +612,19 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
int subfirstbyte;
int mclength;
unsigned char mcbuffer[8];
/* Next byte in the pattern */
c = ptr < patternEnd ? *ptr : 0;
/* Fill in length of a previous callout, except when the next thing is
a quantifier. */
bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
switch (c) {
/* The branch terminates at end of string, |, or ). */
case 0:
if (ptr < patternEnd)
goto NORMAL_CHAR;
......@@ -630,7 +636,7 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
*codeptr = code;
*ptrptr = ptr;
return true;
/* Handle single-character metacharacters. In multiline mode, ^ disables
the setting of any following char as a first character. */
......@@ -663,26 +669,26 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
previous = code;
*code++ = OP_NOT_NEWLINE;
break;
/* Character classes. If the included characters are all < 256, we build a
32-byte bitmap of the permitted characters, except in the special case
where there is only one such character. For negated classes, we build the
map as usual, then invert it at the end. However, we use a different opcode
so that data characters > 255 can be handled correctly.
If the class contains characters outside the 0-255 range, a different
opcode is compiled. It may optionally have a bit map for characters < 256,
but those above are are explicitly listed afterwards. A flag byte tells
whether the bitmap is present, and whether this is a negated class or not.
*/
case '[': {
previous = code;
should_flip_negation = false;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
they are encountered at the top level, so we'll do that too. */
/* If the first character is '^', set the negation flag and skip it. */
if (ptr + 1 >= patternEnd) {
......@@ -695,24 +701,24 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
++ptr;
} else
negate_class = false;
/* Keep a count of chars with values < 256 so that we can optimize the case
of just a single character (as long as it's < 256). For higher valued UTF-8
characters, we don't yet do any optimization. */
class_charcount = 0;
class_lastchar = -1;
class_utf8 = false; /* No chars >= 256 */
class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
/* Initialize the 32-char bit map to all zeros. We have to build the
map in a temporary bit of store, in case the class contains only 1
character (< 256), because in that case the compiled code doesn't use the
bit map. */
memset(classbits, 0, 32 * sizeof(unsigned char));
/* Process characters until ] is reached. The first pass
through the regex checked the overall syntax, so we don't need to be very
strict here. At the start of the loop, c contains the first byte of the
......@@ -726,7 +732,7 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
it marks a word boundary. Other escapes have preset maps ready to
or into the one we are building. We assume they have more than one
character in them, so set class_charcount bigger than one. */
if (c == '\\') {
c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
if (c < 0) {
......@@ -736,92 +742,92 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
for (c = 0; c < 32; c++)
classbits[c] |= classBitmapForChar(c + cbit_digit);
continue;
case ESC_D:
should_flip_negation = true;
for (c = 0; c < 32; c++)
classbits[c] |= ~classBitmapForChar(c + cbit_digit);
continue;
case ESC_w:
for (c = 0; c < 32; c++)
classbits[c] |= classBitmapForChar(c + cbit_word);
continue;
case ESC_W:
should_flip_negation = true;
for (c = 0; c < 32; c++)
classbits[c] |= ~classBitmapForChar(c + cbit_word);
continue;
case ESC_s:
for (c = 0; c < 32; c++)
classbits[c] |= classBitmapForChar(c + cbit_space);
continue;
case ESC_S:
should_flip_negation = true;
for (c = 0; c < 32; c++)
classbits[c] |= ~classBitmapForChar(c + cbit_space);
continue;
/* Unrecognized escapes are faulted if PCRE is running in its
strict mode. By default, for compatibility with Perl, they are
treated as literals. */
default:
c = *ptr; /* The final character */
class_charcount -= 2; /* Undo the default count from above */
}
}
/* Fall through if we have a single character (c >= 0). This may be
> 256 in UTF-8 mode. */
} /* End of backslash handling */
/* A single character may be followed by '-' to form a range. However,
Perl does not permit ']' to be the end of the range. A '-' character
here is treated as a literal. */
if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
ptr += 2;
int d = *ptr;
/* The second part of a range can be a single-character escape, but
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
in such circumstances. */
if (d == '\\') {
const UChar* oldptr = ptr;
const Char* oldptr = ptr;
d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, true);
/* \X is literal X; any other special means the '-' was literal */
if (d < 0) {
ptr = oldptr - 2;
goto LONE_SINGLE_CHARACTER; /* A few lines below */
}
}
/* The check that the two values are in the correct order happens in
the pre-pass. Optimize one-character ranges */
if (d == c)
goto LONE_SINGLE_CHARACTER; /* A few lines below */
/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
matching, we have to use an XCLASS with extra data items. Caseless
matching for characters > 127 is available only if UCP support is
available. */
if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
class_utf8 = true;
/* With UCP support, we can find the other case equivalents of
the relevant characters. There may be several ranges. Optimize how
they fit with the basic range. */
if (options & IgnoreCaseOption) {
int occ, ocd;
int cc = c;
......@@ -829,7 +835,7 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
if (occ >= c && ocd <= d)
continue; /* Skip embedded ranges */
if (occ < c && ocd >= c - 1) /* Extend the basic range */
{ /* if there is overlap, */
c = occ; /* noting that if occ < c */
......@@ -840,7 +846,7 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
d = ocd;
continue;
}
if (occ == ocd)
*class_utf8data++ = XCL_SINGLE;
else {
......@@ -850,25 +856,25 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
class_utf8data += encodeUTF8(ocd, class_utf8data);
}
}
/* Now record the original range, possibly modified for UCP caseless
overlapping ranges. */
*class_utf8data++ = XCL_RANGE;
class_utf8data += encodeUTF8(c, class_utf8data);
class_utf8data += encodeUTF8(d, class_utf8data);
/* With UCP support, we are done. Without UCP support, there is no
caseless matching for UTF-8 characters > 127; we can use the bit map
for the smaller ones. */
continue; /* With next character in the class */
}
/* We use the bit map for all cases when not in UTF-8 mode; else
ranges that lie entirely within 0-127 when there is UCP support; else
for partial ranges without UCP support. */
for (; c <= d; c++) {
classbits[c/8] |= (1 << (c&7));
if (options & IgnoreCaseOption) {
......@@ -878,23 +884,23 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
class_charcount++; /* in case a one-char range */
class_lastchar = c;
}
continue; /* Go get the next char in the class */
}
/* Handle a lone single character - we can get here for a normal
non-escape char, or after \ that introduces a single character or for an
apparent range that isn't. */
LONE_SINGLE_CHARACTER:
/* Handle a character that cannot go in the bit map */
if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
class_utf8 = true;
*class_utf8data++ = XCL_SINGLE;
class_utf8data += encodeUTF8(c, class_utf8data);
if (options & IgnoreCaseOption) {
int othercase;
if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {
......@@ -913,26 +919,26 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
class_lastchar = c;
}
}
/* If class_charcount is 1, we saw precisely one character whose value is
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
can optimize the negative case only if there were no characters >= 128
because OP_NOT and the related opcodes like OP_NOTSTAR operate on
single-bytes only. This is an historical hangover. Maybe one day we can
tidy these opcodes to handle multi-byte characters.
The optimization throws away the bit map. We turn the item into a
1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
that OP_NOT does not support multibyte characters. In the positive case, it
can cause firstbyte to be set. Otherwise, there can be no first char if
this item is first, whatever repeat count may follow. In the case of
reqbyte, save the previous value for reinstating. */
if (class_charcount == 1 && (!class_utf8 && (!negate_class || class_lastchar < 128))) {
zeroreqbyte = reqbyte;
/* The OP_NOT opcode works on one-byte characters only. */
if (negate_class) {
if (firstbyte == REQ_UNSET)
firstbyte = REQ_NONE;
......@@ -941,61 +947,61 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
*code++ = class_lastchar;
break;
}
/* For a single, positive character, get the value into c, and
then we can handle this with the normal one-character code. */
c = class_lastchar;
goto NORMAL_CHAR;
} /* End of 1-char optimization */
/* The general case - not the one-char optimization. If this is the first
thing in the branch, there can be no first char setting, whatever the
repeat count. Any reqbyte setting must remain unchanged after any kind of
repeat. */
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
/* If there are characters with values > 255, we have to compile an
extended class, with its own opcode. If there are no characters < 256,
we can omit the bitmap. */
if (class_utf8 && !should_flip_negation) {
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
code += LINK_SIZE;
*code = negate_class? XCL_NOT : 0;
/* If the map is required, install it, and move on to the end of
the extra data */
if (class_charcount > 0) {
*code++ |= XCL_MAP;
memcpy(code, classbits, 32);
code = class_utf8data;
}
/* If the map is not required, slide down the extra data. */
else {
int len = class_utf8data - (code + 33);
memmove(code + 1, code + 33, len);
code += len + 1;
}
/* Now fill in the complete length of the item */
putLinkValue(previous + 1, code - previous);
break; /* End of class handling */
}
/* If there are no characters > 255, negate the 32-byte map if necessary,
and copy it into the code vector. If this is the first thing in the branch,
there can be no first char setting, whatever the repeat count. Any reqbyte
setting must remain unchanged after any kind of repeat. */
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
if (negate_class)
for (c = 0; c < 32; c++)
......@@ -1005,7 +1011,7 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
code += 32;
break;
}
/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
has been tested above. */
......@@ -1016,68 +1022,68 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
if (*errorcodeptr)
goto FAILED;
goto REPEAT;
case '*':
repeat_min = 0;
repeat_max = -1;
goto REPEAT;
case '+':
repeat_min = 1;
repeat_max = -1;
goto REPEAT;
case '?':
repeat_min = 0;
repeat_max = 1;
REPEAT:
if (!previous) {
*errorcodeptr = ERR9;
goto FAILED;
}
if (repeat_min == 0) {
firstbyte = zerofirstbyte; /* Adjust for zero repeat */
reqbyte = zeroreqbyte; /* Ditto */
}
/* Remember whether this is a variable length repeat */
reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;
op_type = 0; /* Default single-char op codes */
/* Save start of previous item, in case we have to move it up to make space
for an inserted OP_ONCE for the additional '+' extension. */
/* FIXME: Probably don't need this because we don't use OP_ONCE. */
tempcode = previous;
/* If the next character is '+', we have a possessive quantifier. This
implies greediness, whatever the setting of the PCRE_UNGREEDY option.
If the next character is '?' this is a minimizing repeat, by default,
but if PCRE_UNGREEDY is set, it works the other way round. We change the
repeat type to the non-default. */
if (safelyCheckNextChar(ptr, patternEnd, '?')) {
repeat_type = 1;
ptr++;
} else
repeat_type = 0;
/* If previous was a character match, abolish the item and generate a
repeat item instead. If a char item has a minumum of more than one, ensure
that it is set in reqbyte - it might not be if a sequence such as x{3} is
the first thing in a branch because the x will have gone into firstbyte
instead. */
if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
/* Deal with UTF-8 characters that take up more than one byte. It's
easier to write this out separately than try to macrify it. Use c to
hold the length of the character in bytes, plus 0x80 to flag that it's a
length rather than a small character. */
if (code[-1] & 0x80) {
unsigned char *lastchar = code - 1;
while((*lastchar & 0xc0) == 0x80)
......@@ -1091,56 +1097,56 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
if (repeat_min > 1)
reqbyte = c | req_caseopt | cd.req_varyopt;
}
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
}
else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
c = previous[1];
if (repeat_min > 1)
reqbyte = c | req_caseopt | cd.req_varyopt;
goto OUTPUT_SINGLE_REPEAT;
}
/* If previous was a single negated character ([^a] or similar), we use
one of the special opcodes, replacing it. The code is shared with single-
character repeats by setting opt_type to add a suitable offset into
repeat_type. OP_NOT is currently used only for single-byte chars. */
else if (*previous == OP_NOT) {
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
c = previous[1];
goto OUTPUT_SINGLE_REPEAT;
}
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
repeats by setting op_type to add a suitable offset into repeat_type. */
else if (*previous <= OP_NOT_NEWLINE) {
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
c = *previous;
OUTPUT_SINGLE_REPEAT:
int prop_type = -1;
int prop_value = -1;
unsigned char* oldcode = code;
code = previous; /* Usually overwrite previous item */
/* If the maximum is zero then the minimum must also be zero; Perl allows
this case, so we do too - by simply omitting the item altogether. */
if (repeat_max == 0)
goto END_REPEAT;
/* Combine the op_type with the repeat_type */
repeat_type += op_type;
/* A minimum of zero is handled either as the special case * or ?, or as
an UPTO, with the maximum given. */
if (repeat_min == 0) {
if (repeat_max == -1)
*code++ = OP_STAR + repeat_type;
......@@ -1151,12 +1157,12 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
put2ByteValueAndAdvance(code, repeat_max);
}
}
/* A repeat minimum of 1 is optimized into some special cases. If the
maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
left in place and, if the maximum is greater than 1, we use OP_UPTO with
one less than the maximum. */
else if (repeat_min == 1) {
if (repeat_max == -1)
*code++ = OP_PLUS + repeat_type;
......@@ -1168,20 +1174,20 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
put2ByteValueAndAdvance(code, repeat_max - 1);
}
}
/* The case {n,n} is just an EXACT, while the general case {n,m} is
handled as an EXACT followed by an UPTO. */
else {
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
put2ByteValueAndAdvance(code, repeat_min);
/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
we have to insert the character for the previous code. For a repeated
Unicode property match, there are two extra bytes that define the
required property. In UTF-8 mode, long characters have their length in
c, with the 0x80 bit as a flag. */
if (repeat_max < 0) {
if (c >= 128) {
memcpy(code, utf8_char, c & 7);
......@@ -1195,10 +1201,10 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
}
*code++ = OP_STAR + repeat_type;
}
/* Else insert an UPTO if the max is greater than the min, again
preceded by the character, for the previously inserted code. */
else if (repeat_max != repeat_min) {
if (c >= 128) {
memcpy(code, utf8_char, c & 7);
......@@ -1214,27 +1220,27 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
put2ByteValueAndAdvance(code, repeat_max);
}
}
/* The character or character type itself comes last in all cases. */
if (c >= 128) {
memcpy(code, utf8_char, c & 7);
code += c & 7;
} else
*code++ = c;
/* For a repeated Unicode property match, there are two extra bytes that
define the required property. */
if (prop_type >= 0) {
*code++ = prop_type;
*code++ = prop_value;
}
}
/* If previous was a character class or a back reference, we put the repeat
stuff after it, but just skip the item if the repeat was {0,0}. */
else if (*previous == OP_CLASS ||
*previous == OP_NCLASS ||
*previous == OP_XCLASS ||
......@@ -1244,7 +1250,7 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
code = previous;
goto END_REPEAT;
}
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
else if (repeat_min == 1 && repeat_max == -1)
......@@ -1259,86 +1265,86 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
put2ByteValueAndAdvance(code, repeat_max);
}
}
/* If previous was a bracket group, we may have to replicate it in certain
cases. */
else if (*previous >= OP_BRA) {
int ketoffset = 0;
int len = code - previous;
unsigned char* bralink = NULL;
/* If the maximum repeat count is unlimited, find the end of the bracket
by scanning through from the start, and compute the offset back to it
from the current code pointer. There may be an OP_OPT setting following
the final KET, so we can't find the end just by going back from the code
pointer. */
if (repeat_max == -1) {
const unsigned char* ket = previous;
advanceToEndOfBracket(ket);
ketoffset = code - ket;
}
/* The case of a zero minimum is special because of the need to stick
OP_BRAZERO in front of it, and because the group appears once in the
data, whereas in other cases it appears the minimum number of times. For
this reason, it is simplest to treat this case separately, as otherwise
the code gets far too messy. There are several special subcases when the
minimum is zero. */
if (repeat_min == 0) {
/* If the maximum is also zero, we just omit the group from the output
altogether. */
if (repeat_max == 0) {
code = previous;
goto END_REPEAT;
}
/* If the maximum is 1 or unlimited, we just have to stick in the
BRAZERO and do no more at this point. However, we do need to adjust
any OP_RECURSE calls inside the group that refer to the group itself or
any internal group, because the offset is from the start of the whole
regex. Temporarily terminate the pattern while doing this. */
if (repeat_max <= 1) {
*code = OP_END;
memmove(previous+1, previous, len);
code++;
*previous++ = OP_BRAZERO + repeat_type;
}
/* If the maximum is greater than 1 and limited, we have to replicate
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
The first one has to be handled carefully because it's the original
copy, which has to be moved up. The remainder can be handled by code
that is common with the non-zero minimum case below. We have to
adjust the value of repeat_max, since one less copy is required. */
else {
*code = OP_END;
memmove(previous + 2 + LINK_SIZE, previous, len);
code += 2 + LINK_SIZE;
*previous++ = OP_BRAZERO + repeat_type;
*previous++ = OP_BRA;
/* We chain together the bracket offset fields that have to be
filled in later when the ends of the brackets are reached. */
int offset = (!bralink) ? 0 : previous - bralink;
bralink = previous;
putLinkValueAllowZeroAndAdvance(previous, offset);
}
repeat_max--;
}
/* If the minimum is greater than zero, replicate the group as many
times as necessary, and adjust the maximum to the number of subsequent
copies that we need. If we set a first char from the group, and didn't
set a required char, copy the latter from the former. */
else {
if (repeat_min > 1) {
if (groupsetfirstbyte && reqbyte < 0)
......@@ -1351,34 +1357,34 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
if (repeat_max > 0)
repeat_max -= repeat_min;
}
/* This code is common to both the zero and non-zero minimum cases. If
the maximum is limited, it replicates the group in a nested fashion,
remembering the bracket starts on a stack. In the case of a zero minimum,
the first one was set up above. In all cases the repeat_max now specifies
the number of additional copies needed. */
if (repeat_max >= 0) {
for (int i = repeat_max - 1; i >= 0; i--) {
*code++ = OP_BRAZERO + repeat_type;
/* All but the final copy start a new nesting, maintaining the
chain of brackets outstanding. */
if (i != 0) {
*code++ = OP_BRA;
int offset = (!bralink) ? 0 : code - bralink;
bralink = code;
putLinkValueAllowZeroAndAdvance(code, offset);
}
memcpy(code, previous, len);
code += len;
}
/* Now chain through the pending brackets, and fill in their length
fields (which are holding the chain links pro tem). */
while (bralink) {
int offset = code - bralink + 1;
unsigned char* bra = code - offset;
......@@ -1389,71 +1395,71 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
putLinkValue(bra + 1, offset);
}
}
/* If the maximum is unlimited, set a repeater in the final copy. We
can't just offset backwards from the current code point, because we
don't know if there's been an options resetting after the ket. The
correct offset was computed above. */
else
code[-ketoffset] = OP_KETRMAX + repeat_type;
}
/* Else there's some kind of shambles */
else {
*errorcodeptr = ERR11;
goto FAILED;
}
/* In all case we no longer have a previous item. We also set the
"follows varying string" flag for subsequently encountered reqbytes if
it isn't already set and we have just passed a varying length item. */
END_REPEAT:
previous = NULL;
cd.req_varyopt |= reqvary;
break;
/* Start of nested bracket sub-expression, or comment or lookahead or
lookbehind or option setting or condition. First deal with special things
that can come after a bracket; all are introduced by ?, and the appearance
of any of them means that this is not a referencing group. They were
checked for validity in the first pass over the string, so we don't have to
check for syntax errors here. */
case '(':
skipbytes = 0;
if (*(++ptr) == '?') {
switch (*(++ptr)) {
case ':': /* Non-extracting bracket */
bravalue = OP_BRA;
ptr++;
break;
case '=': /* Positive lookahead */
bravalue = OP_ASSERT;
ptr++;
break;
case '!': /* Negative lookahead */
bravalue = OP_ASSERT_NOT;
ptr++;
break;
/* Character after (? not specially recognized */
default:
*errorcodeptr = ERR12;
goto FAILED;
}
}
/* Else we have a referencing group; adjust the opcode. If the bracket
number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
arrange for the true number to follow later, in an OP_BRANUMBER item. */
else {
if (++(*brackets) > EXTRACT_BASIC_MAX) {
bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
......@@ -1464,17 +1470,17 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
else
bravalue = OP_BRA + *brackets;
}
/* Process nested bracketed re. Assertions may not be repeated, but other
kinds can be. We copy code into a non-variable in order to be able
to pass its address because some compilers complain otherwise. Pass in a
new setting for the ims options if they have changed. */
previous = (bravalue >= OP_BRAZERO) ? code : 0;
*code = bravalue;
tempcode = code;
tempreqvary = cd.req_varyopt; /* Save value before bracket */
if (!compileBracket(
options,
brackets, /* Extracting bracket count */
......@@ -1487,29 +1493,29 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
&subreqbyte, /* For possible last char */
cd)) /* Tables block */
goto FAILED;
/* At the end of compiling, code is still pointing to the start of the
group, while tempcode has been updated to point past the end of the group
and any option resetting that may follow it. The pattern pointer (ptr)
is on the bracket. */
/* Handle updating of the required and first characters. Update for normal
brackets of all kinds, and conditions with two branches (see code above).
If the bracket is followed by a quantifier with zero repeat, we have to
back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
main loop so that they can be accessed for the back off. */
zeroreqbyte = reqbyte;
zerofirstbyte = firstbyte;
groupsetfirstbyte = false;
if (bravalue >= OP_BRA) {
/* If we have not yet set a firstbyte in this branch, take it from the
subpattern, remembering that it was set here so that a repeat of more
than one can replicate it as reqbyte if necessary. If the subpattern has
no firstbyte, set "none" for the whole branch. In both cases, a zero
repeat forces firstbyte to "none". */
if (firstbyte == REQ_UNSET) {
if (subfirstbyte >= 0) {
firstbyte = subfirstbyte;
......@@ -1519,21 +1525,21 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
firstbyte = REQ_NONE;
zerofirstbyte = REQ_NONE;
}
/* If firstbyte was previously set, convert the subpattern's firstbyte
into reqbyte if there wasn't one, using the vary flag that was in
existence beforehand. */
else if (subfirstbyte >= 0 && subreqbyte < 0)
subreqbyte = subfirstbyte | tempreqvary;
/* If the subpattern set a required byte (or set a first byte that isn't
really the first byte - see above), set it. */
if (subreqbyte >= 0)
reqbyte = subreqbyte;
}
/* For a forward assertion, we take the reqbyte, if set. This can be
helpful if the pattern that follows the assertion doesn't set a different
char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
......@@ -1541,83 +1547,83 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
of a firstbyte. This is overcome by a scan at the end if there's no
firstbyte, looking for an asserted first char. */
else if (bravalue == OP_ASSERT && subreqbyte >= 0)
reqbyte = subreqbyte;
/* Now update the main code pointer to the end of the group. */
code = tempcode;
/* Error if hit end of pattern */
if (ptr >= patternEnd || *ptr != ')') {
*errorcodeptr = ERR14;
goto FAILED;
}
break;
/* Check \ for being a real metacharacter; if not, fall through and handle
it as a data character at the start of a string. Escape items are checked
for validity in the pre-compiling pass. */
case '\\':
tempptr = ptr;
c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingBrackets, false);
/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
are arranged to be the negation of the corresponding OP_values. For the
back references, the values are ESC_REF plus the reference number. Only
back references and those types that consume a character may be repeated.
We can test for values between ESC_b and ESC_w for the latter; this may
have to change if any new ones are ever created. */
if (c < 0) {
/* For metasequences that actually match a character, we disable the
setting of a first character if it hasn't already been set. */
if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
firstbyte = REQ_NONE;
/* Set values to reset to if this is followed by a zero repeat. */
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
/* Back references are handled specially */
if (-c >= ESC_REF) {
int number = -c - ESC_REF;
previous = code;
*code++ = OP_REF;
put2ByteValueAndAdvance(code, number);
}
/* For the rest, we can obtain the OP value by negating the escape
value */
else {
previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
*code++ = -c;
}
continue;
}
/* Fall through. */
/* Handle a literal character. It is guaranteed not to be whitespace or #
when the extended flag is set. If we are in UTF-8 mode, it may be a
multi-byte literal character. */
default:
NORMAL_CHAR:
previous = code;
if (c < 128) {
mclength = 1;
mcbuffer[0] = c;
if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
*code++ = OP_ASCII_LETTER_IGNORING_CASE;
*code++ = c | 0x20;
......@@ -1627,24 +1633,24 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
}
} else {
mclength = encodeUTF8(c, mcbuffer);
*code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
for (c = 0; c < mclength; c++)
*code++ = mcbuffer[c];
}
/* Set the first and required bytes appropriately. If no previous first
byte, set it from this character, but revert to none on a zero repeat.
Otherwise, leave the firstbyte value alone, and don't change it on a zero
repeat. */
if (firstbyte == REQ_UNSET) {
zerofirstbyte = REQ_NONE;
zeroreqbyte = reqbyte;
/* If the character is more than one byte long, we can set firstbyte
only if it is not to be matched caselessly. */
if (mclength == 1 || req_caseopt == 0) {
firstbyte = mcbuffer[0] | req_caseopt;
if (mclength != 1)
......@@ -1653,25 +1659,25 @@ compileBranch(int options, int* brackets, unsigned char** codeptr,
else
firstbyte = reqbyte = REQ_NONE;
}
/* firstbyte was previously set; we can set reqbyte only the length is
1 or the matching is caseful. */
else {
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
if (mclength == 1 || req_caseopt == 0)
reqbyte = code[-1] | req_caseopt | cd.req_varyopt;
}
break; /* End of literal character handling */
}
} /* end of big loop */
/* Control never reaches here by falling through, only by a goto for all the
error states. Pass back the position in the pattern so that it can be displayed
to the user for diagnosing the error. */
FAILED:
*ptrptr = ptr;
return false;
......@@ -1703,28 +1709,29 @@ Argument:
Returns: true on success
*/
template <typename Char>
static bool
compileBracket(int options, int* brackets, unsigned char** codeptr,
const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,
const Char** ptrptr, const Char* patternEnd, ErrorCode* errorcodeptr, int skipbytes,
int* firstbyteptr, int* reqbyteptr, CompileData& cd)
{
const UChar* ptr = *ptrptr;
const Char* ptr = *ptrptr;
unsigned char* code = *codeptr;
unsigned char* last_branch = code;
unsigned char* start_bracket = code;
int firstbyte = REQ_UNSET;
int reqbyte = REQ_UNSET;
/* Offset is set zero to mark that this bracket is still open */
putLinkValueAllowZero(code + 1, 0);
code += 1 + LINK_SIZE + skipbytes;
/* Loop for each alternative branch */
while (true) {
/* Now compile the branch */
int branchfirstbyte;
int branchreqbyte;
if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorcodeptr,
......@@ -1732,45 +1739,45 @@ compileBracket(int options, int* brackets, unsigned char** codeptr,
*ptrptr = ptr;
return false;
}
/* If this is the first branch, the firstbyte and reqbyte values for the
branch become the values for the regex. */
if (*last_branch != OP_ALT) {
firstbyte = branchfirstbyte;
reqbyte = branchreqbyte;
}
/* If this is not the first branch, the first char and reqbyte have to
match the values from all the previous branches, except that if the previous
value for reqbyte didn't have REQ_VARY set, it can still match, and we set
REQ_VARY for the regex. */
else {
/* If we previously had a firstbyte, but it doesn't match the new branch,
we have to abandon the firstbyte for the regex, but if there was previously
no reqbyte, it takes on the value of the old firstbyte. */
if (firstbyte >= 0 && firstbyte != branchfirstbyte) {
if (reqbyte < 0)
reqbyte = firstbyte;
firstbyte = REQ_NONE;
}
/* If we (now or from before) have no firstbyte, a firstbyte from the
branch becomes a reqbyte if there isn't a branch reqbyte. */
if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
branchreqbyte = branchfirstbyte;
/* Now ensure that the reqbytes match */
if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
reqbyte = REQ_NONE;
else
reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
}
/* Reached end of expression, either ')' or end of pattern. Go back through
the alternative branches and reverse the chain of offsets, with the field in
the BRA item now becoming an offset to the first alternative. If there are
......@@ -1779,7 +1786,7 @@ compileBracket(int options, int* brackets, unsigned char** codeptr,
the ims options were changed inside the group, compile a resetting op-code
following, except at the very end of the pattern. Return leaving the pointer
at the terminating char. */
if (ptr >= patternEnd || *ptr != '|') {
int length = code - last_branch;
do {
......@@ -1788,27 +1795,27 @@ compileBracket(int options, int* brackets, unsigned char** codeptr,
length = prev_length;
last_branch -= length;
} while (length > 0);
/* Fill in the ket */
*code = OP_KET;
putLinkValue(code + 1, code - start_bracket);
code += 1 + LINK_SIZE;
/* Set values to pass back */
*codeptr = code;
*ptrptr = ptr;
*firstbyteptr = firstbyte;
*reqbyteptr = reqbyte;
return true;
}
/* Another branch follows; insert an "or" node. Its length field points back
to the previous branch while the bracket remains open. At the end the chain
is reversed. It's done like this so that the start of the bracket has a
zero offset until it is closed, making it possible to detect recursion. */
*code = OP_ALT;
putLinkValue(code + 1, code - last_branch);
last_branch = code;
......@@ -1844,7 +1851,7 @@ static bool branchIsAnchored(const unsigned char* code)
if (op >= OP_BRA || op == OP_ASSERT)
return bracketIsAnchored(scode);
/* Check for explicit anchoring */
/* Check for explicit anchoring */
return op == OP_CIRC;
}
......@@ -1884,7 +1891,7 @@ static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap,
{
const unsigned char* scode = firstSignificantOpcode(code);
int op = *scode;
/* Capturing brackets */
if (op > OP_BRA) {
int captureNum = op - OP_BRA;
......@@ -1893,14 +1900,14 @@ static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap,
int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
}
/* Other brackets */
if (op == OP_BRA || op == OP_ASSERT)
return bracketNeedsLineStart(scode, captureMap, backrefMap);
/* .* means "start at start or after \n" if it isn't in brackets that
may be referenced. */
if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
......@@ -1942,14 +1949,14 @@ static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inas
{
const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
int op = *scode;
if (op >= OP_BRA)
op = OP_BRA;
switch (op) {
default:
return -1;
case OP_BRA:
case OP_ASSERT:
return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
......@@ -1995,7 +2002,8 @@ static inline int multiplyWithOverflowCheck(int a, int b)
return a * b;
}
static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
template <typename Char>
static int calculateCompiledPatternLength(const Char* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
CompileData& cd, ErrorCode& errorcode)
{
/* Make a pass over the pattern to compute the
......@@ -2014,10 +2022,10 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
int brastack[BRASTACK_SIZE];
unsigned char bralenstack[BRASTACK_SIZE];
int bracount = 0;
const UChar* ptr = (const UChar*)(pattern - 1);
const UChar* patternEnd = (const UChar*)(pattern + patternLength);
const Char* ptr = (const Char*)(pattern - 1);
const Char* patternEnd = (const Char*)(pattern + patternLength);
while (++ptr < patternEnd) {
int minRepeats = 0, maxRepeats = 0;
int c = *ptr;
......@@ -2030,12 +2038,12 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
if (errorcode != 0)
return -1;
lastitemlength = 1; /* Default length of last item for repeats */
if (c >= 0) { /* Data character */
length += 2; /* For a one-byte character */
if (c > 127) {
int i;
for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
......@@ -2043,18 +2051,18 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
length += i;
lastitemlength += i;
}
continue;
}
/* Other escapes need one byte */
length++;
/* A back reference needs an additional 2 bytes, plus either one or 5
bytes for a repeat. We also need to keep the value of the highest
back reference. */
if (c <= -ESC_REF) {
int refnum = -c - ESC_REF;
cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
......@@ -2075,20 +2083,20 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
}
continue;
case '^': /* Single-byte metacharacters */
case '.':
case '$':
length++;
lastitemlength = 1;
continue;
case '*': /* These repeats won't be after brackets; */
case '+': /* those are handled separately */
case '?':
length++;
goto POSSESSIVE;
/* This covers the cases of braced repeats after a single char, metachar,
class, or back reference. */
......@@ -2098,15 +2106,15 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
if (errorcode != 0)
return -1;
/* These special cases just insert one extra opcode */
if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
(minRepeats == 1 && maxRepeats == -1))
length++;
/* These cases might insert additional copies of a preceding character. */
else {
if (minRepeats != 1) {
length -= lastitemlength; /* Uncount the original char or metachar */
......@@ -2115,7 +2123,7 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
}
if (safelyCheckNextChar(ptr, patternEnd, '?'))
ptr++; /* Needs no extra length */
......@@ -2125,18 +2133,18 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
}
continue;
/* An alternation contains an offset to the next branch or ket. If any ims
options changed in the previous branch(es), and/or if we are in a
lookbehind assertion, extra space will be needed at the start of the
branch. This is handled by branch_extra. */
case '|':
if (brastackptr == 0)
cd.needOuterBracket = true;
length += 1 + LINK_SIZE + branch_extra;
continue;
/* A character class uses 33 characters provided that all the character
values are less than 256. Otherwise, it uses a bit map for low valued
characters, and individual items for others. Don't worry about character
......@@ -2144,7 +2152,7 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
compile. A character class that contains only one single-byte character
uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
where we can. (In UTF-8 mode we can do this only for chars < 128.) */
case '[': {
int class_optcount;
if (*(++ptr) == '^') {
......@@ -2153,46 +2161,46 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
else
class_optcount = 0;
bool class_utf8 = false;
for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
/* Check for escapes */
if (*ptr == '\\') {
c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
if (errorcode != 0)
return -1;
/* Handle escapes that turn into characters */
if (c >= 0)
goto NON_SPECIAL_CHARACTER;
/* Escapes that are meta-things. The normal ones just affect the
bit map, but Unicode properties require an XCLASS extended item. */
else
class_optcount = 10; /* \d, \s etc; make sure > 1 */
}
/* Anything else increments the possible optimization count. We have to
detect ranges here so that we can compute the number of extra ranges for
caseless wide characters when UCP support is available. If there are wide
characters, we are going to have to use an XCLASS, even for single
characters. */
else {
c = *ptr;
/* Come here from handling \ above when it escapes to a char value */
NON_SPECIAL_CHARACTER:
class_optcount++;
int d = -1;
if (safelyCheckNextChar(ptr, patternEnd, '-')) {
UChar const *hyptr = ptr++;
Char const *hyptr = ptr++;
if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
ptr++;
d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
......@@ -2204,17 +2212,17 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
if (d < 0)
ptr = hyptr; /* go back to hyphen as data */
}
/* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
127 for caseless matching, we will need to use an XCLASS. */
if (d >= 0) {
class_optcount = 10; /* Ensure > 1 */
if (d < c) {
errorcode = ERR8;
return -1;
}
if ((d > 255 || (ignoreCase && d > 127))) {
unsigned char buffer[6];
if (!class_utf8) /* Allow for XCLASS overhead */
......@@ -2222,13 +2230,13 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
class_utf8 = true;
length += LINK_SIZE + 2;
}
/* If we have UCP support, find out how many extra ranges are
needed to map the other case of characters within this range. We
have to mimic the range optimization here, because extending the
range upwards might push d over a boundary that makes it use
another byte in the UTF-8 representation. */
if (ignoreCase) {
int occ, ocd;
int cc = c;
......@@ -2236,7 +2244,7 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
if (occ >= c && ocd <= d)
continue; /* Skip embedded */
if (occ < c && ocd >= c - 1) /* Extend the basic range */
{ /* if there is overlap, */
c = occ; /* noting that if occ < c */
......@@ -2247,26 +2255,26 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
d = ocd;
continue;
}
/* An extra item is needed */
length += 1 + encodeUTF8(occ, buffer) +
((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
}
}
/* The length of the (possibly extended) range */
length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
}
}
/* We have a single character. There is nothing to be done unless we
are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
support. */
else {
if ((c > 255 || (ignoreCase && c > 127))) {
unsigned char buffer[6];
......@@ -2281,12 +2289,12 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
}
}
if (ptr >= patternEnd) { /* Missing terminating ']' */
errorcode = ERR6;
return -1;
}
/* We can optimize when there was only one optimizable character.
Note that this does not detect the case of a negated single character.
In that case we do an incorrect length computation, but it's not a serious
......@@ -2298,10 +2306,10 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
/* Here, we handle repeats for the class opcodes. */
{
length += 33;
/* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
we also need extra for wrapping the whole thing in a sub-pattern. */
if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
if (errorcode != 0)
......@@ -2322,62 +2330,62 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
/* Brackets may be genuine groups or special things */
case '(': {
int branch_newextra = 0;
int bracket_length = 1 + LINK_SIZE;
bool capturing = false;
/* Handle special forms of bracket, which all start (? */
if (safelyCheckNextChar(ptr, patternEnd, '?')) {
switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
/* Non-referencing groups and lookaheads just move the pointer on, and
then behave like a non-special bracket, except that they don't increment
the count of extracting brackets. Ditto for the "once only" bracket,
which is in Perl from version 5.005. */
case ':':
case '=':
case '!':
ptr += 2;
break;
/* Else loop checking valid options until ) is met. Anything else is an
error. If we are without any brackets, i.e. at top level, the settings
act as if specified in the options, so massage the options immediately.
This is for backward compatibility with Perl 5.004. */
default:
errorcode = ERR12;
return -1;
}
} else
capturing = 1;
/* Capturing brackets must be counted so we can process escapes in a
Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
an additional 3 bytes of memory per capturing bracket. */
if (capturing) {
bracount++;
if (bracount > EXTRACT_BASIC_MAX)
bracket_length += 3;
}
/* Save length for computing whole length at end if there's a repeat that
requires duplication of the group. Also save the current value of
branch_extra, and start the new group with the new value. If non-zero, this
will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
if (brastackptr >= sizeof(brastack)/sizeof(int)) {
errorcode = ERR17;
return -1;
}
bralenstack[brastackptr] = branch_extra;
branch_extra = branch_newextra;
brastack[brastackptr++] = length;
length += bracket_length;
continue;
......@@ -2398,10 +2406,10 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
else
duplength = 0;
/* Leave ptr at the final char; for readRepeatCounts this happens
automatically; for the others we need an increment. */
if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
if (errorcode)
......@@ -2422,12 +2430,12 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
minRepeats = 1;
maxRepeats = 1;
}
/* If the minimum is zero, we have to allow for an OP_BRAZERO before the
group, and if the maximum is greater than zero, we have to replicate
maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
bracket set. */
int repeatsLength;
if (minRepeats == 0) {
length++;
......@@ -2444,13 +2452,13 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
}
}
}
/* When the minimum is greater than zero, we have to replicate up to
minval-1 times, with no additions required in the copies. Then, if there
is a limited maximum we have to replicate up to maxval-1 times allowing
for a BRAZERO item before each optional copy and nesting brackets for all
but one of the optional copies. */
else {
repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
if (repeatsLength < 0) {
......@@ -2471,9 +2479,9 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
return -1;
}
}
/* Allow space for once brackets for "possessive quantifier" */
if (safelyCheckNextChar(ptr, patternEnd, '+')) {
ptr++;
length += 2 + 2 * LINK_SIZE;
......@@ -2484,7 +2492,7 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
/* Non-special character. It won't be space or # in extended mode, so it is
always a genuine character. If we are in a \Q...\E sequence, check for the
end; if not, we have a literal. */
default:
NORMAL_CHAR:
length += 2; /* For a one-byte character */
......@@ -2498,11 +2506,11 @@ static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt
length += i;
lastitemlength += i;
}
continue;
}
}
length += 2 + LINK_SIZE; /* For final KET and END */
cd.numCapturingBrackets = bracount;
......@@ -2537,7 +2545,8 @@ static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)
return 0;
}
JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
template <typename Char>
JSRegExp* jsRegExpCompile(const Char* pattern, int patternLength,
JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
unsigned* numSubpatterns, const char** errorptr,
malloc_t* allocate_function, free_t* free_function)
......@@ -2547,9 +2556,9 @@ JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
if (!errorptr)
return 0;
*errorptr = NULL;
CompileData cd;
ErrorCode errorcode = ERR0;
/* Call this once just to count the brackets. */
calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
......@@ -2557,29 +2566,29 @@ JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
if (errorcode)
return returnError(errorcode, errorptr);
if (length > MAX_PATTERN_SIZE)
return returnError(ERR16, errorptr);
size_t size = length + sizeof(JSRegExp);
JSRegExp* re = reinterpret_cast<JSRegExp*>((*allocate_function)(size));
if (!re)
return returnError(ERR13, errorptr);
re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
/* The starting points of the name/number translation table and of the code are
passed around in the compile data block. */
const unsigned char* codeStart = (const unsigned char*)(re + 1);
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, errorcode will be set non-zero, so we don't need to look at the result
of the function here. */
const UChar* ptr = (const UChar*)pattern;
const UChar* patternEnd = pattern + patternLength;
const Char* ptr = (const Char*)pattern;
const Char* patternEnd = pattern + patternLength;
unsigned char* code = (unsigned char*)codeStart;
int firstbyte, reqbyte;
int bracketCount = 0;
......@@ -2591,44 +2600,44 @@ JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
}
re->top_bracket = bracketCount;
re->top_backref = cd.top_backref;
/* If not reached end of pattern on success, there's an excess bracket. */
if (errorcode == 0 && ptr < patternEnd)
errorcode = ERR10;
/* Fill in the terminating state and check for disastrous overflow, but
if debugging, leave the test till after things are printed out. */
*code++ = OP_END;
ASSERT(code - codeStart <= length);
if (code - codeStart > length)
errorcode = ERR7;
/* Give an error if there's back reference to a non-existent capturing
subpattern. */
if (re->top_backref > re->top_bracket)
errorcode = ERR15;
/* Failed to compile, or error while post-processing */
if (errorcode != ERR0) {
(*free_function)(reinterpret_cast<void*>(re));
return returnError(errorcode, errorptr);
}
/* If the anchored option was not passed, set the flag if we can determine that
the pattern is anchored by virtue of ^ characters or \A or anything else (such
as starting with .* when DOTALL is set).
Otherwise, if we know what the first character has to be, save it, because that
speeds up unanchored matches no end. If not, see if we can set the
UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
start with ^. and also when all branches start with .* for non-DOTALL matches.
*/
if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
re->options |= IsAnchoredOption;
else {
......@@ -2649,11 +2658,11 @@ JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
re->options |= UseMultiLineFirstByteOptimizationOption;
}
}
/* For an anchored pattern, we use the "required byte" only if it follows a
variable length item in the regex. Remove the caseless flag for non-caseable
bytes. */
if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) || (reqbyte & REQ_VARY))) {
int ch = reqbyte & 255;
if (ch < 127) {
......@@ -2661,12 +2670,32 @@ JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
re->options |= UseRequiredByteOptimizationOption;
}
}
if (numSubpatterns)
*numSubpatterns = re->top_bracket;
return re;
}
template
JSRegExp* jsRegExpCompile<unsigned short>(const unsigned short* pattern,
int patternLength,
JSRegExpIgnoreCaseOption ignoreCase,
JSRegExpMultilineOption multiline,
unsigned* numSubpatterns,
const char** errorptr,
malloc_t* allocate_function,
free_t* free_function);
template
JSRegExp* jsRegExpCompile<char>(const char* pattern,
int patternLength,
JSRegExpIgnoreCaseOption ignoreCase,
JSRegExpMultilineOption multiline,
unsigned* numSubpatterns,
const char** errorptr,
malloc_t* allocate_function,
free_t* free_function);
void jsRegExpFree(JSRegExp* re, free_t* free_function)
{
(*free_function)(reinterpret_cast<void*>(re));
......
......@@ -69,36 +69,38 @@ typedef void* ReturnLocation;
/* Structure for building a chain of data for holding the values of
the subject pointer at the start of each bracket, used to detect when
an empty string has been matched by a bracket to break infinite loops. */
an empty string has been matched by a bracket to break infinite loops. */
template <typename Char>
struct BracketChainNode {
BracketChainNode* previousBracket;
const UChar* bracketStart;
BracketChainNode<Char>* previousBracket;
const Char* bracketStart;
};
template <typename Char>
struct MatchFrame {
ReturnLocation returnLocation;
struct MatchFrame* previousFrame;
struct MatchFrame<Char>* previousFrame;
/* Function arguments that may change */
struct {
const UChar* subjectPtr;
const Char* subjectPtr;
const unsigned char* instructionPtr;
int offsetTop;
BracketChainNode* bracketChain;
BracketChainNode<Char>* bracketChain;
} args;
/* PCRE uses "fake" recursion built off of gotos, thus
stack-based local variables are not safe to use. Instead we have to
store local variables on the current MatchFrame. */
struct {
const unsigned char* data;
const unsigned char* startOfRepeatingBracket;
const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
const Char* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
const unsigned char* instructionPtrAtStartOfOnce;
int repeatOthercase;
int ctype;
int fc;
int fi;
......@@ -109,22 +111,23 @@ struct MatchFrame {
int saveOffset1;
int saveOffset2;
int saveOffset3;
BracketChainNode bracketChainNode;
BracketChainNode<Char> bracketChainNode;
} locals;
};
/* Structure for passing "static" information around between the functions
doing traditional NFA matching, so that they are thread-safe. */
template <typename Char>
struct MatchData {
int* offsetVector; /* Offset vector */
int offsetEnd; /* One past the end */
int offsetMax; /* The maximum usable for return data */
bool offsetOverflow; /* Set if too many extractions */
const UChar* startSubject; /* Start of the subject string */
const UChar* endSubject; /* End of the subject string */
const UChar* endMatchPtr; /* Subject position at end match */
const Char* startSubject; /* Start of the subject string */
const Char* endSubject; /* End of the subject string */
const Char* endMatchPtr; /* Subject position at end match */
int endOffsetTop; /* Highwater mark at end of match */
bool multiline;
bool ignoreCase;
......@@ -155,7 +158,8 @@ Arguments:
md pointer to matching data block, if isSubject is true
*/
static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md)
template <typename Char>
static void pchars(const Char* p, int length, bool isSubject, const MatchData& md)
{
if (isSubject && length > md.endSubject - p)
length = md.endSubject - p;
......@@ -187,10 +191,11 @@ Arguments:
Returns: true if matched
*/
static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md)
template <typename Char>
static bool matchRef(int offset, const Char* subjectPtr, int length, const MatchData<Char>& md)
{
const UChar* p = md.startSubject + md.offsetVector[offset];
const Char* p = md.startSubject + md.offsetVector[offset];
#ifdef DEBUG
if (subjectPtr >= md.endSubject)
printf("matching subject <null>");
......@@ -202,19 +207,19 @@ static bool matchRef(int offset, const UChar* subjectPtr, int length, const Matc
pchars(p, length, false, md);
printf("\n");
#endif
/* Always fail if not enough characters left */
if (length > md.endSubject - subjectPtr)
return false;
/* Separate the caselesss case for speed */
if (md.ignoreCase) {
while (length-- > 0) {
UChar c = *p++;
Char c = *p++;
int othercase = kjs_pcre_ucp_othercase(c);
UChar d = *subjectPtr++;
Char d = *subjectPtr++;
if (c != d && othercase != d)
return false;
}
......@@ -224,7 +229,7 @@ static bool matchRef(int offset, const UChar* subjectPtr, int length, const Matc
if (*p++ != *subjectPtr++)
return false;
}
return true;
}
......@@ -296,6 +301,7 @@ Returns: 1 if matched ) these values are >= 0
static const unsigned FRAMES_ON_STACK = 16;
template <typename Char>
struct MatchStack {
MatchStack()
: framesEnd(frames + FRAMES_ON_STACK)
......@@ -304,27 +310,27 @@ struct MatchStack {
{
ASSERT((sizeof(frames) / sizeof(frames[0])) == FRAMES_ON_STACK);
}
MatchFrame frames[FRAMES_ON_STACK];
MatchFrame* framesEnd;
MatchFrame* currentFrame;
MatchFrame<Char> frames[FRAMES_ON_STACK];
MatchFrame<Char>* framesEnd;
MatchFrame<Char>* currentFrame;
unsigned size;
inline bool canUseStackBufferForNextFrame()
{
return size < FRAMES_ON_STACK;
}
inline MatchFrame* allocateNextFrame()
inline MatchFrame<Char>* allocateNextFrame()
{
if (canUseStackBufferForNextFrame())
return currentFrame + 1;
return new MatchFrame;
return new MatchFrame<Char>;
}
inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation)
inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode<Char>* bracketChain, ReturnLocation returnLocation)
{
MatchFrame* newframe = allocateNextFrame();
MatchFrame<Char>* newframe = allocateNextFrame();
newframe->previousFrame = currentFrame;
newframe->args.subjectPtr = currentFrame->args.subjectPtr;
......@@ -336,10 +342,10 @@ struct MatchStack {
currentFrame = newframe;
}
inline void popCurrentFrame()
{
MatchFrame* oldFrame = currentFrame;
MatchFrame<Char>* oldFrame = currentFrame;
currentFrame = currentFrame->previousFrame;
if (size > FRAMES_ON_STACK)
delete oldFrame;
......@@ -353,7 +359,8 @@ struct MatchStack {
}
};
static int matchError(int errorCode, MatchStack& stack)
template <typename Char>
static int matchError(int errorCode, MatchStack<Char>& stack)
{
stack.popAllFrames();
return errorCode;
......@@ -377,13 +384,14 @@ static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* su
}
}
static inline void startNewGroup(MatchFrame* currentFrame)
template <typename Char>
static inline void startNewGroup(MatchFrame<Char>* currentFrame)
{
/* At the start of a bracketed group, add the current subject pointer to the
stack of such pointers, to be re-instated at the end of the group when we hit
the closing ket. When match() is called in other circumstances, we don't add to
this stack. */
currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain;
currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr;
currentFrame->args.bracketChain = &currentFrame->locals.bracketChainNode;
......@@ -404,14 +412,15 @@ static inline void repeatInformationFromInstructionOffset(short instructionOffse
maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset];
}
static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md)
template <typename Char>
static int match(const Char* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData<Char>& md)
{
bool isMatch = false;
int min;
bool minimize = false; /* Initialization not really needed, but some compilers think so. */
unsigned matchCount = 0;
MatchStack stack;
MatchStack<Char> stack;
/* The opcode jump table. */
#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
......@@ -419,13 +428,13 @@ static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, i
static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };
#undef EMIT_JUMP_TABLE_ENTRY
#endif
/* One-time setup of the opcode jump table. */
#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
for (int i = 255; !opcodeJumpTable[i]; i--)
opcodeJumpTable[i] = &&CAPTURING_BRACKET;
#endif
#ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
// Shark shows this as a hot line
// Using a static const here makes this line disappear, but makes later access hotter (not sure why)
......@@ -438,20 +447,20 @@ static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, i
stack.currentFrame->args.offsetTop = offsetTop;
stack.currentFrame->args.bracketChain = 0;
startNewGroup(stack.currentFrame);
/* This is where control jumps back to to effect "recursion" */
RECURSE:
if (++matchCount > matchLimit)
return matchError(JSRegExpErrorHitLimit, stack);
/* Now start processing the operations. */
#ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
while (true)
#endif
{
#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
#define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
#define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
......@@ -459,7 +468,7 @@ RECURSE:
#define BEGIN_OPCODE(opcode) case OP_##opcode
#define NEXT_OPCODE continue
#endif
#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
NEXT_OPCODE;
#else
......@@ -467,7 +476,7 @@ RECURSE:
#endif
{
/* Non-capturing bracket: optimized */
BEGIN_OPCODE(BRA):
NON_CAPTURING_BRACKET:
DPRINTF(("start bracket 0\n"));
......@@ -479,27 +488,27 @@ RECURSE:
} while (*stack.currentFrame->args.instructionPtr == OP_ALT);
DPRINTF(("bracket 0 failed\n"));
RRETURN;
/* Skip over large extraction number data if encountered. */
BEGIN_OPCODE(BRANUMBER):
stack.currentFrame->args.instructionPtr += 3;
NEXT_OPCODE;
/* End of the pattern. */
BEGIN_OPCODE(END):
md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */
md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */
isMatch = true;
RRETURN;
/* Assertion brackets. Check the alternative branches in turn - the
matching won't pass the KET for an assertion. If any one branch matches,
the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
start of each branch to move the current point backwards, so the code at
this level is identical to the lookahead case. */
BEGIN_OPCODE(ASSERT):
do {
RECURSIVE_MATCH_STARTNG_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
......@@ -509,17 +518,17 @@ RECURSE:
} while (*stack.currentFrame->args.instructionPtr == OP_ALT);
if (*stack.currentFrame->args.instructionPtr == OP_KET)
RRETURN_NO_MATCH;
/* Continue from after the assertion, updating the offsets high water
mark, since extracts may have been taken during the assertion. */
advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
stack.currentFrame->args.offsetTop = md.endOffsetTop;
NEXT_OPCODE;
/* Negative assertion: all branches must fail to match */
BEGIN_OPCODE(ASSERT_NOT):
do {
RECURSIVE_MATCH_STARTNG_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
......@@ -527,23 +536,23 @@ RECURSE:
RRETURN_NO_MATCH;
stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
} while (*stack.currentFrame->args.instructionPtr == OP_ALT);
stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
NEXT_OPCODE;
/* An alternation is the end of a branch; scan along to find the end of the
bracketed group and go to there. */
BEGIN_OPCODE(ALT):
advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
NEXT_OPCODE;
/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
that it may occur zero times. It may repeat infinitely, or not at all -
i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
repeat limits are compiled as a number of copies, with the optional ones
preceded by BRAZERO or BRAMINZERO. */
BEGIN_OPCODE(BRAZERO): {
stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
RECURSIVE_MATCH_STARTNG_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain);
......@@ -553,7 +562,7 @@ RECURSE:
stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
NEXT_OPCODE;
}
BEGIN_OPCODE(BRAMINZERO): {
stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
......@@ -563,12 +572,12 @@ RECURSE:
stack.currentFrame->args.instructionPtr++;
NEXT_OPCODE;
}
/* End of a group, repeated or non-repeating. If we are at the end of
an assertion "group", stop matching and return 1, but record the
current high water mark for use by positive assertions. Do this also
for the "once" (not-backup up) groups. */
BEGIN_OPCODE(KET):
BEGIN_OPCODE(KETRMIN):
BEGIN_OPCODE(KETRMAX):
......@@ -584,30 +593,30 @@ RECURSE:
isMatch = true;
RRETURN;
}
/* In all other cases except a conditional group we have to check the
group number back at the start and if necessary complete handling an
extraction by setting the offsets and bumping the high water mark. */
stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA;
/* For extended extraction brackets (large number), we have to fish out
the number from a dummy opcode at the start. */
if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE);
stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
#ifdef DEBUG
printf("end bracket %d", stack.currentFrame->locals.number);
printf("\n");
#endif
/* Test for a numbered group. This includes groups called as a result
of recursion. Note that whole-pattern recursion is coded as a recurse
into group 0, so it won't be picked up here. Instead, we catch it when
the OP_END is reached. */
if (stack.currentFrame->locals.number > 0) {
if (stack.currentFrame->locals.offset >= md.offsetMax)
md.offsetOverflow = true;
......@@ -619,21 +628,21 @@ RECURSE:
stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2;
}
}
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
NEXT_OPCODE;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. */
if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) {
RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
if (isMatch)
......@@ -650,7 +659,7 @@ RECURSE:
RRETURN;
}
RRETURN;
/* Start of subject. */
BEGIN_OPCODE(CIRC):
......@@ -682,28 +691,28 @@ RECURSE:
RRETURN_NO_MATCH;
stack.currentFrame->args.instructionPtr++;
NEXT_OPCODE;
/* Word boundary assertions */
BEGIN_OPCODE(NOT_WORD_BOUNDARY):
BEGIN_OPCODE(WORD_BOUNDARY): {
bool currentCharIsWordChar = false;
bool previousCharIsWordChar = false;
if (stack.currentFrame->args.subjectPtr > md.startSubject)
previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
if (stack.currentFrame->args.subjectPtr < md.endSubject)
currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
/* Now see if the situation is what we want */
bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar)
RRETURN_NO_MATCH;
NEXT_OPCODE;
}
/* Match a single character type; inline for speed */
BEGIN_OPCODE(NOT_NEWLINE):
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN_NO_MATCH;
......@@ -743,7 +752,7 @@ RECURSE:
RRETURN_NO_MATCH;
stack.currentFrame->args.instructionPtr++;
NEXT_OPCODE;
BEGIN_OPCODE(NOT_WORDCHAR):
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN_NO_MATCH;
......@@ -751,7 +760,7 @@ RECURSE:
RRETURN_NO_MATCH;
stack.currentFrame->args.instructionPtr++;
NEXT_OPCODE;
BEGIN_OPCODE(WORDCHAR):
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN_NO_MATCH;
......@@ -759,7 +768,7 @@ RECURSE:
RRETURN_NO_MATCH;
stack.currentFrame->args.instructionPtr++;
NEXT_OPCODE;
/* Match a back reference, possibly repeatedly. Look past the end of the
item to see if there is repeat information following. The code is similar
to that for character classes, but repeated for efficiency. Then obey
......@@ -767,23 +776,23 @@ RECURSE:
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
loops). */
BEGIN_OPCODE(REF):
stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */
stack.currentFrame->args.instructionPtr += 3; /* Advance past item */
/* If the reference is unset, set the length to be longer than the amount
of subject left; this ensures that every attempt at a match fails. We
can't just fail here, because of the possibility of quantifiers with zero
minima. */
if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0)
stack.currentFrame->locals.length = 0;
else
stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset];
/* Set up for repetition, or handle the non-repeated case */
switch (*stack.currentFrame->args.instructionPtr) {
case OP_CRSTAR:
case OP_CRMINSTAR:
......@@ -793,7 +802,7 @@ RECURSE:
case OP_CRMINQUERY:
repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
......@@ -803,36 +812,36 @@ RECURSE:
stack.currentFrame->locals.max = INT_MAX;
stack.currentFrame->args.instructionPtr += 5;
break;
default: /* No repeat follows */
if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
RRETURN_NO_MATCH;
stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
NEXT_OPCODE;
}
/* If the length of the reference is zero, just continue with the
main loop. */
if (stack.currentFrame->locals.length == 0)
NEXT_OPCODE;
/* First, ensure the minimum number of matches are present. */
for (int i = 1; i <= min; i++) {
if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
RRETURN_NO_MATCH;
stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
}
/* If min = max, continue at the same level without recursion.
They are not both allowed to be zero. */
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
/* If minimizing, keep trying and advancing the pointer */
if (minimize) {
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
......@@ -844,9 +853,9 @@ RECURSE:
}
/* Control never reaches here */
}
/* If maximizing, find the longest string and work backwards */
else {
stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
for (int i = min; i < stack.currentFrame->locals.max; i++) {
......@@ -863,23 +872,23 @@ RECURSE:
RRETURN_NO_MATCH;
}
/* Control never reaches here */
/* Match a bit-mapped character class, possibly repeatedly. This op code is
used when all the characters in the class have values in the range 0-255,
and either the matching is caseful, or the characters are in the range
0-127 when UTF-8 processing is enabled. The only difference between
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
encountered.
First, look past the end of the item to see if there is repeat information
following. Then obey similar code to character type repeats - written out
again for speed. */
BEGIN_OPCODE(NCLASS):
BEGIN_OPCODE(CLASS):
stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */
stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */
switch (*stack.currentFrame->args.instructionPtr) {
case OP_CRSTAR:
case OP_CRMINSTAR:
......@@ -889,7 +898,7 @@ RECURSE:
case OP_CRMINQUERY:
repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
......@@ -899,14 +908,14 @@ RECURSE:
stack.currentFrame->locals.max = INT_MAX;
stack.currentFrame->args.instructionPtr += 5;
break;
default: /* No repeat follows */
min = stack.currentFrame->locals.max = 1;
break;
}
/* First, ensure the minimum number of matches are present. */
for (int i = 1; i <= min; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN_NO_MATCH;
......@@ -919,13 +928,13 @@ RECURSE:
RRETURN_NO_MATCH;
}
}
/* If max == min we can continue with the main loop without the
need to recurse. */
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
NEXT_OPCODE;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
if (minimize) {
......@@ -949,7 +958,7 @@ RECURSE:
/* If maximizing, find the longest possible run, then work backwards. */
else {
stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
break;
......@@ -970,17 +979,17 @@ RECURSE:
if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
break; /* Stop if tried at original pos */
}
RRETURN;
}
/* Control never reaches here */
/* Match an extended character class. */
BEGIN_OPCODE(XCLASS):
stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */
stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */
switch (*stack.currentFrame->args.instructionPtr) {
case OP_CRSTAR:
case OP_CRMINSTAR:
......@@ -990,7 +999,7 @@ RECURSE:
case OP_CRMINQUERY:
repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
......@@ -1000,13 +1009,13 @@ RECURSE:
stack.currentFrame->locals.max = INT_MAX;
stack.currentFrame->args.instructionPtr += 5;
break;
default: /* No repeat follows */
min = stack.currentFrame->locals.max = 1;
}
/* First, ensure the minimum number of matches are present. */
for (int i = 1; i <= min; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN_NO_MATCH;
......@@ -1014,16 +1023,16 @@ RECURSE:
if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data))
RRETURN_NO_MATCH;
}
/* If max == min we can continue with the main loop without the
need to recurse. */
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
if (minimize) {
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
......@@ -1037,9 +1046,9 @@ RECURSE:
}
/* Control never reaches here */
}
/* If maximizing, find the longest possible run, then work backwards. */
else {
stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
for (int i = min; i < stack.currentFrame->locals.max; i++) {
......@@ -1059,11 +1068,11 @@ RECURSE:
}
RRETURN;
}
/* Control never reaches here */
/* Match a single character, casefully */
BEGIN_OPCODE(CHAR):
stack.currentFrame->locals.length = 1;
stack.currentFrame->args.instructionPtr++;
......@@ -1074,9 +1083,9 @@ RECURSE:
if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++)
RRETURN_NO_MATCH;
NEXT_OPCODE;
/* Match a single character, caselessly */
BEGIN_OPCODE(CHAR_IGNORING_CASE): {
stack.currentFrame->locals.length = 1;
stack.currentFrame->args.instructionPtr++;
......@@ -1089,9 +1098,9 @@ RECURSE:
RRETURN_NO_MATCH;
NEXT_OPCODE;
}
/* Match a single ASCII character. */
BEGIN_OPCODE(ASCII_CHAR):
if (md.endSubject == stack.currentFrame->args.subjectPtr)
RRETURN_NO_MATCH;
......@@ -1100,9 +1109,9 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
stack.currentFrame->args.instructionPtr += 2;
NEXT_OPCODE;
/* Match one of two cases of an ASCII letter. */
BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
if (md.endSubject == stack.currentFrame->args.subjectPtr)
RRETURN_NO_MATCH;
......@@ -1111,15 +1120,15 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
stack.currentFrame->args.instructionPtr += 2;
NEXT_OPCODE;
/* Match a single character repeatedly; different opcodes share code. */
BEGIN_OPCODE(EXACT):
min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
minimize = false;
stack.currentFrame->args.instructionPtr += 3;
goto REPEATCHAR;
BEGIN_OPCODE(UPTO):
BEGIN_OPCODE(MINUPTO):
min = 0;
......@@ -1127,7 +1136,7 @@ RECURSE:
minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO;
stack.currentFrame->args.instructionPtr += 3;
goto REPEATCHAR;
BEGIN_OPCODE(STAR):
BEGIN_OPCODE(MINSTAR):
BEGIN_OPCODE(PLUS):
......@@ -1135,31 +1144,31 @@ RECURSE:
BEGIN_OPCODE(QUERY):
BEGIN_OPCODE(MINQUERY):
repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max);
/* Common code for all repeated single-character matches. We can give
up quickly if there are fewer than the minimum number of characters left in
the subject. */
REPEATCHAR:
stack.currentFrame->locals.length = 1;
getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr)
RRETURN_NO_MATCH;
stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
if (stack.currentFrame->locals.fc <= 0xFFFF) {
int othercase = md.ignoreCase ? kjs_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1;
for (int i = 1; i <= min; i++) {
if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
RRETURN_NO_MATCH;
++stack.currentFrame->args.subjectPtr;
}
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
if (minimize) {
stack.currentFrame->locals.repeatOthercase = othercase;
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
......@@ -1193,16 +1202,16 @@ RECURSE:
/* Control never reaches here */
} else {
/* No case on surrogate pairs, so no need to bother with "othercase". */
for (int i = 1; i <= min; i++) {
if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
RRETURN_NO_MATCH;
stack.currentFrame->args.subjectPtr += 2;
}
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
if (minimize) {
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
......@@ -1235,9 +1244,9 @@ RECURSE:
/* Control never reaches here */
}
/* Control never reaches here */
/* Match a negated single one-byte character. */
BEGIN_OPCODE(NOT): {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN_NO_MATCH;
......@@ -1254,20 +1263,20 @@ RECURSE:
}
NEXT_OPCODE;
}
/* Match a negated single one-byte character repeatedly. This is almost a
repeat of the code for a repeated single character, but I haven't found a
nice way of commoning these up that doesn't require a test of the
positive/negative option for each character match. Maybe that wouldn't add
very much to the time taken, but character matching *is* what this is all
about... */
BEGIN_OPCODE(NOTEXACT):
min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
minimize = false;
stack.currentFrame->args.instructionPtr += 3;
goto REPEATNOTCHAR;
BEGIN_OPCODE(NOTUPTO):
BEGIN_OPCODE(NOTMINUPTO):
min = 0;
......@@ -1275,7 +1284,7 @@ RECURSE:
minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO;
stack.currentFrame->args.instructionPtr += 3;
goto REPEATNOTCHAR;
BEGIN_OPCODE(NOTSTAR):
BEGIN_OPCODE(NOTMINSTAR):
BEGIN_OPCODE(NOTPLUS):
......@@ -1283,16 +1292,16 @@ RECURSE:
BEGIN_OPCODE(NOTQUERY):
BEGIN_OPCODE(NOTMINQUERY):
repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max);
/* Common code for all repeated single-byte matches. We can give up quickly
if there are fewer than the minimum number of bytes left in the
subject. */
REPEATNOTCHAR:
if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
RRETURN_NO_MATCH;
stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++;
/* The code is duplicated for the caseless and caseful cases, for speed,
since matching characters is likely to be quite common. First, ensure the
minimum number of matches are present. If min = max, continue at the same
......@@ -1300,13 +1309,13 @@ RECURSE:
the expression and advancing one matching character if failing, up to the
maximum. Alternatively, if maximizing, find the maximum number of
characters and work backwards. */
DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max));
if (md.ignoreCase) {
if (stack.currentFrame->locals.fc < 128)
stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc);
for (int i = 1; i <= min; i++) {
int d = *stack.currentFrame->args.subjectPtr++;
if (d < 128)
......@@ -1314,10 +1323,10 @@ RECURSE:
if (stack.currentFrame->locals.fc == d)
RRETURN_NO_MATCH;
}
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
NEXT_OPCODE;
if (minimize) {
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
......@@ -1331,12 +1340,12 @@ RECURSE:
}
/* Control never reaches here */
}
/* Maximize case */
else {
stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
break;
......@@ -1354,14 +1363,14 @@ RECURSE:
if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
break; /* Stop if tried at original pos */
}
RRETURN;
}
/* Control never reaches here */
}
/* Caseful comparisons */
else {
for (int i = 1; i <= min; i++) {
int d = *stack.currentFrame->args.subjectPtr++;
......@@ -1371,7 +1380,7 @@ RECURSE:
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
if (minimize) {
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
......@@ -1383,12 +1392,12 @@ RECURSE:
}
/* Control never reaches here */
}
/* Maximize case */
else {
stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
break;
......@@ -1409,17 +1418,17 @@ RECURSE:
}
}
/* Control never reaches here */
/* Match a single character type repeatedly; several different opcodes
share code. This is very similar to the code for single characters, but we
repeat it in the interests of efficiency. */
BEGIN_OPCODE(TYPEEXACT):
min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
minimize = true;
stack.currentFrame->args.instructionPtr += 3;
goto REPEATTYPE;
BEGIN_OPCODE(TYPEUPTO):
BEGIN_OPCODE(TYPEMINUPTO):
min = 0;
......@@ -1427,7 +1436,7 @@ RECURSE:
minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO;
stack.currentFrame->args.instructionPtr += 3;
goto REPEATTYPE;
BEGIN_OPCODE(TYPESTAR):
BEGIN_OPCODE(TYPEMINSTAR):
BEGIN_OPCODE(TYPEPLUS):
......@@ -1435,19 +1444,19 @@ RECURSE:
BEGIN_OPCODE(TYPEQUERY):
BEGIN_OPCODE(TYPEMINQUERY):
repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max);
/* Common code for all repeated single character type matches. Note that
in UTF-8 mode, '.' matches a character of any length, but for the other
character types, the valid characters are all one-byte long. */
REPEATTYPE:
stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */
/* First, ensure the minimum number of matches are present. Use inline
code for maximizing the speed, and do the type test once at the start
(i.e. keep it out of the loop). Also we can test that there are at least
the minimum number of characters before we start. */
if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
RRETURN_NO_MATCH;
if (min > 0) {
......@@ -1459,7 +1468,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_NOT_DIGIT:
for (int i = 1; i <= min; i++) {
if (isASCIIDigit(*stack.currentFrame->args.subjectPtr))
......@@ -1467,7 +1476,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_DIGIT:
for (int i = 1; i <= min; i++) {
if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr))
......@@ -1475,7 +1484,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_NOT_WHITESPACE:
for (int i = 1; i <= min; i++) {
if (isSpaceChar(*stack.currentFrame->args.subjectPtr))
......@@ -1483,7 +1492,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_WHITESPACE:
for (int i = 1; i <= min; i++) {
if (!isSpaceChar(*stack.currentFrame->args.subjectPtr))
......@@ -1491,7 +1500,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_NOT_WORDCHAR:
for (int i = 1; i <= min; i++) {
if (isWordChar(*stack.currentFrame->args.subjectPtr))
......@@ -1499,7 +1508,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_WORDCHAR:
for (int i = 1; i <= min; i++) {
if (!isWordChar(*stack.currentFrame->args.subjectPtr))
......@@ -1507,21 +1516,21 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
default:
ASSERT_NOT_REACHED();
return matchError(JSRegExpErrorInternal, stack);
} /* End switch(stack.currentFrame->locals.ctype) */
}
/* If min = max, continue at the same level without recursing */
if (min == stack.currentFrame->locals.max)
NEXT_OPCODE;
NEXT_OPCODE;
/* If minimizing, we have to test the rest of the pattern before each
subsequent match. */
if (minimize) {
for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
......@@ -1529,44 +1538,44 @@ RECURSE:
RRETURN;
if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
RRETURN;
int c = *stack.currentFrame->args.subjectPtr++;
switch (stack.currentFrame->locals.ctype) {
case OP_NOT_NEWLINE:
if (isNewline(c))
RRETURN;
break;
case OP_NOT_DIGIT:
if (isASCIIDigit(c))
RRETURN;
break;
case OP_DIGIT:
if (!isASCIIDigit(c))
RRETURN;
break;
case OP_NOT_WHITESPACE:
if (isSpaceChar(c))
RRETURN;
break;
case OP_WHITESPACE:
if (!isSpaceChar(c))
RRETURN;
break;
case OP_NOT_WORDCHAR:
if (isWordChar(c))
RRETURN;
break;
case OP_WORDCHAR:
if (!isWordChar(c))
RRETURN;
break;
default:
ASSERT_NOT_REACHED();
return matchError(JSRegExpErrorInternal, stack);
......@@ -1574,13 +1583,13 @@ RECURSE:
}
/* Control never reaches here */
}
/* If maximizing it is worth using inline code for speed, doing the type
test once at the start (i.e. keep it out of the loop). */
else {
stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */
switch (stack.currentFrame->locals.ctype) {
case OP_NOT_NEWLINE:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
......@@ -1589,7 +1598,7 @@ RECURSE:
stack.currentFrame->args.subjectPtr++;
}
break;
case OP_NOT_DIGIT:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
......@@ -1600,7 +1609,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_DIGIT:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
......@@ -1611,7 +1620,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_NOT_WHITESPACE:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
......@@ -1622,7 +1631,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_WHITESPACE:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
......@@ -1633,7 +1642,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_NOT_WORDCHAR:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
......@@ -1644,7 +1653,7 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
case OP_WORDCHAR:
for (int i = min; i < stack.currentFrame->locals.max; i++) {
if (stack.currentFrame->args.subjectPtr >= md.endSubject)
......@@ -1655,14 +1664,14 @@ RECURSE:
++stack.currentFrame->args.subjectPtr;
}
break;
default:
ASSERT_NOT_REACHED();
return matchError(JSRegExpErrorInternal, stack);
}
/* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
for (;;) {
RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
if (isMatch)
......@@ -1670,13 +1679,13 @@ RECURSE:
if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
break; /* Stop if tried at original pos */
}
/* Get here if we can't make it match with any permitted repetitions */
RRETURN;
}
/* Control never reaches here */
BEGIN_OPCODE(CRMINPLUS):
BEGIN_OPCODE(CRMINQUERY):
BEGIN_OPCODE(CRMINRANGE):
......@@ -1687,7 +1696,7 @@ RECURSE:
BEGIN_OPCODE(CRSTAR):
ASSERT_NOT_REACHED();
return matchError(JSRegExpErrorInternal, stack);
#ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
CAPTURING_BRACKET:
#else
......@@ -1698,71 +1707,71 @@ RECURSE:
mustn't change the current values of the data slot, because they may be set
from a previous iteration of this group, and be referred to by a reference
inside the group.
If the bracket fails to match, we need to restore this value and also the
values of the final offsets, in case they were set by a previous iteration of
the same bracket.
If there isn't enough space in the offset vector, treat this as if it were a
non-capturing bracket. Don't worry about setting the flag for the error case
here; that is handled in the code for KET. */
ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA);
stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA;
/* For extended extraction brackets (large number), we have to fish out the
number from a dummy opcode at the start. */
if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE);
stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
#ifdef DEBUG
printf("start bracket %d subject=", stack.currentFrame->locals.number);
pchars(stack.currentFrame->args.subjectPtr, 16, true, md);
printf("\n");
#endif
if (stack.currentFrame->locals.offset < md.offsetMax) {
stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset];
stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1];
stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3));
md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject;
do {
RECURSIVE_MATCH_STARTNG_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
if (isMatch)
RRETURN;
stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
} while (*stack.currentFrame->args.instructionPtr == OP_ALT);
DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number));
md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1;
md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2;
md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3;
RRETURN;
}
/* Insufficient room for saving captured contents */
goto NON_CAPTURING_BRACKET;
}
/* Do not stick any code in here without much thought; it is assumed
that "continue" in the code above comes out to here to repeat the main
loop. */
} /* End of main loop */
ASSERT_NOT_REACHED();
#ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
RRETURN_SWITCH:
switch (stack.currentFrame->returnLocation) {
case 0: goto RETURN;
......@@ -1793,12 +1802,12 @@ RRETURN_SWITCH:
case 48: goto RRETURN_48;
case 52: goto RRETURN_52;
}
ASSERT_NOT_REACHED();
return matchError(JSRegExpErrorInternal, stack);
#endif
RETURN:
return isMatch;
}
......@@ -1828,12 +1837,13 @@ Returns: > 0 => success; value is the number of elements filled in
< -1 => some kind of unexpected problem
*/
static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int first_byte, bool first_byte_caseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart)
template <typename Char>
static void tryFirstByteOptimization(const Char*& subjectPtr, const Char* endSubject, int first_byte, bool first_byte_caseless, bool useMultiLineFirstCharOptimization, const Char* originalSubjectStart)
{
// If first_byte is set, try scanning to the first instance of that byte
// no need to try and match against any earlier part of the subject string.
if (first_byte >= 0) {
UChar first_char = first_byte;
Char first_char = first_byte;
if (first_byte_caseless)
while (subjectPtr < endSubject) {
int c = *subjectPtr;
......@@ -1857,7 +1867,8 @@ static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endS
}
}
static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int req_byte, int req_byte2, bool req_byte_caseless, bool hasFirstByte, const UChar*& reqBytePtr)
template <typename Char>
static bool tryRequiredByteOptimization(const Char*& subjectPtr, const Char* endSubject, int req_byte, int req_byte2, bool req_byte_caseless, bool hasFirstByte, const Char*& reqBytePtr)
{
/* If req_byte is set, we know that that character must appear in the subject
for the match to succeed. If the first character is set, req_byte must be
......@@ -1866,7 +1877,7 @@ static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* e
unlimited repeats that aren't going to match. Writing separate code for
cased/caseless versions makes it go faster, as does using an autoincrement
and backing off on a match.
HOWEVER: when the subject string is very, very long, searching to its end can
take a long time, and give bad performance on quite ordinary patterns. This
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
......@@ -1874,7 +1885,7 @@ static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* e
*/
if (req_byte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) {
const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0);
const Char* p = subjectPtr + (hasFirstByte ? 1 : 0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
......@@ -1912,30 +1923,31 @@ static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* e
return false;
}
template <typename Char>
int jsRegExpExecute(const JSRegExp* re,
const UChar* subject, int length, int start_offset, int* offsets,
const Char* subject, int length, int start_offset, int* offsets,
int offsetcount)
{
ASSERT(re);
ASSERT(subject);
ASSERT(offsetcount >= 0);
ASSERT(offsets || offsetcount == 0);
MatchData matchBlock;
MatchData<Char> matchBlock;
matchBlock.startSubject = subject;
matchBlock.endSubject = matchBlock.startSubject + length;
const UChar* endSubject = matchBlock.endSubject;
const Char* endSubject = matchBlock.endSubject;
matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption);
matchBlock.ignoreCase = (re->options & IgnoreCaseOption);
/* If the expression has got more back references than the offsets supplied can
hold, we get a temporary chunk of working store to use during the matching.
Otherwise, we can use the vector supplied, rounding down its size to a multiple
of 3. */
int ocount = offsetcount - (offsetcount % 3);
// FIXME: This is lame that we have to second-guess our caller here.
// The API should change to either fail-hard when we don't have enough offset space
// or that we shouldn't ask our callers to pre-allocate in the first place.
......@@ -1948,36 +1960,36 @@ int jsRegExpExecute(const JSRegExp* re,
using_temporary_offsets = true;
} else
matchBlock.offsetVector = offsets;
matchBlock.offsetEnd = ocount;
matchBlock.offsetMax = (2*ocount)/3;
matchBlock.offsetOverflow = false;
/* Compute the minimum number of offsets that we need to reset each time. Doing
this makes a huge difference to execution time when there aren't many brackets
in the pattern. */
int resetcount = 2 + re->top_bracket * 2;
if (resetcount > offsetcount)
resetcount = ocount;
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
initialize them to avoid reading uninitialized locations. */
if (matchBlock.offsetVector) {
int* iptr = matchBlock.offsetVector + ocount;
int* iend = iptr - resetcount/2 + 1;
while (--iptr >= iend)
*iptr = -1;
}
/* Set up the first character to match, if available. The first_byte value is
never set for an anchored regular expression, but the anchoring may be forced
at run time, so we have to test for anchoring. The first char may be unset for
an unanchored pattern, of course. If there's no first char and the pattern was
studied, there may be a bitmap of possible first characters. */
bool first_byte_caseless = false;
int first_byte = -1;
if (re->options & UseFirstByteOptimizationOption) {
......@@ -1985,10 +1997,10 @@ int jsRegExpExecute(const JSRegExp* re,
if ((first_byte_caseless = (re->first_byte & REQ_IGNORE_CASE)))
first_byte = toLowerCase(first_byte);
}
/* For anchored or unanchored matches, there may be a "last known required
character" set. */
bool req_byte_caseless = false;
int req_byte = -1;
int req_byte2 = -1;
......@@ -1997,14 +2009,14 @@ int jsRegExpExecute(const JSRegExp* re,
req_byte_caseless = (re->req_byte & REQ_IGNORE_CASE);
req_byte2 = flipCase(req_byte);
}
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
the loop runs just once. */
const UChar* startMatch = subject + start_offset;
const UChar* reqBytePtr = startMatch - 1;
const Char* startMatch = subject + start_offset;
const Char* reqBytePtr = startMatch - 1;
bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption;
do {
/* Reset the maximum number of extractions we might see. */
if (matchBlock.offsetVector) {
......@@ -2013,23 +2025,23 @@ int jsRegExpExecute(const JSRegExp* re,
while (iptr < iend)
*iptr++ = -1;
}
tryFirstByteOptimization(startMatch, endSubject, first_byte, first_byte_caseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset);
if (tryRequiredByteOptimization(startMatch, endSubject, req_byte, req_byte2, req_byte_caseless, first_byte >= 0, reqBytePtr))
break;
/* When a match occurs, substrings will be set for all internal extractions;
we just need to set up the whole thing as substring 0 before returning. If
there were too many extractions, set the return code to zero. In the case
where we had to get some local store to hold offsets for backreferences, copy
those back references that we can. In this case there need not be overflow
if certain parts of the pattern were not used. */
/* The code starts after the JSRegExp block and the capture name table. */
const unsigned char* start_code = (const unsigned char*)(re + 1);
int returnCode = match(startMatch, start_code, 2, matchBlock);
int returnCode = match<Char>(startMatch, start_code, 2, matchBlock);
/* When the result is no match, advance the pointer to the next character
and continue. */
if (returnCode == 0) {
......@@ -2042,10 +2054,10 @@ int jsRegExpExecute(const JSRegExp* re,
DPRINTF((">>>> error: returning %d\n", returnCode));
return returnCode;
}
/* We have a match! Copy the offset information from temporary store if
necessary */
if (using_temporary_offsets) {
if (offsetcount >= 4) {
memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetcount - 2) * sizeof(int));
......@@ -2053,29 +2065,41 @@ int jsRegExpExecute(const JSRegExp* re,
}
if (matchBlock.endOffsetTop > offsetcount)
matchBlock.offsetOverflow = true;
DPRINTF(("Freeing temporary memory\n"));
delete [] matchBlock.offsetVector;
}
returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2;
if (offsetcount < 2)
returnCode = 0;
else {
offsets[0] = startMatch - matchBlock.startSubject;
offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject;
}
DPRINTF((">>>> returning %d\n", returnCode));
return returnCode;
} while (!(re->options & IsAnchoredOption) && startMatch <= endSubject);
if (using_temporary_offsets) {
DPRINTF(("Freeing temporary memory\n"));
delete [] matchBlock.offsetVector;
}
DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
return JSRegExpErrorNoMatch;
}
template
int jsRegExpExecute<unsigned short>(const JSRegExp* re,
const unsigned short* subject,
int length, int start_offset,
int* offsets, int offsetcount);
template
int jsRegExpExecute<char>(const JSRegExp* re,
const char* subject,
int length, int start_offset,
int* offsets, int offsetcount);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment