Implements ES6 String.prototype.normalize method.

BUG=v8:2943
LOG=Y
TEST=Unit tests for "real life" use cases, edge cases, various types of normalization.

==========================

This is identical to the previous CL
   https://codereview.chromium.org/40133004/
with two differences:
 * Added a dummy implementation of String.prototype.normalize to be used when v8 is compiled without intl support
 * Rebased the the test files for webkit. That was the only reason for the previous failure (and revert).

Thank you,
Mihai

R=svenpanne@chromium.org

Review URL: https://codereview.chromium.org/68133016

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18972 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 39d1534d
...@@ -45,6 +45,11 @@ var AVAILABLE_SERVICES = ['collator', ...@@ -45,6 +45,11 @@ var AVAILABLE_SERVICES = ['collator',
'dateformat', 'dateformat',
'breakiterator']; 'breakiterator'];
var NORMALIZATION_FORMS = ['NFC',
'NFD',
'NFKC',
'NFKD'];
/** /**
* Caches available locales for each service. * Caches available locales for each service.
*/ */
...@@ -1986,6 +1991,40 @@ $Object.defineProperty($String.prototype, 'localeCompare', { ...@@ -1986,6 +1991,40 @@ $Object.defineProperty($String.prototype, 'localeCompare', {
%SetNativeFlag($String.prototype.localeCompare); %SetNativeFlag($String.prototype.localeCompare);
/**
* Unicode normalization. This method is called with one argument that
* specifies the normalization form.
* If none is specified, "NFC" is assumed.
* If the form is not one of "NFC", "NFD", "NFKC", or "NFKD", then throw
* a RangeError Exception.
*/
$Object.defineProperty($String.prototype, 'normalize', {
value: function(that) {
if (%_IsConstructCall()) {
throw new $TypeError(ORDINARY_FUNCTION_CALLED_AS_CONSTRUCTOR);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.normalize");
var form = $String(%_Arguments(0) || 'NFC');
var normalizationForm = NORMALIZATION_FORMS.indexOf(form);
if (normalizationForm === -1) {
throw new $RangeError('The normalization form should be one of '
+ NORMALIZATION_FORMS.join(', ') + '.');
}
return %StringNormalize(this, normalizationForm);
},
writable: true,
configurable: true,
enumerable: false
});
%FunctionSetName($String.prototype.normalize, 'normalize');
%FunctionRemovePrototype($String.prototype.normalize);
%SetNativeFlag($String.prototype.normalize);
/** /**
* Formats a Number object (this) using locale and options values. * Formats a Number object (this) using locale and options values.
* If locale or options are omitted, defaults are used. * If locale or options are omitted, defaults are used.
......
...@@ -13976,6 +13976,35 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_InternalCompare) { ...@@ -13976,6 +13976,35 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_InternalCompare) {
} }
RUNTIME_FUNCTION(MaybeObject*, Runtime_StringNormalize) {
HandleScope scope(isolate);
static const UNormalizationMode normalizationForms[] =
{ UNORM_NFC, UNORM_NFD, UNORM_NFKC, UNORM_NFKD };
ASSERT(args.length() == 2);
CONVERT_ARG_HANDLE_CHECKED(String, stringValue, 0);
CONVERT_NUMBER_CHECKED(int, form_id, Int32, args[1]);
v8::String::Value string_value(v8::Utils::ToLocal(stringValue));
const UChar* u_value = reinterpret_cast<const UChar*>(*string_value);
// TODO(mnita): check Normalizer2 (not available in ICU 46)
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString result;
icu::Normalizer::normalize(u_value, normalizationForms[form_id], 0,
result, status);
if (U_FAILURE(status)) {
return isolate->heap()->undefined_value();
}
return *isolate->factory()->NewStringFromTwoByte(
Vector<const uint16_t>(
reinterpret_cast<const uint16_t*>(result.getBuffer()),
result.length()));
}
RUNTIME_FUNCTION(MaybeObject*, Runtime_CreateBreakIterator) { RUNTIME_FUNCTION(MaybeObject*, Runtime_CreateBreakIterator) {
HandleScope scope(isolate); HandleScope scope(isolate);
......
...@@ -568,6 +568,9 @@ namespace internal { ...@@ -568,6 +568,9 @@ namespace internal {
F(CreateCollator, 3, 1) \ F(CreateCollator, 3, 1) \
F(InternalCompare, 3, 1) \ F(InternalCompare, 3, 1) \
\ \
/* String.prototype.normalize. */ \
F(StringNormalize, 2, 1) \
\
/* Break iterator. */ \ /* Break iterator. */ \
F(CreateBreakIterator, 3, 1) \ F(CreateBreakIterator, 3, 1) \
F(BreakIteratorAdoptText, 2, 1) \ F(BreakIteratorAdoptText, 2, 1) \
......
...@@ -186,6 +186,28 @@ function StringMatch(regexp) { ...@@ -186,6 +186,28 @@ function StringMatch(regexp) {
} }
var NORMALIZATION_FORMS = ['NFC', 'NFD', 'NFKC', 'NFKD'];
// ECMA-262 v6, section 21.1.3.12
//
// For now we do nothing, as proper normalization requires big tables.
// If Intl is enabled, then i18n.js will override it and provide the the
// proper functionality.
function StringNormalize(form) {
CHECK_OBJECT_COERCIBLE(this, "String.prototype.normalize");
var form = form ? TO_STRING_INLINE(form) : 'NFC';
var normalizationForm = NORMALIZATION_FORMS.indexOf(form);
if (normalizationForm === -1) {
throw new $RangeError('The normalization form should be one of '
+ NORMALIZATION_FORMS.join(', ') + '.');
}
return %_ValueOf(this);
}
// This has the same size as the lastMatchInfo array, and can be used for // This has the same size as the lastMatchInfo array, and can be used for
// functions that expect that structure to be returned. It is used when the // functions that expect that structure to be returned. It is used when the
// needle is a string rather than a regexp. In this case we can't update // needle is a string rather than a regexp. In this case we can't update
...@@ -942,6 +964,7 @@ function SetUpString() { ...@@ -942,6 +964,7 @@ function SetUpString() {
"lastIndexOf", StringLastIndexOf, "lastIndexOf", StringLastIndexOf,
"localeCompare", StringLocaleCompare, "localeCompare", StringLocaleCompare,
"match", StringMatch, "match", StringMatch,
"normalize", StringNormalize,
"replace", StringReplace, "replace", StringReplace,
"search", StringSearch, "search", StringSearch,
"slice", StringSlice, "slice", StringSlice,
......
// Copyright 2013 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Tests the new String.prototype.normalize method.
// Common use case when searching for 'not very exact' match.
// These are examples of data one might encounter in real use.
var testRealUseCases = function() {
// Vietnamese legacy text, old Windows 9x / non-Unicode applications use
// windows-1258 code page, which is neither precomposed, nor decomposed.
assertEquals('ti\u00ea\u0301ng Vi\u00ea\u0323t'.normalize('NFKD'),
'ti\u1ebfng Vi\u1ec7t'.normalize('NFKD')); // all precomposed
// Various kinds of spaces
assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space
'Google\u00a0Maps'.normalize('NFKD')); // non-breaking space
assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space
'Google\u2002Maps'.normalize('NFKD')); // en-space
assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space
'Google\u2003Maps'.normalize('NFKD')); // em-space
assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space
'Google\u3000Maps'.normalize('NFKC')); // ideographic space
// Latin small ligature "fi"
assertEquals('fi'.normalize('NFKD'), '\ufb01'.normalize('NFKD'));
// ŀ, Latin small L with middle dot, used in Catalan and often represented
// as decomposed for non-Unicode environments ( l + ·)
assertEquals('l\u00b7'.normalize('NFKD'), '\u0140'.normalize('NFKD'));
// Legacy text, Japanese narrow Kana (MS-DOS & Win 3.x time)
assertEquals('\u30d1\u30bd\u30b3\u30f3'.normalize('NFKD'), // パソコン : wide
'\uff8a\uff9f\uff7f\uff7a\uff9d'.normalize('NFKD')); // パソコン : narrow
// Also for Japanese, Latin fullwidth forms vs. ASCII
assertEquals('ABCD'.normalize('NFKD'),
'\uff21\uff22\uff23\uff24'.normalize('NFKD')); // ABCD, fullwidth
}();
var testEdgeCases = function() {
// Make sure we throw RangeError, as the standard requires.
assertThrows('"".normalize(1234)', RangeError);
assertThrows('"".normalize("BAD")', RangeError);
// The standard does not say what kind of exceptions we should throw, so we
// will not be specific. But we still test that we throw errors.
assertThrows('s.normalize()'); // s is not defined
assertThrows('var s = null; s.normalize()');
assertThrows('var s = undefined; s.normalize()');
assertThrows('var s = 1234; s.normalize()'); // no normalize for non-strings
}();
// Several kinds of mappings. No need to be comprehensive, we don't test
// the ICU functionality, we only test C - JavaScript 'glue'
var testData = [
// org, default, NFC, NFD, NKFC, NKFD
['\u00c7', // Ç : Combining sequence, Latin 1
'\u00c7', '\u0043\u0327',
'\u00c7', '\u0043\u0327'],
['\u0218', // Ș : Combining sequence, non-Latin 1
'\u0218', '\u0053\u0326',
'\u0218', '\u0053\u0326'],
['\uac00', // 가 : Hangul
'\uac00', '\u1100\u1161',
'\uac00', '\u1100\u1161'],
['\uff76', // カ : Narrow Kana
'\uff76', '\uff76',
'\u30ab', '\u30ab'],
['\u00bc', // ¼ : Fractions
'\u00bc', '\u00bc',
'\u0031\u2044\u0034', '\u0031\u2044\u0034'],
['\u01c6', // dž : Latin ligature
'\u01c6', '\u01c6',
'\u0064\u017e', '\u0064\u007a\u030c'],
['s\u0307\u0323', // s + dot above + dot below, ordering of combining marks
'\u1e69', 's\u0323\u0307',
'\u1e69', 's\u0323\u0307'],
['\u3300', // ㌀ : Squared characters
'\u3300', '\u3300',
'\u30a2\u30d1\u30fc\u30c8', // アパート
'\u30a2\u30cf\u309a\u30fc\u30c8'], // アパート
['\ufe37', // ︷ : Vertical forms
'\ufe37', '\ufe37',
'{' , '{'],
['\u2079', // ⁹ : superscript 9
'\u2079', '\u2079',
'9', '9'],
['\ufee5\ufee6\ufee7\ufee8', // Arabic forms
'\ufee5\ufee6\ufee7\ufee8', '\ufee5\ufee6\ufee7\ufee8',
'\u0646\u0646\u0646\u0646', '\u0646\u0646\u0646\u0646'],
['\u2460', // ① : Circled
'\u2460', '\u2460',
'1', '1'],
['\u210c', // ℌ : Font variants
'\u210c', '\u210c',
'H', 'H'],
['\u2126', // Ω : Singleton, OHM sign vs. Greek capital letter OMEGA
'\u03a9', '\u03a9',
'\u03a9', '\u03a9'],
['\ufdfb', // Long ligature, ARABIC LIGATURE JALLAJALALOUHOU
'\ufdfb', '\ufdfb',
'\u062C\u0644\u0020\u062C\u0644\u0627\u0644\u0647',
'\u062C\u0644\u0020\u062C\u0644\u0627\u0644\u0647']
];
var testArray = function() {
var kNFC = 1, kNFD = 2, kNFKC = 3, kNFKD = 4;
for (var i = 0; i < testData.length; ++i) {
// the original, NFC and NFD should normalize to the same thing
for (var column = 0; column < 3; ++column) {
var str = testData[i][column];
assertEquals(str.normalize(), testData[i][kNFC]); // defaults to NFC
assertEquals(str.normalize('NFC'), testData[i][kNFC]);
assertEquals(str.normalize('NFD'), testData[i][kNFD]);
assertEquals(str.normalize('NFKC'), testData[i][kNFKC]);
assertEquals(str.normalize('NFKD'), testData[i][kNFKD]);
}
}
}();
...@@ -70,7 +70,7 @@ FAIL getSortedOwnPropertyNames(Function.prototype) should be apply,bind,call,con ...@@ -70,7 +70,7 @@ FAIL getSortedOwnPropertyNames(Function.prototype) should be apply,bind,call,con
FAIL getSortedOwnPropertyNames(Array) should be isArray,length,name,prototype. Was arguments,caller,isArray,length,name,prototype. FAIL getSortedOwnPropertyNames(Array) should be isArray,length,name,prototype. Was arguments,caller,isArray,length,name,prototype.
PASS getSortedOwnPropertyNames(Array.prototype) is ['concat', 'constructor', 'every', 'filter', 'forEach', 'indexOf', 'join', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toString', 'unshift'] PASS getSortedOwnPropertyNames(Array.prototype) is ['concat', 'constructor', 'every', 'filter', 'forEach', 'indexOf', 'join', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toString', 'unshift']
FAIL getSortedOwnPropertyNames(String) should be fromCharCode,length,name,prototype. Was arguments,caller,fromCharCode,length,name,prototype. FAIL getSortedOwnPropertyNames(String) should be fromCharCode,length,name,prototype. Was arguments,caller,fromCharCode,length,name,prototype.
PASS getSortedOwnPropertyNames(String.prototype) is ['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf'] PASS getSortedOwnPropertyNames(String.prototype) is ['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'normalize', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf']
FAIL getSortedOwnPropertyNames(Boolean) should be length,name,prototype. Was arguments,caller,length,name,prototype. FAIL getSortedOwnPropertyNames(Boolean) should be length,name,prototype. Was arguments,caller,length,name,prototype.
PASS getSortedOwnPropertyNames(Boolean.prototype) is ['constructor', 'toString', 'valueOf'] PASS getSortedOwnPropertyNames(Boolean.prototype) is ['constructor', 'toString', 'valueOf']
FAIL getSortedOwnPropertyNames(Number) should be MAX_VALUE,MIN_VALUE,NEGATIVE_INFINITY,NaN,POSITIVE_INFINITY,length,name,prototype. Was EPSILON,MAX_SAFE_INTEGER,MAX_VALUE,MIN_SAFE_INTEGER,MIN_VALUE,NEGATIVE_INFINITY,NaN,POSITIVE_INFINITY,arguments,caller,isFinite,isInteger,isNaN,isSafeInteger,length,name,parseFloat,parseInt,prototype. FAIL getSortedOwnPropertyNames(Number) should be MAX_VALUE,MIN_VALUE,NEGATIVE_INFINITY,NaN,POSITIVE_INFINITY,length,name,prototype. Was EPSILON,MAX_SAFE_INTEGER,MAX_VALUE,MIN_SAFE_INTEGER,MIN_VALUE,NEGATIVE_INFINITY,NaN,POSITIVE_INFINITY,arguments,caller,isFinite,isInteger,isNaN,isSafeInteger,length,name,parseFloat,parseInt,prototype.
......
...@@ -78,7 +78,7 @@ var expectedPropertyNamesSet = { ...@@ -78,7 +78,7 @@ var expectedPropertyNamesSet = {
"Array": "['isArray', 'length', 'name', 'prototype']", "Array": "['isArray', 'length', 'name', 'prototype']",
"Array.prototype": "['concat', 'constructor', 'every', 'filter', 'forEach', 'indexOf', 'join', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toString', 'unshift']", "Array.prototype": "['concat', 'constructor', 'every', 'filter', 'forEach', 'indexOf', 'join', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toString', 'unshift']",
"String": "['fromCharCode', 'length', 'name', 'prototype']", "String": "['fromCharCode', 'length', 'name', 'prototype']",
"String.prototype": "['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf']", "String.prototype": "['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'normalize', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf']",
"Boolean": "['length', 'name', 'prototype']", "Boolean": "['length', 'name', 'prototype']",
"Boolean.prototype": "['constructor', 'toString', 'valueOf']", "Boolean.prototype": "['constructor', 'toString', 'valueOf']",
"Number": "['MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'NaN', 'POSITIVE_INFINITY', 'length', 'name', 'prototype']", "Number": "['MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'NaN', 'POSITIVE_INFINITY', 'length', 'name', 'prototype']",
......
...@@ -47,6 +47,7 @@ PASS String.prototype.charCodeAt.__proto__ is Function.prototype ...@@ -47,6 +47,7 @@ PASS String.prototype.charCodeAt.__proto__ is Function.prototype
PASS String.prototype.indexOf.__proto__ is Function.prototype PASS String.prototype.indexOf.__proto__ is Function.prototype
PASS String.prototype.lastIndexOf.__proto__ is Function.prototype PASS String.prototype.lastIndexOf.__proto__ is Function.prototype
PASS String.prototype.match.__proto__ is Function.prototype PASS String.prototype.match.__proto__ is Function.prototype
PASS String.prototype.normalize.__proto__ is Function.prototype
PASS String.prototype.replace.__proto__ is Function.prototype PASS String.prototype.replace.__proto__ is Function.prototype
PASS String.prototype.search.__proto__ is Function.prototype PASS String.prototype.search.__proto__ is Function.prototype
PASS String.prototype.slice.__proto__ is Function.prototype PASS String.prototype.slice.__proto__ is Function.prototype
......
...@@ -43,6 +43,7 @@ shouldBe("String.prototype.charCodeAt.__proto__","Function.prototype"); ...@@ -43,6 +43,7 @@ shouldBe("String.prototype.charCodeAt.__proto__","Function.prototype");
shouldBe("String.prototype.indexOf.__proto__","Function.prototype"); shouldBe("String.prototype.indexOf.__proto__","Function.prototype");
shouldBe("String.prototype.lastIndexOf.__proto__","Function.prototype"); shouldBe("String.prototype.lastIndexOf.__proto__","Function.prototype");
shouldBe("String.prototype.match.__proto__","Function.prototype"); shouldBe("String.prototype.match.__proto__","Function.prototype");
shouldBe("String.prototype.normalize.__proto__","Function.prototype");
shouldBe("String.prototype.replace.__proto__","Function.prototype"); shouldBe("String.prototype.replace.__proto__","Function.prototype");
shouldBe("String.prototype.search.__proto__","Function.prototype"); shouldBe("String.prototype.search.__proto__","Function.prototype");
shouldBe("String.prototype.slice.__proto__","Function.prototype"); shouldBe("String.prototype.slice.__proto__","Function.prototype");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment