Commit 1d24f5f5 authored by's avatar

Updated unicode library.

Added Nl category to letters predicate (as requried for JS identifiers).
Changed/simplified representation of canonicalization ranges.
Truncated tables to code points in the BMP (all that is used by JS).
Reformatted tables to avoid excessively long lines.
Removed duplicate entries from multi-character mapping result tables.

Review URL:

git-svn-id: ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 84cf5ca5
......@@ -1272,7 +1272,7 @@ static int GetCaseIndependentLetters(uc16 character,
bool ascii_subject,
unibrow::uchar* letters) {
int length = uncanonicalize.get(character, '\0', letters);
// Unibrow returns 0 or 1 for characters where case independependence is
// Unibrow returns 0 or 1 for characters where case independence is
// trivial.
if (length == 0) {
letters[0] = character;
......@@ -4026,74 +4026,48 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
} else if (bottom <= kRangeCanonicalizeMax &&
top <= kRangeCanonicalizeMax) {
} else {
// If this is a range we expand the characters block by block,
// expanding contiguous subranges (blocks) one at a time.
// The approach is as follows. For a given start character we
// look up the block that contains it, for instance 'a' if the
// start character is 'c'. A block is characterized by the property
// that all characters uncanonicalize in the same way as the first
// element, except that each entry in the result is incremented
// by the distance from the first element. So a-z is a block
// because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
// uncanonicalizes to ['a' + k, 'A' + k].
// Once we've found the start point we look up its uncanonicalization
// look up the remainder of the block that contains it (represented
// by the end point), for instance we find 'z' if the character
// is 'c'. A block is characterized by the property
// that all characters uncanonicalize in the same way, except that
// each entry in the result is incremented by the distance from the first
// element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
// the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
// Once we've found the end point we look up its uncanonicalization
// and produce a range for each element. For instance for [c-f]
// we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
// we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
// add a range if it is not already contained in the input, so [c-f]
// will be skipped but [C-F] will be added. If this range is not
// completely contained in a block we do this for all the blocks
// covered by the range.
// covered by the range (handling characters that is not in a block
// as a "singleton block").
unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
// First, look up the block that contains the 'bottom' character.
int length = canonrange.get(bottom, '\0', range);
if (length == 0) {
range[0] = bottom;
} else {
ASSERT_EQ(1, length);
int pos = bottom;
// The start of the current block. Note that except for the first
// iteration 'start' is always equal to 'pos'.
int start;
// If it is not the start point of a block the entry contains the
// offset of the character from the start point.
if ((range[0] & kStartMarker) == 0) {
start = pos - range[0];
} else {
start = pos;
// Then we add the ranges one at a time, incrementing the current
// position to be after the last block each time. The position
// always points to the start of a block.
while (pos < top) {
length = canonrange.get(start, '\0', range);
int length = canonrange.get(pos, '\0', range);
uc16 block_end;
if (length == 0) {
range[0] = start;
block_end = pos;
} else {
ASSERT_EQ(1, length);
block_end = range[0];
ASSERT((range[0] & kStartMarker) != 0);
// The start point of a block contains the distance to the end
// of the range.
int block_end = start + (range[0] & kPayloadMask) - 1;
int end = (block_end > top) ? top : block_end;
length = uncanonicalize.get(start, '\0', range);
length = uncanonicalize.get(block_end, '\0', range);
for (int i = 0; i < length; i++) {
uc32 c = range[i];
uc16 range_from = c + (pos - start);
uc16 range_to = c + (end - start);
uc16 range_from = c - (block_end - pos);
uc16 range_to = c - (block_end - end);
if (!(bottom <= range_from && range_to <= top)) {
ranges->Add(CharacterRange(range_from, range_to));
start = pos = block_end + 1;
pos = end + 1;
} else {
// Unibrow ranges don't work for high characters due to the "2^11 bug".
// Therefore we do something dumber for these ranges.
AddUncanonicals(ranges, bottom, top);
......@@ -4208,20 +4182,14 @@ static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
// 0xa800 - 0xfaff
// 0xfc00 - 0xfeff
const int boundary_count = 18;
// The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
// array. This is to split up big ranges and not because they actually denote
// a case-mapping-free-zone.
ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
const int kFirstRealCaselessZoneIndex = 2;
int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
int boundaries[] = {
0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
// Special ASCII rule from spec can save us some work here.
if (bottom == 0x80 && top == 0xffff) return;
// We have optimized support for this range.
if (top <= CharacterRange::kRangeCanonicalizeMax) {
if (top <= boundaries[0]) {
CharacterRange range(bottom, top);
range.AddCaseEquivalents(ranges, false);
......@@ -4238,8 +4206,7 @@ static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
// If we are completely in a zone with no case mappings then we are done.
// We start at 2 so as not to except the ASCII range from mappings.
for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
for (int i = 0; i < boundary_count; i += 2) {
if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
#ifdef DEBUG
for (int j = bottom; j <= top; j++) {
......@@ -316,7 +316,6 @@ class CharacterRange {
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst);
static const int kRangeCanonicalizeMax = 0x346;
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
// Copyright 2007-2008 the V8 project authors. All rights reserved.
// Copyright 2007-2010 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -211,6 +211,7 @@ class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
struct Uppercase {
static bool Is(uchar c);
......@@ -1399,7 +1399,8 @@ TEST(LatinCanonicalize) {
for (uc32 c = 128; c < (1 << 21); c++)
CHECK_GE(canonicalize(c), 128);
unibrow::Mapping<unibrow::ToUppercase> to_upper;
for (uc32 c = 0; c < (1 << 21); c++) {
// Canonicalization is only defined for the Basic Multilingual Plane.
for (uc32 c = 0; c < (1 << 16); c++) {
unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
int length = to_upper.get(c, '\0', upper);
if (length == 0) {
......@@ -1414,7 +1415,7 @@ TEST(LatinCanonicalize) {
static uc32 CanonRange(uc32 c) {
static uc32 CanonRangeEnd(uc32 c) {
unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
if (count == 0) {
......@@ -1427,47 +1428,29 @@ static uc32 CanonRange(uc32 c) {
TEST(RangeCanonicalization) {
CHECK_NE(CanonRange(0) & CharacterRange::kStartMarker, 0);
// Check that we arrive at the same result when using the basic
// range canonicalization primitives as when using immediate
// canonicalization.
unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
for (int i = 0; i < CharacterRange::kRangeCanonicalizeMax; i++) {
int range = CanonRange(i);
int indirect_length = 0;
unibrow::uchar indirect[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if ((range & CharacterRange::kStartMarker) == 0) {
indirect_length = un_canonicalize.get(i - range, '\0', indirect);
for (int i = 0; i < indirect_length; i++)
indirect[i] += range;
} else {
indirect_length = un_canonicalize.get(i, '\0', indirect);
unibrow::uchar direct[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int direct_length = un_canonicalize.get(i, '\0', direct);
CHECK_EQ(direct_length, indirect_length);
// Check that we arrive at the same results when skipping over
// canonicalization ranges.
int next_block = 0;
while (next_block < CharacterRange::kRangeCanonicalizeMax) {
uc32 start = CanonRange(next_block);
CHECK_NE((start & CharacterRange::kStartMarker), 0);
unsigned dist = start & CharacterRange::kPayloadMask;
unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int first_length = un_canonicalize.get(next_block, '\0', first);
for (unsigned i = 1; i < dist; i++) {
CHECK_EQ(i, CanonRange(next_block + i));
unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int succ_length = un_canonicalize.get(next_block + i, '\0', succ);
CHECK_EQ(first_length, succ_length);
for (int j = 0; j < succ_length; j++) {
int calc = first[j] + i;
int found = succ[j];
CHECK_EQ(calc, found);
int block_start = 0;
while (block_start <= 0xFFFF) {
uc32 block_end = CanonRangeEnd(block_start);
unsigned block_length = block_end - block_start + 1;
if (block_length > 1) {
unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int first_length = un_canonicalize.get(block_start, '\0', first);
for (unsigned i = 1; i < block_length; i++) {
unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
CHECK_EQ(first_length, succ_length);
for (int j = 0; j < succ_length; j++) {
int calc = first[j] + i;
int found = succ[j];
CHECK_EQ(calc, found);
next_block = next_block + dist;
block_start = block_start + block_length;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment