js-collator.cc 17.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_INTL_SUPPORT
#error Internationalization is expected to be enabled.
#endif  // V8_INTL_SUPPORT

#include "src/objects/js-collator.h"

11
#include "src/execution/isolate.h"
12
#include "src/objects/js-collator-inl.h"
13
#include "src/objects/objects-inl.h"
14 15 16 17
#include "unicode/coll.h"
#include "unicode/locid.h"
#include "unicode/strenum.h"
#include "unicode/ucol.h"
18
#include "unicode/udata.h"
19
#include "unicode/uloc.h"
20
#include "unicode/utypes.h"
21 22 23 24 25 26

namespace v8 {
namespace internal {

namespace {

27 28 29 30 31
enum class Usage {
  SORT,
  SEARCH,
};

32 33 34 35 36 37 38 39
enum class Sensitivity {
  kBase,
  kAccent,
  kCase,
  kVariant,
  kUndefined,
};

40 41 42 43 44 45 46 47 48 49
// TODO(gsathya): Consider internalizing the value strings.
void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
                                  Handle<String> key, const char* value) {
  CHECK_NOT_NULL(value);
  Handle<String> value_str =
      isolate->factory()->NewStringFromAsciiChecked(value);

  // This is a brand new JSObject that shouldn't already have the same
  // key so this shouldn't fail.
  CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_str,
50
                                       Just(kDontThrow))
51 52 53 54 55 56 57 58 59 60
            .FromJust());
}

void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
                                  Handle<String> key, bool value) {
  Handle<Object> value_obj = isolate->factory()->ToBoolean(value);

  // This is a brand new JSObject that shouldn't already have the same
  // key so this shouldn't fail.
  CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_obj,
61
                                       Just(kDontThrow))
62 63 64 65 66 67 68 69 70 71 72
            .FromJust());
}

}  // anonymous namespace

// static
Handle<JSObject> JSCollator::ResolvedOptions(Isolate* isolate,
                                             Handle<JSCollator> collator) {
  Handle<JSObject> options =
      isolate->factory()->NewJSObject(isolate->object_function());

73
  icu::Collator* icu_collator = collator->icu_collator().raw();
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
  CHECK_NOT_NULL(icu_collator);

  UErrorCode status = U_ZERO_ERROR;
  bool numeric =
      icu_collator->getAttribute(UCOL_NUMERIC_COLLATION, status) == UCOL_ON;
  CHECK(U_SUCCESS(status));

  const char* case_first = nullptr;
  status = U_ZERO_ERROR;
  switch (icu_collator->getAttribute(UCOL_CASE_FIRST, status)) {
    case UCOL_LOWER_FIRST:
      case_first = "lower";
      break;
    case UCOL_UPPER_FIRST:
      case_first = "upper";
      break;
    default:
      case_first = "false";
  }
  CHECK(U_SUCCESS(status));

  const char* sensitivity = nullptr;
  status = U_ZERO_ERROR;
  switch (icu_collator->getAttribute(UCOL_STRENGTH, status)) {
    case UCOL_PRIMARY: {
      CHECK(U_SUCCESS(status));
      status = U_ZERO_ERROR;
      // case level: true + s1 -> case, s1 -> base.
      if (UCOL_ON == icu_collator->getAttribute(UCOL_CASE_LEVEL, status)) {
        sensitivity = "case";
      } else {
        sensitivity = "base";
      }
      CHECK(U_SUCCESS(status));
      break;
    }
    case UCOL_SECONDARY:
      sensitivity = "accent";
      break;
    case UCOL_TERTIARY:
      sensitivity = "variant";
      break;
    case UCOL_QUATERNARY:
      // We shouldn't get quaternary and identical from ICU, but if we do
      // put them into variant.
      sensitivity = "variant";
      break;
    default:
      sensitivity = "variant";
  }
  CHECK(U_SUCCESS(status));

  status = U_ZERO_ERROR;
  bool ignore_punctuation = icu_collator->getAttribute(UCOL_ALTERNATE_HANDLING,
                                                       status) == UCOL_SHIFTED;
  CHECK(U_SUCCESS(status));

  status = U_ZERO_ERROR;

133 134
  icu::Locale icu_locale(icu_collator->getLocale(ULOC_VALID_LOCALE, status));
  CHECK(U_SUCCESS(status));
135

136 137 138
  const char* collation = "default";
  const char* usage = "sort";
  const char* collation_key = "co";
139
  status = U_ZERO_ERROR;
140 141
  std::string collation_value =
      icu_locale.getUnicodeKeywordValue<std::string>(collation_key, status);
142

143
  std::string locale;
144 145
  if (U_SUCCESS(status)) {
    if (collation_value == "search") {
146
      usage = "search";
147 148 149 150 151

      // Search is disallowed as a collation value per spec. Let's
      // use `default`, instead.
      //
      // https://tc39.github.io/ecma402/#sec-properties-of-intl-collator-instances
152
      collation = "default";
153 154 155 156 157 158 159 160 161

      // We clone the icu::Locale because we don't want the
      // icu_collator to be affected when we remove the collation key
      // below.
      icu::Locale new_icu_locale = icu_locale;

      // The spec forbids the search as a collation value in the
      // locale tag, so let's filter it out.
      status = U_ZERO_ERROR;
162
      new_icu_locale.setUnicodeKeywordValue(collation_key, nullptr, status);
163 164
      CHECK(U_SUCCESS(status));

165
      locale = Intl::ToLanguageTag(new_icu_locale).FromJust();
166
    } else {
167
      collation = collation_value.c_str();
168
      locale = Intl::ToLanguageTag(icu_locale).FromJust();
169
    }
170
  } else {
171
    locale = Intl::ToLanguageTag(icu_locale).FromJust();
172 173
  }

174 175 176 177 178 179 180 181 182 183 184
  // 5. For each row of Table 2, except the header row, in table order, do
  //    ...
  // Table 2: Resolved Options of Collator Instances
  //  Internal Slot            Property               Extension Key
  //    [[Locale]                "locale"
  //    [[Usage]                 "usage"
  //    [[Sensitivity]]          "sensitivity"
  //    [[IgnorePunctuation]]    "ignorePunctuation"
  //    [[Collation]]            "collation"
  //    [[Numeric]]              "numeric"              kn
  //    [[CaseFirst]]            "caseFirst"            kf
185
  CreateDataPropertyForOptions(
186
      isolate, options, isolate->factory()->locale_string(), locale.c_str());
187
  CreateDataPropertyForOptions(isolate, options,
188 189
                               isolate->factory()->usage_string(), usage);
  CreateDataPropertyForOptions(
190 191 192 193 194 195 196 197 198 199
      isolate, options, isolate->factory()->sensitivity_string(), sensitivity);
  CreateDataPropertyForOptions(isolate, options,
                               isolate->factory()->ignorePunctuation_string(),
                               ignore_punctuation);
  CreateDataPropertyForOptions(
      isolate, options, isolate->factory()->collation_string(), collation);
  CreateDataPropertyForOptions(isolate, options,
                               isolate->factory()->numeric_string(), numeric);
  CreateDataPropertyForOptions(
      isolate, options, isolate->factory()->caseFirst_string(), case_first);
200 201 202 203 204
  return options;
}

namespace {

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
Intl::CaseFirst ToCaseFirst(const char* str) {
  if (strcmp(str, "upper") == 0) return Intl::CaseFirst::kUpper;
  if (strcmp(str, "lower") == 0) return Intl::CaseFirst::kLower;
  if (strcmp(str, "false") == 0) return Intl::CaseFirst::kFalse;
  return Intl::CaseFirst::kUndefined;
}

UColAttributeValue ToUColAttributeValue(Intl::CaseFirst case_first) {
  switch (case_first) {
    case Intl::CaseFirst::kUpper:
      return UCOL_UPPER_FIRST;
    case Intl::CaseFirst::kLower:
      return UCOL_LOWER_FIRST;
    case Intl::CaseFirst::kFalse:
    case Intl::CaseFirst::kUndefined:
      return UCOL_OFF;
  }
}

224 225 226 227 228 229 230 231
void SetNumericOption(icu::Collator* icu_collator, bool numeric) {
  CHECK_NOT_NULL(icu_collator);
  UErrorCode status = U_ZERO_ERROR;
  icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
                             numeric ? UCOL_ON : UCOL_OFF, status);
  CHECK(U_SUCCESS(status));
}

232 233
void SetCaseFirstOption(icu::Collator* icu_collator,
                        Intl::CaseFirst case_first) {
234 235
  CHECK_NOT_NULL(icu_collator);
  UErrorCode status = U_ZERO_ERROR;
236 237
  icu_collator->setAttribute(UCOL_CASE_FIRST, ToUColAttributeValue(case_first),
                             status);
238 239 240 241 242 243
  CHECK(U_SUCCESS(status));
}

}  // anonymous namespace

// static
244 245
MaybeHandle<JSCollator> JSCollator::New(Isolate* isolate, Handle<Map> map,
                                        Handle<Object> locales,
246 247
                                        Handle<Object> options_obj,
                                        const char* service) {
248
  // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
249 250 251 252 253
  Maybe<std::vector<std::string>> maybe_requested_locales =
      Intl::CanonicalizeLocaleList(isolate, locales);
  MAYBE_RETURN(maybe_requested_locales, Handle<JSCollator>());
  std::vector<std::string> requested_locales =
      maybe_requested_locales.FromJust();
254 255 256 257 258 259 260 261

  // 2. If options is undefined, then
  if (options_obj->IsUndefined(isolate)) {
    // 2. a. Let options be ObjectCreate(null).
    options_obj = isolate->factory()->NewJSObjectWithNullProto();
  } else {
    // 3. Else
    // 3. a. Let options be ? ToObject(options).
262 263 264
    ASSIGN_RETURN_ON_EXCEPTION(isolate, options_obj,
                               Object::ToObject(isolate, options_obj, service),
                               JSCollator);
265 266 267 268 269 270 271
  }

  // At this point, options_obj can either be a JSObject or a JSProxy only.
  Handle<JSReceiver> options = Handle<JSReceiver>::cast(options_obj);

  // 4. Let usage be ? GetOption(options, "usage", "string", « "sort",
  // "search" », "sort").
272
  Maybe<Usage> maybe_usage = Intl::GetStringOption<Usage>(
273
      isolate, options, "usage", service, {"sort", "search"},
274 275 276
      {Usage::SORT, Usage::SEARCH}, Usage::SORT);
  MAYBE_RETURN(maybe_usage, MaybeHandle<JSCollator>());
  Usage usage = maybe_usage.FromJust();
277 278 279 280

  // 9. Let matcher be ? GetOption(options, "localeMatcher", "string",
  // « "lookup", "best fit" », "best fit").
  // 10. Set opt.[[localeMatcher]] to matcher.
281
  Maybe<Intl::MatcherOption> maybe_locale_matcher =
282
      Intl::GetLocaleMatcher(isolate, options, service);
283 284
  MAYBE_RETURN(maybe_locale_matcher, MaybeHandle<JSCollator>());
  Intl::MatcherOption matcher = maybe_locale_matcher.FromJust();
285 286 287 288 289 290 291 292 293 294 295 296

  // 11. Let numeric be ? GetOption(options, "numeric", "boolean",
  // undefined, undefined).
  // 12. If numeric is not undefined, then
  //    a. Let numeric be ! ToString(numeric).
  //
  // Note: We omit the ToString(numeric) operation as it's not
  // observable. Intl::GetBoolOption returns a Boolean and
  // ToString(Boolean) is not side-effecting.
  //
  // 13. Set opt.[[kn]] to numeric.
  bool numeric;
297 298
  Maybe<bool> found_numeric =
      Intl::GetBoolOption(isolate, options, "numeric", service, &numeric);
299 300 301 302
  MAYBE_RETURN(found_numeric, MaybeHandle<JSCollator>());

  // 14. Let caseFirst be ? GetOption(options, "caseFirst", "string",
  //     « "upper", "lower", "false" », undefined).
303
  Maybe<Intl::CaseFirst> maybe_case_first =
304
      Intl::GetCaseFirst(isolate, options, service);
305 306
  MAYBE_RETURN(maybe_case_first, MaybeHandle<JSCollator>());
  Intl::CaseFirst case_first = maybe_case_first.FromJust();
307 308 309 310 311

  // The relevant unicode extensions accepted by Collator as specified here:
  // https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots
  //
  // 16. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]].
312
  std::set<std::string> relevant_extension_keys{"co", "kn", "kf"};
313 314 315 316

  // 17. Let r be ResolveLocale(%Collator%.[[AvailableLocales]],
  // requestedLocales, opt, %Collator%.[[RelevantExtensionKeys]],
  // localeData).
317
  Intl::ResolvedLocale r =
318 319
      Intl::ResolveLocale(isolate, JSCollator::GetAvailableLocales(),
                          requested_locales, matcher, relevant_extension_keys);
320

321
  // 18. Set collator.[[Locale]] to r.[[locale]].
322
  icu::Locale icu_locale = r.icu_locale;
323 324 325 326
  DCHECK(!icu_locale.isBogus());

  // 19. Let collation be r.[[co]].

327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
  // 5. Set collator.[[Usage]] to usage.
  //
  // 6. If usage is "sort", then
  //    a. Let localeData be %Collator%.[[SortLocaleData]].
  // 7. Else,
  //    a. Let localeData be %Collator%.[[SearchLocaleData]].
  //
  // The Intl spec doesn't allow us to use "search" as an extension
  // value for collation as per:
  // https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots
  //
  // But the only way to pass the value "search" for collation from
  // the options object to ICU is to use the 'co' extension keyword.
  //
  // This will need to be filtered out when creating the
  // resolvedOptions object.
  if (usage == Usage::SEARCH) {
    UErrorCode status = U_ZERO_ERROR;
345
    icu_locale.setUnicodeKeywordValue("co", "search", status);
346 347 348
    CHECK(U_SUCCESS(status));
  }

349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
  // 20. If collation is null, let collation be "default".
  // 21. Set collator.[[Collation]] to collation.
  //
  // We don't store the collation value as per the above two steps
  // here. The collation value can be looked up from icu::Collator on
  // demand, as part of Intl.Collator.prototype.resolvedOptions.

  UErrorCode status = U_ZERO_ERROR;
  std::unique_ptr<icu::Collator> icu_collator(
      icu::Collator::createInstance(icu_locale, status));
  if (U_FAILURE(status) || icu_collator.get() == nullptr) {
    status = U_ZERO_ERROR;
    // Remove extensions and try again.
    icu::Locale no_extension_locale(icu_locale.getBaseName());
    icu_collator.reset(
        icu::Collator::createInstance(no_extension_locale, status));

    if (U_FAILURE(status) || icu_collator.get() == nullptr) {
      FATAL("Failed to create ICU collator, are ICU data files missing?");
    }
  }
  DCHECK(U_SUCCESS(status));
  CHECK_NOT_NULL(icu_collator.get());

  // 22. If relevantExtensionKeys contains "kn", then
  //     a. Set collator.[[Numeric]] to ! SameValue(r.[[kn]], "true").
  //
  // If the numeric value is passed in through the options object,
  // then we use it. Otherwise, we check if the numeric value is
  // passed in through the unicode extensions.
  status = U_ZERO_ERROR;
  if (found_numeric.FromJust()) {
381
    SetNumericOption(icu_collator.get(), numeric);
382
  } else {
383 384 385
    auto kn_extension_it = r.extensions.find("kn");
    if (kn_extension_it != r.extensions.end()) {
      SetNumericOption(icu_collator.get(), (kn_extension_it->second == "true"));
386
    }
387 388 389 390 391 392 393 394
  }

  // 23. If relevantExtensionKeys contains "kf", then
  //     a. Set collator.[[CaseFirst]] to r.[[kf]].
  //
  // If the caseFirst value is passed in through the options object,
  // then we use it. Otherwise, we check if the caseFirst value is
  // passed in through the unicode extensions.
395 396
  if (case_first != Intl::CaseFirst::kUndefined) {
    SetCaseFirstOption(icu_collator.get(), case_first);
397
  } else {
398 399 400 401
    auto kf_extension_it = r.extensions.find("kf");
    if (kf_extension_it != r.extensions.end()) {
      SetCaseFirstOption(icu_collator.get(),
                         ToCaseFirst(kf_extension_it->second.c_str()));
402
    }
403 404 405 406 407 408 409 410 411 412 413
  }

  // Normalization is always on, by the spec. We are free to optimize
  // if the strings are already normalized (but we don't have a way to tell
  // that right now).
  status = U_ZERO_ERROR;
  icu_collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
  CHECK(U_SUCCESS(status));

  // 24. Let sensitivity be ? GetOption(options, "sensitivity",
  // "string", « "base", "accent", "case", "variant" », undefined).
414
  Maybe<Sensitivity> maybe_sensitivity = Intl::GetStringOption<Sensitivity>(
415
      isolate, options, "sensitivity", service,
416 417 418 419 420 421
      {"base", "accent", "case", "variant"},
      {Sensitivity::kBase, Sensitivity::kAccent, Sensitivity::kCase,
       Sensitivity::kVariant},
      Sensitivity::kUndefined);
  MAYBE_RETURN(maybe_sensitivity, MaybeHandle<JSCollator>());
  Sensitivity sensitivity = maybe_sensitivity.FromJust();
422 423

  // 25. If sensitivity is undefined, then
424
  if (sensitivity == Sensitivity::kUndefined) {
425 426 427
    // 25. a. If usage is "sort", then
    if (usage == Usage::SORT) {
      // 25. a. i. Let sensitivity be "variant".
428
      sensitivity = Sensitivity::kVariant;
429
    }
430 431 432 433
  }
  // 26. Set collator.[[Sensitivity]] to sensitivity.
  switch (sensitivity) {
    case Sensitivity::kBase:
434
      icu_collator->setStrength(icu::Collator::PRIMARY);
435 436
      break;
    case Sensitivity::kAccent:
437
      icu_collator->setStrength(icu::Collator::SECONDARY);
438 439
      break;
    case Sensitivity::kCase:
440 441 442 443
      icu_collator->setStrength(icu::Collator::PRIMARY);
      status = U_ZERO_ERROR;
      icu_collator->setAttribute(UCOL_CASE_LEVEL, UCOL_ON, status);
      CHECK(U_SUCCESS(status));
444 445
      break;
    case Sensitivity::kVariant:
446
      icu_collator->setStrength(icu::Collator::TERTIARY);
447 448 449
      break;
    case Sensitivity::kUndefined:
      break;
450 451 452 453 454
  }

  // 27.Let ignorePunctuation be ? GetOption(options,
  // "ignorePunctuation", "boolean", undefined, false).
  bool ignore_punctuation;
455 456
  Maybe<bool> found_ignore_punctuation = Intl::GetBoolOption(
      isolate, options, "ignorePunctuation", service, &ignore_punctuation);
457 458 459 460 461 462 463 464 465 466 467 468
  MAYBE_RETURN(found_ignore_punctuation, MaybeHandle<JSCollator>());

  // 28. Set collator.[[IgnorePunctuation]] to ignorePunctuation.
  if (found_ignore_punctuation.FromJust() && ignore_punctuation) {
    status = U_ZERO_ERROR;
    icu_collator->setAttribute(UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, status);
    CHECK(U_SUCCESS(status));
  }

  Handle<Managed<icu::Collator>> managed_collator =
      Managed<icu::Collator>::FromUniquePtr(isolate, 0,
                                            std::move(icu_collator));
469 470 471 472 473

  // Now all properties are ready, so we can allocate the result object.
  Handle<JSCollator> collator = Handle<JSCollator>::cast(
      isolate->factory()->NewFastOrSlowJSObjectFromMap(map));
  DisallowHeapAllocation no_gc;
474 475 476 477 478 479
  collator->set_icu_collator(*managed_collator);

  // 29. Return collator.
  return collator;
}

480 481 482 483 484 485 486 487 488 489 490
namespace {

struct CheckColl {
  static const char* key() { return nullptr; }
#define U_ICUDATA_COLL U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll"
  static const char* path() { return U_ICUDATA_COLL; }
#undef U_ICUDATA_COLL
};

}  // namespace

491
const std::set<std::string>& JSCollator::GetAvailableLocales() {
492 493
  static base::LazyInstance<Intl::AvailableLocales<icu::Collator, CheckColl>>::
      type available_locales = LAZY_INSTANCE_INITIALIZER;
494
  return available_locales.Pointer()->Get();
495 496
}

497 498
}  // namespace internal
}  // namespace v8