unicode-inl.h 5.68 KB
Newer Older
1
// Copyright 2007-2010 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
#ifndef V8_STRINGS_UNICODE_INL_H_
#define V8_STRINGS_UNICODE_INL_H_
7

8
#include "src/base/logging.h"
9
#include "src/strings/unicode.h"
10
#include "src/utils/utils.h"
11 12 13

namespace unibrow {

14
#ifndef V8_INTL_SUPPORT
15 16
template <class T, int s>
bool Predicate<T, s>::get(uchar code_point) {
17
  CacheEntry entry = entries_[code_point & kMask];
18
  if (entry.code_point() == code_point) return entry.value();
19 20 21
  return CalculateValue(code_point);
}

22 23
template <class T, int s>
bool Predicate<T, s>::CalculateValue(uchar code_point) {
24 25 26 27 28
  bool result = T::Is(code_point);
  entries_[code_point & kMask] = CacheEntry(code_point, result);
  return result;
}

29 30
template <class T, int s>
int Mapping<T, s>::get(uchar c, uchar n, uchar* result) {
31 32 33 34 35 36 37 38 39 40 41 42 43
  CacheEntry entry = entries_[c & kMask];
  if (entry.code_point_ == c) {
    if (entry.offset_ == 0) {
      return 0;
    } else {
      result[0] = c + entry.offset_;
      return 1;
    }
  } else {
    return CalculateValue(c, n, result);
  }
}

44 45
template <class T, int s>
int Mapping<T, s>::CalculateValue(uchar c, uchar n, uchar* result) {
46 47 48 49 50 51 52 53 54 55 56 57 58 59
  bool allow_caching = true;
  int length = T::Convert(c, n, result, &allow_caching);
  if (allow_caching) {
    if (length == 1) {
      entries_[c & kMask] = CacheEntry(c, result[0] - c);
      return 1;
    } else {
      entries_[c & kMask] = CacheEntry(c, 0);
      return 0;
    }
  } else {
    return length;
  }
}
60
#endif  // !V8_INTL_SUPPORT
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
                               Utf8IncrementalBuffer* buffer) {
  DCHECK_NOT_NULL(buffer);
  State old_state = *state;
  byte next = **cursor;
  *cursor += 1;

  if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
    DCHECK_EQ(0u, *buffer);
    return static_cast<uchar>(next);
  }

  // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
  // char in that sequence.
  Utf8DfaDecoder::Decode(next, state, buffer);

  switch (*state) {
    case State::kAccept: {
      uchar t = *buffer;
      *buffer = 0;
      return t;
    }

    case State::kReject:
      *state = State::kAccept;
      *buffer = 0;

      // If we hit a bad byte, we need to determine if we were trying to start
      // a sequence or continue one. If we were trying to start a sequence,
      // that means it's just an invalid lead byte and we need to continue to
      // the next (which we already did above). If we were already in a
      // sequence, we need to reprocess this same byte after resetting to the
      // initial state.
      if (old_state != State::kAccept) {
        // We were trying to continue a sequence, so let's reprocess this byte
        // next time.
        *cursor -= 1;
      }
      return kBadChar;

    default:
      return kIncomplete;
  }
}
109

110 111 112 113 114 115 116 117 118 119 120
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
  static const int kMask = ~(1 << 6);
  if (c <= kMaxOneByteChar) {
    str[0] = c;
    return 1;
  }
  str[0] = 0xC0 | (c >> 6);
  str[1] = 0x80 | (c & kMask);
  return 2;
}

121 122 123 124
// Encode encodes the UTF-16 code units c and previous into the given str
// buffer, and combines surrogate code units into single code points. If
// replace_invalid is set to true, orphan surrogate code units will be replaced
// with kBadChar.
125
unsigned Utf8::Encode(char* str, uchar c, int previous, bool replace_invalid) {
126 127 128 129 130 131 132 133 134
  static const int kMask = ~(1 << 6);
  if (c <= kMaxOneByteChar) {
    str[0] = c;
    return 1;
  } else if (c <= kMaxTwoByteChar) {
    str[0] = 0xC0 | (c >> 6);
    str[1] = 0x80 | (c & kMask);
    return 2;
  } else if (c <= kMaxThreeByteChar) {
135
    DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
136
    if (Utf16::IsSurrogatePair(previous, c)) {
137 138 139
      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
      return Encode(str - kUnmatchedSize,
                    Utf16::CombineSurrogatePair(previous, c),
140 141
                    Utf16::kNoPreviousCharacter, replace_invalid) -
             kUnmatchedSize;
142
    } else if (replace_invalid &&
143
               (Utf16::IsLeadSurrogate(c) || Utf16::IsTrailSurrogate(c))) {
144
      c = kBadChar;
145
    }
146 147 148 149 150 151 152 153 154 155 156 157 158
    str[0] = 0xE0 | (c >> 12);
    str[1] = 0x80 | ((c >> 6) & kMask);
    str[2] = 0x80 | (c & kMask);
    return 3;
  } else {
    str[0] = 0xF0 | (c >> 18);
    str[1] = 0x80 | ((c >> 12) & kMask);
    str[2] = 0x80 | ((c >> 6) & kMask);
    str[3] = 0x80 | (c & kMask);
    return 4;
  }
}

159
uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
160 161
  if (length <= 0) return kBadChar;
  byte first = bytes[0];
162 163
  // Characters between 0000 and 007F are encoded as a single character
  if (V8_LIKELY(first <= kMaxOneByteChar)) {
164 165 166 167 168 169
    *cursor += 1;
    return first;
  }
  return CalculateValue(bytes, length, cursor);
}

170
unsigned Utf8::Length(uchar c, int previous) {
171 172 173 174 175
  if (c <= kMaxOneByteChar) {
    return 1;
  } else if (c <= kMaxTwoByteChar) {
    return 2;
  } else if (c <= kMaxThreeByteChar) {
176 177
    DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
    if (Utf16::IsSurrogatePair(previous, c)) {
178 179
      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
    }
180 181 182 183 184 185
    return 3;
  } else {
    return 4;
  }
}

clemensh's avatar
clemensh committed
186 187 188 189 190 191
bool Utf8::IsValidCharacter(uchar c) {
  return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
         (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
          c != kBadChar);
}

192 193
}  // namespace unibrow

194
#endif  // V8_STRINGS_UNICODE_INL_H_