unicode-decoder.h 3.76 KB
Newer Older
1 2 3 4 5 6 7 8 9
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_UNICODE_DECODER_H_
#define V8_UNICODE_DECODER_H_

#include <sys/types.h>
#include "src/globals.h"
10
#include "src/utils.h"
11 12 13

namespace unibrow {

14
class V8_EXPORT_PRIVATE Utf8DecoderBase {
15 16 17
 public:
  // Initialization done in subclass.
  inline Utf8DecoderBase();
18 19 20
  inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
                         const uint8_t* stream, size_t stream_length);
  inline size_t Utf16Length() const { return utf16_length_; }
21 22 23 24

 protected:
  // This reads all characters and sets the utf16_length_.
  // The first buffer_length utf16 chars are cached in the buffer.
25 26
  void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
             size_t stream_length);
27 28
  static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
                             uint16_t* data, size_t length);
29
  const uint8_t* unbuffered_start_;
30
  size_t unbuffered_length_;
31
  size_t utf16_length_;
32 33 34 35 36 37
  bool last_byte_of_buffer_unused_;

 private:
  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
};

38
template <size_t kBufferSize>
39 40 41
class Utf8Decoder : public Utf8DecoderBase {
 public:
  inline Utf8Decoder() {}
42 43 44
  inline Utf8Decoder(const char* stream, size_t length);
  inline void Reset(const char* stream, size_t length);
  inline size_t WriteUtf16(uint16_t* data, size_t length) const;
45 46 47 48 49 50 51 52

 private:
  uint16_t buffer_[kBufferSize];
};


Utf8DecoderBase::Utf8DecoderBase()
    : unbuffered_start_(NULL),
53
      unbuffered_length_(0),
54 55 56 57
      utf16_length_(0),
      last_byte_of_buffer_unused_(false) {}


58 59
Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
                                 const uint8_t* stream, size_t stream_length) {
60 61 62 63
  Reset(buffer, buffer_length, stream, stream_length);
}


64 65
template <size_t kBufferSize>
Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
66 67 68 69
    : Utf8DecoderBase(buffer_, kBufferSize,
                      reinterpret_cast<const uint8_t*>(stream), length) {}


70 71
template <size_t kBufferSize>
void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
72 73 74 75 76
  Utf8DecoderBase::Reset(buffer_, kBufferSize,
                         reinterpret_cast<const uint8_t*>(stream), length);
}


77 78 79
template <size_t kBufferSize>
size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
                                            size_t length) const {
80 81 82
  DCHECK(length > 0);
  if (length > utf16_length_) length = utf16_length_;
  // memcpy everything in buffer.
83
  size_t buffer_length =
84
      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
85
  size_t memcpy_length = length <= buffer_length ? length : buffer_length;
86 87 88 89
  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
  if (length <= buffer_length) return length;
  DCHECK(unbuffered_start_ != NULL);
  // Copy the rest the slow way.
90
  WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
                 length - buffer_length);
  return length;
}

class Latin1 {
 public:
  static const unsigned kMaxChar = 0xff;
  // Returns 0 if character does not convert to single latin-1 character
  // or if the character doesn't not convert back to latin-1 via inverse
  // operation (upper to lower, etc).
  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
};


uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
  DCHECK(c > Latin1::kMaxChar);
  switch (c) {
    // This are equivalent characters in unicode.
    case 0x39c:
    case 0x3bc:
      return 0xb5;
    // This is an uppercase of a Latin-1 character
    // outside of Latin-1.
    case 0x178:
      return 0xff;
  }
  return 0;
}


}  // namespace unibrow

#endif  // V8_UNICODE_DECODER_H_