utf8-decoder.h 3.07 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// The remapped transition table is justified at
// https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14

#include <stdint.h>

#ifndef __UTF8_DFA_DECODER_H
#define __UTF8_DFA_DECODER_H

namespace Utf8DfaDecoder {

enum State : uint8_t {
  kReject = 0,
  kAccept = 12,
  kTwoByte = 24,
  kThreeByte = 36,
  kThreeByteLowMid = 48,
  kFourByte = 60,
  kFourByteLow = 72,
  kThreeByteHigh = 84,
  kFourByteMidHigh = 96,
};

static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
  // This first table maps bytes to character to a transition.
  static constexpr uint8_t transitions[] = {
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 00-0F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 10-1F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 20-2F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 30-3F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 40-4F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 50-5F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 60-6F
      0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 70-7F
      1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80-8F
      2,  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 90-9F
      3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // A0-AF
      3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // B0-BF
      9,  9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // C0-CF
      4,  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // D0-DF
      10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5,  // E0-EF
      11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,  // F0-FF
  };

  // This second table maps a state to a new state when adding a transition.
  //  00-7F
  //  |   80-8F
  //  |   |   90-9F
  //  |   |   |   A0-BF
  //  |   |   |   |   C2-DF
  //  |   |   |   |   |   E1-EC, EE, EF
  //  |   |   |   |   |   |   ED
  //  |   |   |   |   |   |   |   F1-F3
  //  |   |   |   |   |   |   |   |   F4
  //  |   |   |   |   |   |   |   |   |   C0, C1, F5-FF
  //  |   |   |   |   |   |   |   |   |   |  E0
  //  |   |   |   |   |   |   |   |   |   |  |   F0
  static constexpr uint8_t states[] = {
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // REJECT = 0
      12, 0,  0,  0,  24, 36, 48, 60, 72, 0, 84, 96,  // ACCEPT = 12
      0,  12, 12, 12, 0,  0,  0,  0,  0,  0, 0,  0,   // 2-byte = 24
      0,  24, 24, 24, 0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte = 36
      0,  24, 24, 0,  0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte low/mid = 48
      0,  36, 36, 36, 0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte = 60
      0,  36, 0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte low = 72
      0,  0,  0,  24, 0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte high = 84
      0,  0,  36, 36, 0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte mid/high = 96
  };

  uint8_t type = transitions[byte];
  *state = static_cast<State>(states[*state + type]);
  *buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
}

}  // namespace Utf8DfaDecoder

#endif /* __UTF8_DFA_DECODER_H */