scanner.cc 16.7 KB
Newer Older
1
// Copyright 2010 the V8 project authors. All rights reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials provided
//       with the distribution.
//     * Neither the name of Google Inc. nor the names of its
//       contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "v8.h"

#include "ast.h"
31
#include "handles.h"
32
#include "scanner.h"
33
#include "unicode-inl.h"
34

35 36
namespace v8 {
namespace internal {
37 38

// ----------------------------------------------------------------------------
39 40 41 42 43 44 45 46 47
// BufferedUC16CharacterStreams

BufferedUC16CharacterStream::BufferedUC16CharacterStream()
    : UC16CharacterStream(),
      pushback_limit_(NULL) {
  // Initialize buffer as being empty. First read will fill the buffer.
  buffer_cursor_ = buffer_;
  buffer_end_ = buffer_;
}
lrn@chromium.org's avatar
lrn@chromium.org committed
48

49
BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
50

51 52 53 54 55
void BufferedUC16CharacterStream::PushBack(uc32 character) {
  if (character == kEndOfInput) {
    pos_--;
    return;
  }
56 57
  if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
    // buffer_ is writable, buffer_cursor_ is const pointer.
58
    buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
59 60
    pos_--;
    return;
61
  }
62
  SlowPushBack(static_cast<uc16>(character));
63 64
}

lrn@chromium.org's avatar
lrn@chromium.org committed
65

66 67 68 69 70 71 72 73 74 75 76 77 78
void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
  // In pushback mode, the end of the buffer contains pushback,
  // and the start of the buffer (from buffer start to pushback_limit_)
  // contains valid data that comes just after the pushback.
  // We NULL the pushback_limit_ if pushing all the way back to the
  // start of the buffer.

  if (pushback_limit_ == NULL) {
    // Enter pushback mode.
    pushback_limit_ = buffer_end_;
    buffer_end_ = buffer_ + kBufferSize;
    buffer_cursor_ = buffer_end_;
  }
79 80
  // Ensure that there is room for at least one pushback.
  ASSERT(buffer_cursor_ > buffer_);
81 82 83 84 85 86 87
  ASSERT(pos_ > 0);
  buffer_[--buffer_cursor_ - buffer_] = character;
  if (buffer_cursor_ == buffer_) {
    pushback_limit_ = NULL;
  } else if (buffer_cursor_ < pushback_limit_) {
    pushback_limit_ = buffer_cursor_;
  }
88 89 90
  pos_--;
}

lrn@chromium.org's avatar
lrn@chromium.org committed
91

92
bool BufferedUC16CharacterStream::ReadBlock() {
93
  buffer_cursor_ = buffer_;
94
  if (pushback_limit_ != NULL) {
95
    // Leave pushback mode.
96 97
    buffer_end_ = pushback_limit_;
    pushback_limit_ = NULL;
98 99 100 101
    // If there were any valid characters left at the
    // start of the buffer, use those.
    if (buffer_cursor_ < buffer_end_) return true;
    // Otherwise read a new block.
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
  }
  unsigned length = FillBuffer(pos_, kBufferSize);
  buffer_end_ = buffer_ + length;
  return length > 0;
}


unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
  // Leave pushback mode (i.e., ignore that there might be valid data
  // in the buffer before the pushback_limit_ point).
  pushback_limit_ = NULL;
  return BufferSeekForward(delta);
}

// ----------------------------------------------------------------------------
// GenericStringUC16CharacterStream


GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
    Handle<String> data,
    unsigned start_position,
    unsigned end_position)
    : string_(data),
      length_(end_position) {
  ASSERT(end_position >= start_position);
  buffer_cursor_ = buffer_;
  buffer_end_ = buffer_;
  pos_ = start_position;
}


GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }


unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
  unsigned old_pos = pos_;
  pos_ = Min(pos_ + delta, length_);
  ReadBlock();
  return pos_ - old_pos;
}


unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
                                                      unsigned length) {
  if (from_pos >= length_) return 0;
  if (from_pos + length > length_) {
    length = length_ - from_pos;
  }
  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
  return length;
}


// ----------------------------------------------------------------------------
// Utf8ToUC16CharacterStream
Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
                                                     unsigned length)
    : BufferedUC16CharacterStream(),
      raw_data_(data),
      raw_data_length_(length),
      raw_data_pos_(0),
      raw_character_position_(0) {
  ReadBlock();
}


Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }


unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
  unsigned old_pos = pos_;
  unsigned target_pos = pos_ + delta;
  SetRawPosition(target_pos);
  pos_ = raw_character_position_;
  ReadBlock();
  return pos_ - old_pos;
}


unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
                                               unsigned length) {
  static const unibrow::uchar kMaxUC16Character = 0xffff;
  SetRawPosition(char_position);
  if (raw_character_position_ != char_position) {
    // char_position was not a valid position in the stream (hit the end
    // while spooling to it).
    return 0u;
  }
  unsigned i = 0;
  while (i < length) {
    if (raw_data_pos_ == raw_data_length_) break;
    unibrow::uchar c = raw_data_[raw_data_pos_];
    if (c <= unibrow::Utf8::kMaxOneByteChar) {
      raw_data_pos_++;
    } else {
      c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
                                         raw_data_length_ - raw_data_pos_,
                                         &raw_data_pos_);
      // Don't allow characters outside of the BMP.
      if (c > kMaxUC16Character) {
        c = unibrow::Utf8::kBadChar;
      }
    }
    buffer_[i++] = static_cast<uc16>(c);
  }
  raw_character_position_ = char_position + i;
  return i;
}


static const byte kUtf8MultiByteMask = 0xC0;
static const byte kUtf8MultiByteCharStart = 0xC0;
static const byte kUtf8MultiByteCharFollower = 0x80;


#ifdef DEBUG
static bool IsUtf8MultiCharacterStart(byte first_byte) {
  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
}
#endif


static bool IsUtf8MultiCharacterFollower(byte later_byte) {
  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
}


// Move the cursor back to point at the preceding UTF-8 character start
// in the buffer.
static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
  byte character = buffer[--*cursor];
  if (character > unibrow::Utf8::kMaxOneByteChar) {
    ASSERT(IsUtf8MultiCharacterFollower(character));
    // Last byte of a multi-byte character encoding. Step backwards until
    // pointing to the first byte of the encoding, recognized by having the
    // top two bits set.
    while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
    ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
  }
}


// Move the cursor forward to point at the next following UTF-8 character start
// in the buffer.
static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
  byte character = buffer[(*cursor)++];
  if (character > unibrow::Utf8::kMaxOneByteChar) {
    // First character of a multi-byte character encoding.
    // The number of most-significant one-bits determines the length of the
    // encoding:
    //  110..... - (0xCx, 0xDx) one additional byte (minimum).
    //  1110.... - (0xEx) two additional bytes.
    //  11110... - (0xFx) three additional bytes (maximum).
    ASSERT(IsUtf8MultiCharacterStart(character));
    // Additional bytes is:
    // 1 if value in range 0xC0 .. 0xDF.
    // 2 if value in range 0xE0 .. 0xEF.
    // 3 if value in range 0xF0 .. 0xF7.
    // Encode that in a single value.
    unsigned additional_bytes =
        ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
    *cursor += additional_bytes;
    ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
  }
}


void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
  if (raw_character_position_ > target_position) {
    // Spool backwards in utf8 buffer.
    do {
      Utf8CharacterBack(raw_data_, &raw_data_pos_);
      raw_character_position_--;
    } while (raw_character_position_ > target_position);
    return;
  }
  // Spool forwards in the utf8 buffer.
  while (raw_character_position_ < target_position) {
    if (raw_data_pos_ == raw_data_length_) return;
    Utf8CharacterForward(raw_data_, &raw_data_pos_);
    raw_character_position_++;
283 284 285
  }
}

lrn@chromium.org's avatar
lrn@chromium.org committed
286

287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
// ----------------------------------------------------------------------------
// ExternalTwoByteStringUC16CharacterStream

ExternalTwoByteStringUC16CharacterStream::
    ~ExternalTwoByteStringUC16CharacterStream() { }


ExternalTwoByteStringUC16CharacterStream
    ::ExternalTwoByteStringUC16CharacterStream(
        Handle<ExternalTwoByteString> data,
        int start_position,
        int end_position)
    : UC16CharacterStream(),
      source_(data),
      raw_data_(data->GetTwoByteData(start_position)) {
  buffer_cursor_ = raw_data_,
  buffer_end_ = raw_data_ + (end_position - start_position);
  pos_ = start_position;
305 306
}

lrn@chromium.org's avatar
lrn@chromium.org committed
307

308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
// ----------------------------------------------------------------------------
// Scanner::LiteralScope

Scanner::LiteralScope::LiteralScope(Scanner* self)
    : scanner_(self), complete_(false) {
  self->StartLiteral();
}


Scanner::LiteralScope::~LiteralScope() {
  if (!complete_) scanner_->DropLiteral();
}


void Scanner::LiteralScope::Complete() {
  scanner_->TerminateLiteral();
  complete_ = true;
}

327

328
// ----------------------------------------------------------------------------
329
// V8JavaScriptScanner
330

331
V8JavaScriptScanner::V8JavaScriptScanner() : JavaScriptScanner() { }
lrn@chromium.org's avatar
lrn@chromium.org committed
332

333

334
void V8JavaScriptScanner::Initialize(UC16CharacterStream* source) {
335 336 337
  source_ = source;
  // Need to capture identifiers in order to recognize "get" and "set"
  // in object literals.
338 339 340 341 342 343
  Init();
  // Skip initial whitespace allowing HTML comment ends just like
  // after a newline and scan first token.
  has_line_terminator_before_next_ = true;
  SkipWhiteSpace();
  Scan();
344 345 346
}


347 348
// ----------------------------------------------------------------------------
// JsonScanner
349

350
JsonScanner::JsonScanner() : Scanner() { }
351

352

353 354
void JsonScanner::Initialize(UC16CharacterStream* source) {
  source_ = source;
355 356 357 358 359
  Init();
  // Skip initial whitespace.
  SkipJsonWhiteSpace();
  // Preload first token as look-ahead.
  ScanJson();
360 361
}

lrn@chromium.org's avatar
lrn@chromium.org committed
362

363
Token::Value JsonScanner::Next() {
364 365 366 367 368
  // BUG 1215673: Find a thread safe way to set a stack limit in
  // pre-parse mode. Otherwise, we cannot safely pre-parse from other
  // threads.
  current_ = next_;
  // Check for stack-overflow before returning any tokens.
369
  ScanJson();
370 371 372
  return current_.token;
}

lrn@chromium.org's avatar
lrn@chromium.org committed
373

374
bool JsonScanner::SkipJsonWhiteSpace() {
375 376 377 378 379 380 381 382 383
  int start_position = source_pos();
  // JSON WhiteSpace is tab, carrige-return, newline and space.
  while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
    Advance();
  }
  return source_pos() != start_position;
}


384
void JsonScanner::ScanJson() {
385
  next_.literal_chars = NULL;
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
  Token::Value token;
  do {
    // Remember the position of the next token
    next_.location.beg_pos = source_pos();
    switch (c0_) {
      case '\t':
      case '\r':
      case '\n':
      case ' ':
        Advance();
        token = Token::WHITESPACE;
        break;
      case '{':
        Advance();
        token = Token::LBRACE;
        break;
      case '}':
        Advance();
        token = Token::RBRACE;
        break;
      case '[':
        Advance();
        token = Token::LBRACK;
        break;
      case ']':
        Advance();
        token = Token::RBRACK;
        break;
      case ':':
        Advance();
        token = Token::COLON;
        break;
      case ',':
        Advance();
        token = Token::COMMA;
        break;
      case '"':
        token = ScanJsonString();
        break;
      case '-':
      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':
        token = ScanJsonNumber();
        break;
      case 't':
        token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
        break;
      case 'f':
        token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
        break;
      case 'n':
        token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
        break;
      default:
        if (c0_ < 0) {
          Advance();
          token = Token::EOS;
        } else {
          Advance();
          token = Select(Token::ILLEGAL);
        }
    }
  } while (token == Token::WHITESPACE);

  next_.location.end_pos = source_pos();
  next_.token = token;
}


463
Token::Value JsonScanner::ScanJsonString() {
464 465
  ASSERT_EQ('"', c0_);
  Advance();
466
  LiteralScope literal(this);
467
  while (c0_ != '"') {
468 469 470
    // Check for control character (0x00-0x1f) or unterminated string (<0).
    if (c0_ < 0x20) return Token::ILLEGAL;
    if (c0_ != '\\') {
471
      AddLiteralCharAdvance();
472 473 474 475 476 477
    } else {
      Advance();
      switch (c0_) {
        case '"':
        case '\\':
        case '/':
478
          AddLiteralChar(c0_);
479 480
          break;
        case 'b':
481
          AddLiteralChar('\x08');
482 483
          break;
        case 'f':
484
          AddLiteralChar('\x0c');
485 486
          break;
        case 'n':
487
          AddLiteralChar('\x0a');
488 489
          break;
        case 'r':
490
          AddLiteralChar('\x0d');
491 492
          break;
        case 't':
493
          AddLiteralChar('\x09');
494 495 496 497 498 499
          break;
        case 'u': {
          uc32 value = 0;
          for (int i = 0; i < 4; i++) {
            Advance();
            int digit = HexValue(c0_);
500 501 502
            if (digit < 0) {
              return Token::ILLEGAL;
            }
503 504
            value = value * 16 + digit;
          }
505
          AddLiteralChar(value);
506 507 508 509 510 511 512 513
          break;
        }
        default:
          return Token::ILLEGAL;
      }
      Advance();
    }
  }
514
  literal.Complete();
515 516 517 518 519
  Advance();
  return Token::STRING;
}


520
Token::Value JsonScanner::ScanJsonNumber() {
521
  LiteralScope literal(this);
522 523 524 525 526 527
  bool negative = false;

  if (c0_ == '-') {
    AddLiteralCharAdvance();
    negative = true;
  }
528
  if (c0_ == '0') {
529
    AddLiteralCharAdvance();
530 531 532 533
    // Prefix zero is only allowed if it's the only digit before
    // a decimal point or exponent.
    if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
  } else {
534 535
    int i = 0;
    int digits = 0;
536 537
    if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
    do {
538 539
      i = i * 10 + c0_ - '0';
      digits++;
540
      AddLiteralCharAdvance();
541
    } while (c0_ >= '0' && c0_ <= '9');
542 543 544 545
    if (c0_ != '.' && c0_ != 'e' && c0_ != 'E' && digits < 10) {
      number_ = (negative ? -i : i);
      return Token::NUMBER;
    }
546 547
  }
  if (c0_ == '.') {
548
    AddLiteralCharAdvance();
549 550
    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
    do {
551
      AddLiteralCharAdvance();
552 553
    } while (c0_ >= '0' && c0_ <= '9');
  }
554
  if (AsciiAlphaToLower(c0_) == 'e') {
555 556
    AddLiteralCharAdvance();
    if (c0_ == '-' || c0_ == '+') AddLiteralCharAdvance();
557 558
    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
    do {
559
      AddLiteralCharAdvance();
560 561
    } while (c0_ >= '0' && c0_ <= '9');
  }
562
  literal.Complete();
563 564 565 566
  ASSERT_NOT_NULL(next_.literal_chars);
  number_ = StringToDouble(next_.literal_chars->ascii_literal(),
                           NO_FLAGS,  // Hex, octal or trailing junk.
                           OS::nan_value());
567 568 569 570
  return Token::NUMBER;
}


571 572
Token::Value JsonScanner::ScanJsonIdentifier(const char* text,
                                             Token::Value token) {
573
  LiteralScope literal(this);
574 575 576 577 578
  while (*text != '\0') {
    if (c0_ != *text) return Token::ILLEGAL;
    Advance();
    text++;
  }
579
  if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
580
  literal.Complete();
581 582 583 584
  return token;
}


585

lrn@chromium.org's avatar
lrn@chromium.org committed
586
} }  // namespace v8::internal