asm-scanner.cc 10.6 KB
Newer Older
1 2 3 4 5 6
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/asmjs/asm-scanner.h"

7 8
#include <cinttypes>

9
#include "src/flags/flags.h"
10
#include "src/numbers/conversions.h"
11
#include "src/parsing/scanner.h"
12
#include "src/strings/char-predicates-inl.h"
13 14 15 16 17 18 19

namespace v8 {
namespace internal {

namespace {
// Cap number of identifiers to ensure we can assign both global and
// local ones a token id in the range of an int32_t.
20
static const int kMaxIdentifierCount = 0xF000000;
21
}  // namespace
22

23
AsmJsScanner::AsmJsScanner(Utf16CharacterStream* stream)
24 25
    : stream_(stream),
      token_(kUninitialized),
26 27
      preceding_token_(kUninitialized),
      next_token_(kUninitialized),
28 29 30
      position_(0),
      preceding_position_(0),
      next_position_(0),
31 32 33 34 35 36 37 38 39 40
      rewind_(false),
      in_local_scope_(false),
      global_count_(0),
      double_value_(0.0),
      unsigned_value_(0),
      preceded_by_newline_(false) {
#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
  STDLIB_MATH_FUNCTION_LIST(V)
  STDLIB_ARRAY_TYPE_LIST(V)
#undef V
41
#define V(name, _junk1) property_names_[#name] = kToken_##name;
42
  STDLIB_MATH_VALUE_LIST(V)
43 44
#undef V
#define V(name) property_names_[#name] = kToken_##name;
45 46 47 48 49 50 51 52
  STDLIB_OTHER_LIST(V)
#undef V
#define V(name) global_names_[#name] = kToken_##name;
  KEYWORD_NAME_LIST(V)
#undef V
  Next();
}

53
void AsmJsScanner::Next() {
54 55
  if (rewind_) {
    preceding_token_ = token_;
56
    preceding_position_ = position_;
57
    token_ = next_token_;
58
    position_ = next_position_;
59
    next_token_ = kUninitialized;
60
    next_position_ = 0;
61 62 63 64 65 66 67 68 69 70 71 72 73
    rewind_ = false;
    return;
  }

  if (token_ == kEndOfInput || token_ == kParseError) {
    return;
  }

#if DEBUG
  if (FLAG_trace_asm_scanner) {
    if (Token() == kDouble) {
      PrintF("%lf ", AsDouble());
    } else if (Token() == kUnsigned) {
74
      PrintF("%" PRIu32 " ", AsUnsigned());
75 76 77 78 79 80 81 82 83
    } else {
      std::string name = Name(Token());
      PrintF("%s ", name.c_str());
    }
  }
#endif

  preceded_by_newline_ = false;
  preceding_token_ = token_;
84 85
  preceding_position_ = position_;

86
  for (;;) {
87 88
    position_ = stream_->pos();
    uc32 ch = stream_->Advance();
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
    switch (ch) {
      case ' ':
      case '\t':
      case '\r':
        // Ignore whitespace.
        break;

      case '\n':
        // Track when we've passed a newline for optional semicolon support,
        // but keep scanning.
        preceded_by_newline_ = true;
        break;

      case kEndOfInput:
        token_ = kEndOfInput;
        return;

      case '\'':
      case '"':
108
        ConsumeString(ch);
109 110 111
        return;

      case '/':
112
        ch = stream_->Advance();
113
        if (ch == '/') {
114
          ConsumeCPPComment();
115
        } else if (ch == '*') {
116
          if (!ConsumeCComment()) {
117 118 119 120
            token_ = kParseError;
            return;
          }
        } else {
121
          stream_->Back();
122 123 124 125 126 127 128 129 130 131 132
          token_ = '/';
          return;
        }
        // Breaks out of switch, but loops again (i.e. the case when we parsed
        // a comment, but need to continue to look for the next token).
        break;

      case '<':
      case '>':
      case '=':
      case '!':
133
        ConsumeCompareOrShift(ch);
134 135 136 137 138 139 140 141 142 143 144
        return;

#define V(single_char_token) case single_char_token:
        SIMPLE_SINGLE_TOKEN_LIST(V)
#undef V
        // Use fixed token IDs for ASCII.
        token_ = ch;
        return;

      default:
        if (IsIdentifierStart(ch)) {
145
          ConsumeIdentifier(ch);
146
        } else if (IsNumberStart(ch)) {
147
          ConsumeNumber(ch);
148 149 150 151 152 153 154 155 156 157
        } else {
          // TODO(bradnelson): Support unicode (probably via UnicodeCache).
          token_ = kParseError;
        }
        return;
    }
  }
}

void AsmJsScanner::Rewind() {
158
  DCHECK_NE(kUninitialized, preceding_token_);
159 160 161
  // TODO(bradnelson): Currently rewinding needs to leave in place the
  // preceding newline state (in case a |0 ends a line).
  // This is weird and stateful, fix me.
162 163
  DCHECK(!rewind_);
  next_token_ = token_;
164
  next_position_ = position_;
165
  token_ = preceding_token_;
166
  position_ = preceding_position_;
167
  preceding_token_ = kUninitialized;
168
  preceding_position_ = 0;
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
  rewind_ = true;
  identifier_string_.clear();
}

void AsmJsScanner::ResetLocals() { local_names_.clear(); }

#if DEBUG
// Only used for debugging.
std::string AsmJsScanner::Name(token_t token) const {
  if (token >= 32 && token < 127) {
    return std::string(1, static_cast<char>(token));
  }
  for (auto& i : local_names_) {
    if (i.second == token) {
      return i.first;
    }
  }
  for (auto& i : global_names_) {
    if (i.second == token) {
      return i.first;
    }
  }
  for (auto& i : property_names_) {
    if (i.second == token) {
      return i.first;
    }
  }
  switch (token) {
#define V(rawname, name) \
  case kToken_##name:    \
    return rawname;
    LONG_SYMBOL_NAME_LIST(V)
#undef V
#define V(name, value, string_name) \
  case name:                        \
    return string_name;
    SPECIAL_TOKEN_LIST(V)
    default:
      break;
208
#undef V
209 210 211 212 213
  }
  UNREACHABLE();
}
#endif

214
void AsmJsScanner::Seek(size_t pos) {
215
  stream_->Seek(pos);
216 217 218
  preceding_token_ = kUninitialized;
  token_ = kUninitialized;
  next_token_ = kUninitialized;
219 220 221
  preceding_position_ = 0;
  position_ = 0;
  next_position_ = 0;
222 223 224 225 226 227 228 229 230
  rewind_ = false;
  Next();
}

void AsmJsScanner::ConsumeIdentifier(uc32 ch) {
  // Consume characters while still part of the identifier.
  identifier_string_.clear();
  while (IsIdentifierPart(ch)) {
    identifier_string_ += ch;
231
    ch = stream_->Advance();
232 233
  }
  // Go back one for next time.
234
  stream_->Back();
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259

  // Decode what the identifier means.
  if (preceding_token_ == '.') {
    auto i = property_names_.find(identifier_string_);
    if (i != property_names_.end()) {
      token_ = i->second;
      return;
    }
  } else {
    {
      auto i = local_names_.find(identifier_string_);
      if (i != local_names_.end()) {
        token_ = i->second;
        return;
      }
    }
    if (!in_local_scope_) {
      auto i = global_names_.find(identifier_string_);
      if (i != global_names_.end()) {
        token_ = i->second;
        return;
      }
    }
  }
  if (preceding_token_ == '.') {
260
    CHECK_LT(global_count_, kMaxIdentifierCount);
261 262 263
    token_ = kGlobalsStart + global_count_++;
    property_names_[identifier_string_] = token_;
  } else if (in_local_scope_) {
264
    CHECK_LT(local_names_.size(), kMaxIdentifierCount);
265 266 267
    token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
    local_names_[identifier_string_] = token_;
  } else {
268
    CHECK_LT(global_count_, kMaxIdentifierCount);
269 270 271 272 273 274 275
    token_ = kGlobalsStart + global_count_++;
    global_names_[identifier_string_] = token_;
  }
}

void AsmJsScanner::ConsumeNumber(uc32 ch) {
  std::string number;
276
  number.assign(1, ch);
277
  bool has_dot = ch == '.';
278
  bool has_prefix = false;
279
  for (;;) {
280
    ch = stream_->Advance();
281 282 283
    if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
        (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'b' || ch == 'o' ||
        ch == 'x' ||
284 285 286
        ((ch == '-' || ch == '+') && !has_prefix &&
         (number[number.size() - 1] == 'e' ||
          number[number.size() - 1] == 'E'))) {
287 288 289 290
      // TODO(bradnelson): Test weird cases ending in -.
      if (ch == '.') {
        has_dot = true;
      }
291 292 293
      if (ch == 'b' || ch == 'o' || ch == 'x') {
        has_prefix = true;
      }
294 295 296 297 298
      number.push_back(ch);
    } else {
      break;
    }
  }
299
  stream_->Back();
300 301 302 303 304 305 306 307 308 309 310 311 312
  // Special case the most common number.
  if (number.size() == 1 && number[0] == '0') {
    unsigned_value_ = 0;
    token_ = kUnsigned;
    return;
  }
  // Pick out dot.
  if (number.size() == 1 && number[0] == '.') {
    token_ = '.';
    return;
  }
  // Decode numbers.
  double_value_ = StringToDouble(
313
      Vector<const uint8_t>::cast(VectorOf(number)),
314 315 316 317 318 319 320 321 322
      ALLOW_HEX | ALLOW_OCTAL | ALLOW_BINARY | ALLOW_IMPLICIT_OCTAL);
  if (std::isnan(double_value_)) {
    // Check if string to number conversion didn't consume all the characters.
    // This happens if the character filter let through something invalid
    // like: 0123ef for example.
    // TODO(bradnelson): Check if this happens often enough to be a perf
    // problem.
    if (number[0] == '.') {
      for (size_t k = 1; k < number.size(); ++k) {
323
        stream_->Back();
324 325 326 327 328 329 330 331 332 333 334
      }
      token_ = '.';
      return;
    }
    // Anything else that doesn't parse is an error.
    token_ = kParseError;
    return;
  }
  if (has_dot) {
    token_ = kDouble;
  } else {
335 336 337 338 339
    // Exceeding safe integer range is an error.
    if (double_value_ > static_cast<double>(kMaxUInt32)) {
      token_ = kParseError;
      return;
    }
340 341 342 343 344 345 346
    unsigned_value_ = static_cast<uint32_t>(double_value_);
    token_ = kUnsigned;
  }
}

bool AsmJsScanner::ConsumeCComment() {
  for (;;) {
347
    uc32 ch = stream_->Advance();
348
    while (ch == '*') {
349
      ch = stream_->Advance();
350 351 352 353
      if (ch == '/') {
        return true;
      }
    }
354 355 356
    if (ch == '\n') {
      preceded_by_newline_ = true;
    }
357 358 359 360 361 362 363 364
    if (ch == kEndOfInput) {
      return false;
    }
  }
}

void AsmJsScanner::ConsumeCPPComment() {
  for (;;) {
365
    uc32 ch = stream_->Advance();
366 367 368 369 370
    if (ch == '\n') {
      preceded_by_newline_ = true;
      return;
    }
    if (ch == kEndOfInput) {
371 372 373 374 375 376 377 378 379
      return;
    }
  }
}

void AsmJsScanner::ConsumeString(uc32 quote) {
  // Only string allowed is 'use asm' / "use asm".
  const char* expected = "use asm";
  for (; *expected != '\0'; ++expected) {
380
    if (stream_->Advance() != *expected) {
381 382 383 384
      token_ = kParseError;
      return;
    }
  }
385
  if (stream_->Advance() != quote) {
386 387 388 389 390 391 392
    token_ = kParseError;
    return;
  }
  token_ = kToken_UseAsm;
}

void AsmJsScanner::ConsumeCompareOrShift(uc32 ch) {
393
  uc32 next_ch = stream_->Advance();
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
  if (next_ch == '=') {
    switch (ch) {
      case '<':
        token_ = kToken_LE;
        break;
      case '>':
        token_ = kToken_GE;
        break;
      case '=':
        token_ = kToken_EQ;
        break;
      case '!':
        token_ = kToken_NE;
        break;
      default:
        UNREACHABLE();
    }
  } else if (ch == '<' && next_ch == '<') {
    token_ = kToken_SHL;
  } else if (ch == '>' && next_ch == '>') {
414
    if (stream_->Advance() == '>') {
415 416 417
      token_ = kToken_SHR;
    } else {
      token_ = kToken_SAR;
418
      stream_->Back();
419 420
    }
  } else {
421
    stream_->Back();
422 423 424 425 426
    token_ = ch;
  }
}

bool AsmJsScanner::IsIdentifierStart(uc32 ch) {
427 428
  return base::IsInRange(AsciiAlphaToLower(ch), 'a', 'z') || ch == '_' ||
         ch == '$';
429 430
}

431
bool AsmJsScanner::IsIdentifierPart(uc32 ch) { return IsAsciiIdentifier(ch); }
432 433

bool AsmJsScanner::IsNumberStart(uc32 ch) {
434
  return ch == '.' || IsDecimalDigit(ch);
435 436 437 438
}

}  // namespace internal
}  // namespace v8