dateparser-inl.h 13.1 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6 7
#ifndef V8_DATEPARSER_INL_H_
#define V8_DATEPARSER_INL_H_

8
#include "src/char-predicates-inl.h"
9
#include "src/dateparser.h"
10
#include "src/unicode-cache-inl.h"
11

12 13
namespace v8 {
namespace internal {
14 15

template <typename Char>
16 17
bool DateParser::Parse(Isolate* isolate, Vector<Char> str, FixedArray* out) {
  UnicodeCache* unicode_cache = isolate->unicode_cache();
18
  DCHECK(out->length() >= OUTPUT_SIZE);
19
  InputReader<Char> in(unicode_cache, str);
20
  DateStringTokenizer<Char> scanner(&in);
21 22 23 24
  TimeZoneComposer tz;
  TimeComposer time;
  DayComposer day;

25
  // Specification:
26
  // Accept ES5 ISO 8601 date-time-strings or legacy dates compatible
27
  // with Safari.
28
  // ES5 ISO 8601 dates:
29 30 31 32 33 34 35 36 37 38 39 40 41 42
  //   [('-'|'+')yy]yyyy[-MM[-DD]][THH:mm[:ss[.sss]][Z|(+|-)hh:mm]]
  //   where yyyy is in the range 0000..9999 and
  //         +/-yyyyyy is in the range -999999..+999999 -
  //           but -000000 is invalid (year zero must be positive),
  //         MM is in the range 01..12,
  //         DD is in the range 01..31,
  //         MM and DD defaults to 01 if missing,,
  //         HH is generally in the range 00..23, but can be 24 if mm, ss
  //           and sss are zero (or missing), representing midnight at the
  //           end of a day,
  //         mm and ss are in the range 00..59,
  //         sss is in the range 000..999,
  //         hh is in the range 00..23,
  //         mm, ss, and sss default to 00 if missing, and
43 44
  //         timezone defaults to Z if missing
  //           (following Safari, ISO actually demands local time).
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
  //  Extensions:
  //   We also allow sss to have more or less than three digits (but at
  //   least one).
  //   We allow hh:mm to be specified as hhmm.
  // Legacy dates:
  //  Any unrecognized word before the first number is ignored.
  //  Parenthesized text is ignored.
  //  An unsigned number followed by ':' is a time value, and is
  //  added to the TimeComposer. A number followed by '::' adds a second
  //  zero as well. A number followed by '.' is also a time and must be
  //  followed by milliseconds.
  //  Any other number is a date component and is added to DayComposer.
  //  A month name (or really: any word having the same first three letters
  //  as a month name) is recorded as a named month in the Day composer.
  //  A word recognizable as a time-zone is recorded as such, as is
  //  '(+|-)(hhmm|hh:)'.
  //  Legacy dates don't allow extra signs ('+' or '-') or umatched ')'
  //  after a number has been read (before the first number, any garbage
  //  is allowed).
  // Intersection of the two:
  //  A string that matches both formats (e.g. 1970-01-01) will be
66 67 68 69
  //  parsed as an ES5 date-time string - which means it will default
  //  to UTC time-zone. That's unavoidable if following the ES5
  //  specification.
  //  After a valid "T" has been read while scanning an ES5 datetime string,
70 71 72
  //  the input can no longer be a valid legacy date, since the "T" is a
  //  garbage string after a number has been read.

73 74
  // First try getting as far as possible with as ES5 Date Time String.
  DateToken next_unhandled_token = ParseES5DateTime(&scanner, &day, &time, &tz);
75 76 77
  if (next_unhandled_token.IsInvalid()) return false;
  bool has_read_number = !day.IsEmpty();
  // If there's anything left, continue with the legacy parser.
78
  bool legacy_parser = false;
79 80 81 82
  for (DateToken token = next_unhandled_token;
       !token.IsEndOfInput();
       token = scanner.Next()) {
    if (token.IsNumber()) {
83
      legacy_parser = true;
84 85 86 87
      has_read_number = true;
      int n = token.number();
      if (scanner.SkipSymbol(':')) {
        if (scanner.SkipSymbol(':')) {
88 89 90 91 92 93 94
          // n + "::"
          if (!time.IsEmpty()) return false;
          time.Add(n);
          time.Add(0);
        } else {
          // n + ":"
          if (!time.Add(n)) return false;
95
          if (scanner.Peek().IsSymbol('.')) scanner.Next();
96
        }
97
      } else if (scanner.SkipSymbol('.') && time.IsExpecting(n)) {
98
        time.Add(n);
99 100 101
        if (!scanner.Peek().IsNumber()) return false;
        int n = ReadMilliseconds(scanner.Next());
        if (n < 0) return false;
102
        time.AddFinal(n);
103 104 105 106
      } else if (tz.IsExpecting(n)) {
        tz.SetAbsoluteMinute(n);
      } else if (time.IsExpecting(n)) {
        time.AddFinal(n);
107 108
        // Require end, white space, "Z", "+" or "-" immediately after
        // finalizing time.
109 110 111 112 113
        DateToken peek = scanner.Peek();
        if (!peek.IsEndOfInput() &&
            !peek.IsWhiteSpace() &&
            !peek.IsKeywordZ() &&
            !peek.IsAsciiSign()) return false;
114 115
      } else {
        if (!day.Add(n)) return false;
116
        scanner.SkipSymbol('-');
117
      }
118
    } else if (token.IsKeyword()) {
119
      legacy_parser = true;
120
      // Parse a "word" (sequence of chars. >= 'A').
121 122
      KeywordType type = token.keyword_type();
      int value = token.keyword_value();
123
      if (type == AM_PM && !time.IsEmpty()) {
124
        time.SetHourOffset(value);
125
      } else if (type == MONTH_NAME) {
126 127 128 129
        day.SetNamedMonth(value);
        scanner.SkipSymbol('-');
      } else if (type == TIME_ZONE_NAME && has_read_number) {
        tz.Set(value);
130 131
      } else {
        // Garbage words are illegal if a number has been read.
132
        if (has_read_number) return false;
133 134 135
        // The first number has to be separated from garbage words by
        // whitespace or other separators.
        if (scanner.Peek().IsNumber()) return false;
136
      }
137
    } else if (token.IsAsciiSign() && (tz.IsUTC() || !time.IsEmpty())) {
138
      legacy_parser = true;
139
      // Parse UTC offset (only after UTC or time).
140 141 142
      tz.SetSign(token.ascii_sign());
      // The following number may be empty.
      int n = 0;
143
      int length = 0;
144
      if (scanner.Peek().IsNumber()) {
145 146 147
        DateToken token = scanner.Next();
        length = token.length();
        n = token.number();
148 149 150 151
      }
      has_read_number = true;

      if (scanner.Peek().IsSymbol(':')) {
152
        tz.SetAbsoluteHour(n);
153
        // TODO(littledan): Use minutes as part of timezone?
154
        tz.SetAbsoluteMinute(kNone);
155 156 157 158 159 160
      } else if (length == 2 || length == 1) {
        // Handle time zones like GMT-8
        tz.SetAbsoluteHour(n);
        tz.SetAbsoluteMinute(0);
      } else if (length == 4 || length == 3) {
        // Looks like the hhmm format
161 162
        tz.SetAbsoluteHour(n / 100);
        tz.SetAbsoluteMinute(n % 100);
163 164 165
      } else {
        // No need to accept time zones like GMT-12345
        return false;
166
      }
167 168
    } else if ((token.IsAsciiSign() || token.IsSymbol(')')) &&
               has_read_number) {
169 170 171
      // Extra sign or ')' is illegal if a number has been read.
      return false;
    } else {
172
      // Ignore other characters and whitespace.
173 174
    }
  }
175

176 177 178 179 180 181 182
  bool success = day.Write(out) && time.Write(out) && tz.Write(out);

  if (legacy_parser && success) {
    isolate->CountUsage(v8::Isolate::kLegacyDateParser);
  }

  return success;
183 184
}

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200

template<typename CharType>
DateParser::DateToken DateParser::DateStringTokenizer<CharType>::Scan() {
  int pre_pos = in_->position();
  if (in_->IsEnd()) return DateToken::EndOfInput();
  if (in_->IsAsciiDigit()) {
    int n = in_->ReadUnsignedNumeral();
    int length = in_->position() - pre_pos;
    return DateToken::Number(n, length);
  }
  if (in_->Skip(':')) return DateToken::Symbol(':');
  if (in_->Skip('-')) return DateToken::Symbol('-');
  if (in_->Skip('+')) return DateToken::Symbol('+');
  if (in_->Skip('.')) return DateToken::Symbol('.');
  if (in_->Skip(')')) return DateToken::Symbol(')');
  if (in_->IsAsciiAlphaOrAbove()) {
201
    DCHECK_EQ(KeywordTable::kPrefixLength, 3);
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
    uint32_t buffer[3] = {0, 0, 0};
    int length = in_->ReadWord(buffer, 3);
    int index = KeywordTable::Lookup(buffer, length);
    return DateToken::Keyword(KeywordTable::GetType(index),
                              KeywordTable::GetValue(index),
                              length);
  }
  if (in_->SkipWhiteSpace()) {
    return DateToken::WhiteSpace(in_->position() - pre_pos);
  }
  if (in_->SkipParentheses()) {
    return DateToken::Unknown();
  }
  in_->Next();
  return DateToken::Unknown();
}


220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
template <typename Char>
bool DateParser::InputReader<Char>::SkipWhiteSpace() {
  if (unicode_cache_->IsWhiteSpaceOrLineTerminator(ch_)) {
    Next();
    return true;
  }
  return false;
}


template <typename Char>
bool DateParser::InputReader<Char>::SkipParentheses() {
  if (ch_ != '(') return false;
  int balance = 0;
  do {
    if (ch_ == ')') --balance;
    else if (ch_ == '(') ++balance;
    Next();
  } while (balance > 0 && ch_);
  return true;
}


243
template <typename Char>
244 245
DateParser::DateToken DateParser::ParseES5DateTime(
    DateStringTokenizer<Char>* scanner, DayComposer* day, TimeComposer* time,
246
    TimeZoneComposer* tz) {
247 248 249
  DCHECK(day->IsEmpty());
  DCHECK(time->IsEmpty());
  DCHECK(tz->IsEmpty());
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279

  // Parse mandatory date string: [('-'|'+')yy]yyyy[':'MM[':'DD]]
  if (scanner->Peek().IsAsciiSign()) {
    // Keep the sign token, so we can pass it back to the legacy
    // parser if we don't use it.
    DateToken sign_token = scanner->Next();
    if (!scanner->Peek().IsFixedLengthNumber(6)) return sign_token;
    int sign = sign_token.ascii_sign();
    int year = scanner->Next().number();
    if (sign < 0 && year == 0) return sign_token;
    day->Add(sign * year);
  } else if (scanner->Peek().IsFixedLengthNumber(4)) {
    day->Add(scanner->Next().number());
  } else {
    return scanner->Next();
  }
  if (scanner->SkipSymbol('-')) {
    if (!scanner->Peek().IsFixedLengthNumber(2) ||
        !DayComposer::IsMonth(scanner->Peek().number())) return scanner->Next();
    day->Add(scanner->Next().number());
    if (scanner->SkipSymbol('-')) {
      if (!scanner->Peek().IsFixedLengthNumber(2) ||
          !DayComposer::IsDay(scanner->Peek().number())) return scanner->Next();
      day->Add(scanner->Next().number());
    }
  }
  // Check for optional time string: 'T'HH':'mm[':'ss['.'sss]]Z
  if (!scanner->Peek().IsKeywordType(TIME_SEPARATOR)) {
    if (!scanner->Peek().IsEndOfInput()) return scanner->Next();
  } else {
280
    // ES5 Date Time String time part is present.
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
    scanner->Next();
    if (!scanner->Peek().IsFixedLengthNumber(2) ||
        !Between(scanner->Peek().number(), 0, 24)) {
      return DateToken::Invalid();
    }
    // Allow 24:00[:00[.000]], but no other time starting with 24.
    bool hour_is_24 = (scanner->Peek().number() == 24);
    time->Add(scanner->Next().number());
    if (!scanner->SkipSymbol(':')) return DateToken::Invalid();
    if (!scanner->Peek().IsFixedLengthNumber(2) ||
        !TimeComposer::IsMinute(scanner->Peek().number()) ||
        (hour_is_24 && scanner->Peek().number() > 0)) {
      return DateToken::Invalid();
    }
    time->Add(scanner->Next().number());
    if (scanner->SkipSymbol(':')) {
      if (!scanner->Peek().IsFixedLengthNumber(2) ||
          !TimeComposer::IsSecond(scanner->Peek().number()) ||
          (hour_is_24 && scanner->Peek().number() > 0)) {
        return DateToken::Invalid();
      }
      time->Add(scanner->Next().number());
      if (scanner->SkipSymbol('.')) {
        if (!scanner->Peek().IsNumber() ||
            (hour_is_24 && scanner->Peek().number() > 0)) {
          return DateToken::Invalid();
        }
        // Allow more or less than the mandated three digits.
        time->Add(ReadMilliseconds(scanner->Next()));
      }
    }
    // Check for optional timezone designation: 'Z' | ('+'|'-')hh':'mm
    if (scanner->Peek().IsKeywordZ()) {
      scanner->Next();
      tz->Set(0);
    } else if (scanner->Peek().IsSymbol('+') ||
               scanner->Peek().IsSymbol('-')) {
      tz->SetSign(scanner->Next().symbol() == '+' ? 1 : -1);
      if (scanner->Peek().IsFixedLengthNumber(4)) {
        // hhmm extension syntax.
        int hourmin = scanner->Next().number();
        int hour = hourmin / 100;
        int min = hourmin % 100;
        if (!TimeComposer::IsHour(hour) || !TimeComposer::IsMinute(min)) {
          return DateToken::Invalid();
        }
        tz->SetAbsoluteHour(hour);
        tz->SetAbsoluteMinute(min);
      } else {
        // hh:mm standard syntax.
        if (!scanner->Peek().IsFixedLengthNumber(2) ||
            !TimeComposer::IsHour(scanner->Peek().number())) {
          return DateToken::Invalid();
        }
        tz->SetAbsoluteHour(scanner->Next().number());
        if (!scanner->SkipSymbol(':')) return DateToken::Invalid();
        if (!scanner->Peek().IsFixedLengthNumber(2) ||
            !TimeComposer::IsMinute(scanner->Peek().number())) {
          return DateToken::Invalid();
        }
        tz->SetAbsoluteMinute(scanner->Next().number());
      }
    }
    if (!scanner->Peek().IsEndOfInput()) return DateToken::Invalid();
  }
346 347 348 349 350 351 352
  // Successfully parsed ES5 Date Time String.
  // ES#sec-date-time-string-format Date Time String Format
  // "When the time zone offset is absent, date-only forms are interpreted
  //  as a UTC time and date-time forms are interpreted as a local time."
  if (tz->IsEmpty() && time->IsEmpty()) {
    tz->Set(0);
  }
353 354 355 356 357
  day->set_iso_date();
  return DateToken::EndOfInput();
}


358 359
}  // namespace internal
}  // namespace v8
360 361

#endif  // V8_DATEPARSER_INL_H_