fromstring.cc 13.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/bigint/bigint-internal.h"
#include "src/bigint/vector-arithmetic.h"

namespace v8 {
namespace bigint {

// The classic algorithm: for every part, multiply the accumulator with
// the appropriate multiplier, and add the part. O(n²) overall.
void ProcessorImpl::FromStringClassic(RWDigits Z,
                                      FromStringAccumulator* accumulator) {
15 16 17
  // We always have at least one part to process.
  DCHECK(accumulator->stack_parts_used_ > 0);  // NOLINT(readability/check)
  Z[0] = accumulator->stack_parts_[0];
18 19
  RWDigits already_set(Z, 0, 1);
  for (int i = 1; i < Z.len(); i++) Z[i] = 0;
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43

  // The {FromStringAccumulator} uses stack-allocated storage for the first
  // few parts; if heap storage is used at all then all parts are copied there.
  int num_stack_parts = accumulator->stack_parts_used_;
  if (num_stack_parts == 1) return;
  const std::vector<digit_t>& heap_parts = accumulator->heap_parts_;
  int num_heap_parts = static_cast<int>(heap_parts.size());
  // All multipliers are the same, except possibly for the last.
  const digit_t max_multiplier = accumulator->max_multiplier_;

  if (num_heap_parts == 0) {
    for (int i = 1; i < num_stack_parts - 1; i++) {
      MultiplySingle(Z, already_set, max_multiplier);
      Add(Z, accumulator->stack_parts_[i]);
      already_set.set_len(already_set.len() + 1);
    }
    MultiplySingle(Z, already_set, accumulator->last_multiplier_);
    Add(Z, accumulator->stack_parts_[num_stack_parts - 1]);
    return;
  }
  // Parts are stored on the heap.
  for (int i = 1; i < num_heap_parts - 1; i++) {
    MultiplySingle(Z, already_set, max_multiplier);
    Add(Z, accumulator->heap_parts_[i]);
44 45
    already_set.set_len(already_set.len() + 1);
  }
46 47
  MultiplySingle(Z, already_set, accumulator->last_multiplier_);
  Add(Z, accumulator->heap_parts_.back());
48 49
}

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
// The fast algorithm: combine parts in a balanced-binary-tree like order:
// Multiply-and-add neighboring pairs of parts, then loop, until only one
// part is left. The benefit is that the multiplications will have inputs of
// similar sizes, which makes them amenable to fast multiplication algorithms.
// We have to do more multiplications than the classic algorithm though,
// because we also have to multiply the multipliers.
// Optimizations:
// - We can skip the multiplier for the first part, because we never need it.
// - Most multipliers are the same; we can avoid repeated multiplications and
//   just copy the previous result. (In theory we could even de-dupe them, but
//   as the parts/multipliers grow, we'll need most of the memory anyway.)
//   Copied results are marked with a * below.
// - We can re-use memory using a system of three buffers whose usage rotates:
//   - one is considered empty, and is overwritten with the new parts,
//   - one holds the multipliers (and will be "empty" in the next round), and
//   - one initially holds the parts and is overwritten with the new multipliers
//   Parts and multipliers both grow in each iteration, and get fewer, so we
//   use the space of two adjacent old chunks for one new chunk.
//   Since the {heap_parts_} vectors has the right size, and so does the
//   result {Z}, we can use that memory, and only need to allocate one scratch
//   vector. If the final result ends up in the wrong bucket, we have to copy it
//   to the correct one.
// - We don't have to keep track of the positions and sizes of the chunks,
//   because we can deduce their precise placement from the iteration index.
//
// Example, assuming digit_t is 4 bits, fitting one decimal digit:
// Initial state:
// parts_:        1  2  3  4  5  6  7  8  9  0  1  2  3  4  5
// multipliers_: 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
// After the first iteration of the outer loop:
// parts:         12    34    56    78    90    12    34    5
// multipliers:        100  *100  *100  *100  *100  *100   10
// After the second iteration:
// parts:         1234        5678        9012        345
// multipliers:              10000      *10000       1000
// After the third iteration:
// parts:         12345678                9012345
// multipliers:                          10000000
// And then there's an obvious last iteration.
void ProcessorImpl::FromStringLarge(RWDigits Z,
                                    FromStringAccumulator* accumulator) {
  int num_parts = static_cast<int>(accumulator->heap_parts_.size());
  DCHECK(num_parts >= 2);  // NOLINT(readability/check)
  DCHECK(Z.len() >= num_parts);
  RWDigits parts(accumulator->heap_parts_.data(), num_parts);
  Storage multipliers_storage(num_parts);
  RWDigits multipliers(multipliers_storage.get(), num_parts);
  RWDigits temp(Z, 0, num_parts);
  // Unrolled and specialized first iteration: part_len == 1, so instead of
  // Digits sub-vectors we have individual digit_t values, and the multipliers
  // are known up front.
  {
    digit_t max_multiplier = accumulator->max_multiplier_;
    digit_t last_multiplier = accumulator->last_multiplier_;
    RWDigits new_parts = temp;
    RWDigits new_multipliers = parts;
    int i = 0;
    for (; i + 1 < num_parts; i += 2) {
      digit_t p_in = parts[i];
      digit_t p_in2 = parts[i + 1];
      digit_t m_in = max_multiplier;
      digit_t m_in2 = i == num_parts - 2 ? last_multiplier : max_multiplier;
      // p[j] = p[i] * m[i+1] + p[i+1]
      digit_t p_high;
      digit_t p_low = digit_mul(p_in, m_in2, &p_high);
      digit_t carry;
      new_parts[i] = digit_add2(p_low, p_in2, &carry);
      new_parts[i + 1] = p_high + carry;
      // m[j] = m[i] * m[i+1]
      if (i > 0) {
        if (i > 2 && m_in2 != last_multiplier) {
          new_multipliers[i] = new_multipliers[i - 2];
          new_multipliers[i + 1] = new_multipliers[i - 1];
        } else {
          digit_t m_high;
          new_multipliers[i] = digit_mul(m_in, m_in2, &m_high);
          new_multipliers[i + 1] = m_high;
        }
      }
    }
    // Trailing last part (if {num_parts} was odd).
    if (i < num_parts) {
      new_parts[i] = parts[i];
      new_multipliers[i] = last_multiplier;
      i += 2;
    }
    num_parts = i >> 1;
    RWDigits new_temp = multipliers;
    parts = new_parts;
    multipliers = new_multipliers;
    temp = new_temp;
    AddWorkEstimate(num_parts);
  }
  int part_len = 2;

  // Remaining iterations.
  while (num_parts > 1) {
    RWDigits new_parts = temp;
    RWDigits new_multipliers = parts;
    int new_part_len = part_len * 2;
    int i = 0;
    for (; i + 1 < num_parts; i += 2) {
      int start = i * part_len;
      Digits p_in(parts, start, part_len);
      Digits p_in2(parts, start + part_len, part_len);
      Digits m_in(multipliers, start, part_len);
      Digits m_in2(multipliers, start + part_len, part_len);
      RWDigits p_out(new_parts, start, new_part_len);
      RWDigits m_out(new_multipliers, start, new_part_len);
      // p[j] = p[i] * m[i+1] + p[i+1]
      Multiply(p_out, p_in, m_in2);
      if (should_terminate()) return;
      digit_t overflow = AddAndReturnOverflow(p_out, p_in2);
      DCHECK(overflow == 0);  // NOLINT(readability/check)
      USE(overflow);
      // m[j] = m[i] * m[i+1]
      if (i > 0) {
        bool copied = false;
        if (i > 2) {
          int prev_start = (i - 2) * part_len;
          Digits m_in_prev(multipliers, prev_start, part_len);
          Digits m_in2_prev(multipliers, prev_start + part_len, part_len);
          if (Compare(m_in, m_in_prev) == 0 &&
              Compare(m_in2, m_in2_prev) == 0) {
            copied = true;
            Digits m_out_prev(new_multipliers, prev_start, new_part_len);
            for (int k = 0; k < new_part_len; k++) m_out[k] = m_out_prev[k];
          }
        }
        if (!copied) {
          Multiply(m_out, m_in, m_in2);
          if (should_terminate()) return;
        }
      }
    }
    // Trailing last part (if {num_parts} was odd).
    if (i < num_parts) {
      Digits p_in(parts, i * part_len, part_len);
      Digits m_in(multipliers, i * part_len, part_len);
      RWDigits p_out(new_parts, i * part_len, new_part_len);
      RWDigits m_out(new_multipliers, i * part_len, new_part_len);
      int k = 0;
      for (; k < p_in.len(); k++) p_out[k] = p_in[k];
      for (; k < p_out.len(); k++) p_out[k] = 0;
      k = 0;
      for (; k < m_in.len(); k++) m_out[k] = m_in[k];
      for (; k < m_out.len(); k++) m_out[k] = 0;
      i += 2;
    }
    num_parts = i >> 1;
    part_len = new_part_len;
    RWDigits new_temp = multipliers;
    parts = new_parts;
    multipliers = new_multipliers;
    temp = new_temp;
  }
  // Copy the result to Z, if it doesn't happen to be there already.
  if (parts.digits() != Z.digits()) {
    int i = 0;
    for (; i < parts.len(); i++) Z[i] = parts[i];
    // Z might be bigger than we requested; be robust towards that.
    for (; i < Z.len(); i++) Z[i] = 0;
  }
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
// Specialized algorithms for power-of-two radixes. Designed to work with
// {ParsePowerTwo}: {max_multiplier_} isn't saved, but {radix_} is, and
// {last_multiplier_} has special meaning, namely the number of unpopulated bits
// in the last part.
// For these radixes, {parts} already is a list of correct bit sequences, we
// just have to put them together in the right way:
// - The parts are currently in reversed order. The highest-index parts[i]
//   will go into Z[0].
// - All parts, possibly except for the last, are maximally populated.
// - A maximally populated part stores a non-fractional number of characters,
//   i.e. the largest fitting multiple of {char_bits} of it is populated.
// - The populated bits in a part are at the low end.
// - The number of unused bits in the last part is stored in
//   {accumulator->last_multiplier_}.
//
// Example: Given the following parts vector, where letters are used to
// label bits, bit order is big endian (i.e. [00000101] encodes "5"),
// 'x' means "unpopulated", kDigitBits == 8, radix == 8, and char_bits == 3:
//
//     parts[0] -> [xxABCDEF][xxGHIJKL][xxMNOPQR][xxxxxSTU] <- parts[3]
//
// We have to assemble the following result:
//
//         Z[0] -> [NOPQRSTU][FGHIJKLM][xxxABCDE] <- Z[2]
//
void ProcessorImpl::FromStringBasePowerOfTwo(
    RWDigits Z, FromStringAccumulator* accumulator) {
  const int num_parts = accumulator->ResultLength();
  DCHECK(num_parts >= 1);  // NOLINT(readability/check)
  DCHECK(Z.len() >= num_parts);
  Digits parts(accumulator->heap_parts_.size() > 0
                   ? accumulator->heap_parts_.data()
                   : accumulator->stack_parts_,
               num_parts);
  uint8_t radix = accumulator->radix_;
  DCHECK(radix == 2 || radix == 4 || radix == 8 || radix == 16 || radix == 32);
  const int char_bits = BitLength(radix - 1);
  const int unused_last_part_bits =
      static_cast<int>(accumulator->last_multiplier_);
  const int unused_part_bits = kDigitBits % char_bits;
  const int max_part_bits = kDigitBits - unused_part_bits;
  int z_index = 0;
  int part_index = num_parts - 1;

  // If the last part is fully populated, then all parts must be, and we can
  // simply copy them (in reversed order).
  if (unused_last_part_bits == 0) {
    DCHECK(kDigitBits % char_bits == 0);  // NOLINT(readability/check)
    while (part_index >= 0) {
      Z[z_index++] = parts[part_index--];
    }
    for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
    return;
  }

  // Otherwise we have to shift parts contents around as needed.
  // Holds the next Z digit that we want to store...
  digit_t digit = parts[part_index--];
  // ...and the number of bits (at the right end) we already know.
  int digit_bits = kDigitBits - unused_last_part_bits;
  while (part_index >= 0) {
    // Holds the last part that we read from {parts}...
    digit_t part;
    // ...and the number of bits (at the right end) that we haven't used yet.
    int part_bits;
    while (digit_bits < kDigitBits) {
      part = parts[part_index--];
      part_bits = max_part_bits;
      digit |= part << digit_bits;
      int part_shift = kDigitBits - digit_bits;
      if (part_shift > part_bits) {
        digit_bits += part_bits;
        part = 0;
        part_bits = 0;
        if (part_index < 0) break;
      } else {
        digit_bits = kDigitBits;
        part >>= part_shift;
        part_bits -= part_shift;
      }
    }
    Z[z_index++] = digit;
    digit = part;
    digit_bits = part_bits;
  }
  if (digit_bits > 0) {
    Z[z_index++] = digit;
  }
  for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
}

306
void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
307 308 309 310 311 312 313 314
  if (accumulator->inline_everything_) {
    int i = 0;
    for (; i < accumulator->stack_parts_used_; i++) {
      Z[i] = accumulator->stack_parts_[i];
    }
    for (; i < Z.len(); i++) Z[i] = 0;
  } else if (accumulator->stack_parts_used_ == 0) {
    for (int i = 0; i < Z.len(); i++) Z[i] = 0;
315 316
  } else if (IsPowerOfTwo(accumulator->radix_)) {
    FromStringBasePowerOfTwo(Z, accumulator);
317
  } else if (accumulator->ResultLength() < kFromStringLargeThreshold) {
318
    FromStringClassic(Z, accumulator);
319 320
  } else {
    FromStringLarge(Z, accumulator);
321 322 323 324 325 326 327 328 329 330 331
  }
}

Status Processor::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
  ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
  impl->FromString(Z, accumulator);
  return impl->get_and_clear_status();
}

}  // namespace bigint
}  // namespace v8