div-barrett.cc 13.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Barrett division, finding the inverse with Newton's method.
// Reference: "Fast Division of Large Integers" by Karl Hasselström,
// found at https://treskal.com/s/masters-thesis.pdf

// Many thanks to Karl Wiberg, k@w5.se, for both writing up an
// understandable theoretical description of the algorithm and privately
// providing a demo implementation, on which the implementation in this file is
// based.

#include <algorithm>

#include "src/bigint/bigint-internal.h"
#include "src/bigint/digit-arithmetic.h"
#include "src/bigint/div-helpers.h"
#include "src/bigint/vector-arithmetic.h"

namespace v8 {
namespace bigint {

namespace {

void DcheckIntegerPartRange(Digits X, digit_t min, digit_t max) {
#if DEBUG
  digit_t integer_part = X.msd();
  DCHECK(integer_part >= min);
  DCHECK(integer_part <= max);
#else
  USE(X);
  USE(min);
  USE(max);
#endif
}

}  // namespace

// Z := (the fractional part of) 1/V, via naive division.
// See comments at {Invert} and {InvertNewton} below for details.
void ProcessorImpl::InvertBasecase(RWDigits Z, Digits V, RWDigits scratch) {
  DCHECK(Z.len() > V.len());
  DCHECK(V.len() > 0);  // NOLINT(readability/check)
  DCHECK(scratch.len() >= 2 * V.len());
  int n = V.len();
  RWDigits X(scratch, 0, 2 * n);
  digit_t borrow = 0;
  int i = 0;
  for (; i < n; i++) X[i] = 0;
  for (; i < 2 * n; i++) X[i] = digit_sub2(0, V[i - n], borrow, &borrow);
  DCHECK(borrow == 1);     // NOLINT(readability/check)
  RWDigits R(nullptr, 0);  // We don't need the remainder.
  if (n < kBurnikelThreshold) {
    DivideSchoolbook(Z, R, X, V);
  } else {
    DivideBurnikelZiegler(Z, R, X, V);
  }
}

// This is Algorithm 4.2 from the paper.
// Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
// V.len+1 digits. The V.len low digits of the result digits will be written
// to Z, plus there is an implicit top digit with value 1.
// Needs InvertNewtonScratchSpace(V.len) of scratch space.
// The result is either correct or off by one (about half the time it is
// correct, half the time it is one too much, and in the corner case where V is
// minimal and the implicit top digit would have to be 2 it is one too little).
// Barrett's division algorithm can handle that, so we don't care.
void ProcessorImpl::InvertNewton(RWDigits Z, Digits V, RWDigits scratch) {
  const int vn = V.len();
  DCHECK(Z.len() >= vn);
  DCHECK(scratch.len() >= InvertNewtonScratchSpace(vn));
  const int kSOffset = 0;
  const int kWOffset = 0;  // S and W can share their scratch space.
  const int kUOffset = vn + kInvertNewtonExtraSpace;

  // The base case won't work otherwise.
  DCHECK(V.len() >= 3);  // NOLINT(readability/check)

  constexpr int kBasecasePrecision = kNewtonInversionThreshold - 1;
  // V must have more digits than the basecase.
  DCHECK(V.len() > kBasecasePrecision);
  DCHECK(IsBitNormalized(V));

  // Step (1): Setup.
  // Calculate precision required at each step.
  // {k} is the number of fraction bits for the current iteration.
  int k = vn * kDigitBits;
  int target_fraction_bits[8 * sizeof(vn)];  // "k_i" in the paper.
  int iteration = -1;  // "i" in the paper, except inverted to run downwards.
  while (k > kBasecasePrecision * kDigitBits) {
    iteration++;
    target_fraction_bits[iteration] = k;
    k = DIV_CEIL(k, 2);
  }
  // At this point, k <= kBasecasePrecision*kDigitBits is the number of
  // fraction bits to use in the base case. {iteration} is the highest index
  // in use for f[].

  // Step (2): Initial approximation.
  int initial_digits = DIV_CEIL(k + 1, kDigitBits);
  Digits top_part_of_v(V, vn - initial_digits, initial_digits);
  InvertBasecase(Z, top_part_of_v, scratch);
  Z[initial_digits] = Z[initial_digits] + 1;  // Implicit top digit.
  // From now on, we'll keep Z.len updated to the part that's already computed.
  Z.set_len(initial_digits + 1);

  // Step (3): Precision doubling loop.
  while (true) {
    DcheckIntegerPartRange(Z, 1, 2);

    // (3b): S = Z^2
    RWDigits S(scratch, kSOffset, 2 * Z.len());
    Multiply(S, Z, Z);
    if (should_terminate()) return;
    S.TrimOne();  // Top digit of S is unused.
    DcheckIntegerPartRange(S, 1, 4);

    // (3c): T = V, truncated so that at least 2k+3 fraction bits remain.
    int fraction_digits = DIV_CEIL(2 * k + 3, kDigitBits);
    int t_len = std::min(V.len(), fraction_digits);
    Digits T(V, V.len() - t_len, t_len);

    // (3d): U = T * S, truncated so that at least 2k+1 fraction bits remain
    // (U has one integer digit, which might be zero).
127
    fraction_digits = DIV_CEIL(2 * k + 1, kDigitBits);
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    RWDigits U(scratch, kUOffset, S.len() + T.len());
    DCHECK(U.len() > fraction_digits);
    Multiply(U, S, T);
    if (should_terminate()) return;
    U = U + (U.len() - (1 + fraction_digits));
    DcheckIntegerPartRange(U, 0, 3);

    // (3e): W = 2 * Z, padded with "0" fraction bits so that it has the
    // same number of fraction bits as U.
    DCHECK(U.len() >= Z.len());
    RWDigits W(scratch, kWOffset, U.len());
    int padding_digits = U.len() - Z.len();
    for (int i = 0; i < padding_digits; i++) W[i] = 0;
    LeftShift(W + padding_digits, Z, 1);
    DcheckIntegerPartRange(W, 2, 4);

    // (3f): Z = W - U.
    // This check is '<=' instead of '<' because U's top digit is its
    // integer part, and we want vn fraction digits.
    if (U.len() <= vn) {
      // Normal subtraction.
      // This is not the last iteration.
      DCHECK(iteration > 0);  // NOLINT(readability/check)
      Z.set_len(U.len());
      digit_t borrow = SubtractAndReturnBorrow(Z, W, U);
      DCHECK(borrow == 0);  // NOLINT(readability/check)
      USE(borrow);
      DcheckIntegerPartRange(Z, 1, 2);
    } else {
      // Truncate some least significant digits so that we get vn
      // fraction digits, and compute the integer digit separately.
      // This is the last iteration.
      DCHECK(iteration == 0);  // NOLINT(readability/check)
      Z.set_len(vn);
      Digits W_part(W, W.len() - vn - 1, vn);
      Digits U_part(U, U.len() - vn - 1, vn);
      digit_t borrow = SubtractAndReturnBorrow(Z, W_part, U_part);
      digit_t integer_part = W.msd() - U.msd() - borrow;
      DCHECK(integer_part == 1 || integer_part == 2);
      if (integer_part == 2) {
        // This is the rare case where the correct result would be 2.0, but
        // since we can't express that by returning only the fractional part
        // with an implicit 1-digit, we have to return [1.]9999... instead.
        for (int i = 0; i < Z.len(); i++) Z[i] = ~digit_t{0};
      }
      break;
    }
    // (3g, 3h): Update local variables and loop.
    k = target_fraction_bits[iteration];
    iteration--;
  }
}

// Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
// V.len+1 digits. The V.len low digits of the result digits will be written
// to Z, plus there is an implicit top digit with value 1.
// (Corner case: if V is minimal, the implicit digit should be 2; in that case
// we return one less than the correct answer. DivideBarrett can handle that.)
// Needs InvertScratchSpace(V.len) digits of scratch space.
void ProcessorImpl::Invert(RWDigits Z, Digits V, RWDigits scratch) {
  DCHECK(Z.len() > V.len());
  DCHECK(V.len() >= 1);  // NOLINT(readability/check)
  DCHECK(IsBitNormalized(V));
  DCHECK(scratch.len() >= InvertScratchSpace(V.len()));

  int vn = V.len();
  if (vn >= kNewtonInversionThreshold) {
    return InvertNewton(Z, V, scratch);
  }
  if (vn == 1) {
    digit_t d = V[0];
    digit_t dummy_remainder;
    Z[0] = digit_div(~d, ~digit_t{0}, d, &dummy_remainder);
    Z[1] = 0;
  } else {
    InvertBasecase(Z, V, scratch);
    if (Z[vn] == 1) {
      for (int i = 0; i < vn; i++) Z[i] = ~digit_t{0};
206
      Z[vn] = 0;
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
    }
  }
}

// This is algorithm 3.5 from the paper.
// Computes Q(uotient) and R(emainder) for A/B using I, which is a
// precomputed approximation of 1/B (e.g. with Invert() above).
// Needs DivideBarrettScratchSpace(A.len) scratch space.
void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B,
                                  Digits I, RWDigits scratch) {
  DCHECK(Q.len() > A.len() - B.len());
  DCHECK(R.len() >= B.len());
  DCHECK(A.len() > B.len());  // Careful: This is *not* '>=' !
  DCHECK(A.len() <= 2 * B.len());
  DCHECK(B.len() > 0);  // NOLINT(readability/check)
  DCHECK(IsBitNormalized(B));
  DCHECK(I.len() == A.len() - B.len());
  DCHECK(scratch.len() >= DivideBarrettScratchSpace(A.len()));

  int orig_q_len = Q.len();

  // (1): A1 = A with B.len fewer digits.
  Digits A1 = A + B.len();
  DCHECK(A1.len() == I.len());

  // (2): Q = A1*I with I.len fewer digits.
  // {I} has an implicit high digit with value 1, so we add {A1} to the high
  // part of the multiplication result.
  RWDigits K(scratch, 0, 2 * I.len());
  Multiply(K, A1, I);
  if (should_terminate()) return;
  Q.set_len(I.len() + 1);
  Add(Q, K + I.len(), A1);
  // K is no longer used, can re-use {scratch} for P.

  // (3): R = A - B*Q (approximate remainder).
  RWDigits P(scratch, 0, A.len() + 1);
  Multiply(P, B, Q);
  if (should_terminate()) return;
  digit_t borrow = SubtractAndReturnBorrow(R, A, Digits(P, 0, B.len()));
  // R may be allocated wider than B, zero out any extra digits if so.
  for (int i = B.len(); i < R.len(); i++) R[i] = 0;
  digit_t r_high = A[B.len()] - P[B.len()] - borrow;

  // Adjust R and Q so that they become the correct remainder and quotient.
  // The number of iterations is guaranteed to be at most some very small
  // constant, unless the caller gave us a bad approximate quotient.
  if (r_high >> (kDigitBits - 1) == 1) {
    // (5b): R < 0, so R += B
    digit_t q_sub = 0;
    do {
      r_high += AddAndReturnCarry(R, R, B);
      q_sub++;
      DCHECK(q_sub <= 5);  // NOLINT(readability/check)
    } while (r_high != 0);
    Subtract(Q, q_sub);
  } else {
    digit_t q_add = 0;
    while (r_high != 0 || GreaterThanOrEqual(R, B)) {
      // (5c): R >= B, so R -= B
      r_high -= SubtractAndReturnBorrow(R, R, B);
      q_add++;
      DCHECK(q_add <= 5);  // NOLINT(readability/check)
    }
    Add(Q, q_add);
  }
  // (5a): Return.
  int final_q_len = Q.len();
  Q.set_len(orig_q_len);
  for (int i = final_q_len; i < orig_q_len; i++) Q[i] = 0;
}

// Computes Q(uotient) and R(emainder) for A/B, using Barrett division.
void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B) {
  DCHECK(Q.len() > A.len() - B.len());
  DCHECK(R.len() >= B.len());
  DCHECK(A.len() > B.len());  // Careful: This is *not* '>=' !
  DCHECK(B.len() > 0);        // NOLINT(readability/check)

  // Normalize B, and shift A by the same amount.
  ShiftedDigits b_normalized(B);
  ShiftedDigits a_normalized(A, b_normalized.shift());
  // Keep the code below more concise.
  B = b_normalized;
  A = a_normalized;

  // The core DivideBarrett function above only supports A having at most
  // twice as many digits as B. We generalize this to arbitrary inputs
  // similar to Burnikel-Ziegler division by performing a t-by-1 division
  // of B-sized chunks. It's easy to special-case the situation where we
  // don't need to bother.
  int barrett_dividend_length = A.len() <= 2 * B.len() ? A.len() : 2 * B.len();
  int i_len = barrett_dividend_length - B.len();
  ScratchDigits I(i_len + 1);  // +1 is for temporary use by Invert().
  int scratch_len =
      std::max(InvertScratchSpace(i_len),
               DivideBarrettScratchSpace(barrett_dividend_length));
  ScratchDigits scratch(scratch_len);
  Invert(I, Digits(B, B.len() - i_len, i_len), scratch);
  if (should_terminate()) return;
  I.TrimOne();
  DCHECK(I.len() == i_len);
  if (A.len() > 2 * B.len()) {
    // This follows the variable names and and algorithmic steps of
    // DivideBurnikelZiegler().
    int n = B.len();  // Chunk length.
    // (5): {t} is the number of B-sized chunks of A.
    int t = DIV_CEIL(A.len(), n);
    DCHECK(t >= 3);  // NOLINT(readability/check)
    // (6)/(7): Z is used for the current 2-chunk block to be divided by B,
    // initialized to the two topmost chunks of A.
    int z_len = n * 2;
    ScratchDigits Z(z_len);
    PutAt(Z, A + n * (t - 2), z_len);
    // (8): For i from t-2 downto 0 do
    int qi_len = n + 1;
    ScratchDigits Qi(qi_len);
    ScratchDigits Ri(n);
    // First iteration unrolled and specialized.
    {
      int i = t - 2;
      DivideBarrett(Qi, Ri, Z, B, I, scratch);
      if (should_terminate()) return;
      RWDigits target = Q + n * i;
      // In the first iteration, all qi_len = n + 1 digits may be used.
      int to_copy = std::min(qi_len, target.len());
      for (int j = 0; j < to_copy; j++) target[j] = Qi[j];
      for (int j = to_copy; j < target.len(); j++) target[j] = 0;
#if DEBUG
      for (int j = to_copy; j < Qi.len(); j++) {
        DCHECK(Qi[j] == 0);  // NOLINT(readability/check)
      }
#endif
    }
    // Now loop over any remaining iterations.
    for (int i = t - 3; i >= 0; i--) {
      // (8b): If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
      // (De-duped with unrolled first iteration, hence reading A_(i).)
      PutAt(Z + n, Ri, n);
      PutAt(Z, A + n * i, n);
      // (8a): Compute Qi, Ri such that Zi = B*Qi + Ri.
      DivideBarrett(Qi, Ri, Z, B, I, scratch);
      DCHECK(Qi[qi_len - 1] == 0);  // NOLINT(readability/check)
      if (should_terminate()) return;
      // (9): Return Q = [Q_(t-2), ..., Q_0]...
      PutAt(Q + n * i, Qi, n);
    }
    Ri.Normalize();
    DCHECK(Ri.len() <= R.len());
    // (9): ...and R = R_0 * 2^(-leading_zeros).
    RightShift(R, Ri, b_normalized.shift());
  } else {
    DivideBarrett(Q, R, A, B, I, scratch);
    if (should_terminate()) return;
    RightShift(R, R, b_normalized.shift());
  }
}

}  // namespace bigint
}  // namespace v8