jfdctfst.c 9.4 KB
Newer Older
Fabrice Bellard's avatar
Fabrice Bellard committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 * jfdctfst.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains a fast, not so accurate integer implementation of the
 * forward DCT (Discrete Cosine Transform).
 *
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
 * on each column.  Direct algorithms are also available, but they are
 * much more complex and seem not to be any faster when reduced to code.
 *
 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
 * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
 * JPEG textbook (see REFERENCES section in file README).  The following code
 * is based directly on figure 4-8 in P&M.
 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
 * possible to arrange the computation so that many of the multiplies are
 * simple scalings of the final outputs.  These multiplies can then be
 * folded into the multiplications or divisions by the JPEG quantization
 * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
 * to be done in the DCT itself.
 * The primary disadvantage of this method is that with fixed-point math,
 * accuracy is lost due to imprecise representation of the scaled
 * quantization values.  The smaller the quantization table entry, the less
 * precise the scaled value, so this implementation does worse with high-
 * quality-setting files than with low-quality ones.
 */

Michael Niedermayer's avatar
Michael Niedermayer committed
33 34 35 36
/**
 * @file jfdctfst.c
 * Independent JPEG Group's fast AAN dct.
 */
37

Fabrice Bellard's avatar
Fabrice Bellard committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#include "dsputil.h"

#define DCTSIZE 8
#define GLOBAL(x) x
#define RIGHT_SHIFT(x, n) ((x) >> (n))
#define SHIFT_TEMPS

/*
 * This module is specialized to the case DCTSIZE = 8.
 */

#if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif


/* Scaling decisions are generally the same as in the LL&M algorithm;
 * see jfdctint.c for more details.  However, we choose to descale
 * (right shift) multiplication products as soon as they are formed,
 * rather than carrying additional fractional bits into subsequent additions.
 * This compromises accuracy slightly, but it lets us save a few shifts.
 * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
 * everywhere except in the multiplications proper; this saves a good deal
 * of work on 16-bit-int machines.
 *
 * Again to save a few shifts, the intermediate results between pass 1 and
 * pass 2 are not upscaled, but are represented only to integral precision.
 *
 * A final compromise is to represent the multiplicative constants to only
 * 8 fractional bits, rather than 13.  This saves some shifting work on some
 * machines, and may also reduce the cost of multiplication (since there
 * are fewer one-bits in the constants).
 */

#define CONST_BITS  8


/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 * causing a lot of useless floating-point operations at run time.
 * To get around this we use the following pre-calculated constants.
 * If you change CONST_BITS you may want to add appropriate values.
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 */

#if CONST_BITS == 8
86 87 88 89
#define FIX_0_382683433  ((int32_t)   98)       /* FIX(0.382683433) */
#define FIX_0_541196100  ((int32_t)  139)       /* FIX(0.541196100) */
#define FIX_0_707106781  ((int32_t)  181)       /* FIX(0.707106781) */
#define FIX_1_306562965  ((int32_t)  334)       /* FIX(1.306562965) */
Fabrice Bellard's avatar
Fabrice Bellard committed
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
#else
#define FIX_0_382683433  FIX(0.382683433)
#define FIX_0_541196100  FIX(0.541196100)
#define FIX_0_707106781  FIX(0.707106781)
#define FIX_1_306562965  FIX(1.306562965)
#endif


/* We can gain a little more speed, with a further compromise in accuracy,
 * by omitting the addition in a descaling shift.  This yields an incorrectly
 * rounded result half the time...
 */

#ifndef USE_ACCURATE_ROUNDING
#undef DESCALE
#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
#endif


109
/* Multiply a DCTELEM variable by an int32_t constant, and immediately
Fabrice Bellard's avatar
Fabrice Bellard committed
110 111 112 113 114
 * descale to yield a DCTELEM result.
 */

#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))

115 116 117 118
static always_inline void row_fdct(DCTELEM * data){
  int_fast16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int_fast16_t tmp10, tmp11, tmp12, tmp13;
  int_fast16_t z1, z2, z3, z4, z5, z11, z13;
Fabrice Bellard's avatar
Fabrice Bellard committed
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
  DCTELEM *dataptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[0] + dataptr[7];
    tmp7 = dataptr[0] - dataptr[7];
    tmp1 = dataptr[1] + dataptr[6];
    tmp6 = dataptr[1] - dataptr[6];
    tmp2 = dataptr[2] + dataptr[5];
    tmp5 = dataptr[2] - dataptr[5];
    tmp3 = dataptr[3] + dataptr[4];
    tmp4 = dataptr[3] - dataptr[4];
135

Fabrice Bellard's avatar
Fabrice Bellard committed
136
    /* Even part */
137

138
    tmp10 = tmp0 + tmp3;        /* phase 2 */
Fabrice Bellard's avatar
Fabrice Bellard committed
139 140 141
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
142

Fabrice Bellard's avatar
Fabrice Bellard committed
143 144
    dataptr[0] = tmp10 + tmp11; /* phase 3 */
    dataptr[4] = tmp10 - tmp11;
145

Fabrice Bellard's avatar
Fabrice Bellard committed
146
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
147
    dataptr[2] = tmp13 + z1;    /* phase 5 */
Fabrice Bellard's avatar
Fabrice Bellard committed
148
    dataptr[6] = tmp13 - z1;
149

Fabrice Bellard's avatar
Fabrice Bellard committed
150 151
    /* Odd part */

152
    tmp10 = tmp4 + tmp5;        /* phase 2 */
Fabrice Bellard's avatar
Fabrice Bellard committed
153 154 155 156 157
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
158 159 160
    z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5;    /* c2-c6 */
    z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5;    /* c2+c6 */
    z3 = MULTIPLY(tmp11, FIX_0_707106781);         /* c4 */
Fabrice Bellard's avatar
Fabrice Bellard committed
161

162
    z11 = tmp7 + z3;            /* phase 5 */
Fabrice Bellard's avatar
Fabrice Bellard committed
163 164
    z13 = tmp7 - z3;

165
    dataptr[5] = z13 + z2;      /* phase 6 */
Fabrice Bellard's avatar
Fabrice Bellard committed
166 167 168 169
    dataptr[3] = z13 - z2;
    dataptr[1] = z11 + z4;
    dataptr[7] = z11 - z4;

170
    dataptr += DCTSIZE;         /* advance pointer to next row */
Fabrice Bellard's avatar
Fabrice Bellard committed
171
  }
172
}
Fabrice Bellard's avatar
Fabrice Bellard committed
173

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/*
 * Perform the forward DCT on one block of samples.
 */

GLOBAL(void)
fdct_ifast (DCTELEM * data)
{
  int_fast16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int_fast16_t tmp10, tmp11, tmp12, tmp13;
  int_fast16_t z1, z2, z3, z4, z5, z11, z13;
  DCTELEM *dataptr;
  int ctr;
  SHIFT_TEMPS

  row_fdct(data);
189

Fabrice Bellard's avatar
Fabrice Bellard committed
190 191 192 193 194 195 196 197 198 199 200 201
  /* Pass 2: process columns. */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
202

Fabrice Bellard's avatar
Fabrice Bellard committed
203
    /* Even part */
204

205
    tmp10 = tmp0 + tmp3;        /* phase 2 */
Fabrice Bellard's avatar
Fabrice Bellard committed
206 207 208
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
209

Fabrice Bellard's avatar
Fabrice Bellard committed
210 211
    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
    dataptr[DCTSIZE*4] = tmp10 - tmp11;
212

Fabrice Bellard's avatar
Fabrice Bellard committed
213 214 215
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
    dataptr[DCTSIZE*6] = tmp13 - z1;
216

Fabrice Bellard's avatar
Fabrice Bellard committed
217 218
    /* Odd part */

219
    tmp10 = tmp4 + tmp5;        /* phase 2 */
Fabrice Bellard's avatar
Fabrice Bellard committed
220 221 222 223 224 225 226 227 228
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
    z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
    z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
    z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */

229
    z11 = tmp7 + z3;            /* phase 5 */
Fabrice Bellard's avatar
Fabrice Bellard committed
230 231 232 233 234 235 236
    z13 = tmp7 - z3;

    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
    dataptr[DCTSIZE*3] = z13 - z2;
    dataptr[DCTSIZE*1] = z11 + z4;
    dataptr[DCTSIZE*7] = z11 - z4;

237
    dataptr++;                  /* advance pointer to next column */
Fabrice Bellard's avatar
Fabrice Bellard committed
238 239
  }
}
240

241 242 243 244 245 246 247
/*
 * Perform the forward 2-4-8 DCT on one block of samples.
 */

GLOBAL(void)
fdct_ifast248 (DCTELEM * data)
{
248 249 250
  int_fast16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int_fast16_t tmp10, tmp11, tmp12, tmp13;
  int_fast16_t z1;
251 252 253 254
  DCTELEM *dataptr;
  int ctr;
  SHIFT_TEMPS

255
  row_fdct(data);
256

257 258 259 260 261 262 263 264 265 266 267 268 269 270
  /* Pass 2: process columns. */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1];
    tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
    tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
    tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1];
    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
    tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
    tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];

    /* Even part */
271

272 273 274 275
    tmp10 = tmp0 + tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    tmp13 = tmp0 - tmp3;
276

277 278
    dataptr[DCTSIZE*0] = tmp10 + tmp11;
    dataptr[DCTSIZE*4] = tmp10 - tmp11;
279

280 281 282 283 284 285 286 287
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
    dataptr[DCTSIZE*2] = tmp13 + z1;
    dataptr[DCTSIZE*6] = tmp13 - z1;

    tmp10 = tmp4 + tmp7;
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp5 - tmp6;
    tmp13 = tmp4 - tmp7;
288

289 290
    dataptr[DCTSIZE*1] = tmp10 + tmp11;
    dataptr[DCTSIZE*5] = tmp10 - tmp11;
291

292 293 294
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
    dataptr[DCTSIZE*3] = tmp13 + z1;
    dataptr[DCTSIZE*7] = tmp13 - z1;
295

296
    dataptr++;                        /* advance pointer to next column */
297 298 299
  }
}

300 301 302 303 304 305

#undef GLOBAL
#undef CONST_BITS
#undef DESCALE
#undef FIX_0_541196100
#undef FIX_1_306562965