swscale.c 30.9 KB
Newer Older
1
/*
2 3
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7 8 9
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

21
#include <assert.h>
22
#include <inttypes.h>
23
#include <math.h>
24
#include <stdio.h>
25 26
#include <string.h>

27
#include "libavutil/avutil.h"
28
#include "libavutil/bswap.h"
29 30 31
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
32
#include "libavutil/pixdesc.h"
33 34 35 36
#include "config.h"
#include "rgb2rgb.h"
#include "swscale_internal.h"
#include "swscale.h"
Arpi's avatar
Arpi committed
37

38
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[8][8] = {
39 40 41 42 43 44 45 46 47 48
    {  36, 68,  60, 92,  34, 66,  58, 90, },
    { 100,  4, 124, 28,  98,  2, 122, 26, },
    {  52, 84,  44, 76,  50, 82,  42, 74, },
    { 116, 20, 108, 12, 114, 18, 106, 10, },
    {  32, 64,  56, 88,  38, 70,  62, 94, },
    {  96,  0, 120, 24, 102,  6, 126, 30, },
    {  48, 80,  40, 72,  54, 86,  46, 78, },
    { 112, 16, 104,  8, 118, 22, 110, 14, },
};

49
DECLARE_ALIGNED(8, static const uint8_t, sws_pb_64)[8] = {
50
    64, 64, 64, 64, 64, 64, 64, 64
51
};
52

53 54
static av_always_inline void fillPlane(uint8_t *plane, int stride, int width,
                                       int height, int y, uint8_t val)
Ramiro Polla's avatar
Ramiro Polla committed
55
{
56
    int i;
57 58
    uint8_t *ptr = plane + stride * y;
    for (i = 0; i < height; i++) {
59 60 61 62 63
        memset(ptr, val, width);
        ptr += stride;
    }
}

64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
static void fill_plane9or10(uint8_t *plane, int stride, int width,
                            int height, int y, uint8_t val,
                            const int dst_depth, const int big_endian)
{
    int i, j;
    uint16_t *dst = (uint16_t *) (plane + stride * y);
#define FILL8TO9_OR_10(wfunc) \
    for (i = 0; i < height; i++) { \
        for (j = 0; j < width; j++) { \
            wfunc(&dst[j], (val << (dst_depth - 8)) |  \
                               (val >> (16 - dst_depth))); \
        } \
        dst += stride / 2; \
    }
    if (big_endian) {
        FILL8TO9_OR_10(AV_WB16);
    } else {
        FILL8TO9_OR_10(AV_WL16);
    }
}


86 87
static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
                           const uint8_t *_src, const int16_t *filter,
88
                           const int32_t *filterPos, int filterSize)
89
{
90
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
91
    int i;
92
    int32_t *dst        = (int32_t *) _dst;
93
    const uint16_t *src = (const uint16_t *) _src;
94
    int bits            = desc->comp[0].depth - 1;
95
    int sh              = bits - 4;
96 97 98 99

    for (i = 0; i < dstW; i++) {
        int j;
        int srcPos = filterPos[i];
100
        int val    = 0;
101 102 103 104 105

        for (j = 0; j < filterSize; j++) {
            val += src[srcPos + j] * filter[filterSize * i + j];
        }
        // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
106
        dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
107 108 109
    }
}

110 111
static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
                           const uint8_t *_src, const int16_t *filter,
112
                           const int32_t *filterPos, int filterSize)
113
{
114
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
115 116
    int i;
    const uint16_t *src = (const uint16_t *) _src;
117
    int sh              = desc->comp[0].depth - 1;
118 119 120 121

    for (i = 0; i < dstW; i++) {
        int j;
        int srcPos = filterPos[i];
122
        int val    = 0;
123 124 125 126 127 128 129 130 131

        for (j = 0; j < filterSize; j++) {
            val += src[srcPos + j] * filter[filterSize * i + j];
        }
        // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
        dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
    }
}

132
// bilinear / bicubic scaling
133 134 135
static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW,
                          const uint8_t *src, const int16_t *filter,
                          const int32_t *filterPos, int filterSize)
136 137
{
    int i;
138
    for (i = 0; i < dstW; i++) {
139
        int j;
140 141 142 143
        int srcPos = filterPos[i];
        int val    = 0;
        for (j = 0; j < filterSize; j++) {
            val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
144
        }
145
        dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
146 147 148
    }
}

149 150 151
static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW,
                          const uint8_t *src, const int16_t *filter,
                          const int32_t *filterPos, int filterSize)
152 153 154
{
    int i;
    int32_t *dst = (int32_t *) _dst;
155
    for (i = 0; i < dstW; i++) {
156
        int j;
157 158 159 160
        int srcPos = filterPos[i];
        int val    = 0;
        for (j = 0; j < filterSize; j++) {
            val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
161
        }
162
        dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
163 164 165
    }
}

Diego Biurrun's avatar
Diego Biurrun committed
166
// FIXME all pal and rgb srcFormats could do this conversion as well
167
// FIXME all scalers more complex than bilinear could do half of this transform
168
static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
169 170 171
{
    int i;
    for (i = 0; i < width; i++) {
172 173
        dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
        dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
174 175
    }
}
176

177
static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
178 179 180
{
    int i;
    for (i = 0; i < width; i++) {
181 182
        dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
        dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
183 184
    }
}
185

186
static void lumRangeToJpeg_c(int16_t *dst, int width)
187 188 189
{
    int i;
    for (i = 0; i < width; i++)
190
        dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
191
}
192

193
static void lumRangeFromJpeg_c(int16_t *dst, int width)
194 195 196
{
    int i;
    for (i = 0; i < width; i++)
197
        dst[i] = (dst[i] * 14071 + 33561947) >> 14;
198 199
}

200 201 202 203 204 205
static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
    int i;
    int32_t *dstU = (int32_t *) _dstU;
    int32_t *dstV = (int32_t *) _dstV;
    for (i = 0; i < width; i++) {
206 207
        dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
        dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
208 209
    }
}
210

211 212 213 214 215 216
static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
    int i;
    int32_t *dstU = (int32_t *) _dstU;
    int32_t *dstV = (int32_t *) _dstV;
    for (i = 0; i < width; i++) {
217 218
        dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
        dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
219 220
    }
}
221

222 223 224 225 226
static void lumRangeToJpeg16_c(int16_t *_dst, int width)
{
    int i;
    int32_t *dst = (int32_t *) _dst;
    for (i = 0; i < width; i++)
227
        dst[i] = (FFMIN(dst[i], 30189 << 4) * 4769 - (39057361 << 2)) >> 12;
228
}
229

230 231 232 233 234
static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
{
    int i;
    int32_t *dst = (int32_t *) _dst;
    for (i = 0; i < width; i++)
235
        dst[i] = (dst[i] * 14071 + (33561947 << 4)) >> 14;
236 237
}

Ronald S. Bultje's avatar
Ronald S. Bultje committed
238 239
static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
                           const uint8_t *src, int srcW, int xInc)
240 241
{
    int i;
242 243 244 245 246 247
    unsigned int xpos = 0;
    for (i = 0; i < dstWidth; i++) {
        register unsigned int xx     = xpos >> 16;
        register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
        dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
        xpos  += xInc;
248 249 250
    }
}

Ronald S. Bultje's avatar
Ronald S. Bultje committed
251
// *** horizontal scale Y line to temp buffer
252
static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
253 254
                                     const uint8_t *src_in[4],
                                     int srcW, int xInc,
255
                                     const int16_t *hLumFilter,
256 257
                                     const int32_t *hLumFilterPos,
                                     int hLumFilterSize,
258 259
                                     uint8_t *formatConvBuffer,
                                     uint32_t *pal, int isAlpha)
260
{
261 262
    void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) =
        isAlpha ? c->alpToYV12 : c->lumToYV12;
263
    void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
264
    const uint8_t *src = src_in[isAlpha ? 3 : 0];
265 266 267

    if (toYV12) {
        toYV12(formatConvBuffer, src, srcW, pal);
268
        src = formatConvBuffer;
269 270 271
    } else if (c->readLumPlanar && !isAlpha) {
        c->readLumPlanar(formatConvBuffer, src_in, srcW);
        src = formatConvBuffer;
272 273 274
    } else if (c->readAlpPlanar && isAlpha) {
        c->readAlpPlanar(formatConvBuffer, src_in, srcW);
        src = formatConvBuffer;
275 276 277
    }

    if (!c->hyscale_fast) {
278 279
        c->hyScale(c, dst, dstWidth, src, hLumFilter,
                   hLumFilterPos, hLumFilterSize);
280 281 282 283 284 285 286 287
    } else { // fast bilinear upscale / crap downscale
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
    }

    if (convertRange)
        convertRange(dst, dstWidth);
}

Ronald S. Bultje's avatar
Ronald S. Bultje committed
288 289 290
static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
                           int dstWidth, const uint8_t *src1,
                           const uint8_t *src2, int srcW, int xInc)
291 292
{
    int i;
293 294 295 296 297 298 299
    unsigned int xpos = 0;
    for (i = 0; i < dstWidth; i++) {
        register unsigned int xx     = xpos >> 16;
        register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
        dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
        dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
        xpos   += xInc;
300 301 302
    }
}

303 304
static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
                                     int16_t *dst2, int dstWidth,
305
                                     const uint8_t *src_in[4],
306 307 308 309
                                     int srcW, int xInc,
                                     const int16_t *hChrFilter,
                                     const int32_t *hChrFilterPos,
                                     int hChrFilterSize,
310
                                     uint8_t *formatConvBuffer, uint32_t *pal)
311
{
312
    const uint8_t *src1 = src_in[1], *src2 = src_in[2];
313
    if (c->chrToYV12) {
314 315
        uint8_t *buf2 = formatConvBuffer +
                        FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
316
        c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
317 318
        src1 = formatConvBuffer;
        src2 = buf2;
319
    } else if (c->readChrPlanar) {
320 321
        uint8_t *buf2 = formatConvBuffer +
                        FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
322
        c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
323 324
        src1 = formatConvBuffer;
        src2 = buf2;
325 326 327
    }

    if (!c->hcscale_fast) {
328 329
        c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
        c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
330 331 332 333 334 335 336 337 338
    } else { // fast bilinear upscale / crap downscale
        c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
    }

    if (c->chrConvertRange)
        c->chrConvertRange(dst1, dst2, dstWidth);
}

#define DEBUG_SWSCALE_BUFFERS 0
339 340 341
#define DEBUG_BUFFERS(...)                      \
    if (DEBUG_SWSCALE_BUFFERS)                  \
        av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
342

343
static int swscale(SwsContext *c, const uint8_t *src[],
Ronald S. Bultje's avatar
Ronald S. Bultje committed
344
                   int srcStride[], int srcSliceY,
345
                   int srcSliceH, uint8_t *dst[], int dstStride[])
346
{
347 348 349 350 351 352 353 354 355
    /* load a few things into local vars to make the code more readable?
     * and faster */
    const int srcW                   = c->srcW;
    const int dstW                   = c->dstW;
    const int dstH                   = c->dstH;
    const int chrDstW                = c->chrDstW;
    const int chrSrcW                = c->chrSrcW;
    const int lumXInc                = c->lumXInc;
    const int chrXInc                = c->chrXInc;
356
    const enum AVPixelFormat dstFormat = c->dstFormat;
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
    const int flags                  = c->flags;
    int32_t *vLumFilterPos           = c->vLumFilterPos;
    int32_t *vChrFilterPos           = c->vChrFilterPos;
    int32_t *hLumFilterPos           = c->hLumFilterPos;
    int32_t *hChrFilterPos           = c->hChrFilterPos;
    int16_t *vLumFilter              = c->vLumFilter;
    int16_t *vChrFilter              = c->vChrFilter;
    int16_t *hLumFilter              = c->hLumFilter;
    int16_t *hChrFilter              = c->hChrFilter;
    int32_t *lumMmxFilter            = c->lumMmxFilter;
    int32_t *chrMmxFilter            = c->chrMmxFilter;
    const int vLumFilterSize         = c->vLumFilterSize;
    const int vChrFilterSize         = c->vChrFilterSize;
    const int hLumFilterSize         = c->hLumFilterSize;
    const int hChrFilterSize         = c->hChrFilterSize;
    int16_t **lumPixBuf              = c->lumPixBuf;
    int16_t **chrUPixBuf             = c->chrUPixBuf;
    int16_t **chrVPixBuf             = c->chrVPixBuf;
    int16_t **alpPixBuf              = c->alpPixBuf;
    const int vLumBufSize            = c->vLumBufSize;
    const int vChrBufSize            = c->vChrBufSize;
    uint8_t *formatConvBuffer        = c->formatConvBuffer;
    uint32_t *pal                    = c->pal_yuv;
    yuv2planar1_fn yuv2plane1        = c->yuv2plane1;
    yuv2planarX_fn yuv2planeX        = c->yuv2planeX;
    yuv2interleavedX_fn yuv2nv12cX   = c->yuv2nv12cX;
    yuv2packed1_fn yuv2packed1       = c->yuv2packed1;
    yuv2packed2_fn yuv2packed2       = c->yuv2packed2;
    yuv2packedX_fn yuv2packedX       = c->yuv2packedX;
386
    yuv2anyX_fn yuv2anyX             = c->yuv2anyX;
387 388
    const int chrSrcSliceY           =                srcSliceY >> c->chrSrcVSubSample;
    const int chrSrcSliceH           = AV_CEIL_RSHIFT(srcSliceH,   c->chrSrcVSubSample);
389
    int should_dither                = is9_15BPS(c->srcFormat) ||
390
                                       is16BPS(c->srcFormat);
391 392 393
    int lastDstY;

    /* vars which will change and which we need to store back in the context */
394 395 396 397 398
    int dstY         = c->dstY;
    int lumBufIndex  = c->lumBufIndex;
    int chrBufIndex  = c->chrBufIndex;
    int lastInLumBuf = c->lastInLumBuf;
    int lastInChrBuf = c->lastInChrBuf;
399 400

    if (isPacked(c->srcFormat)) {
401 402 403 404 405 406 407 408
        src[0] =
        src[1] =
        src[2] =
        src[3] = src[0];
        srcStride[0] =
        srcStride[1] =
        srcStride[2] =
        srcStride[3] = srcStride[0];
409
    }
410 411
    srcStride[1] <<= c->vChrDrop;
    srcStride[2] <<= c->vChrDrop;
412

413
    DEBUG_BUFFERS("swscale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
414 415 416 417
                  src[0], srcStride[0], src[1], srcStride[1],
                  src[2], srcStride[2], src[3], srcStride[3],
                  dst[0], dstStride[0], dst[1], dstStride[1],
                  dst[2], dstStride[2], dst[3], dstStride[3]);
418
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
419
                  srcSliceY, srcSliceH, dstY, dstH);
420
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
421
                  vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
422

423 424 425
    if (dstStride[0] % 8 != 0 || dstStride[1] % 8 != 0 ||
        dstStride[2] % 8 != 0 || dstStride[3] % 8 != 0) {
        static int warnedAlready = 0; // FIXME maybe move this into the context
426
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
427 428
            av_log(c, AV_LOG_WARNING,
                   "Warning: dstStride is not aligned!\n"
429
                   "         ->cannot do aligned memory accesses anymore\n");
430
            warnedAlready = 1;
431 432 433 434
        }
    }

    /* Note the user might start scaling the picture in the middle so this
435 436 437 438 439 440 441 442
     * will not get executed. This is not really intended but works
     * currently, so people might do it. */
    if (srcSliceY == 0) {
        lumBufIndex  = -1;
        chrBufIndex  = -1;
        dstY         = 0;
        lastInLumBuf = -1;
        lastInChrBuf = -1;
443 444
    }

445
    if (!should_dither) {
446
        c->chrDither8 = c->lumDither8 = sws_pb_64;
447
    }
448
    lastDstY = dstY;
449

450 451 452
    for (; dstY < dstH; dstY++) {
        const int chrDstY = dstY >> c->chrDstVSubSample;
        uint8_t *dest[4]  = {
453 454 455 456 457
            dst[0] + dstStride[0] * dstY,
            dst[1] + dstStride[1] * chrDstY,
            dst[2] + dstStride[2] * chrDstY,
            (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
        };
458

459 460 461 462 463
        // First line needed as input
        const int firstLumSrcY  = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
        const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
        // First line needed as input
        const int firstChrSrcY  = FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]);
464 465 466 467 468

        // Last line needed as input
        int lastLumSrcY  = FFMIN(c->srcH,    firstLumSrcY  + vLumFilterSize) - 1;
        int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
        int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
469 470
        int enough_lines;

471 472 473 474 475
        // handle holes (FAST_BILINEAR & weird filters)
        if (firstLumSrcY > lastInLumBuf)
            lastInLumBuf = firstLumSrcY - 1;
        if (firstChrSrcY > lastInChrBuf)
            lastInChrBuf = firstChrSrcY - 1;
476 477 478 479 480
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);

        DEBUG_BUFFERS("dstY: %d\n", dstY);
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
481
                      firstLumSrcY, lastLumSrcY, lastInLumBuf);
482
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
483
                      firstChrSrcY, lastChrSrcY, lastInChrBuf);
484 485

        // Do we have enough lines in this slice to output the dstY line
486
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH &&
487
                       lastChrSrcY < AV_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample);
488 489 490 491 492

        if (!enough_lines) {
            lastLumSrcY = srcSliceY + srcSliceH - 1;
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
493
                          lastLumSrcY, lastChrSrcY);
494 495
        }

496 497
        // Do horizontal scaling
        while (lastInLumBuf < lastLumSrcY) {
498 499 500 501 502 503
            const uint8_t *src1[4] = {
                src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
                src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
                src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
                src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
            };
504
            lumBufIndex++;
505
            assert(lumBufIndex < 2 * vLumBufSize);
506 507
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
508
            hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
Ronald S. Bultje's avatar
Ronald S. Bultje committed
509
                    hLumFilter, hLumFilterPos, hLumFilterSize,
510
                    formatConvBuffer, pal, 0);
511
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
512
                hyscale(c, alpPixBuf[lumBufIndex], dstW, src1, srcW,
Ronald S. Bultje's avatar
Ronald S. Bultje committed
513
                        lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
514
                        formatConvBuffer, pal, 1);
515 516
            lastInLumBuf++;
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
517
                          lumBufIndex, lastInLumBuf);
518
        }
519
        while (lastInChrBuf < lastChrSrcY) {
520 521 522 523 524 525
            const uint8_t *src1[4] = {
                src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
                src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
                src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
                src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
            };
526
            chrBufIndex++;
527
            assert(chrBufIndex < 2 * vChrBufSize);
528 529
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
530
            // FIXME replace parameters through context struct (some at least)
531 532

            if (c->needs_hcscale)
Ronald S. Bultje's avatar
Ronald S. Bultje committed
533
                hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
534 535 536
                        chrDstW, src1, chrSrcW, chrXInc,
                        hChrFilter, hChrFilterPos, hChrFilterSize,
                        formatConvBuffer, pal);
537 538
            lastInChrBuf++;
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
539
                          chrBufIndex, lastInChrBuf);
540
        }
541 542 543 544 545
        // wrap buf index around to stay inside the ring buffer
        if (lumBufIndex >= vLumBufSize)
            lumBufIndex -= vLumBufSize;
        if (chrBufIndex >= vChrBufSize)
            chrBufIndex -= vChrBufSize;
546
        if (!enough_lines)
547
            break;  // we can't output a dstY line so let's try with the next slice
548

549
#if HAVE_MMX_INLINE
550 551
        updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex,
                              lastInLumBuf, lastInChrBuf);
552
#endif
553
        if (should_dither) {
554 555
            c->chrDither8 = ff_dither_8x8_128[chrDstY & 7];
            c->lumDither8 = ff_dither_8x8_128[dstY    & 7];
556
        }
557 558 559 560
        if (dstY >= dstH - 2) {
            /* hmm looks like we can't use MMX here without overwriting
             * this array's tail */
            ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
561
                                     &yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
562 563 564
        }

        {
565 566 567 568 569
            const int16_t **lumSrcPtr  = (const int16_t **)lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
            const int16_t **chrUSrcPtr = (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **chrVSrcPtr = (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
            const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
                                         (const int16_t **)alpPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
570 571

            if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
572 573 574 575 576
                const int16_t **tmpY = (const int16_t **)lumPixBuf +
                                       2 * vLumBufSize;
                int neg = -firstLumSrcY, i;
                int end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
                for (i = 0; i < neg; i++)
577
                    tmpY[i] = lumSrcPtr[neg];
578
                for (; i < end; i++)
579
                    tmpY[i] = lumSrcPtr[i];
580 581
                for (; i < vLumFilterSize; i++)
                    tmpY[i] = tmpY[i - 1];
582 583 584
                lumSrcPtr = tmpY;

                if (alpSrcPtr) {
585 586 587
                    const int16_t **tmpA = (const int16_t **)alpPixBuf +
                                           2 * vLumBufSize;
                    for (i = 0; i < neg; i++)
588
                        tmpA[i] = alpSrcPtr[neg];
589
                    for (; i < end; i++)
590
                        tmpA[i] = alpSrcPtr[i];
591
                    for (; i < vLumFilterSize; i++)
592 593 594 595
                        tmpA[i] = tmpA[i - 1];
                    alpSrcPtr = tmpA;
                }
            }
596 597 598 599 600 601 602
            if (firstChrSrcY < 0 ||
                firstChrSrcY + vChrFilterSize > c->chrSrcH) {
                const int16_t **tmpU = (const int16_t **)chrUPixBuf + 2 * vChrBufSize,
                **tmpV               = (const int16_t **)chrVPixBuf + 2 * vChrBufSize;
                int neg = -firstChrSrcY, i;
                int end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
                for (i = 0; i < neg; i++) {
603 604 605
                    tmpU[i] = chrUSrcPtr[neg];
                    tmpV[i] = chrVSrcPtr[neg];
                }
606
                for (; i < end; i++) {
607 608 609
                    tmpU[i] = chrUSrcPtr[i];
                    tmpV[i] = chrVSrcPtr[i];
                }
610
                for (; i < vChrFilterSize; i++) {
611 612 613 614 615 616 617
                    tmpU[i] = tmpU[i - 1];
                    tmpV[i] = tmpV[i - 1];
                }
                chrUSrcPtr = tmpU;
                chrVSrcPtr = tmpV;
            }

618 619 620
            if (isPlanarYUV(dstFormat) ||
                (isGray(dstFormat) && !isALPHA(dstFormat))) { // YV12 like
                const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
621

622 623 624
                if (vLumFilterSize == 1) {
                    yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
                } else {
625 626 627
                    yuv2planeX(vLumFilter + dstY * vLumFilterSize,
                               vLumFilterSize, lumSrcPtr, dest[0],
                               dstW, c->lumDither8, 0);
628
                }
629

630
                if (!((dstY & chrSkipMask) || isGray(dstFormat))) {
631
                    if (yuv2nv12cX) {
632 633 634
                        yuv2nv12cX(c, vChrFilter + chrDstY * vChrFilterSize,
                                   vChrFilterSize, chrUSrcPtr, chrVSrcPtr,
                                   dest[1], chrDstW);
635 636 637 638
                    } else if (vChrFilterSize == 1) {
                        yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
                        yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
                    } else {
639 640 641 642 643 644
                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize,
                                   vChrFilterSize, chrUSrcPtr, dest[1],
                                   chrDstW, c->chrDither8, 0);
                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize,
                                   vChrFilterSize, chrVSrcPtr, dest[2],
                                   chrDstW, c->chrDither8, 3);
645
                    }
646
                }
647

648
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
649
                    if (vLumFilterSize == 1) {
650 651
                        yuv2plane1(alpSrcPtr[0], dest[3], dstW,
                                   c->lumDither8, 0);
652
                    } else {
653 654 655
                        yuv2planeX(vLumFilter + dstY * vLumFilterSize,
                                   vLumFilterSize, alpSrcPtr, dest[3],
                                   dstW, c->lumDither8, 0);
656
                    }
657
                }
658
            } else if (yuv2packedX) {
659 660
                if (c->yuv2packed1 && vLumFilterSize == 1 &&
                    vChrFilterSize <= 2) { // unscaled RGB
661
                    int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
662 663 664
                    yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
                                alpPixBuf ? *alpSrcPtr : NULL,
                                dest[0], dstW, chrAlpha, dstY);
665 666
                } else if (c->yuv2packed2 && vLumFilterSize == 2 &&
                           vChrFilterSize == 2) { // bilinear upscale RGB
667 668 669
                    int lumAlpha = vLumFilter[2 * dstY + 1];
                    int chrAlpha = vChrFilter[2 * dstY + 1];
                    lumMmxFilter[2] =
670
                    lumMmxFilter[3] = vLumFilter[2 * dstY]    * 0x10001;
671 672 673 674 675
                    chrMmxFilter[2] =
                    chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
                    yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
                                alpPixBuf ? alpSrcPtr : NULL,
                                dest[0], dstW, lumAlpha, chrAlpha, dstY);
676
                } else { // general RGB
677 678 679 680 681
                    yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
                                lumSrcPtr, vLumFilterSize,
                                vChrFilter + dstY * vChrFilterSize,
                                chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
                                alpSrcPtr, dest[0], dstW, dstY);
682
                }
683 684 685 686 687 688
            } else {
                yuv2anyX(c, vLumFilter + dstY * vLumFilterSize,
                         lumSrcPtr, vLumFilterSize,
                         vChrFilter + dstY * vChrFilterSize,
                         chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
                         alpSrcPtr, dest, dstW, dstY);
689 690 691 692
            }
        }
    }

693 694 695 696 697 698
    if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
        int length = dstW;
        int height = dstY - lastDstY;
        if (is16BPS(c->dstFormat))
            length *= 2;

699
        if (is9_15BPS(dstFormat)) {
700 701
            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
            fill_plane9or10(dst[3], dstStride[3], length, height, lastDstY,
702
                            255, desc->comp[3].depth, isBE(dstFormat));
703 704 705
        } else
            fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
    }
706

707
#if HAVE_MMXEXT_INLINE
708
    if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
709
        __asm__ volatile ("sfence" ::: "memory");
710 711 712 713
#endif
    emms_c();

    /* store changed local vars back in the context */
714 715 716 717 718
    c->dstY         = dstY;
    c->lumBufIndex  = lumBufIndex;
    c->chrBufIndex  = chrBufIndex;
    c->lastInLumBuf = lastInLumBuf;
    c->lastInChrBuf = lastInChrBuf;
719 720 721 722

    return dstY - lastDstY;
}

723
static av_cold void sws_init_swscale(SwsContext *c)
724
{
725
    enum AVPixelFormat srcFormat = c->srcFormat;
726

727 728
    ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
                             &c->yuv2nv12cX, &c->yuv2packed1,
729
                             &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
730

731
    ff_sws_init_input_funcs(c);
732

733
    if (c->srcBpc == 8) {
734
        if (c->dstBpc <= 15) {
735
            c->hyScale = c->hcScale = hScale8To15_c;
736 737 738 739
            if (c->flags & SWS_FAST_BILINEAR) {
                c->hyscale_fast = hyscale_fast_c;
                c->hcscale_fast = hcscale_fast_c;
            }
740
        } else {
741
            c->hyScale = c->hcScale = hScale8To19_c;
742
        }
743
    } else {
744
        c->hyScale = c->hcScale = c->dstBpc > 15 ? hScale16To19_c
745
                                                 : hScale16To15_c;
746
    }
747

748
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
749
        if (c->dstBpc <= 15) {
750 751 752 753 754 755 756 757
            if (c->srcRange) {
                c->lumConvertRange = lumRangeFromJpeg_c;
                c->chrConvertRange = chrRangeFromJpeg_c;
            } else {
                c->lumConvertRange = lumRangeToJpeg_c;
                c->chrConvertRange = chrRangeToJpeg_c;
            }
        } else {
758 759 760 761 762 763 764 765 766
            if (c->srcRange) {
                c->lumConvertRange = lumRangeFromJpeg16_c;
                c->chrConvertRange = chrRangeFromJpeg16_c;
            } else {
                c->lumConvertRange = lumRangeToJpeg16_c;
                c->chrConvertRange = chrRangeToJpeg16_c;
            }
        }
    }
767 768

    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
769
          srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
770 771
        c->needs_hcscale = 1;
}
Michael Niedermayer's avatar
Michael Niedermayer committed
772

773
SwsFunc ff_getSwsFunc(SwsContext *c)
774
{
775
    sws_init_swscale(c);
776

777 778
    if (ARCH_PPC)
        ff_sws_init_swscale_ppc(c);
779 780
    if (ARCH_X86)
        ff_sws_init_swscale_x86(c);
781

782
    return swscale;
783
}