swscale.c 35.5 KB
Newer Older
1
/*
2
 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3 4 5
 *
 * This file is part of FFmpeg.
 *
6 7 8 9
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
10 11 12
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
15
 *
16 17
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

21
#include <inttypes.h>
22
#include <math.h>
23
#include <stdio.h>
24 25
#include <string.h>

26
#include "libavutil/avassert.h"
27
#include "libavutil/avutil.h"
28
#include "libavutil/bswap.h"
29
#include "libavutil/cpu.h"
30
#include "libavutil/imgutils.h"
31 32
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
33
#include "libavutil/pixdesc.h"
34 35 36 37
#include "config.h"
#include "rgb2rgb.h"
#include "swscale_internal.h"
#include "swscale.h"
Arpi's avatar
Arpi committed
38

39
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[9][8] = {
40 41 42 43 44 45 46 47
    {  36, 68,  60, 92,  34, 66,  58, 90, },
    { 100,  4, 124, 28,  98,  2, 122, 26, },
    {  52, 84,  44, 76,  50, 82,  42, 74, },
    { 116, 20, 108, 12, 114, 18, 106, 10, },
    {  32, 64,  56, 88,  38, 70,  62, 94, },
    {  96,  0, 120, 24, 102,  6, 126, 30, },
    {  48, 80,  40, 72,  54, 86,  46, 78, },
    { 112, 16, 104,  8, 118, 22, 110, 14, },
48
    {  36, 68,  60, 92,  34, 66,  58, 90, },
49
};
50

51
DECLARE_ALIGNED(8, static const uint8_t, sws_pb_64)[8] = {
52
    64, 64, 64, 64, 64, 64, 64, 64
53
};
54

55 56
static av_always_inline void fillPlane(uint8_t *plane, int stride, int width,
                                       int height, int y, uint8_t val)
Ramiro Polla's avatar
Ramiro Polla committed
57
{
58
    int i;
59 60
    uint8_t *ptr = plane + stride * y;
    for (i = 0; i < height; i++) {
61 62 63 64 65
        memset(ptr, val, width);
        ptr += stride;
    }
}

66 67
static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
                           const uint8_t *_src, const int16_t *filter,
68
                           const int32_t *filterPos, int filterSize)
69
{
70
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
71
    int i;
72
    int32_t *dst        = (int32_t *) _dst;
73
    const uint16_t *src = (const uint16_t *) _src;
74
    int bits            = desc->comp[0].depth - 1;
75
    int sh              = bits - 4;
76

77
    if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
78
        sh = 9;
79 80 81
    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
        sh = 16 - 1 - 4;
    }
82

83 84 85
    for (i = 0; i < dstW; i++) {
        int j;
        int srcPos = filterPos[i];
86
        int val    = 0;
87 88 89 90 91

        for (j = 0; j < filterSize; j++) {
            val += src[srcPos + j] * filter[filterSize * i + j];
        }
        // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
92
        dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
93 94 95
    }
}

96 97
static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
                           const uint8_t *_src, const int16_t *filter,
98
                           const int32_t *filterPos, int filterSize)
99
{
100
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
101 102
    int i;
    const uint16_t *src = (const uint16_t *) _src;
103
    int sh              = desc->comp[0].depth - 1;
104

105
    if (sh<15) {
106
        sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
107 108 109
    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
        sh = 16 - 1;
    }
110

111 112 113
    for (i = 0; i < dstW; i++) {
        int j;
        int srcPos = filterPos[i];
114
        int val    = 0;
115 116 117 118 119 120 121 122 123

        for (j = 0; j < filterSize; j++) {
            val += src[srcPos + j] * filter[filterSize * i + j];
        }
        // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
        dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
    }
}

124
// bilinear / bicubic scaling
125 126 127
static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW,
                          const uint8_t *src, const int16_t *filter,
                          const int32_t *filterPos, int filterSize)
Ramiro Polla's avatar
Ramiro Polla committed
128
{
129
    int i;
130
    for (i = 0; i < dstW; i++) {
131
        int j;
132 133 134 135
        int srcPos = filterPos[i];
        int val    = 0;
        for (j = 0; j < filterSize; j++) {
            val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
136
        }
137
        dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
138
    }
139
}
140

141 142 143
static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW,
                          const uint8_t *src, const int16_t *filter,
                          const int32_t *filterPos, int filterSize)
144 145 146
{
    int i;
    int32_t *dst = (int32_t *) _dst;
147
    for (i = 0; i < dstW; i++) {
148
        int j;
149 150 151 152
        int srcPos = filterPos[i];
        int val    = 0;
        for (j = 0; j < filterSize; j++) {
            val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
153
        }
154
        dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
155 156 157
    }
}

Diego Biurrun's avatar
Diego Biurrun committed
158
// FIXME all pal and rgb srcFormats could do this conversion as well
159
// FIXME all scalers more complex than bilinear could do half of this transform
160
static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
161 162 163
{
    int i;
    for (i = 0; i < width; i++) {
164 165
        dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
        dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
166
    }
Arpi's avatar
Arpi committed
167
}
168

169
static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
Ramiro Polla's avatar
Ramiro Polla committed
170
{
171 172
    int i;
    for (i = 0; i < width; i++) {
173 174
        dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
        dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
175
    }
176
}
177

178
static void lumRangeToJpeg_c(int16_t *dst, int width)
Ramiro Polla's avatar
Ramiro Polla committed
179
{
180 181
    int i;
    for (i = 0; i < width; i++)
182
        dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
183
}
184

185
static void lumRangeFromJpeg_c(int16_t *dst, int width)
186 187 188
{
    int i;
    for (i = 0; i < width; i++)
189
        dst[i] = (dst[i] * 14071 + 33561947) >> 14;
190 191
}

192 193 194 195 196 197
static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
    int i;
    int32_t *dstU = (int32_t *) _dstU;
    int32_t *dstV = (int32_t *) _dstV;
    for (i = 0; i < width; i++) {
198 199
        dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
        dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
200 201
    }
}
202

203 204 205 206 207 208
static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
    int i;
    int32_t *dstU = (int32_t *) _dstU;
    int32_t *dstV = (int32_t *) _dstV;
    for (i = 0; i < width; i++) {
209 210
        dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
        dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
211 212
    }
}
213

214 215 216 217
static void lumRangeToJpeg16_c(int16_t *_dst, int width)
{
    int i;
    int32_t *dst = (int32_t *) _dst;
218 219 220
    for (i = 0; i < width; i++) {
        dst[i] = ((int)(FFMIN(dst[i], 30189 << 4) * 4769U - (39057361 << 2))) >> 12;
    }
221
}
222

223 224 225 226 227
static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
{
    int i;
    int32_t *dst = (int32_t *) _dst;
    for (i = 0; i < width; i++)
228
        dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
229 230
}

231

232
#define DEBUG_SWSCALE_BUFFERS 0
233 234 235
#define DEBUG_BUFFERS(...)                      \
    if (DEBUG_SWSCALE_BUFFERS)                  \
        av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
236

237
static int swscale(SwsContext *c, const uint8_t *src[],
Ronald S. Bultje's avatar
Ronald S. Bultje committed
238
                   int srcStride[], int srcSliceY,
239
                   int srcSliceH, uint8_t *dst[], int dstStride[])
Ramiro Polla's avatar
Ramiro Polla committed
240
{
241 242 243 244
    /* load a few things into local vars to make the code more readable?
     * and faster */
    const int dstW                   = c->dstW;
    const int dstH                   = c->dstH;
245

246
    const enum AVPixelFormat dstFormat = c->dstFormat;
247 248 249
    const int flags                  = c->flags;
    int32_t *vLumFilterPos           = c->vLumFilterPos;
    int32_t *vChrFilterPos           = c->vChrFilterPos;
250

251 252
    const int vLumFilterSize         = c->vLumFilterSize;
    const int vChrFilterSize         = c->vChrFilterSize;
253

254 255 256 257 258 259
    yuv2planar1_fn yuv2plane1        = c->yuv2plane1;
    yuv2planarX_fn yuv2planeX        = c->yuv2planeX;
    yuv2interleavedX_fn yuv2nv12cX   = c->yuv2nv12cX;
    yuv2packed1_fn yuv2packed1       = c->yuv2packed1;
    yuv2packed2_fn yuv2packed2       = c->yuv2packed2;
    yuv2packedX_fn yuv2packedX       = c->yuv2packedX;
260
    yuv2anyX_fn yuv2anyX             = c->yuv2anyX;
261
    const int chrSrcSliceY           =                srcSliceY >> c->chrSrcVSubSample;
262
    const int chrSrcSliceH           = AV_CEIL_RSHIFT(srcSliceH,   c->chrSrcVSubSample);
263
    int should_dither                = isNBPS(c->srcFormat) ||
264
                                       is16BPS(c->srcFormat);
265 266 267
    int lastDstY;

    /* vars which will change and which we need to store back in the context */
268 269 270 271 272
    int dstY         = c->dstY;
    int lumBufIndex  = c->lumBufIndex;
    int chrBufIndex  = c->chrBufIndex;
    int lastInLumBuf = c->lastInLumBuf;
    int lastInChrBuf = c->lastInChrBuf;
273

274 275 276 277
    int lumStart = 0;
    int lumEnd = c->descIndex[0];
    int chrStart = lumEnd;
    int chrEnd = c->descIndex[1];
278 279
    int vStart = chrEnd;
    int vEnd = c->numDesc;
280
    SwsSlice *src_slice = &c->slice[lumStart];
281 282
    SwsSlice *hout_slice = &c->slice[c->numSlice-2];
    SwsSlice *vout_slice = &c->slice[c->numSlice-1];
283
    SwsFilterDescriptor *desc = c->desc;
284

285 286
    int needAlpha = c->needAlpha;

287 288
    int hasLumHoles = 1;
    int hasChrHoles = 1;
289

290
    if (isPacked(c->srcFormat)) {
291 292 293 294 295 296
        src[1] =
        src[2] =
        src[3] = src[0];
        srcStride[1] =
        srcStride[2] =
        srcStride[3] = srcStride[0];
297
    }
298 299
    srcStride[1] *= 1 << c->vChrDrop;
    srcStride[2] *= 1 << c->vChrDrop;
300

301
    DEBUG_BUFFERS("swscale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
302 303 304 305
                  src[0], srcStride[0], src[1], srcStride[1],
                  src[2], srcStride[2], src[3], srcStride[3],
                  dst[0], dstStride[0], dst[1], dstStride[1],
                  dst[2], dstStride[2], dst[3], dstStride[3]);
306
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
307
                  srcSliceY, srcSliceH, dstY, dstH);
308 309
    DEBUG_BUFFERS("vLumFilterSize: %d vChrFilterSize: %d\n",
                  vLumFilterSize, vChrFilterSize);
310

311 312
    if (dstStride[0]&15 || dstStride[1]&15 ||
        dstStride[2]&15 || dstStride[3]&15) {
313
        static int warnedAlready = 0; // FIXME maybe move this into the context
314
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
315 316
            av_log(c, AV_LOG_WARNING,
                   "Warning: dstStride is not aligned!\n"
317
                   "         ->cannot do aligned memory accesses anymore\n");
318
            warnedAlready = 1;
319 320
        }
    }
321

322 323 324 325
    if (   (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15
        || (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15
        || dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15
        || srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15
326 327 328
    ) {
        static int warnedAlready=0;
        int cpu_flags = av_get_cpu_flags();
329
        if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
Lou Logan's avatar
Lou Logan committed
330
            av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speed loss\n");
331 332 333 334
            warnedAlready=1;
        }
    }

335
    /* Note the user might start scaling the picture in the middle so this
336 337 338 339 340 341 342 343
     * will not get executed. This is not really intended but works
     * currently, so people might do it. */
    if (srcSliceY == 0) {
        lumBufIndex  = -1;
        chrBufIndex  = -1;
        dstY         = 0;
        lastInLumBuf = -1;
        lastInChrBuf = -1;
344 345
    }

346
    if (!should_dither) {
347
        c->chrDither8 = c->lumDither8 = sws_pb_64;
348
    }
349
    lastDstY = dstY;
350

351 352
    ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
                   yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, c->use_mmx_vfilter);
353

354
    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
355
            srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);
356 357 358

    ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
            dstY, dstH, dstY >> c->chrDstVSubSample,
359
            AV_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);
360 361 362 363 364 365 366 367 368 369 370 371
    if (srcSliceY == 0) {
        hout_slice->plane[0].sliceY = lastInLumBuf + 1;
        hout_slice->plane[1].sliceY = lastInChrBuf + 1;
        hout_slice->plane[2].sliceY = lastInChrBuf + 1;
        hout_slice->plane[3].sliceY = lastInLumBuf + 1;

        hout_slice->plane[0].sliceH =
        hout_slice->plane[1].sliceH =
        hout_slice->plane[2].sliceH =
        hout_slice->plane[3].sliceH = 0;
        hout_slice->width = dstW;
    }
372

373 374
    for (; dstY < dstH; dstY++) {
        const int chrDstY = dstY >> c->chrDstVSubSample;
375
        int use_mmx_vfilter= c->use_mmx_vfilter;
376

377 378 379 380 381
        // First line needed as input
        const int firstLumSrcY  = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
        const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
        // First line needed as input
        const int firstChrSrcY  = FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]);
382 383 384 385 386

        // Last line needed as input
        int lastLumSrcY  = FFMIN(c->srcH,    firstLumSrcY  + vLumFilterSize) - 1;
        int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
        int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
387
        int enough_lines;
388

389
        int i;
390
        int posY, cPosY, firstPosY, lastPosY, firstCPosY, lastCPosY;
391

392
        // handle holes (FAST_BILINEAR & weird filters)
393
        if (firstLumSrcY > lastInLumBuf) {
394

395 396
            hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
            if (hasLumHoles) {
397 398
                hout_slice->plane[0].sliceY = firstLumSrcY;
                hout_slice->plane[3].sliceY = firstLumSrcY;
399 400
                hout_slice->plane[0].sliceH =
                hout_slice->plane[3].sliceH = 0;
401
            }
402

403
            lastInLumBuf = firstLumSrcY - 1;
404 405
        }
        if (firstChrSrcY > lastInChrBuf) {
406

407 408
            hasChrHoles = lastInChrBuf != firstChrSrcY - 1;
            if (hasChrHoles) {
409 410
                hout_slice->plane[1].sliceY = firstChrSrcY;
                hout_slice->plane[2].sliceY = firstChrSrcY;
411 412
                hout_slice->plane[1].sliceH =
                hout_slice->plane[2].sliceH = 0;
413
            }
414

415
            lastInChrBuf = firstChrSrcY - 1;
416
        }
417 418 419

        DEBUG_BUFFERS("dstY: %d\n", dstY);
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
420
                      firstLumSrcY, lastLumSrcY, lastInLumBuf);
421
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
422
                      firstChrSrcY, lastChrSrcY, lastInChrBuf);
423 424

        // Do we have enough lines in this slice to output the dstY line
425
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH &&
426
                       lastChrSrcY < AV_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample);
427 428 429 430 431

        if (!enough_lines) {
            lastLumSrcY = srcSliceY + srcSliceH - 1;
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
432
                          lastLumSrcY, lastChrSrcY);
433
        }
434

435 436 437 438
        av_assert0((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->plane[0].available_lines);
        av_assert0((lastChrSrcY - firstChrSrcY + 1) <= hout_slice->plane[1].available_lines);


439
        posY = hout_slice->plane[0].sliceY + hout_slice->plane[0].sliceH;
440 441
        if (posY <= lastLumSrcY && !hasLumHoles) {
            firstPosY = FFMAX(firstLumSrcY, posY);
442
            lastPosY = FFMIN(firstLumSrcY + hout_slice->plane[0].available_lines - 1, srcSliceY + srcSliceH - 1);
443
        } else {
444
            firstPosY = posY;
445 446 447
            lastPosY = lastLumSrcY;
        }

448
        cPosY = hout_slice->plane[1].sliceY + hout_slice->plane[1].sliceH;
449 450
        if (cPosY <= lastChrSrcY && !hasChrHoles) {
            firstCPosY = FFMAX(firstChrSrcY, cPosY);
451
            lastCPosY = FFMIN(firstChrSrcY + hout_slice->plane[1].available_lines - 1, AV_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample) - 1);
452
        } else {
453
            firstCPosY = cPosY;
454 455 456
            lastCPosY = lastChrSrcY;
        }

457
        ff_rotate_slice(hout_slice, lastPosY, lastCPosY);
458

459
        if (posY < lastLumSrcY + 1) {
460
            for (i = lumStart; i < lumEnd; ++i)
461 462 463
                desc[i].process(c, &desc[i], firstPosY, lastPosY - firstPosY + 1);
        }

464 465 466
        lumBufIndex += lastLumSrcY - lastInLumBuf;
        lastInLumBuf = lastLumSrcY;

467
        if (cPosY < lastChrSrcY + 1) {
468
            for (i = chrStart; i < chrEnd; ++i)
469 470 471
                desc[i].process(c, &desc[i], firstCPosY, lastCPosY - firstCPosY + 1);
        }

472 473 474
        chrBufIndex += lastChrSrcY - lastInChrBuf;
        lastInChrBuf = lastChrSrcY;

475
        // wrap buf index around to stay inside the ring buffer
476 477 478 479
        if (lumBufIndex >= vLumFilterSize)
            lumBufIndex -= vLumFilterSize;
        if (chrBufIndex >= vChrFilterSize)
            chrBufIndex -= vChrFilterSize;
480
        if (!enough_lines)
481
            break;  // we can't output a dstY line so let's try with the next slice
482

483
#if HAVE_MMX_INLINE
484
        ff_updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex,
485
                              lastInLumBuf, lastInChrBuf);
486
#endif
487
        if (should_dither) {
488 489
            c->chrDither8 = ff_dither_8x8_128[chrDstY & 7];
            c->lumDither8 = ff_dither_8x8_128[dstY    & 7];
490
        }
491 492 493 494
        if (dstY >= dstH - 2) {
            /* hmm looks like we can't use MMX here without overwriting
             * this array's tail */
            ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
495
                                     &yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
496
            use_mmx_vfilter= 0;
497 498
            ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
                           yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter);
499 500 501
        }

        {
502 503
            for (i = vStart; i < vEnd; ++i)
                desc[i].process(c, &desc[i], dstY, 1);
504
        }
505
    }
506
    if (isPlanar(dstFormat) && isALPHA(dstFormat) && !needAlpha) {
507 508 509
        int length = dstW;
        int height = dstY - lastDstY;

510
        if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
511
            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
512
            fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
513
                    1, desc->comp[3].depth,
514
                    isBE(dstFormat));
515 516 517
        } else
            fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
    }
518

519
#if HAVE_MMXEXT_INLINE
520
    if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
521
        __asm__ volatile ("sfence" ::: "memory");
522 523 524 525
#endif
    emms_c();

    /* store changed local vars back in the context */
526 527 528 529 530
    c->dstY         = dstY;
    c->lumBufIndex  = lumBufIndex;
    c->chrBufIndex  = chrBufIndex;
    c->lastInLumBuf = lastInLumBuf;
    c->lastInChrBuf = lastInChrBuf;
531 532

    return dstY - lastDstY;
533 534
}

535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
av_cold void ff_sws_init_range_convert(SwsContext *c)
{
    c->lumConvertRange = NULL;
    c->chrConvertRange = NULL;
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
        if (c->dstBpc <= 14) {
            if (c->srcRange) {
                c->lumConvertRange = lumRangeFromJpeg_c;
                c->chrConvertRange = chrRangeFromJpeg_c;
            } else {
                c->lumConvertRange = lumRangeToJpeg_c;
                c->chrConvertRange = chrRangeToJpeg_c;
            }
        } else {
            if (c->srcRange) {
                c->lumConvertRange = lumRangeFromJpeg16_c;
                c->chrConvertRange = chrRangeFromJpeg16_c;
            } else {
                c->lumConvertRange = lumRangeToJpeg16_c;
                c->chrConvertRange = chrRangeToJpeg16_c;
            }
        }
    }
}

560
static av_cold void sws_init_swscale(SwsContext *c)
Ramiro Polla's avatar
Ramiro Polla committed
561
{
562
    enum AVPixelFormat srcFormat = c->srcFormat;
563

564 565
    ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
                             &c->yuv2nv12cX, &c->yuv2packed1,
566
                             &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
567

568
    ff_sws_init_input_funcs(c);
569

570
    if (c->srcBpc == 8) {
571
        if (c->dstBpc <= 14) {
572
            c->hyScale = c->hcScale = hScale8To15_c;
573
            if (c->flags & SWS_FAST_BILINEAR) {
574 575
                c->hyscale_fast = ff_hyscale_fast_c;
                c->hcscale_fast = ff_hcscale_fast_c;
576
            }
577
        } else {
578
            c->hyScale = c->hcScale = hScale8To19_c;
579
        }
580
    } else {
581
        c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_c
582
                                                 : hScale16To15_c;
583
    }
584

585
    ff_sws_init_range_convert(c);
586

587
    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
588
          srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
589
        c->needs_hcscale = 1;
590
}
591

592
SwsFunc ff_getSwsFunc(SwsContext *c)
593
{
594
    sws_init_swscale(c);
595

596 597
    if (ARCH_PPC)
        ff_sws_init_swscale_ppc(c);
598 599
    if (ARCH_X86)
        ff_sws_init_swscale_x86(c);
600 601
    if (ARCH_AARCH64)
        ff_sws_init_swscale_aarch64(c);
602 603
    if (ARCH_ARM)
        ff_sws_init_swscale_arm(c);
604

605
    return swscale;
606
}
607

608
static void reset_ptr(const uint8_t *src[], enum AVPixelFormat format)
609 610 611 612 613 614 615 616 617 618 619
{
    if (!isALPHA(format))
        src[3] = NULL;
    if (!isPlanar(format)) {
        src[3] = src[2] = NULL;

        if (!usePal(format))
            src[1] = NULL;
    }
}

620
static int check_image_pointers(const uint8_t * const data[4], enum AVPixelFormat pix_fmt,
621 622
                                const int linesizes[4])
{
623
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
624 625
    int i;

626 627
    av_assert2(desc);

628 629 630 631 632 633 634 635 636
    for (i = 0; i < 4; i++) {
        int plane = desc->comp[i].plane;
        if (!data[plane] || !linesizes[plane])
            return 0;
    }

    return 1;
}

637 638 639 640 641 642 643 644 645 646
static void xyz12Torgb48(struct SwsContext *c, uint16_t *dst,
                         const uint16_t *src, int stride, int h)
{
    int xp,yp;
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);

    for (yp=0; yp<h; yp++) {
        for (xp=0; xp+2<stride; xp+=3) {
            int x, y, z, r, g, b;

647
            if (desc->flags & AV_PIX_FMT_FLAG_BE) {
648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
                x = AV_RB16(src + xp + 0);
                y = AV_RB16(src + xp + 1);
                z = AV_RB16(src + xp + 2);
            } else {
                x = AV_RL16(src + xp + 0);
                y = AV_RL16(src + xp + 1);
                z = AV_RL16(src + xp + 2);
            }

            x = c->xyzgamma[x>>4];
            y = c->xyzgamma[y>>4];
            z = c->xyzgamma[z>>4];

            // convert from XYZlinear to sRGBlinear
            r = c->xyz2rgb_matrix[0][0] * x +
                c->xyz2rgb_matrix[0][1] * y +
                c->xyz2rgb_matrix[0][2] * z >> 12;
            g = c->xyz2rgb_matrix[1][0] * x +
                c->xyz2rgb_matrix[1][1] * y +
                c->xyz2rgb_matrix[1][2] * z >> 12;
            b = c->xyz2rgb_matrix[2][0] * x +
669
                c->xyz2rgb_matrix[2][1] * y +
670 671 672
                c->xyz2rgb_matrix[2][2] * z >> 12;

            // limit values to 12-bit depth
673 674 675
            r = av_clip_uintp2(r, 12);
            g = av_clip_uintp2(g, 12);
            b = av_clip_uintp2(b, 12);
676 677

            // convert from sRGBlinear to RGB and scale from 12bit to 16bit
678
            if (desc->flags & AV_PIX_FMT_FLAG_BE) {
679 680 681 682 683 684 685 686 687 688 689 690 691 692
                AV_WB16(dst + xp + 0, c->rgbgamma[r] << 4);
                AV_WB16(dst + xp + 1, c->rgbgamma[g] << 4);
                AV_WB16(dst + xp + 2, c->rgbgamma[b] << 4);
            } else {
                AV_WL16(dst + xp + 0, c->rgbgamma[r] << 4);
                AV_WL16(dst + xp + 1, c->rgbgamma[g] << 4);
                AV_WL16(dst + xp + 2, c->rgbgamma[b] << 4);
            }
        }
        src += stride;
        dst += stride;
    }
}

693 694 695 696
static void rgb48Toxyz12(struct SwsContext *c, uint16_t *dst,
                         const uint16_t *src, int stride, int h)
{
    int xp,yp;
697
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat);
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728

    for (yp=0; yp<h; yp++) {
        for (xp=0; xp+2<stride; xp+=3) {
            int x, y, z, r, g, b;

            if (desc->flags & AV_PIX_FMT_FLAG_BE) {
                r = AV_RB16(src + xp + 0);
                g = AV_RB16(src + xp + 1);
                b = AV_RB16(src + xp + 2);
            } else {
                r = AV_RL16(src + xp + 0);
                g = AV_RL16(src + xp + 1);
                b = AV_RL16(src + xp + 2);
            }

            r = c->rgbgammainv[r>>4];
            g = c->rgbgammainv[g>>4];
            b = c->rgbgammainv[b>>4];

            // convert from sRGBlinear to XYZlinear
            x = c->rgb2xyz_matrix[0][0] * r +
                c->rgb2xyz_matrix[0][1] * g +
                c->rgb2xyz_matrix[0][2] * b >> 12;
            y = c->rgb2xyz_matrix[1][0] * r +
                c->rgb2xyz_matrix[1][1] * g +
                c->rgb2xyz_matrix[1][2] * b >> 12;
            z = c->rgb2xyz_matrix[2][0] * r +
                c->rgb2xyz_matrix[2][1] * g +
                c->rgb2xyz_matrix[2][2] * b >> 12;

            // limit values to 12-bit depth
729 730 731
            x = av_clip_uintp2(x, 12);
            y = av_clip_uintp2(y, 12);
            z = av_clip_uintp2(z, 12);
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748

            // convert from XYZlinear to X'Y'Z' and scale from 12bit to 16bit
            if (desc->flags & AV_PIX_FMT_FLAG_BE) {
                AV_WB16(dst + xp + 0, c->xyzgammainv[x] << 4);
                AV_WB16(dst + xp + 1, c->xyzgammainv[y] << 4);
                AV_WB16(dst + xp + 2, c->xyzgammainv[z] << 4);
            } else {
                AV_WL16(dst + xp + 0, c->xyzgammainv[x] << 4);
                AV_WL16(dst + xp + 1, c->xyzgammainv[y] << 4);
                AV_WL16(dst + xp + 2, c->xyzgammainv[z] << 4);
            }
        }
        src += stride;
        dst += stride;
    }
}

749 750 751 752 753 754 755 756 757 758 759
/**
 * swscale wrapper, so we don't need to export the SwsContext.
 * Assumes planar YUV to be in YUV order instead of YVU.
 */
int attribute_align_arg sws_scale(struct SwsContext *c,
                                  const uint8_t * const srcSlice[],
                                  const int srcStride[], int srcSliceY,
                                  int srcSliceH, uint8_t *const dst[],
                                  const int dstStride[])
{
    int i, ret;
760 761
    const uint8_t *src2[4];
    uint8_t *dst2[4];
762
    uint8_t *rgb0_tmp = NULL;
763
    int macro_height = isBayer(c->srcFormat) ? 2 : (1 << c->chrSrcVSubSample);
764
    // copy strides, so they can safely be modified
765 766
    int srcStride2[4];
    int dstStride2[4];
767
    int srcSliceY_internal = srcSliceY;
768

769
    if (!srcStride || !dstStride || !dst || !srcSlice) {
770 771 772
        av_log(c, AV_LOG_ERROR, "One of the input parameters to sws_scale() is NULL, please check the calling code\n");
        return 0;
    }
773

774 775 776 777 778
    for (i=0; i<4; i++) {
        srcStride2[i] = srcStride[i];
        dstStride2[i] = dstStride[i];
    }

779 780 781 782 783 784 785
    if ((srcSliceY & (macro_height-1)) ||
        ((srcSliceH& (macro_height-1)) && srcSliceY + srcSliceH != c->srcH) ||
        srcSliceY + srcSliceH > c->srcH) {
        av_log(c, AV_LOG_ERROR, "Slice parameters %d, %d are invalid\n", srcSliceY, srcSliceH);
        return AVERROR(EINVAL);
    }

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
    if (c->gamma_flag && c->cascaded_context[0]) {
        ret = sws_scale(c->cascaded_context[0],
                    srcSlice, srcStride, srcSliceY, srcSliceH,
                    c->cascaded_tmp, c->cascaded_tmpStride);

        if (ret < 0)
            return ret;

        if (c->cascaded_context[2])
            ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, c->cascaded_tmpStride, srcSliceY, srcSliceH, c->cascaded1_tmp, c->cascaded1_tmpStride);
        else
            ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, c->cascaded_tmpStride, srcSliceY, srcSliceH, dst, dstStride);

        if (ret < 0)
            return ret;

        if (c->cascaded_context[2]) {
            ret = sws_scale(c->cascaded_context[2],
                        (const uint8_t * const *)c->cascaded1_tmp, c->cascaded1_tmpStride, c->cascaded_context[1]->dstY - ret, c->cascaded_context[1]->dstY,
                        dst, dstStride);
        }
        return ret;
    }

810 811 812 813 814 815 816 817 818 819 820 821
    if (c->cascaded_context[0] && srcSliceY == 0 && srcSliceH == c->cascaded_context[0]->srcH) {
        ret = sws_scale(c->cascaded_context[0],
                        srcSlice, srcStride, srcSliceY, srcSliceH,
                        c->cascaded_tmp, c->cascaded_tmpStride);
        if (ret < 0)
            return ret;
        ret = sws_scale(c->cascaded_context[1],
                        (const uint8_t * const * )c->cascaded_tmp, c->cascaded_tmpStride, 0, c->cascaded_context[0]->dstH,
                        dst, dstStride);
        return ret;
    }

822 823 824
    memcpy(src2, srcSlice, sizeof(src2));
    memcpy(dst2, dst, sizeof(dst2));

825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
    // do not mess up sliceDir if we have a "trailing" 0-size slice
    if (srcSliceH == 0)
        return 0;

    if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
        av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
        return 0;
    }
    if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
        av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
        return 0;
    }

    if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
        av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
        return 0;
    }
    if (c->sliceDir == 0) {
        if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
    }

    if (usePal(c->srcFormat)) {
        for (i = 0; i < 256; i++) {
848
            int r, g, b, y, u, v, a = 0xff;
849
            if (c->srcFormat == AV_PIX_FMT_PAL8) {
850
                uint32_t p = ((const uint32_t *)(srcSlice[1]))[i];
851 852 853 854
                a = (p >> 24) & 0xFF;
                r = (p >> 16) & 0xFF;
                g = (p >>  8) & 0xFF;
                b =  p        & 0xFF;
855
            } else if (c->srcFormat == AV_PIX_FMT_RGB8) {
856 857 858
                r = ( i >> 5     ) * 36;
                g = ((i >> 2) & 7) * 36;
                b = ( i       & 3) * 85;
859
            } else if (c->srcFormat == AV_PIX_FMT_BGR8) {
860 861 862
                b = ( i >> 6     ) * 85;
                g = ((i >> 3) & 7) * 36;
                r = ( i       & 7) * 36;
863
            } else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) {
864 865 866
                r = ( i >> 3     ) * 255;
                g = ((i >> 1) & 3) * 85;
                b = ( i       & 1) * 255;
867
            } else if (c->srcFormat == AV_PIX_FMT_GRAY8 || c->srcFormat == AV_PIX_FMT_GRAY8A) {
868 869
                r = g = b = i;
            } else {
870
                av_assert1(c->srcFormat == AV_PIX_FMT_BGR4_BYTE);
871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
                b = ( i >> 3     ) * 255;
                g = ((i >> 1) & 3) * 85;
                r = ( i       & 1) * 255;
            }
#define RGB2YUV_SHIFT 15
#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))

            y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
            u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
            v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
889
            c->pal_yuv[i]= y + (u<<8) + (v<<16) + ((unsigned)a<<24);
890 891

            switch (c->dstFormat) {
892
            case AV_PIX_FMT_BGR32:
893
#if !HAVE_BIGENDIAN
894
            case AV_PIX_FMT_RGB24:
895
#endif
896
                c->pal_rgb[i]=  r + (g<<8) + (b<<16) + ((unsigned)a<<24);
897
                break;
898
            case AV_PIX_FMT_BGR32_1:
899
#if HAVE_BIGENDIAN
900
            case AV_PIX_FMT_BGR24:
901
#endif
902
                c->pal_rgb[i]= a + (r<<8) + (g<<16) + ((unsigned)b<<24);
903
                break;
904
            case AV_PIX_FMT_RGB32_1:
905
#if HAVE_BIGENDIAN
906
            case AV_PIX_FMT_RGB24:
907
#endif
908
                c->pal_rgb[i]= a + (b<<8) + (g<<16) + ((unsigned)r<<24);
909
                break;
910
            case AV_PIX_FMT_RGB32:
911
#if !HAVE_BIGENDIAN
912
            case AV_PIX_FMT_BGR24:
913 914
#endif
            default:
915
                c->pal_rgb[i]=  b + (g<<8) + (r<<16) + ((unsigned)a<<24);
916 917 918 919 920 921 922 923
            }
        }
    }

    if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
        uint8_t *base;
        int x,y;
        rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
924 925 926
        if (!rgb0_tmp)
            return AVERROR(ENOMEM);

927 928 929 930 931 932 933 934 935 936
        base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
        for (y=0; y<srcSliceH; y++){
            memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
            for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
                base[ srcStride[0]*y + x] = 0xFF;
            }
        }
        src2[0] = base;
    }

937 938 939
    if (c->srcXYZ && !(c->dstXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
        uint8_t *base;
        rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
940 941 942
        if (!rgb0_tmp)
            return AVERROR(ENOMEM);

943 944
        base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;

945
        xyz12Torgb48(c, (uint16_t*)base, (const uint16_t*)src2[0], srcStride[0]/2, srcSliceH);
946 947 948
        src2[0] = base;
    }

949
    if (!srcSliceY && (c->flags & SWS_BITEXACT) && c->dither == SWS_DITHER_ED && c->dither_error[0])
950 951 952
        for (i = 0; i < 4; i++)
            memset(c->dither_error[i], 0, sizeof(c->dither_error[0][0]) * (c->dstW+2));

953
    if (c->sliceDir != 1) {
954
        // slices go from bottom to top => we flip the image internally
955 956 957 958
        for (i=0; i<4; i++) {
            srcStride2[i] *= -1;
            dstStride2[i] *= -1;
        }
959 960 961 962 963 964 965 966 967 968 969

        src2[0] += (srcSliceH - 1) * srcStride[0];
        if (!usePal(c->srcFormat))
            src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1];
        src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2];
        src2[3] += (srcSliceH - 1) * srcStride[3];
        dst2[0] += ( c->dstH                         - 1) * dstStride[0];
        dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1];
        dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2];
        dst2[3] += ( c->dstH                         - 1) * dstStride[3];

970
        srcSliceY_internal = c->srcH-srcSliceY-srcSliceH;
971
    }
972 973 974 975 976 977 978
    reset_ptr(src2, c->srcFormat);
    reset_ptr((void*)dst2, c->dstFormat);

    /* reset slice direction at end of frame */
    if (srcSliceY_internal + srcSliceH == c->srcH)
        c->sliceDir = 0;
    ret = c->swscale(c, src2, srcStride2, srcSliceY_internal, srcSliceH, dst2, dstStride2);
979

980
    if (c->dstXYZ && !(c->srcXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
981
        int dstY = c->dstY ? c->dstY : srcSliceY + srcSliceH;
982
        uint16_t *dst16 = (uint16_t*)(dst2[0] + (dstY - ret) * dstStride2[0]);
983 984 985 986
        av_assert0(dstY >= ret);
        av_assert0(ret >= 0);
        av_assert0(c->dstH >= dstY);

987
        /* replace on the same data */
988
        rgb48Toxyz12(c, dst16, dst16, dstStride2[0]/2, ret);
989 990
    }

991 992 993
    av_free(rgb0_tmp);
    return ret;
}