swscale_template.c 71.6 KB
Newer Older
1
/*
2
 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

21 22 23 24 25
#include <stdint.h>

#include "libavutil/x86/asm.h"
#include "libswscale/swscale_internal.h"

26 27
#undef REAL_MOVNTQ
#undef MOVNTQ
28
#undef MOVNTQ2
29 30 31
#undef PREFETCH


32
#if COMPILE_TEMPLATE_MMXEXT
33
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34
#define MOVNTQ2 "movntq "
35 36
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37
#define MOVNTQ2 "movq "
38 39 40
#endif
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)

41
#if !COMPILE_TEMPLATE_MMXEXT
42
static av_always_inline void
43
dither_8to16(const uint8_t *srcDither, int rot)
44 45 46 47 48 49 50 51 52 53 54
{
    if (rot) {
        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
                         "movq       (%0), %%mm3\n\t"
                         "movq      %%mm3, %%mm4\n\t"
                         "psrlq       $24, %%mm3\n\t"
                         "psllq       $40, %%mm4\n\t"
                         "por       %%mm4, %%mm3\n\t"
                         "movq      %%mm3, %%mm4\n\t"
                         "punpcklbw %%mm0, %%mm3\n\t"
                         "punpckhbw %%mm0, %%mm4\n\t"
55
                         :: "r"(srcDither)
56 57 58 59 60 61 62
                         );
    } else {
        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
                         "movq       (%0), %%mm3\n\t"
                         "movq      %%mm3, %%mm4\n\t"
                         "punpcklbw %%mm0, %%mm3\n\t"
                         "punpckhbw %%mm0, %%mm4\n\t"
63
                         :: "r"(srcDither)
64 65 66 67 68
                         );
    }
}
#endif

69 70 71
static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset)
72
{
73
    dither_8to16(dither, offset);
74 75 76 77 78 79 80 81
    filterSize--;
    __asm__ volatile(
        "movd %0, %%mm1\n\t"
        "punpcklwd %%mm1, %%mm1\n\t"
        "punpckldq %%mm1, %%mm1\n\t"
        "psllw        $3, %%mm1\n\t"
        "paddw     %%mm1, %%mm3\n\t"
        "paddw     %%mm1, %%mm4\n\t"
82 83
        "psraw        $4, %%mm3\n\t"
        "psraw        $4, %%mm4\n\t"
84 85 86 87
        ::"m"(filterSize)
     );

    __asm__ volatile(\
88 89
        "movq    %%mm3, %%mm6\n\t"
        "movq    %%mm4, %%mm7\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
90
        "movl %3, %%ecx\n\t"
91 92 93 94 95 96 97 98 99 100
        "mov                                 %0, %%"FF_REG_d"       \n\t"\
        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
        ".p2align                             4                     \n\t" /* FIXME Unroll? */\
        "1:                                                         \n\t"\
        "movq                      8(%%"FF_REG_d"), %%mm0           \n\t" /* filterCoeff */\
        "movq                (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
        "movq               8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
        "add                                $16, %%"FF_REG_d"       \n\t"\
        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
        "test                         %%"FF_REG_S", %%"FF_REG_S"    \n\t"\
101 102 103 104 105 106 107 108
        "pmulhw                           %%mm0, %%mm2      \n\t"\
        "pmulhw                           %%mm0, %%mm5      \n\t"\
        "paddw                            %%mm2, %%mm3      \n\t"\
        "paddw                            %%mm5, %%mm4      \n\t"\
        " jnz                                1b             \n\t"\
        "psraw                               $3, %%mm3      \n\t"\
        "psraw                               $3, %%mm4      \n\t"\
        "packuswb                         %%mm4, %%mm3      \n\t"
109 110 111
        MOVNTQ2 "                         %%mm3, (%1, %%"FF_REG_c")\n\t"
        "add                          $8, %%"FF_REG_c"      \n\t"\
        "cmp                          %2, %%"FF_REG_c"      \n\t"\
112 113
        "movq    %%mm6, %%mm3\n\t"
        "movq    %%mm7, %%mm4\n\t"
114 115 116
        "mov                                 %0, %%"FF_REG_d"     \n\t"\
        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
        "jb                                  1b                   \n\t"\
117 118
        :: "g" (filter),
           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
119
        : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
120
    );
121 122
}

123 124
#define YSCALEYUV2PACKEDX_UV \
    __asm__ volatile(\
125
        "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\
126 127 128
        ".p2align                      4                \n\t"\
        "nop                                            \n\t"\
        "1:                                             \n\t"\
129 130
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\
        "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
131 132 133 134
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
        "movq                      %%mm3, %%mm4         \n\t"\
        ".p2align                      4                \n\t"\
        "2:                                             \n\t"\
135 136 137 138 139 140
        "movq            8(%%"FF_REG_d"), %%mm0         \n\t" /* filterCoeff */\
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" /* UsrcData */\
        "add                          %6, %%"FF_REG_S"  \n\t" \
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm5      \n\t" /* VsrcData */\
        "add                         $16, %%"FF_REG_d"  \n\t"\
        "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
141 142 143 144
        "pmulhw                    %%mm0, %%mm2         \n\t"\
        "pmulhw                    %%mm0, %%mm5         \n\t"\
        "paddw                     %%mm2, %%mm3         \n\t"\
        "paddw                     %%mm5, %%mm4         \n\t"\
145
        "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
146 147 148
        " jnz                         2b                \n\t"\

#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
149 150
    "lea                "offset"(%0), %%"FF_REG_d"  \n\t"\
    "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
151 152 153 154
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
    "movq                    "#dst1", "#dst2"       \n\t"\
    ".p2align                      4                \n\t"\
    "2:                                             \n\t"\
155 156 157 158 159
    "movq            8(%%"FF_REG_d"), "#coeff"      \n\t" /* filterCoeff */\
    "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
    "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
    "add                         $16, %%"FF_REG_d"  \n\t"\
    "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
160 161 162 163
    "pmulhw                 "#coeff", "#src1"       \n\t"\
    "pmulhw                 "#coeff", "#src2"       \n\t"\
    "paddw                   "#src1", "#dst1"       \n\t"\
    "paddw                   "#src2", "#dst2"       \n\t"\
164
    "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
165 166 167 168 169 170 171 172 173
    " jnz                         2b                \n\t"\

#define YSCALEYUV2PACKEDX \
    YSCALEYUV2PACKEDX_UV \
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \

#define YSCALEYUV2PACKEDX_END                     \
        :: "r" (&c->redDither),                   \
            "m" (dummy), "m" (dummy), "m" (dummy),\
174
            "r" (dest), "m" (dstW_reg), "m"(uv_off) \
175
            NAMED_CONSTRAINTS_ADD(bF8,bFC) \
176
        : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S            \
177 178 179 180
    );

#define YSCALEYUV2PACKEDX_ACCURATE_UV \
    __asm__ volatile(\
181
        "xor %%"FF_REG_a", %%"FF_REG_a"                 \n\t"\
182 183 184
        ".p2align                      4                \n\t"\
        "nop                                            \n\t"\
        "1:                                             \n\t"\
185 186
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\
        "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
187 188 189 190 191 192
        "pxor                      %%mm4, %%mm4         \n\t"\
        "pxor                      %%mm5, %%mm5         \n\t"\
        "pxor                      %%mm6, %%mm6         \n\t"\
        "pxor                      %%mm7, %%mm7         \n\t"\
        ".p2align                      4                \n\t"\
        "2:                                             \n\t"\
193 194 195 196 197
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm0      \n\t" /* UsrcData */\
        "add                          %6, %%"FF_REG_S"  \n\t" \
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" /* VsrcData */\
        "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm1      \n\t" /* UsrcData */\
198 199 200
        "movq                      %%mm0, %%mm3         \n\t"\
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
201
        "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1      \n\t" /* filterCoeff */\
202 203 204 205
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
        "paddd                     %%mm0, %%mm4         \n\t"\
        "paddd                     %%mm3, %%mm5         \n\t"\
206 207 208 209 210
        "add                          %6, %%"FF_REG_S"  \n\t" \
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm3      \n\t" /* VsrcData */\
        "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
        "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\
        "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
        "movq                      %%mm2, %%mm0         \n\t"\
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
        "paddd                     %%mm2, %%mm6         \n\t"\
        "paddd                     %%mm0, %%mm7         \n\t"\
        " jnz                         2b                \n\t"\
        "psrad                       $16, %%mm4         \n\t"\
        "psrad                       $16, %%mm5         \n\t"\
        "psrad                       $16, %%mm6         \n\t"\
        "psrad                       $16, %%mm7         \n\t"\
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
        "packssdw                  %%mm5, %%mm4         \n\t"\
        "packssdw                  %%mm7, %%mm6         \n\t"\
        "paddw                     %%mm0, %%mm4         \n\t"\
        "paddw                     %%mm0, %%mm6         \n\t"\
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\

#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
232 233
    "lea                "offset"(%0), %%"FF_REG_d"      \n\t"\
    "mov                 (%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\
234 235 236 237 238 239
    "pxor                      %%mm1, %%mm1         \n\t"\
    "pxor                      %%mm5, %%mm5         \n\t"\
    "pxor                      %%mm7, %%mm7         \n\t"\
    "pxor                      %%mm6, %%mm6         \n\t"\
    ".p2align                      4                \n\t"\
    "2:                                             \n\t"\
240 241 242 243
    "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0       \n\t" /* Y1srcData */\
    "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2       \n\t" /* Y2srcData */\
    "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\
    "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4       \n\t" /* Y1srcData */\
244 245 246
    "movq                      %%mm0, %%mm3         \n\t"\
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
247
    "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4     \n\t" /* filterCoeff */\
248 249 250 251
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
    "paddd                     %%mm0, %%mm1         \n\t"\
    "paddd                     %%mm3, %%mm5         \n\t"\
252 253 254 255
    "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3   \n\t" /* Y2srcData */\
    "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
    "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\
    "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
    "movq                      %%mm2, %%mm0         \n\t"\
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
    "paddd                     %%mm2, %%mm7         \n\t"\
    "paddd                     %%mm0, %%mm6         \n\t"\
    " jnz                         2b                \n\t"\
    "psrad                       $16, %%mm1         \n\t"\
    "psrad                       $16, %%mm5         \n\t"\
    "psrad                       $16, %%mm7         \n\t"\
    "psrad                       $16, %%mm6         \n\t"\
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
    "packssdw                  %%mm5, %%mm1         \n\t"\
    "packssdw                  %%mm6, %%mm7         \n\t"\
    "paddw                     %%mm0, %%mm1         \n\t"\
    "paddw                     %%mm0, %%mm7         \n\t"\
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\

#define YSCALEYUV2PACKEDX_ACCURATE \
    YSCALEYUV2PACKEDX_ACCURATE_UV \
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)

#define YSCALEYUV2RGBX \
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
    "paddw           %%mm3, %%mm4       \n\t"\
    "movq            %%mm2, %%mm0       \n\t"\
    "movq            %%mm5, %%mm6       \n\t"\
    "movq            %%mm4, %%mm3       \n\t"\
    "punpcklwd       %%mm2, %%mm2       \n\t"\
    "punpcklwd       %%mm5, %%mm5       \n\t"\
    "punpcklwd       %%mm4, %%mm4       \n\t"\
    "paddw           %%mm1, %%mm2       \n\t"\
    "paddw           %%mm1, %%mm5       \n\t"\
    "paddw           %%mm1, %%mm4       \n\t"\
    "punpckhwd       %%mm0, %%mm0       \n\t"\
    "punpckhwd       %%mm6, %%mm6       \n\t"\
    "punpckhwd       %%mm3, %%mm3       \n\t"\
    "paddw           %%mm7, %%mm0       \n\t"\
    "paddw           %%mm7, %%mm6       \n\t"\
    "paddw           %%mm7, %%mm3       \n\t"\
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
    "packuswb        %%mm0, %%mm2       \n\t"\
    "packuswb        %%mm6, %%mm5       \n\t"\
    "packuswb        %%mm3, %%mm4       \n\t"\

316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
    "movq       "#b", "#q2"     \n\t" /* B */\
    "movq       "#r", "#t"      \n\t" /* R */\
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
\
    MOVNTQ(   q0,   (dst, index, 4))\
    MOVNTQ(    b,  8(dst, index, 4))\
    MOVNTQ(   q2, 16(dst, index, 4))\
    MOVNTQ(   q3, 24(dst, index, 4))\
\
    "add      $8, "#index"      \n\t"\
336
    "cmp  "dstw", "#index"      \n\t"\
337 338
    " jb      1b                \n\t"
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
339

340 341 342 343 344 345
static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
                                   const int16_t **lumSrc, int lumFilterSize,
                                   const int16_t *chrFilter, const int16_t **chrUSrc,
                                   const int16_t **chrVSrc,
                                   int chrFilterSize, const int16_t **alpSrc,
                                   uint8_t *dest, int dstW, int dstY)
346 347 348
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
349
    x86_reg uv_off = c->uv_offx2;
350

351
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
352 353 354 355 356
        YSCALEYUV2PACKEDX_ACCURATE
        YSCALEYUV2RGBX
        "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
        "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
        "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
357
        YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
358 359 360 361
        "movq               "Y_TEMP"(%0), %%mm5         \n\t"
        "psraw                        $3, %%mm1         \n\t"
        "psraw                        $3, %%mm7         \n\t"
        "packuswb                  %%mm7, %%mm1         \n\t"
362
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
363 364 365 366 367
        YSCALEYUV2PACKEDX_END
    } else {
        YSCALEYUV2PACKEDX_ACCURATE
        YSCALEYUV2RGBX
        "pcmpeqd %%mm7, %%mm7 \n\t"
368
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
369 370 371
        YSCALEYUV2PACKEDX_END
    }
}
372

373 374 375 376 377 378
static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
                                const int16_t **lumSrc, int lumFilterSize,
                                const int16_t *chrFilter, const int16_t **chrUSrc,
                                const int16_t **chrVSrc,
                                int chrFilterSize, const int16_t **alpSrc,
                                uint8_t *dest, int dstW, int dstY)
379 380 381
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
382
    x86_reg uv_off = c->uv_offx2;
383

384
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
385 386
        YSCALEYUV2PACKEDX
        YSCALEYUV2RGBX
387
        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
388 389 390
        "psraw                        $3, %%mm1         \n\t"
        "psraw                        $3, %%mm7         \n\t"
        "packuswb                  %%mm7, %%mm1         \n\t"
391
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
392 393 394 395 396
        YSCALEYUV2PACKEDX_END
    } else {
        YSCALEYUV2PACKEDX
        YSCALEYUV2RGBX
        "pcmpeqd %%mm7, %%mm7 \n\t"
397
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
398 399 400
        YSCALEYUV2PACKEDX_END
    }
}
401

Kevin Coyle's avatar
Kevin Coyle committed
402 403 404 405 406 407 408 409 410 411 412
static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
                                const int16_t **lumSrc, int lumFilterSize,
                                const int16_t *chrFilter, const int16_t **chrUSrc,
                                const int16_t **chrVSrc,
                                int chrFilterSize, const int16_t **alpSrc,
                                uint8_t *dest, int dstW, int dstY)
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
    x86_reg uv_off = c->uv_offx2;

413
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
Kevin Coyle's avatar
Kevin Coyle committed
414 415 416 417 418 419
        YSCALEYUV2PACKEDX
        YSCALEYUV2RGBX
        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
        "psraw                        $3, %%mm1         \n\t"
        "psraw                        $3, %%mm7         \n\t"
        "packuswb                  %%mm7, %%mm1         \n\t"
420
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
Kevin Coyle's avatar
Kevin Coyle committed
421 422 423 424 425
        YSCALEYUV2PACKEDX_END
    } else {
        YSCALEYUV2PACKEDX
        YSCALEYUV2RGBX
        "pcmpeqd %%mm7, %%mm7 \n\t"
426
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
Kevin Coyle's avatar
Kevin Coyle committed
427 428 429 430
        YSCALEYUV2PACKEDX_END
    }
}

431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
#define REAL_WRITERGB16(dst, dstw, index) \
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
    "psrlq           $3, %%mm2  \n\t"\
\
    "movq         %%mm2, %%mm1  \n\t"\
    "movq         %%mm4, %%mm3  \n\t"\
\
    "punpcklbw    %%mm7, %%mm3  \n\t"\
    "punpcklbw    %%mm5, %%mm2  \n\t"\
    "punpckhbw    %%mm7, %%mm4  \n\t"\
    "punpckhbw    %%mm5, %%mm1  \n\t"\
\
    "psllq           $3, %%mm3  \n\t"\
    "psllq           $3, %%mm4  \n\t"\
\
    "por          %%mm3, %%mm2  \n\t"\
    "por          %%mm4, %%mm1  \n\t"\
\
    MOVNTQ(%%mm2,  (dst, index, 2))\
    MOVNTQ(%%mm1, 8(dst, index, 2))\
\
    "add             $8, "#index"   \n\t"\
455
    "cmp         "dstw", "#index"   \n\t"\
456 457
    " jb             1b             \n\t"
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
458

459 460 461 462 463 464
static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
                                    const int16_t **lumSrc, int lumFilterSize,
                                    const int16_t *chrFilter, const int16_t **chrUSrc,
                                    const int16_t **chrVSrc,
                                    int chrFilterSize, const int16_t **alpSrc,
                                    uint8_t *dest, int dstW, int dstY)
465 466 467
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
468
    x86_reg uv_off = c->uv_offx2;
469

470 471 472 473 474 475 476 477 478
    YSCALEYUV2PACKEDX_ACCURATE
    YSCALEYUV2RGBX
    "pxor %%mm7, %%mm7 \n\t"
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
479
    WRITERGB16(%4, "%5", %%FF_REGa)
480 481
    YSCALEYUV2PACKEDX_END
}
482

483 484 485 486 487 488
static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
                                 const int16_t **lumSrc, int lumFilterSize,
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
                                 const int16_t **chrVSrc,
                                 int chrFilterSize, const int16_t **alpSrc,
                                 uint8_t *dest, int dstW, int dstY)
489 490 491
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
492
    x86_reg uv_off = c->uv_offx2;
493

494 495 496 497 498 499 500 501 502
    YSCALEYUV2PACKEDX
    YSCALEYUV2RGBX
    "pxor %%mm7, %%mm7 \n\t"
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
#endif
503
    WRITERGB16(%4, "%5", %%FF_REGa)
504 505
    YSCALEYUV2PACKEDX_END
}
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531

#define REAL_WRITERGB15(dst, dstw, index) \
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
    "psrlq           $3, %%mm2  \n\t"\
    "psrlq           $1, %%mm5  \n\t"\
\
    "movq         %%mm2, %%mm1  \n\t"\
    "movq         %%mm4, %%mm3  \n\t"\
\
    "punpcklbw    %%mm7, %%mm3  \n\t"\
    "punpcklbw    %%mm5, %%mm2  \n\t"\
    "punpckhbw    %%mm7, %%mm4  \n\t"\
    "punpckhbw    %%mm5, %%mm1  \n\t"\
\
    "psllq           $2, %%mm3  \n\t"\
    "psllq           $2, %%mm4  \n\t"\
\
    "por          %%mm3, %%mm2  \n\t"\
    "por          %%mm4, %%mm1  \n\t"\
\
    MOVNTQ(%%mm2,  (dst, index, 2))\
    MOVNTQ(%%mm1, 8(dst, index, 2))\
\
    "add             $8, "#index"   \n\t"\
532
    "cmp         "dstw", "#index"   \n\t"\
533 534 535
    " jb             1b             \n\t"
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)

536 537 538 539 540 541
static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
                                    const int16_t **lumSrc, int lumFilterSize,
                                    const int16_t *chrFilter, const int16_t **chrUSrc,
                                    const int16_t **chrVSrc,
                                    int chrFilterSize, const int16_t **alpSrc,
                                    uint8_t *dest, int dstW, int dstY)
542 543 544
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
545
    x86_reg uv_off = c->uv_offx2;
546 547 548 549 550 551 552 553 554 555

    YSCALEYUV2PACKEDX_ACCURATE
    YSCALEYUV2RGBX
    "pxor %%mm7, %%mm7 \n\t"
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
556
    WRITERGB15(%4, "%5", %%FF_REGa)
557 558 559
    YSCALEYUV2PACKEDX_END
}

560 561 562 563 564 565
static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
                                 const int16_t **lumSrc, int lumFilterSize,
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
                                 const int16_t **chrVSrc,
                                 int chrFilterSize, const int16_t **alpSrc,
                                 uint8_t *dest, int dstW, int dstY)
566 567 568
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
569
    x86_reg uv_off = c->uv_offx2;
570 571 572 573 574 575 576 577 578 579

    YSCALEYUV2PACKEDX
    YSCALEYUV2RGBX
    "pxor %%mm7, %%mm7 \n\t"
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
#endif
580
    WRITERGB15(%4, "%5", %%FF_REGa)
581 582 583
    YSCALEYUV2PACKEDX_END
}

584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
#define WRITEBGR24MMX(dst, dstw, index) \
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
    "movq      %%mm2, %%mm1     \n\t" /* B */\
    "movq      %%mm5, %%mm6     \n\t" /* R */\
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
\
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
\
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
\
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
\
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
    MOVNTQ(%%mm0, (dst))\
\
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
    MOVNTQ(%%mm6, 8(dst))\
\
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
    MOVNTQ(%%mm5, 16(dst))\
\
    "add         $24, "#dst"    \n\t"\
\
    "add          $8, "#index"  \n\t"\
634
    "cmp      "dstw", "#index"  \n\t"\
635 636
    " jb          1b            \n\t"

637
#define WRITEBGR24MMXEXT(dst, dstw, index) \
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
\
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
\
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
    "por    %%mm1, %%mm6        \n\t"\
    "por    %%mm3, %%mm6        \n\t"\
    MOVNTQ(%%mm6, (dst))\
\
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
\
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
\
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
    "por    %%mm3, %%mm6        \n\t"\
    MOVNTQ(%%mm6, 8(dst))\
\
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
\
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
\
    "por    %%mm1, %%mm3        \n\t"\
    "por    %%mm3, %%mm6        \n\t"\
    MOVNTQ(%%mm6, 16(dst))\
\
    "add      $24, "#dst"       \n\t"\
\
    "add       $8, "#index"     \n\t"\
682
    "cmp   "dstw", "#index"     \n\t"\
683 684
    " jb       1b               \n\t"

685
#if COMPILE_TEMPLATE_MMXEXT
686
#undef WRITEBGR24
687
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index)
688 689 690 691 692
#else
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
#endif

693
#if HAVE_6REGS
694 695 696 697 698 699
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
                                   const int16_t **lumSrc, int lumFilterSize,
                                   const int16_t *chrFilter, const int16_t **chrUSrc,
                                   const int16_t **chrVSrc,
                                   int chrFilterSize, const int16_t **alpSrc,
                                   uint8_t *dest, int dstW, int dstY)
700 701 702
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
703
    x86_reg uv_off = c->uv_offx2;
704 705 706 707

    YSCALEYUV2PACKEDX_ACCURATE
    YSCALEYUV2RGBX
    "pxor %%mm7, %%mm7 \n\t"
708 709 710
    "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
    "add %4, %%"FF_REG_c"                        \n\t"
    WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
711 712
    :: "r" (&c->redDither),
       "m" (dummy), "m" (dummy), "m" (dummy),
713
       "r" (dest), "m" (dstW_reg), "m"(uv_off)
714
       NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
715
    : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
716 717 718
    );
}

719 720 721 722 723 724
static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
                                const int16_t **lumSrc, int lumFilterSize,
                                const int16_t *chrFilter, const int16_t **chrUSrc,
                                const int16_t **chrVSrc,
                                int chrFilterSize, const int16_t **alpSrc,
                                uint8_t *dest, int dstW, int dstY)
725 726 727
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
728
    x86_reg uv_off = c->uv_offx2;
729 730 731

    YSCALEYUV2PACKEDX
    YSCALEYUV2RGBX
732 733 734 735
    "pxor                    %%mm7, %%mm7              \n\t"
    "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
    "add                        %4, %%"FF_REG_c"       \n\t"
    WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
736 737
    :: "r" (&c->redDither),
       "m" (dummy), "m" (dummy), "m" (dummy),
738
       "r" (dest),  "m" (dstW_reg), "m"(uv_off)
739
       NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
740
    : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
741 742
    );
}
743
#endif /* HAVE_6REGS */
744

745 746 747 748 749 750 751 752 753 754 755 756 757
#define REAL_WRITEYUY2(dst, dstw, index) \
    "packuswb  %%mm3, %%mm3     \n\t"\
    "packuswb  %%mm4, %%mm4     \n\t"\
    "packuswb  %%mm7, %%mm1     \n\t"\
    "punpcklbw %%mm4, %%mm3     \n\t"\
    "movq      %%mm1, %%mm7     \n\t"\
    "punpcklbw %%mm3, %%mm1     \n\t"\
    "punpckhbw %%mm3, %%mm7     \n\t"\
\
    MOVNTQ(%%mm1, (dst, index, 2))\
    MOVNTQ(%%mm7, 8(dst, index, 2))\
\
    "add          $8, "#index"  \n\t"\
758
    "cmp      "dstw", "#index"  \n\t"\
759 760 761
    " jb          1b            \n\t"
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)

762 763 764 765 766 767
static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
                                     const int16_t **lumSrc, int lumFilterSize,
                                     const int16_t *chrFilter, const int16_t **chrUSrc,
                                     const int16_t **chrVSrc,
                                     int chrFilterSize, const int16_t **alpSrc,
                                     uint8_t *dest, int dstW, int dstY)
768 769 770
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
771
    x86_reg uv_off = c->uv_offx2;
772

773 774 775 776 777 778
    YSCALEYUV2PACKEDX_ACCURATE
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    "psraw $3, %%mm3    \n\t"
    "psraw $3, %%mm4    \n\t"
    "psraw $3, %%mm1    \n\t"
    "psraw $3, %%mm7    \n\t"
779
    WRITEYUY2(%4, "%5", %%FF_REGa)
780
    YSCALEYUV2PACKEDX_END
781 782
}

783 784 785 786 787 788
static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
                                  const int16_t **lumSrc, int lumFilterSize,
                                  const int16_t *chrFilter, const int16_t **chrUSrc,
                                  const int16_t **chrVSrc,
                                  int chrFilterSize, const int16_t **alpSrc,
                                  uint8_t *dest, int dstW, int dstY)
789 790 791
{
    x86_reg dummy=0;
    x86_reg dstW_reg = dstW;
792
    x86_reg uv_off = c->uv_offx2;
793

794 795 796 797 798 799
    YSCALEYUV2PACKEDX
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    "psraw $3, %%mm3    \n\t"
    "psraw $3, %%mm4    \n\t"
    "psraw $3, %%mm1    \n\t"
    "psraw $3, %%mm7    \n\t"
800
    WRITEYUY2(%4, "%5", %%FF_REGa)
801
    YSCALEYUV2PACKEDX_END
802 803
}

804
#define REAL_YSCALEYUV2RGB_UV(index, c) \
805 806 807 808 809
    "xor            "#index", "#index"  \n\t"\
    ".p2align              4            \n\t"\
    "1:                                 \n\t"\
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
810
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
811 812
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
813
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
830

831 832 833 834 835 836 837 838 839 840 841 842 843
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
844

845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
#define REAL_YSCALEYUV2RGB_COEFF(c) \
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
    "paddw             %%mm3, %%mm4     \n\t"\
    "movq              %%mm2, %%mm0     \n\t"\
    "movq              %%mm5, %%mm6     \n\t"\
    "movq              %%mm4, %%mm3     \n\t"\
    "punpcklwd         %%mm2, %%mm2     \n\t"\
    "punpcklwd         %%mm5, %%mm5     \n\t"\
    "punpcklwd         %%mm4, %%mm4     \n\t"\
    "paddw             %%mm1, %%mm2     \n\t"\
    "paddw             %%mm1, %%mm5     \n\t"\
    "paddw             %%mm1, %%mm4     \n\t"\
    "punpckhwd         %%mm0, %%mm0     \n\t"\
    "punpckhwd         %%mm6, %%mm6     \n\t"\
    "punpckhwd         %%mm3, %%mm3     \n\t"\
    "paddw             %%mm7, %%mm0     \n\t"\
    "paddw             %%mm7, %%mm6     \n\t"\
    "paddw             %%mm7, %%mm3     \n\t"\
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
    "packuswb          %%mm0, %%mm2     \n\t"\
    "packuswb          %%mm6, %%mm5     \n\t"\
    "packuswb          %%mm3, %%mm4     \n\t"\
873

874
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
875

876 877
#define YSCALEYUV2RGB(index, c) \
    REAL_YSCALEYUV2RGB_UV(index, c) \
878 879
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
    REAL_YSCALEYUV2RGB_COEFF(c)
880

881 882 883
/**
 * vertical bilinear scale YV12 to RGB
 */
884 885 886
static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
                                const int16_t *abuf[2], uint8_t *dest,
887
                                int dstW, int yalpha, int uvalpha, int y)
888
{
889 890 891
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

892
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
893
        const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
894 895
#if ARCH_X86_64
        __asm__ volatile(
896
            YSCALEYUV2RGB(%%r8, %5)
897 898 899 900
            YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
            "packuswb            %%mm7, %%mm1       \n\t"
901
            WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
902
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
903
               "a" (&c->redDither),
904
               "r" (abuf0), "r" (abuf1)
905 906 907 908 909 910
            : "%r8"
        );
#else
        c->u_temp=(intptr_t)abuf0;
        c->v_temp=(intptr_t)abuf1;
        __asm__ volatile(
911 912 913 914
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov        %4, %%"FF_REG_b"            \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB(%%FF_REGBP, %5)
915 916 917 918
            "push                   %0              \n\t"
            "push                   %1              \n\t"
            "mov          "U_TEMP"(%5), %0          \n\t"
            "mov          "V_TEMP"(%5), %1          \n\t"
919
            YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
920 921 922 923 924
            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
            "packuswb            %%mm7, %%mm1       \n\t"
            "pop                    %1              \n\t"
            "pop                    %0              \n\t"
925 926 927
            WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
928
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
929
               "a" (&c->redDither)
930 931 932 933
        );
#endif
    } else {
        __asm__ volatile(
934 935 936 937
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov        %4, %%"FF_REG_b"            \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB(%%FF_REGBP, %5)
938
            "pcmpeqd %%mm7, %%mm7                   \n\t"
939 940 941
            WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
942
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
943
               "a" (&c->redDither)
944 945
        );
    }
946 947
}

948 949 950
static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
                                const int16_t *abuf[2], uint8_t *dest,
951
                                int dstW, int yalpha, int uvalpha, int y)
952
{
953 954 955
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

956
    __asm__ volatile(
957 958 959 960
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
        "mov           %4, %%"FF_REG_b"         \n\t"
        "push %%"FF_REG_BP"                     \n\t"
        YSCALEYUV2RGB(%%FF_REGBP, %5)
961
        "pxor    %%mm7, %%mm7                   \n\t"
962 963 964
        WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
        "pop %%"FF_REG_BP"                      \n\t"
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
965
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
966
           "a" (&c->redDither)
967
           NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
968
    );
969 970
}

971 972 973
static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
                                 const int16_t *abuf[2], uint8_t *dest,
974
                                 int dstW, int yalpha, int uvalpha, int y)
975
{
976 977 978
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

979
    __asm__ volatile(
980 981 982 983
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
        "mov        %4, %%"FF_REG_b"            \n\t"
        "push %%"FF_REG_BP"                     \n\t"
        YSCALEYUV2RGB(%%FF_REGBP, %5)
984 985 986
        "pxor    %%mm7, %%mm7                   \n\t"
        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
987
        "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
988
        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
989
        "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
990
#endif
991 992 993
        WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
        "pop %%"FF_REG_BP"                      \n\t"
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
994
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
995
           "a" (&c->redDither)
996
           NAMED_CONSTRAINTS_ADD(bF8)
997
    );
998 999
}

1000 1001 1002
static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
                                 const int16_t *abuf[2], uint8_t *dest,
1003
                                 int dstW, int yalpha, int uvalpha, int y)
1004
{
1005 1006 1007
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

1008
    __asm__ volatile(
1009 1010 1011 1012
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
        "mov           %4, %%"FF_REG_b"         \n\t"
        "push %%"FF_REG_BP"                     \n\t"
        YSCALEYUV2RGB(%%FF_REGBP, %5)
1013 1014
        "pxor    %%mm7, %%mm7                   \n\t"
        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1015
#ifdef DITHER1XBPP
1016
        "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1017
        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1018
        "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1019
#endif
1020 1021 1022
        WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
        "pop %%"FF_REG_BP"                      \n\t"
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1023
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1024
           "a" (&c->redDither)
1025
           NAMED_CONSTRAINTS_ADD(bF8,bFC)
1026
    );
1027 1028
}

1029
#define REAL_YSCALEYUV2PACKED(index, c) \
1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
    "psraw                $3, %%mm0                           \n\t"\
    "psraw                $3, %%mm1                           \n\t"\
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
    "xor            "#index", "#index"                        \n\t"\
    ".p2align              4            \n\t"\
    "1:                                 \n\t"\
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1041
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1042 1043
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1044
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1066

1067
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
1068

1069 1070 1071
static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
                                  const int16_t *abuf[2], uint8_t *dest,
1072
                                  int dstW, int yalpha, int uvalpha, int y)
1073
{
1074 1075 1076
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

1077
    __asm__ volatile(
1078 1079 1080 1081 1082 1083 1084
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
        "mov           %4, %%"FF_REG_b"         \n\t"
        "push %%"FF_REG_BP"                     \n\t"
        YSCALEYUV2PACKED(%%FF_REGBP, %5)
        WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
        "pop %%"FF_REG_BP"                      \n\t"
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1085
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1086
           "a" (&c->redDither)
1087
    );
1088 1089
}

1090
#define REAL_YSCALEYUV2RGB1(index, c) \
1091 1092 1093 1094
    "xor            "#index", "#index"  \n\t"\
    ".p2align              4            \n\t"\
    "1:                                 \n\t"\
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1095
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1096
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1097
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
    "paddw             %%mm3, %%mm4     \n\t"\
    "movq              %%mm2, %%mm0     \n\t"\
    "movq              %%mm5, %%mm6     \n\t"\
    "movq              %%mm4, %%mm3     \n\t"\
    "punpcklwd         %%mm2, %%mm2     \n\t"\
    "punpcklwd         %%mm5, %%mm5     \n\t"\
    "punpcklwd         %%mm4, %%mm4     \n\t"\
    "paddw             %%mm1, %%mm2     \n\t"\
    "paddw             %%mm1, %%mm5     \n\t"\
    "paddw             %%mm1, %%mm4     \n\t"\
    "punpckhwd         %%mm0, %%mm0     \n\t"\
    "punpckhwd         %%mm6, %%mm6     \n\t"\
    "punpckhwd         %%mm3, %%mm3     \n\t"\
    "paddw             %%mm7, %%mm0     \n\t"\
    "paddw             %%mm7, %%mm6     \n\t"\
    "paddw             %%mm7, %%mm3     \n\t"\
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
    "packuswb          %%mm0, %%mm2     \n\t"\
    "packuswb          %%mm6, %%mm5     \n\t"\
    "packuswb          %%mm3, %%mm4     \n\t"\
1138

1139
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
1140

1141
// do vertical chrominance interpolation
1142
#define REAL_YSCALEYUV2RGB1b(index, c) \
1143 1144 1145 1146 1147
    "xor            "#index", "#index"  \n\t"\
    ".p2align              4            \n\t"\
    "1:                                 \n\t"\
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1148
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1149 1150
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1151
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
    "paddw             %%mm3, %%mm4     \n\t"\
    "movq              %%mm2, %%mm0     \n\t"\
    "movq              %%mm5, %%mm6     \n\t"\
    "movq              %%mm4, %%mm3     \n\t"\
    "punpcklwd         %%mm2, %%mm2     \n\t"\
    "punpcklwd         %%mm5, %%mm5     \n\t"\
    "punpcklwd         %%mm4, %%mm4     \n\t"\
    "paddw             %%mm1, %%mm2     \n\t"\
    "paddw             %%mm1, %%mm5     \n\t"\
    "paddw             %%mm1, %%mm4     \n\t"\
    "punpckhwd         %%mm0, %%mm0     \n\t"\
    "punpckhwd         %%mm6, %%mm6     \n\t"\
    "punpckhwd         %%mm3, %%mm3     \n\t"\
    "paddw             %%mm7, %%mm0     \n\t"\
    "paddw             %%mm7, %%mm6     \n\t"\
    "paddw             %%mm7, %%mm3     \n\t"\
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
    "packuswb          %%mm0, %%mm2     \n\t"\
    "packuswb          %%mm6, %%mm5     \n\t"\
    "packuswb          %%mm3, %%mm4     \n\t"\

1195
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
1196 1197 1198 1199 1200 1201 1202 1203

#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
    "packuswb          %%mm1, %%mm7     \n\t"
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1204 1205

/**
1206
 * YV12 to RGB without scaling or interpolating
1207
 */
1208
static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1209
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
1210 1211
                                const int16_t *abuf0, uint8_t *dest,
                                int dstW, int uvalpha, int y)
1212
{
1213
    const int16_t *ubuf0 = ubuf[0];
1214
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1215

1216
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1217
        const int16_t *ubuf1 = ubuf[0];
1218
        if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1219
            __asm__ volatile(
1220 1221 1222 1223 1224 1225 1226 1227
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
                "mov           %4, %%"FF_REG_b"         \n\t"
                "push %%"FF_REG_BP"                     \n\t"
                YSCALEYUV2RGB1(%%FF_REGBP, %5)
                YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
                "pop %%"FF_REG_BP"                      \n\t"
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1228
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1229
                   "a" (&c->redDither)
1230
            );
1231
        } else {
1232
            __asm__ volatile(
1233 1234 1235 1236
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
                "mov           %4, %%"FF_REG_b"         \n\t"
                "push %%"FF_REG_BP"                     \n\t"
                YSCALEYUV2RGB1(%%FF_REGBP, %5)
1237
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1238 1239 1240
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
                "pop %%"FF_REG_BP"                      \n\t"
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1241
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1242
                   "a" (&c->redDither)
1243
            );
1244 1245
        }
    } else {
1246
        const int16_t *ubuf1 = ubuf[1];
1247
        if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1248
            __asm__ volatile(
1249 1250 1251 1252 1253 1254 1255 1256
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
                "mov           %4, %%"FF_REG_b"         \n\t"
                "push %%"FF_REG_BP"                     \n\t"
                YSCALEYUV2RGB1b(%%FF_REGBP, %5)
                YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
                "pop %%"FF_REG_BP"                      \n\t"
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1257
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1258
                   "a" (&c->redDither)
1259
            );
1260
        } else {
1261
            __asm__ volatile(
1262 1263 1264 1265
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
                "mov           %4, %%"FF_REG_b"         \n\t"
                "push %%"FF_REG_BP"                     \n\t"
                YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1266
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1267 1268 1269
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
                "pop %%"FF_REG_BP"                      \n\t"
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1270
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1271
                   "a" (&c->redDither)
1272
            );
1273
        }
1274
    }
1275 1276
}

1277
static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1278
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
1279 1280
                                const int16_t *abuf0, uint8_t *dest,
                                int dstW, int uvalpha, int y)
1281
{
1282
    const int16_t *ubuf0 = ubuf[0];
1283
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1284 1285

    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1286
        const int16_t *ubuf1 = ubuf[0];
1287
        __asm__ volatile(
1288 1289 1290 1291
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB1(%%FF_REGBP, %5)
1292
            "pxor    %%mm7, %%mm7                   \n\t"
1293 1294 1295
            WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1296
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1297
               "a" (&c->redDither)
1298
               NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1299 1300
        );
    } else {
1301
        const int16_t *ubuf1 = ubuf[1];
1302
        __asm__ volatile(
1303 1304 1305 1306
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1307
            "pxor    %%mm7, %%mm7                   \n\t"
1308 1309 1310
            WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1311
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1312
               "a" (&c->redDither)
1313
               NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1314 1315
        );
    }
1316 1317
}

1318
static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1319
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1320 1321
                                 const int16_t *abuf0, uint8_t *dest,
                                 int dstW, int uvalpha, int y)
1322
{
1323
    const int16_t *ubuf0 = ubuf[0];
1324
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1325 1326

    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1327
        const int16_t *ubuf1 = ubuf[0];
1328
        __asm__ volatile(
1329 1330 1331 1332
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB1(%%FF_REGBP, %5)
1333 1334
            "pxor    %%mm7, %%mm7                   \n\t"
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1335
#ifdef DITHER1XBPP
1336
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1337
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1338
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1339
#endif
1340 1341 1342
            WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1343
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1344
               "a" (&c->redDither)
1345
               NAMED_CONSTRAINTS_ADD(bF8)
1346 1347
        );
    } else {
1348
        const int16_t *ubuf1 = ubuf[1];
1349
        __asm__ volatile(
1350 1351 1352 1353
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1354 1355
            "pxor    %%mm7, %%mm7                   \n\t"
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1356
#ifdef DITHER1XBPP
1357
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1358
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1359
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1360
#endif
1361 1362 1363
            WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1364
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1365
               "a" (&c->redDither)
1366
               NAMED_CONSTRAINTS_ADD(bF8)
1367 1368
        );
    }
1369
}
1370

1371
static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1372
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1373 1374
                                 const int16_t *abuf0, uint8_t *dest,
                                 int dstW, int uvalpha, int y)
1375
{
1376
    const int16_t *ubuf0 = ubuf[0];
1377
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1378 1379

    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380
        const int16_t *ubuf1 = ubuf[0];
1381
        __asm__ volatile(
1382 1383 1384 1385
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB1(%%FF_REGBP, %5)
1386 1387
            "pxor    %%mm7, %%mm7                   \n\t"
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1388
#ifdef DITHER1XBPP
1389
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1390
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1391
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1392
#endif
1393 1394 1395
            WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1396
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1397
               "a" (&c->redDither)
1398
               NAMED_CONSTRAINTS_ADD(bF8,bFC)
1399 1400
        );
    } else {
1401
        const int16_t *ubuf1 = ubuf[1];
1402
        __asm__ volatile(
1403 1404 1405 1406
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1407 1408
            "pxor    %%mm7, %%mm7                   \n\t"
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1409
#ifdef DITHER1XBPP
1410
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1411
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1412
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1413
#endif
1414 1415 1416
            WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1417
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1418
               "a" (&c->redDither)
1419
               NAMED_CONSTRAINTS_ADD(bF8,bFC)
1420 1421 1422
        );
    }
}
1423

1424
#define REAL_YSCALEYUV2PACKED1(index, c) \
1425 1426 1427 1428
    "xor            "#index", "#index"  \n\t"\
    ".p2align              4            \n\t"\
    "1:                                 \n\t"\
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1429
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1430
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1431
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1432 1433 1434 1435 1436 1437
    "psraw                $7, %%mm3     \n\t" \
    "psraw                $7, %%mm4     \n\t" \
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    "psraw                $7, %%mm1     \n\t" \
    "psraw                $7, %%mm7     \n\t" \
1438

1439
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
1440

1441
#define REAL_YSCALEYUV2PACKED1b(index, c) \
1442 1443 1444 1445 1446
    "xor "#index", "#index"             \n\t"\
    ".p2align              4            \n\t"\
    "1:                                 \n\t"\
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1447
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1448 1449
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1450
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1451 1452 1453 1454 1455 1456 1457 1458
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
    "psrlw                $8, %%mm3     \n\t" \
    "psrlw                $8, %%mm4     \n\t" \
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    "psraw                $7, %%mm1     \n\t" \
    "psraw                $7, %%mm7     \n\t"
1459
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
1460

1461
static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1462
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
1463 1464
                                  const int16_t *abuf0, uint8_t *dest,
                                  int dstW, int uvalpha, int y)
1465
{
1466
    const int16_t *ubuf0 = ubuf[0];
1467
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1468 1469

    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1470
        const int16_t *ubuf1 = ubuf[0];
1471
        __asm__ volatile(
1472 1473 1474 1475 1476 1477 1478
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2PACKED1(%%FF_REGBP, %5)
            WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1479
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1480
               "a" (&c->redDither)
1481 1482
        );
    } else {
1483
        const int16_t *ubuf1 = ubuf[1];
1484
        __asm__ volatile(
1485 1486 1487 1488 1489 1490 1491
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
            "mov           %4, %%"FF_REG_b"         \n\t"
            "push %%"FF_REG_BP"                     \n\t"
            YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
            WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
            "pop %%"FF_REG_BP"                      \n\t"
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1492
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1493
               "a" (&c->redDither)
1494 1495
        );
    }
1496
}
1497
static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
1498
{
1499
    enum AVPixelFormat dstFormat = c->dstFormat;
1500

1501
    c->use_mmx_vfilter= 0;
1502
    if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
1503
        && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
1504 1505 1506
            if (c->flags & SWS_ACCURATE_RND) {
                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                    switch (c->dstFormat) {
1507
                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
1508
#if HAVE_6REGS
1509
                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
1510
#endif
1511 1512 1513
                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1514 1515 1516 1517
                    default: break;
                    }
                }
            } else {
1518 1519
                c->use_mmx_vfilter= 1;
                c->yuv2planeX = RENAME(yuv2yuvX    );
1520 1521
                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                    switch (c->dstFormat) {
1522
                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
Kevin Coyle's avatar
Kevin Coyle committed
1523
                    case AV_PIX_FMT_BGR32:   c->yuv2packedX = RENAME(yuv2bgr32_X);   break;
1524
#if HAVE_6REGS
1525
                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
1526
#endif
1527 1528 1529
                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1530 1531 1532
                    default: break;
                    }
                }
1533
            }
1534
        if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1535
            switch (c->dstFormat) {
1536
            case AV_PIX_FMT_RGB32:
1537 1538 1539
                c->yuv2packed1 = RENAME(yuv2rgb32_1);
                c->yuv2packed2 = RENAME(yuv2rgb32_2);
                break;
1540
            case AV_PIX_FMT_BGR24:
1541 1542 1543
                c->yuv2packed1 = RENAME(yuv2bgr24_1);
                c->yuv2packed2 = RENAME(yuv2bgr24_2);
                break;
1544
            case AV_PIX_FMT_RGB555:
1545 1546 1547
                c->yuv2packed1 = RENAME(yuv2rgb555_1);
                c->yuv2packed2 = RENAME(yuv2rgb555_2);
                break;
1548
            case AV_PIX_FMT_RGB565:
1549 1550 1551
                c->yuv2packed1 = RENAME(yuv2rgb565_1);
                c->yuv2packed2 = RENAME(yuv2rgb565_2);
                break;
1552
            case AV_PIX_FMT_YUYV422:
1553 1554 1555
                c->yuv2packed1 = RENAME(yuv2yuyv422_1);
                c->yuv2packed2 = RENAME(yuv2yuyv422_2);
                break;
1556
            default:
1557
                break;
1558
            }
1559
        }
1560
    }
1561

1562
    if (c->srcBpc == 8 && c->dstBpc <= 14) {
1563
    // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1564
#if COMPILE_TEMPLATE_MMXEXT
1565
    if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1566 1567
        c->hyscale_fast = ff_hyscale_fast_mmxext;
        c->hcscale_fast = ff_hcscale_fast_mmxext;
1568
    } else {
1569
#endif /* COMPILE_TEMPLATE_MMXEXT */
1570 1571
        c->hyscale_fast = NULL;
        c->hcscale_fast = NULL;
1572
#if COMPILE_TEMPLATE_MMXEXT
1573
    }
1574
#endif /* COMPILE_TEMPLATE_MMXEXT */
1575
    }
1576
}