vp8dsp-init.c 19 KB
Newer Older
1 2 3 4 5
/*
 * VP8 DSP functions x86-optimized
 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
 *
6
 * This file is part of Libav.
7
 *
8
 * Libav is free software; you can redistribute it and/or
9 10 11 12
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
13
 * Libav is distributed in the hope that it will be useful,
14 15 16 17 18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with Libav; if not, write to the Free Software
20 21 22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

23
#include "libavutil/cpu.h"
24 25 26
#include "libavutil/x86_cpu.h"
#include "libavcodec/vp8dsp.h"

27 28
#if HAVE_YASM

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
/*
 * MC functions
 */
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);

extern void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);

58 59 60 61 62 63 64 65 66 67 68 69
extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
70 71 72 73 74 75 76 77 78 79 80 81 82
extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);
extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, int dststride,
                                       uint8_t *src, int srcstride,
                                       int height, int mx, int my);

83 84 85 86 87 88
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, int dststride,
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, int dststride,
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
89 90 91 92 93 94
extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, int dststride,
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, int dststride,
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
95 96 97 98 99 100 101

extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, int dststride,
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, int dststride,
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
102
extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, int dststride,
103 104
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);
105
extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, int dststride,
106 107 108
                                          uint8_t *src, int srcstride,
                                          int height, int mx, int my);

109

110 111 112 113 114 115 116 117 118 119
extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, int dststride,
                                    uint8_t *src, int srcstride,
                                    int height, int mx, int my);
extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, int dststride,
                                    uint8_t *src, int srcstride,
                                    int height, int mx, int my);
extern void ff_put_vp8_pixels16_sse(uint8_t *dst, int dststride,
                                    uint8_t *src, int srcstride,
                                    int height, int mx, int my);

120 121 122 123
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
    uint8_t *dst,  int dststride, uint8_t *src, \
    int srcstride, int height, int mx, int my) \
124
{ \
125 126 127 128
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
        dst,     dststride, src,     srcstride, height, mx, my); \
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
129
}
130 131 132 133
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
    uint8_t *dst,  int dststride, uint8_t *src, \
    int srcstride, int height, int mx, int my) \
134
{ \
135 136 137 138
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
        dst,     dststride, src,     srcstride, height, mx, my); \
    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
139 140
}

141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)

TAP_W16(sse2,   epel, h6)
TAP_W16(sse2,   epel, v6)
TAP_W16(sse2,   bilinear, h)
TAP_W16(sse2,   bilinear, v)

TAP_W16(ssse3,  epel, h6)
TAP_W16(ssse3,  epel, v6)
TAP_W16(ssse3,  bilinear, h)
TAP_W16(ssse3,  bilinear, v)
161 162

#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
163 164 165
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
    uint8_t *dst, int dststride, uint8_t *src, \
    int srcstride, int height, int mx, int my) \
166 167 168 169
{ \
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
    uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
    src -= srcstride * (TAPNUMY / 2 - 1); \
170 171 172 173
    ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
        tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
    ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
}

#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y,  4,  8) \
HVTAP(mmxext, 8, x, y,  8, 16)

HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)
HVTAP(mmxext, 8, 6, 6, 16, 16)

#define HVTAPSSE2(x, y, w) \
HVTAP(sse2,  16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)

HVTAPSSE2(4, 4, 8)
HVTAPSSE2(4, 6, 8)
HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)
195

196 197 198 199 200
HVTAP(ssse3, 16, 4, 4, 4, 8)
HVTAP(ssse3, 16, 4, 6, 4, 8)
HVTAP(ssse3, 16, 6, 4, 4, 8)
HVTAP(ssse3, 16, 6, 6, 4, 8)

201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
    uint8_t *dst, int dststride, uint8_t *src, \
    int srcstride, int height, int mx, int my) \
{ \
    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
    ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
        tmp, SIZE,      src, srcstride, height + 1, mx, my); \
    ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
        dst, dststride, tmp, SIZE,      height,     mx, my); \
}

HVBILIN(mmxext, 8,  4,  8)
HVBILIN(mmxext, 8,  8, 16)
HVBILIN(mmxext, 8, 16, 16)
HVBILIN(sse2,   8,  8, 16)
HVBILIN(sse2,   8, 16, 16)
218
HVBILIN(ssse3,  8,  4,  8)
219 220
HVBILIN(ssse3,  8,  8, 16)
HVBILIN(ssse3,  8, 16, 16)
221 222 223

extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
224 225 226
extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
227
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
228
extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
Ronald S. Bultje's avatar
Ronald S. Bultje committed
229
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
230
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
231

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
#define DECLARE_LOOP_FILTER(NAME)\
extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
                                                    int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
                                                    int e, int i, int hvt);\
extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
                                                    int s, int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
                                                    int s, int e, int i, int hvt);\
extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
                                                    int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
                                                    int e, int i, int hvt);\
extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
                                                    int s, int e, int i, int hvt);\
extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
                                                    int s, int e, int i, int hvt);

DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
256
DECLARE_LOOP_FILTER(sse4)
257

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
#endif

#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
    c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT

#define VP8_MC_FUNC(IDX, SIZE, OPT) \
    c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
    c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
    VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)

#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
    c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
    c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT

283 284 285

av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
{
286
#if HAVE_YASM
287
    int mm_flags = av_get_cpu_flags();
288

289
    if (mm_flags & AV_CPU_FLAG_MMX) {
290 291 292 293 294
        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
        c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
        c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
        c->vp8_idct_add       = ff_vp8_idct_add_mmx;
        c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;
295 296 297 298
        c->put_vp8_epel_pixels_tab[0][0][0]     =
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
        c->put_vp8_epel_pixels_tab[1][0][0]     =
        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
299 300 301

        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
302

303 304
        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
305 306
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
307 308 309 310 311

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmx;
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmx;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmx;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmx;
312 313 314 315
    }

    /* note that 4-tap width=16 functions are missing because w=16
     * is only used for luma, and luma is always a copy or sixtap. */
316
    if (mm_flags & AV_CPU_FLAG_MMX2) {
317 318
        VP8_LUMA_MC_FUNC(0, 16, mmxext);
        VP8_MC_FUNC(1, 8, mmxext);
319
        VP8_MC_FUNC(2, 4, mmxext);
320 321
        VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
        VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
322
        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
323 324 325

        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
326

327 328
        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
329 330
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
331 332 333 334 335

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmxext;
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
336 337
    }

338
    if (mm_flags & AV_CPU_FLAG_SSE) {
339
        c->vp8_idct_add                         = ff_vp8_idct_add_sse;
340
        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
341 342 343 344
        c->put_vp8_epel_pixels_tab[0][0][0]     =
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
    }

345
    if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
346 347 348 349
        VP8_LUMA_MC_FUNC(0, 16, sse2);
        VP8_MC_FUNC(1, 8, sse2);
        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
350 351

        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
352

353
        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
354
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
355

356 357
        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
358 359
    }

360
    if (mm_flags & AV_CPU_FLAG_SSE2) {
361
        c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
362

363 364
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;

365
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
366
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
367

368 369
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
370 371
    }

372
    if (mm_flags & AV_CPU_FLAG_SSSE3) {
373 374
        VP8_LUMA_MC_FUNC(0, 16, ssse3);
        VP8_MC_FUNC(1, 8, ssse3);
375
        VP8_MC_FUNC(2, 4, ssse3);
376 377
        VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
        VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
378
        VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
379 380 381 382 383 384 385 386 387 388

        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;

        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;

        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
389
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
390
        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
391
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
392 393
    }

394
    if (mm_flags & AV_CPU_FLAG_SSE4) {
395
        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
396

397
        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
398 399
        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
400
    }
David Conrad's avatar
David Conrad committed
401
#endif
402
}