h264_qpel.c 29.4 KB
Newer Older
1 2
/*
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3
 * Copyright (c) 2011 Daniel Kang
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

22
#include "libavutil/attributes.h"
23 24
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
25
#include "libavutil/x86/cpu.h"
26
#include "libavcodec/h264qpel.h"
27
#include "libavcodec/mpegvideo.h"
28
#include "dsputil_x86.h"
29

30
#if HAVE_YASM
31 32 33 34
void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                              int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                              int dstStride, int src1Stride, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                              int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                              int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                               int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                               int dstStride, int src1Stride, int h);
#define ff_put_pixels8_l2_sse2  ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext

52 53 54
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)

55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);

DEF_QPEL(avg)
DEF_QPEL(put)
76 77

#define QPEL_H264(OPNAME, OP, MMX)\
78
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
79 80 81
    int w=3;\
    src -= 2*srcStride+2;\
    while(w--){\
82
        ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
83
        tmp += 4;\
84
        src += 4;\
85 86
    }\
    tmp -= 3*4;\
87
    ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
88 89
}\
\
90
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
91
    src -= 2*srcStride;\
92 93 94 95
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
    src += 4;\
    dst += 4;\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
96
}\
97
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
98 99 100
    int w = (size+8)>>2;\
    src -= 2*srcStride+2;\
    while(w--){\
101
        ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
102
        tmp += 4;\
103
        src += 4;\
104 105
    }\
}\
106
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
107 108
    int w = size>>4;\
    do{\
109 110 111
    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
    tmp += 8;\
    dst += 8;\
112 113 114
    }while(w--);\
}\
\
115 116
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
117
}\
118 119 120
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
121 122
}\
\
123 124 125
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
126 127
    src += 8*srcStride;\
    dst += 8*dstStride;\
128 129
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
130 131
}\
\
132 133 134
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
135 136 137
    src += 8*dstStride;\
    dst += 8*dstStride;\
    src2 += 8*src2Stride;\
138 139
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
140 141
}\
\
142 143 144
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
    ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
145
}\
146 147
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
148 149
}\
\
150 151
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
152 153
}\
\
154
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
155
{\
156 157
    ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
    ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
158 159 160 161 162
}\


#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
163 164 165 166

void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);

167 168
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
169 170 171
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
172 173 174
    src += 8*dstStride;\
    dst += 8*dstStride;\
    src2 += 8*src2Stride;\
175 176
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
177 178 179 180 181
}
#endif // ARCH_X86_64

#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
182 183 184
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
185 186
    src += 8*srcStride;\
    dst += 8*dstStride;\
187 188
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
189 190 191
}\

#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
192 193
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
194
}\
195 196 197
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
    ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
198 199
}

200 201 202 203 204 205
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
                                                                 uint8_t *src,
                                                                 int tmpStride,
                                                                 int srcStride,
                                                                 int size)
{
206 207 208
    int w = (size+8)>>3;
    src -= 2*srcStride+2;
    while(w--){
209
        ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
210
        tmp += 8;
211
        src += 8;
212 213 214 215
    }
}

#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
216
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
217
    put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
218
    ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
219
}\
220 221
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
222
}\
223 224
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
    ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
225 226
}\

227 228 229 230
#define ff_put_h264_qpel8_h_lowpass_l2_sse2  ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2  ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
231

232 233 234 235
#define ff_put_h264_qpel8_v_lowpass_ssse3  ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3  ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
236

237 238
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
239 240 241 242 243 244 245

#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\

246 247 248
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
                                       ptrdiff_t stride)
{
249
    ff_put_pixels16_sse2(dst, src, stride, 16);
250
}
251 252 253
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
                                       ptrdiff_t stride)
{
254
    ff_avg_pixels16_sse2(dst, src, stride, 16);
255
}
256 257
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
258 259

#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
260 261
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
262
    ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
263 264 265
}\

#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
266 267
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
268
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
269 270
}\
\
271 272
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
273
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
274 275
}\
\
276 277
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
278
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
279 280 281
}\

#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
282 283
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
284
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
285 286
    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
287 288
}\
\
289 290
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
291
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
292 293
}\
\
294 295
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
296
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
297 298
    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
    ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
299 300 301
}\

#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
302 303
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
304
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
305 306
    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
307 308
}\
\
309 310
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
311
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
312 313
    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
314 315
}\
\
316 317
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
318
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
319 320
    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
321 322
}\
\
323 324
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
325
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
326 327
    ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
328 329
}\
\
330 331
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
332
    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
333
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
334 335
}\
\
336 337
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
338 339 340
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
    uint8_t * const halfHV= temp;\
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
341
    av_assert2(((int)temp & 7) == 0);\
342 343
    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
344 345
}\
\
346 347
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
348 349 350
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
    uint8_t * const halfHV= temp;\
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
351
    av_assert2(((int)temp & 7) == 0);\
352 353
    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
    ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
354 355
}\
\
356 357
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
358 359 360
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
    uint8_t * const halfHV= temp;\
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
361
    av_assert2(((int)temp & 7) == 0);\
362 363
    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
    ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
364 365
}\
\
366 367
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
368 369 370
    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
    uint8_t * const halfHV= temp;\
    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
371
    av_assert2(((int)temp & 7) == 0);\
372 373
    ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
    ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
}\

#define H264_MC_4816(MMX)\
H264_MC(put_, 4, MMX, 8)\
H264_MC(put_, 8, MMX, 8)\
H264_MC(put_, 16,MMX, 8)\
H264_MC(avg_, 4, MMX, 8)\
H264_MC(avg_, 8, MMX, 8)\
H264_MC(avg_, 16,MMX, 8)\

#define H264_MC_816(QPEL, XMM)\
QPEL(put_, 8, XMM, 16)\
QPEL(put_, 16,XMM, 16)\
QPEL(avg_, 8, XMM, 16)\
QPEL(avg_, 16,XMM, 16)\

390 391
QPEL_H264(put_,        PUT_OP, mmxext)
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
392
QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
393
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
394
QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
395
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
396
QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
397
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
398
QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
399
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
400

401
H264_MC_4816(mmxext)
402 403 404 405
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
406 407 408 409 410


//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
411
    (uint8_t *dst, uint8_t *src, ptrdiff_t stride);
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467

#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
    LUMA_MC_OP(put,  4, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(avg,  4, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)

#define LUMA_MC_816(DEPTH, TYPE, OPT) \
    LUMA_MC_OP(put,  8, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(avg,  8, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
    LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)

LUMA_MC_ALL(10, mc00, mmxext)
LUMA_MC_ALL(10, mc10, mmxext)
LUMA_MC_ALL(10, mc20, mmxext)
LUMA_MC_ALL(10, mc30, mmxext)
LUMA_MC_ALL(10, mc01, mmxext)
LUMA_MC_ALL(10, mc11, mmxext)
LUMA_MC_ALL(10, mc21, mmxext)
LUMA_MC_ALL(10, mc31, mmxext)
LUMA_MC_ALL(10, mc02, mmxext)
LUMA_MC_ALL(10, mc12, mmxext)
LUMA_MC_ALL(10, mc22, mmxext)
LUMA_MC_ALL(10, mc32, mmxext)
LUMA_MC_ALL(10, mc03, mmxext)
LUMA_MC_ALL(10, mc13, mmxext)
LUMA_MC_ALL(10, mc23, mmxext)
LUMA_MC_ALL(10, mc33, mmxext)

LUMA_MC_816(10, mc00, sse2)
LUMA_MC_816(10, mc10, sse2)
LUMA_MC_816(10, mc10, sse2_cache64)
LUMA_MC_816(10, mc10, ssse3_cache64)
LUMA_MC_816(10, mc20, sse2)
LUMA_MC_816(10, mc20, sse2_cache64)
LUMA_MC_816(10, mc20, ssse3_cache64)
LUMA_MC_816(10, mc30, sse2)
LUMA_MC_816(10, mc30, sse2_cache64)
LUMA_MC_816(10, mc30, ssse3_cache64)
LUMA_MC_816(10, mc01, sse2)
LUMA_MC_816(10, mc11, sse2)
LUMA_MC_816(10, mc21, sse2)
LUMA_MC_816(10, mc31, sse2)
LUMA_MC_816(10, mc02, sse2)
LUMA_MC_816(10, mc12, sse2)
LUMA_MC_816(10, mc22, sse2)
LUMA_MC_816(10, mc32, sse2)
LUMA_MC_816(10, mc03, sse2)
LUMA_MC_816(10, mc13, sse2)
LUMA_MC_816(10, mc23, sse2)
LUMA_MC_816(10, mc33, sse2)

#define QPEL16_OPMC(OP, MC, MMX)\
468
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
    src += 8*stride;\
    dst += 8*stride;\
    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
}

#define QPEL16_OP(MC, MMX)\
QPEL16_OPMC(put, MC, MMX)\
QPEL16_OPMC(avg, MC, MMX)

#define QPEL16(MMX)\
QPEL16_OP(mc00, MMX)\
QPEL16_OP(mc01, MMX)\
QPEL16_OP(mc02, MMX)\
QPEL16_OP(mc03, MMX)\
QPEL16_OP(mc10, MMX)\
QPEL16_OP(mc11, MMX)\
QPEL16_OP(mc12, MMX)\
QPEL16_OP(mc13, MMX)\
QPEL16_OP(mc20, MMX)\
QPEL16_OP(mc21, MMX)\
QPEL16_OP(mc22, MMX)\
QPEL16_OP(mc23, MMX)\
QPEL16_OP(mc30, MMX)\
QPEL16_OP(mc31, MMX)\
QPEL16_OP(mc32, MMX)\
QPEL16_OP(mc33, MMX)

499
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
500 501
QPEL16(mmxext)
#endif
502 503

#endif /* HAVE_YASM */
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540

#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
    do {                                                                     \
    c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
    c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
    } while (0)

#define H264_QPEL_FUNCS(x, y, CPU)                                                            \
    do {                                                                                      \
        c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
        c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
    } while (0)

#define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
    do {                                                                                            \
        c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
        c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
    } while (0)

541
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
542
{
543
#if HAVE_YASM
544 545 546
    int high_bit_depth = bit_depth > 8;
    int mm_flags = av_get_cpu_flags();

547 548 549 550 551 552 553 554 555
    if (EXTERNAL_MMXEXT(mm_flags)) {
        if (!high_bit_depth) {
            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmxext, );
            SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmxext, );
            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
            SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
        } else if (bit_depth == 10) {
556
#if ARCH_X86_32
557 558 559 560
            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
561
#endif
562 563 564
            SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
        }
565 566
    }

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
    if (EXTERNAL_SSE2(mm_flags)) {
        if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) {
            // these functions are slower than mmx on AMD, but faster on Intel
            H264_QPEL_FUNCS(0, 0, sse2);
        }

        if (!high_bit_depth) {
            H264_QPEL_FUNCS(0, 1, sse2);
            H264_QPEL_FUNCS(0, 2, sse2);
            H264_QPEL_FUNCS(0, 3, sse2);
            H264_QPEL_FUNCS(1, 1, sse2);
            H264_QPEL_FUNCS(1, 2, sse2);
            H264_QPEL_FUNCS(1, 3, sse2);
            H264_QPEL_FUNCS(2, 1, sse2);
            H264_QPEL_FUNCS(2, 2, sse2);
            H264_QPEL_FUNCS(2, 3, sse2);
            H264_QPEL_FUNCS(3, 1, sse2);
            H264_QPEL_FUNCS(3, 2, sse2);
            H264_QPEL_FUNCS(3, 3, sse2);
        }

        if (bit_depth == 10) {
            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
            H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
            H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
            H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
        }
597 598
    }

599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
    if (EXTERNAL_SSSE3(mm_flags)) {
        if (!high_bit_depth) {
            H264_QPEL_FUNCS(1, 0, ssse3);
            H264_QPEL_FUNCS(1, 1, ssse3);
            H264_QPEL_FUNCS(1, 2, ssse3);
            H264_QPEL_FUNCS(1, 3, ssse3);
            H264_QPEL_FUNCS(2, 0, ssse3);
            H264_QPEL_FUNCS(2, 1, ssse3);
            H264_QPEL_FUNCS(2, 2, ssse3);
            H264_QPEL_FUNCS(2, 3, ssse3);
            H264_QPEL_FUNCS(3, 0, ssse3);
            H264_QPEL_FUNCS(3, 1, ssse3);
            H264_QPEL_FUNCS(3, 2, ssse3);
            H264_QPEL_FUNCS(3, 3, ssse3);
        }

        if (bit_depth == 10) {
            H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
            H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
            H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
        }
620 621
    }

622
    if (EXTERNAL_AVX(mm_flags)) {
623 624 625 626
        /* AVX implies 64 byte cache lines without the need to avoid unaligned
         * memory accesses that cross the boundary between two cache lines.
         * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
         * having to treat SSE2 functions with such properties as AVX. */
627 628 629 630 631
        if (bit_depth == 10) {
            H264_QPEL_FUNCS_10(1, 0, sse2);
            H264_QPEL_FUNCS_10(2, 0, sse2);
            H264_QPEL_FUNCS_10(3, 0, sse2);
        }
632 633 634
    }
#endif
}