motion_est_mmx.c 15.8 KB
Newer Older
1 2
/*
 * MMX optimized motion estimation
3
 * Copyright (c) 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer
5
 *
6 7
 * mostly by Michael Niedermayer <michaelni@gmx.at>
 *
8 9 10
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
11 12
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24 25 26

#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
27
#include "dsputil_mmx.h"
28

29
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
30 31 32
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
33
};
34

35
DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
Michael Niedermayer's avatar
Michael Niedermayer committed
36

37
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
38
{
39
    x86_reg len= -(stride*h);
40
    __asm__ volatile(
41
        ASMALIGN(4)
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
        "1:                             \n\t"
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
        "movq (%2, %%"REG_a"), %%mm4    \n\t"
        "add %3, %%"REG_a"              \n\t"
        "psubusb %%mm0, %%mm2           \n\t"
        "psubusb %%mm4, %%mm0           \n\t"
        "movq (%1, %%"REG_a"), %%mm1    \n\t"
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
        "movq (%2, %%"REG_a"), %%mm5    \n\t"
        "psubusb %%mm1, %%mm3           \n\t"
        "psubusb %%mm5, %%mm1           \n\t"
        "por %%mm2, %%mm0               \n\t"
        "por %%mm1, %%mm3               \n\t"
        "movq %%mm0, %%mm1              \n\t"
        "movq %%mm3, %%mm2              \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
        "punpckhbw %%mm7, %%mm1         \n\t"
        "punpcklbw %%mm7, %%mm3         \n\t"
        "punpckhbw %%mm7, %%mm2         \n\t"
        "paddw %%mm1, %%mm0             \n\t"
        "paddw %%mm3, %%mm2             \n\t"
        "paddw %%mm2, %%mm0             \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "add %3, %%"REG_a"              \n\t"
        " js 1b                         \n\t"
68
        : "+a" (len)
69
        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
70
    );
71 72
}

73
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
74
{
75
    __asm__ volatile(
76
        ASMALIGN(4)
77
        "1:                             \n\t"
Loren Merritt's avatar
Loren Merritt committed
78 79 80 81
        "movq (%1), %%mm0               \n\t"
        "movq (%1, %3), %%mm1           \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2, %3), %%mm1         \n\t"
82
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
83 84 85 86 87 88
        "paddw %%mm1, %%mm6             \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
89
        : "r" ((x86_reg)stride)
90
    );
91 92
}

93 94 95
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
{
    int ret;
96
    __asm__ volatile(
97
        "pxor %%xmm2, %%xmm2            \n\t"
98 99 100
        ASMALIGN(4)
        "1:                             \n\t"
        "movdqu (%1), %%xmm0            \n\t"
101
        "movdqu (%1, %4), %%xmm1        \n\t"
102
        "psadbw (%2), %%xmm0            \n\t"
103
        "psadbw (%2, %4), %%xmm1        \n\t"
104 105
        "paddw %%xmm0, %%xmm2           \n\t"
        "paddw %%xmm1, %%xmm2           \n\t"
106 107
        "lea (%1,%4,2), %1              \n\t"
        "lea (%2,%4,2), %2              \n\t"
108 109
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
110 111
        "movhlps %%xmm2, %%xmm0         \n\t"
        "paddw   %%xmm0, %%xmm2         \n\t"
112 113 114
        "movd    %%xmm2, %3             \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
        : "r" ((x86_reg)stride)
115 116 117 118
    );
    return ret;
}

Loren Merritt's avatar
Loren Merritt committed
119
static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
120
{
121
    __asm__ volatile(
122
        ASMALIGN(4)
123
        "1:                             \n\t"
Loren Merritt's avatar
Loren Merritt committed
124 125 126 127 128 129
        "movq (%1), %%mm0               \n\t"
        "movq (%1, %3), %%mm1           \n\t"
        "pavgb 1(%1), %%mm0             \n\t"
        "pavgb 1(%1, %3), %%mm1         \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2, %3), %%mm1         \n\t"
130
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
131 132 133 134 135 136
        "paddw %%mm1, %%mm6             \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
137
        : "r" ((x86_reg)stride)
Loren Merritt's avatar
Loren Merritt committed
138 139 140 141 142
    );
}

static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
143
    __asm__ volatile(
Loren Merritt's avatar
Loren Merritt committed
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
        "movq (%1), %%mm0               \n\t"
        "add %3, %1                     \n\t"
        ASMALIGN(4)
        "1:                             \n\t"
        "movq (%1), %%mm1               \n\t"
        "movq (%1, %3), %%mm2           \n\t"
        "pavgb %%mm1, %%mm0             \n\t"
        "pavgb %%mm2, %%mm1             \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2, %3), %%mm1         \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "paddw %%mm1, %%mm6             \n\t"
        "movq %%mm2, %%mm0              \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
162
        : "r" ((x86_reg)stride)
163
    );
164 165
}

166
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
Loren Merritt's avatar
Loren Merritt committed
167
{
168
    __asm__ volatile(
169
        "movq "MANGLE(bone)", %%mm5     \n\t"
Loren Merritt's avatar
Loren Merritt committed
170 171 172
        "movq (%1), %%mm0               \n\t"
        "pavgb 1(%1), %%mm0             \n\t"
        "add %3, %1                     \n\t"
Loren Merritt's avatar
Loren Merritt committed
173
        ASMALIGN(4)
174
        "1:                             \n\t"
Loren Merritt's avatar
Loren Merritt committed
175 176 177 178
        "movq (%1), %%mm1               \n\t"
        "movq (%1,%3), %%mm2            \n\t"
        "pavgb 1(%1), %%mm1             \n\t"
        "pavgb 1(%1,%3), %%mm2          \n\t"
Loren Merritt's avatar
Loren Merritt committed
179 180
        "psubusb %%mm5, %%mm1           \n\t"
        "pavgb %%mm1, %%mm0             \n\t"
Loren Merritt's avatar
Loren Merritt committed
181 182 183
        "pavgb %%mm2, %%mm1             \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2,%3), %%mm1          \n\t"
184
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
185 186 187 188 189 190 191
        "paddw %%mm1, %%mm6             \n\t"
        "movq %%mm2, %%mm0              \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
192
        : "r" ((x86_reg)stride)
193
    );
194 195
}

196
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
197
{
198
    x86_reg len= -(stride*h);
199
    __asm__ volatile(
200
        ASMALIGN(4)
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
        "1:                             \n\t"
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
        "movq (%2, %%"REG_a"), %%mm1    \n\t"
        "movq (%1, %%"REG_a"), %%mm2    \n\t"
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
        "punpcklbw %%mm7, %%mm1         \n\t"
        "punpckhbw %%mm7, %%mm2         \n\t"
        "punpckhbw %%mm7, %%mm3         \n\t"
        "paddw %%mm0, %%mm1             \n\t"
        "paddw %%mm2, %%mm3             \n\t"
        "movq (%3, %%"REG_a"), %%mm4    \n\t"
        "movq (%3, %%"REG_a"), %%mm2    \n\t"
        "paddw %%mm5, %%mm1             \n\t"
        "paddw %%mm5, %%mm3             \n\t"
        "psrlw $1, %%mm1                \n\t"
        "psrlw $1, %%mm3                \n\t"
        "packuswb %%mm3, %%mm1          \n\t"
        "psubusb %%mm1, %%mm4           \n\t"
        "psubusb %%mm2, %%mm1           \n\t"
        "por %%mm4, %%mm1               \n\t"
        "movq %%mm1, %%mm0              \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
        "punpckhbw %%mm7, %%mm1         \n\t"
        "paddw %%mm1, %%mm0             \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "add %4, %%"REG_a"              \n\t"
        " js 1b                         \n\t"
229
        : "+a" (len)
230
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
231
    );
232 233
}

234
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
235
{
236
    x86_reg len= -(stride*h);
237
    __asm__ volatile(
238 239
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
Loren Merritt's avatar
Loren Merritt committed
240 241 242
        "movq %%mm0, %%mm1              \n\t"
        "movq %%mm2, %%mm3              \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
243
        "punpckhbw %%mm7, %%mm1         \n\t"
Loren Merritt's avatar
Loren Merritt committed
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
        "punpcklbw %%mm7, %%mm2         \n\t"
        "punpckhbw %%mm7, %%mm3         \n\t"
        "paddw %%mm2, %%mm0             \n\t"
        "paddw %%mm3, %%mm1             \n\t"
        ASMALIGN(4)
        "1:                             \n\t"
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
        "movq %%mm2, %%mm3              \n\t"
        "movq %%mm4, %%mm5              \n\t"
        "punpcklbw %%mm7, %%mm2         \n\t"
        "punpckhbw %%mm7, %%mm3         \n\t"
        "punpcklbw %%mm7, %%mm4         \n\t"
        "punpckhbw %%mm7, %%mm5         \n\t"
        "paddw %%mm4, %%mm2             \n\t"
        "paddw %%mm5, %%mm3             \n\t"
        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
        "paddw %%mm2, %%mm0             \n\t"
        "paddw %%mm3, %%mm1             \n\t"
        "paddw %%mm5, %%mm0             \n\t"
264
        "paddw %%mm5, %%mm1             \n\t"
Loren Merritt's avatar
Loren Merritt committed
265 266 267
        "movq (%3, %%"REG_a"), %%mm4    \n\t"
        "movq (%3, %%"REG_a"), %%mm5    \n\t"
        "psrlw $2, %%mm0                \n\t"
268
        "psrlw $2, %%mm1                \n\t"
Loren Merritt's avatar
Loren Merritt committed
269 270 271 272 273
        "packuswb %%mm1, %%mm0          \n\t"
        "psubusb %%mm0, %%mm4           \n\t"
        "psubusb %%mm5, %%mm0           \n\t"
        "por %%mm4, %%mm0               \n\t"
        "movq %%mm0, %%mm4              \n\t"
274
        "punpcklbw %%mm7, %%mm0         \n\t"
Loren Merritt's avatar
Loren Merritt committed
275
        "punpckhbw %%mm7, %%mm4         \n\t"
276
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
277 278 279
        "paddw %%mm4, %%mm6             \n\t"
        "movq  %%mm2, %%mm0             \n\t"
        "movq  %%mm3, %%mm1             \n\t"
280 281
        "add %4, %%"REG_a"              \n\t"
        " js 1b                         \n\t"
282
        : "+a" (len)
283
        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
284
    );
285 286
}

287
static inline int sum_mmx(void)
288
{
289
    int ret;
290
    __asm__ volatile(
291 292 293 294 295 296 297
        "movq %%mm6, %%mm0              \n\t"
        "psrlq $32, %%mm6               \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "movq %%mm6, %%mm0              \n\t"
        "psrlq $16, %%mm6               \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "movd %%mm6, %0                 \n\t"
298 299 300
        : "=r" (ret)
    );
    return ret&0xFFFF;
301 302
}

303
static inline int sum_mmx2(void)
304
{
305
    int ret;
306
    __asm__ volatile(
307
        "movd %%mm6, %0                 \n\t"
308 309 310
        : "=r" (ret)
    );
    return ret;
311 312
}

Loren Merritt's avatar
Loren Merritt committed
313 314 315 316 317 318 319 320 321
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
    sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
}
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
    sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
}

322

323
#define PIX_SAD(suf)\
324
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
325
{\
326
    assert(h==8);\
327
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
328
                 "pxor %%mm6, %%mm6     \n\t":);\
329
\
330
    sad8_1_ ## suf(blk1, blk2, stride, 8);\
331 332 333
\
    return sum_ ## suf();\
}\
334
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
335
{\
336
    assert(h==8);\
337
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
338
                 "pxor %%mm6, %%mm6     \n\t"\
339
                 "movq %0, %%mm5        \n\t"\
340 341 342
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
343
    sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
344 345 346 347
\
    return sum_ ## suf();\
}\
\
348
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
349
{\
350
    assert(h==8);\
351
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
352 353
                 "pxor %%mm6, %%mm6     \n\t"\
                 "movq %0, %%mm5        \n\t"\
354 355 356
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
357
    sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
358 359 360 361
\
    return sum_ ## suf();\
}\
\
362
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
363
{\
364
    assert(h==8);\
365
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
366
                 "pxor %%mm6, %%mm6     \n\t"\
Loren Merritt's avatar
Loren Merritt committed
367
                 ::);\
368
\
369
    sad8_4_ ## suf(blk1, blk2, stride, 8);\
370 371 372 373
\
    return sum_ ## suf();\
}\
\
374
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
Michael Niedermayer's avatar
Michael Niedermayer committed
375
{\
376
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
377
                 "pxor %%mm6, %%mm6     \n\t":);\
Michael Niedermayer's avatar
Michael Niedermayer committed
378
\
379 380
    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
Michael Niedermayer's avatar
Michael Niedermayer committed
381 382 383
\
    return sum_ ## suf();\
}\
384
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
385
{\
386
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
387 388
                 "pxor %%mm6, %%mm6     \n\t"\
                 "movq %0, %%mm5        \n\t"\
389 390 391
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
392 393
    sad8_x2a_ ## suf(blk1  , blk2  , stride, h);\
    sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
394 395 396
\
    return sum_ ## suf();\
}\
397
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
398
{\
399
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
400 401
                 "pxor %%mm6, %%mm6     \n\t"\
                 "movq %0, %%mm5        \n\t"\
402 403 404
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
405 406
    sad8_y2a_ ## suf(blk1  , blk2  , stride, h);\
    sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
407 408 409
\
    return sum_ ## suf();\
}\
410
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
411
{\
412
    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
413
                 "pxor %%mm6, %%mm6     \n\t"\
Loren Merritt's avatar
Loren Merritt committed
414
                 ::);\
415
\
416 417
    sad8_4_ ## suf(blk1  , blk2  , stride, h);\
    sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
418 419 420
\
    return sum_ ## suf();\
}\
421

422 423
PIX_SAD(mmx)
PIX_SAD(mmx2)
424

425
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
426
{
427
    int mm_flags = av_get_cpu_flags();
428

429
    if (mm_flags & AV_CPU_FLAG_MMX) {
430 431 432 433 434 435 436 437
        c->pix_abs[0][0] = sad16_mmx;
        c->pix_abs[0][1] = sad16_x2_mmx;
        c->pix_abs[0][2] = sad16_y2_mmx;
        c->pix_abs[0][3] = sad16_xy2_mmx;
        c->pix_abs[1][0] = sad8_mmx;
        c->pix_abs[1][1] = sad8_x2_mmx;
        c->pix_abs[1][2] = sad8_y2_mmx;
        c->pix_abs[1][3] = sad8_xy2_mmx;
438

439
        c->sad[0]= sad16_mmx;
440
        c->sad[1]= sad8_mmx;
441
    }
442
    if (mm_flags & AV_CPU_FLAG_MMX2) {
443 444
        c->pix_abs[0][0] = sad16_mmx2;
        c->pix_abs[1][0] = sad8_mmx2;
445

446 447
        c->sad[0]= sad16_mmx2;
        c->sad[1]= sad8_mmx2;
448

449
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
450 451 452 453 454 455
            c->pix_abs[0][1] = sad16_x2_mmx2;
            c->pix_abs[0][2] = sad16_y2_mmx2;
            c->pix_abs[0][3] = sad16_xy2_mmx2;
            c->pix_abs[1][1] = sad8_x2_mmx2;
            c->pix_abs[1][2] = sad8_y2_mmx2;
            c->pix_abs[1][3] = sad8_xy2_mmx2;
456
        }
457
    }
458
    if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != CODEC_ID_SNOW) {
459 460
        c->sad[0]= sad16_sse2;
    }
461
}