motion_est_mmx.c 15.6 KB
Newer Older
1 2
/*
 * MMX optimized motion estimation
3
 * Copyright (c) 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer
5
 *
6 7
 * mostly by Michael Niedermayer <michaelni@gmx.at>
 *
8 9 10
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
11 12
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24
#include "dsputil.h"
25
#include "x86_cpu.h"
26

27
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
28 29 30
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
31
};
32

33
static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
Michael Niedermayer's avatar
Michael Niedermayer committed
34

35
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
36
{
37
    long len= -(stride*h);
38
    asm volatile(
39
        ASMALIGN(4)
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
        "1:                             \n\t"
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
        "movq (%2, %%"REG_a"), %%mm4    \n\t"
        "add %3, %%"REG_a"              \n\t"
        "psubusb %%mm0, %%mm2           \n\t"
        "psubusb %%mm4, %%mm0           \n\t"
        "movq (%1, %%"REG_a"), %%mm1    \n\t"
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
        "movq (%2, %%"REG_a"), %%mm5    \n\t"
        "psubusb %%mm1, %%mm3           \n\t"
        "psubusb %%mm5, %%mm1           \n\t"
        "por %%mm2, %%mm0               \n\t"
        "por %%mm1, %%mm3               \n\t"
        "movq %%mm0, %%mm1              \n\t"
        "movq %%mm3, %%mm2              \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
        "punpckhbw %%mm7, %%mm1         \n\t"
        "punpcklbw %%mm7, %%mm3         \n\t"
        "punpckhbw %%mm7, %%mm2         \n\t"
        "paddw %%mm1, %%mm0             \n\t"
        "paddw %%mm3, %%mm2             \n\t"
        "paddw %%mm2, %%mm0             \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "add %3, %%"REG_a"              \n\t"
        " js 1b                         \n\t"
66
        : "+a" (len)
67
        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
68
    );
69 70
}

71
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
72
{
73
    asm volatile(
74
        ASMALIGN(4)
75
        "1:                             \n\t"
Loren Merritt's avatar
Loren Merritt committed
76 77 78 79
        "movq (%1), %%mm0               \n\t"
        "movq (%1, %3), %%mm1           \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2, %3), %%mm1         \n\t"
80
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
81 82 83 84 85 86 87
        "paddw %%mm1, %%mm6             \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
        : "r" ((long)stride)
88
    );
89 90
}

91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
{
    int ret;
    asm volatile(
        "pxor %%xmm6, %%xmm6            \n\t"
        ASMALIGN(4)
        "1:                             \n\t"
        "movdqu (%1), %%xmm0            \n\t"
        "movdqu (%1, %3), %%xmm1        \n\t"
        "psadbw (%2), %%xmm0            \n\t"
        "psadbw (%2, %3), %%xmm1        \n\t"
        "paddw %%xmm0, %%xmm6           \n\t"
        "paddw %%xmm1, %%xmm6           \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
        : "r" ((long)stride)
    );
    asm volatile(
        "movhlps %%xmm6, %%xmm0         \n\t"
        "paddw   %%xmm0, %%xmm6         \n\t"
        "movd    %%xmm6, %0             \n\t"
        : "=r"(ret)
    );
    return ret;
}

Loren Merritt's avatar
Loren Merritt committed
120
static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
121
{
122
    asm volatile(
123
        ASMALIGN(4)
124
        "1:                             \n\t"
Loren Merritt's avatar
Loren Merritt committed
125 126 127 128 129 130
        "movq (%1), %%mm0               \n\t"
        "movq (%1, %3), %%mm1           \n\t"
        "pavgb 1(%1), %%mm0             \n\t"
        "pavgb 1(%1, %3), %%mm1         \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2, %3), %%mm1         \n\t"
131
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
        "paddw %%mm1, %%mm6             \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
        : "r" ((long)stride)
    );
}

static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
    asm volatile(
        "movq (%1), %%mm0               \n\t"
        "add %3, %1                     \n\t"
        ASMALIGN(4)
        "1:                             \n\t"
        "movq (%1), %%mm1               \n\t"
        "movq (%1, %3), %%mm2           \n\t"
        "pavgb %%mm1, %%mm0             \n\t"
        "pavgb %%mm2, %%mm1             \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2, %3), %%mm1         \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "paddw %%mm1, %%mm6             \n\t"
        "movq %%mm2, %%mm0              \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
        : "r" ((long)stride)
164
    );
165 166
}

167
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
Loren Merritt's avatar
Loren Merritt committed
168
{
169
    asm volatile(
170
        "movq "MANGLE(bone)", %%mm5     \n\t"
Loren Merritt's avatar
Loren Merritt committed
171 172 173
        "movq (%1), %%mm0               \n\t"
        "pavgb 1(%1), %%mm0             \n\t"
        "add %3, %1                     \n\t"
Loren Merritt's avatar
Loren Merritt committed
174
        ASMALIGN(4)
175
        "1:                             \n\t"
Loren Merritt's avatar
Loren Merritt committed
176 177 178 179
        "movq (%1), %%mm1               \n\t"
        "movq (%1,%3), %%mm2            \n\t"
        "pavgb 1(%1), %%mm1             \n\t"
        "pavgb 1(%1,%3), %%mm2          \n\t"
Loren Merritt's avatar
Loren Merritt committed
180 181
        "psubusb %%mm5, %%mm1           \n\t"
        "pavgb %%mm1, %%mm0             \n\t"
Loren Merritt's avatar
Loren Merritt committed
182 183 184
        "pavgb %%mm2, %%mm1             \n\t"
        "psadbw (%2), %%mm0             \n\t"
        "psadbw (%2,%3), %%mm1          \n\t"
185
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
186 187 188 189 190 191 192 193
        "paddw %%mm1, %%mm6             \n\t"
        "movq %%mm2, %%mm0              \n\t"
        "lea (%1,%3,2), %1              \n\t"
        "lea (%2,%3,2), %2              \n\t"
        "sub $2, %0                     \n\t"
        " jg 1b                         \n\t"
        : "+r" (h), "+r" (blk1), "+r" (blk2)
        : "r" ((long)stride)
194
    );
195 196
}

197
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
198
{
199
    long len= -(stride*h);
200
    asm volatile(
201
        ASMALIGN(4)
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
        "1:                             \n\t"
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
        "movq (%2, %%"REG_a"), %%mm1    \n\t"
        "movq (%1, %%"REG_a"), %%mm2    \n\t"
        "movq (%2, %%"REG_a"), %%mm3    \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
        "punpcklbw %%mm7, %%mm1         \n\t"
        "punpckhbw %%mm7, %%mm2         \n\t"
        "punpckhbw %%mm7, %%mm3         \n\t"
        "paddw %%mm0, %%mm1             \n\t"
        "paddw %%mm2, %%mm3             \n\t"
        "movq (%3, %%"REG_a"), %%mm4    \n\t"
        "movq (%3, %%"REG_a"), %%mm2    \n\t"
        "paddw %%mm5, %%mm1             \n\t"
        "paddw %%mm5, %%mm3             \n\t"
        "psrlw $1, %%mm1                \n\t"
        "psrlw $1, %%mm3                \n\t"
        "packuswb %%mm3, %%mm1          \n\t"
        "psubusb %%mm1, %%mm4           \n\t"
        "psubusb %%mm2, %%mm1           \n\t"
        "por %%mm4, %%mm1               \n\t"
        "movq %%mm1, %%mm0              \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
        "punpckhbw %%mm7, %%mm1         \n\t"
        "paddw %%mm1, %%mm0             \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "add %4, %%"REG_a"              \n\t"
        " js 1b                         \n\t"
230
        : "+a" (len)
231
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
232
    );
233 234
}

235
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
236
{
237
    long len= -(stride*h);
238
    asm volatile(
239 240
        "movq (%1, %%"REG_a"), %%mm0    \n\t"
        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
Loren Merritt's avatar
Loren Merritt committed
241 242 243
        "movq %%mm0, %%mm1              \n\t"
        "movq %%mm2, %%mm3              \n\t"
        "punpcklbw %%mm7, %%mm0         \n\t"
244
        "punpckhbw %%mm7, %%mm1         \n\t"
Loren Merritt's avatar
Loren Merritt committed
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
        "punpcklbw %%mm7, %%mm2         \n\t"
        "punpckhbw %%mm7, %%mm3         \n\t"
        "paddw %%mm2, %%mm0             \n\t"
        "paddw %%mm3, %%mm1             \n\t"
        ASMALIGN(4)
        "1:                             \n\t"
        "movq (%2, %%"REG_a"), %%mm2    \n\t"
        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
        "movq %%mm2, %%mm3              \n\t"
        "movq %%mm4, %%mm5              \n\t"
        "punpcklbw %%mm7, %%mm2         \n\t"
        "punpckhbw %%mm7, %%mm3         \n\t"
        "punpcklbw %%mm7, %%mm4         \n\t"
        "punpckhbw %%mm7, %%mm5         \n\t"
        "paddw %%mm4, %%mm2             \n\t"
        "paddw %%mm5, %%mm3             \n\t"
        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
        "paddw %%mm2, %%mm0             \n\t"
        "paddw %%mm3, %%mm1             \n\t"
        "paddw %%mm5, %%mm0             \n\t"
265
        "paddw %%mm5, %%mm1             \n\t"
Loren Merritt's avatar
Loren Merritt committed
266 267 268
        "movq (%3, %%"REG_a"), %%mm4    \n\t"
        "movq (%3, %%"REG_a"), %%mm5    \n\t"
        "psrlw $2, %%mm0                \n\t"
269
        "psrlw $2, %%mm1                \n\t"
Loren Merritt's avatar
Loren Merritt committed
270 271 272 273 274
        "packuswb %%mm1, %%mm0          \n\t"
        "psubusb %%mm0, %%mm4           \n\t"
        "psubusb %%mm5, %%mm0           \n\t"
        "por %%mm4, %%mm0               \n\t"
        "movq %%mm0, %%mm4              \n\t"
275
        "punpcklbw %%mm7, %%mm0         \n\t"
Loren Merritt's avatar
Loren Merritt committed
276
        "punpckhbw %%mm7, %%mm4         \n\t"
277
        "paddw %%mm0, %%mm6             \n\t"
Loren Merritt's avatar
Loren Merritt committed
278 279 280
        "paddw %%mm4, %%mm6             \n\t"
        "movq  %%mm2, %%mm0             \n\t"
        "movq  %%mm3, %%mm1             \n\t"
281 282
        "add %4, %%"REG_a"              \n\t"
        " js 1b                         \n\t"
283
        : "+a" (len)
284
        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
285
    );
286 287
}

288
static inline int sum_mmx(void)
289
{
290 291
    int ret;
    asm volatile(
292 293 294 295 296 297 298
        "movq %%mm6, %%mm0              \n\t"
        "psrlq $32, %%mm6               \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "movq %%mm6, %%mm0              \n\t"
        "psrlq $16, %%mm6               \n\t"
        "paddw %%mm0, %%mm6             \n\t"
        "movd %%mm6, %0                 \n\t"
299 300 301
        : "=r" (ret)
    );
    return ret&0xFFFF;
302 303
}

304
static inline int sum_mmx2(void)
305
{
306 307
    int ret;
    asm volatile(
308
        "movd %%mm6, %0                 \n\t"
309 310 311
        : "=r" (ret)
    );
    return ret;
312 313
}

Loren Merritt's avatar
Loren Merritt committed
314 315 316 317 318 319 320 321 322
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
    sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
}
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
    sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
}

323

324
#define PIX_SAD(suf)\
325
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
326
{\
327
    assert(h==8);\
328 329
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t":);\
330
\
331
    sad8_1_ ## suf(blk1, blk2, stride, 8);\
332 333 334
\
    return sum_ ## suf();\
}\
335
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
336
{\
337
    assert(h==8);\
338 339
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t"\
340
                 "movq %0, %%mm5        \n\t"\
341 342 343
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
344
    sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
345 346 347 348
\
    return sum_ ## suf();\
}\
\
349
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
350
{\
351
    assert(h==8);\
352 353 354
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t"\
                 "movq %0, %%mm5        \n\t"\
355 356 357
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
358
    sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
359 360 361 362
\
    return sum_ ## suf();\
}\
\
363
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
364
{\
365
    assert(h==8);\
366 367
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t"\
Loren Merritt's avatar
Loren Merritt committed
368
                 ::);\
369
\
370
    sad8_4_ ## suf(blk1, blk2, stride, 8);\
371 372 373 374
\
    return sum_ ## suf();\
}\
\
375
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
Michael Niedermayer's avatar
Michael Niedermayer committed
376
{\
377 378
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t":);\
Michael Niedermayer's avatar
Michael Niedermayer committed
379
\
380 381
    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
Michael Niedermayer's avatar
Michael Niedermayer committed
382 383 384
\
    return sum_ ## suf();\
}\
385
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
386
{\
387 388 389
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t"\
                 "movq %0, %%mm5        \n\t"\
390 391 392
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
393 394
    sad8_x2a_ ## suf(blk1  , blk2  , stride, h);\
    sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
395 396 397
\
    return sum_ ## suf();\
}\
398
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
399
{\
400 401 402
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t"\
                 "movq %0, %%mm5        \n\t"\
403 404 405
                 :: "m"(round_tab[1]) \
                 );\
\
Loren Merritt's avatar
Loren Merritt committed
406 407
    sad8_y2a_ ## suf(blk1  , blk2  , stride, h);\
    sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
408 409 410
\
    return sum_ ## suf();\
}\
411
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
412
{\
413 414
    asm volatile("pxor %%mm7, %%mm7     \n\t"\
                 "pxor %%mm6, %%mm6     \n\t"\
Loren Merritt's avatar
Loren Merritt committed
415
                 ::);\
416
\
417 418
    sad8_4_ ## suf(blk1  , blk2  , stride, h);\
    sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
419 420 421
\
    return sum_ ## suf();\
}\
422

423 424
PIX_SAD(mmx)
PIX_SAD(mmx2)
425

426
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
427
{
Måns Rullgård's avatar
Måns Rullgård committed
428
    if (mm_flags & MM_MMX) {
429 430 431 432 433 434 435 436
        c->pix_abs[0][0] = sad16_mmx;
        c->pix_abs[0][1] = sad16_x2_mmx;
        c->pix_abs[0][2] = sad16_y2_mmx;
        c->pix_abs[0][3] = sad16_xy2_mmx;
        c->pix_abs[1][0] = sad8_mmx;
        c->pix_abs[1][1] = sad8_x2_mmx;
        c->pix_abs[1][2] = sad8_y2_mmx;
        c->pix_abs[1][3] = sad8_xy2_mmx;
437

438
        c->sad[0]= sad16_mmx;
439
        c->sad[1]= sad8_mmx;
440
    }
Måns Rullgård's avatar
Måns Rullgård committed
441
    if (mm_flags & MM_MMXEXT) {
442 443
        c->pix_abs[0][0] = sad16_mmx2;
        c->pix_abs[1][0] = sad8_mmx2;
444

445 446
        c->sad[0]= sad16_mmx2;
        c->sad[1]= sad8_mmx2;
447

448
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
449 450 451 452 453 454
            c->pix_abs[0][1] = sad16_x2_mmx2;
            c->pix_abs[0][2] = sad16_y2_mmx2;
            c->pix_abs[0][3] = sad16_xy2_mmx2;
            c->pix_abs[1][1] = sad8_x2_mmx2;
            c->pix_abs[1][2] = sad8_y2_mmx2;
            c->pix_abs[1][3] = sad8_xy2_mmx2;
455
        }
456
    }
457 458 459
    if ((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)) {
        c->sad[0]= sad16_sse2;
    }
460
}