motion_est_mmx.c 12.3 KB
Newer Older
1 2
/*
 * MMX optimized motion estimation
3
 * Copyright (c) 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer
5
 *
6 7 8 9
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
15
 *
16 17 18
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * mostly by Michael Niedermayer <michaelni@gmx.at>
21 22 23
 */
#include "../dsputil.h"

24
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
25 26 27
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
28
};
29

Fabrice Bellard's avatar
Fabrice Bellard committed
30
static __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL;
Michael Niedermayer's avatar
Michael Niedermayer committed
31

32
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
33
{
34
    int len= -(stride*h);
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
        "movq (%1, %%eax), %%mm0	\n\t"
        "movq (%2, %%eax), %%mm2	\n\t"
        "movq (%2, %%eax), %%mm4	\n\t"
        "addl %3, %%eax			\n\t"
        "psubusb %%mm0, %%mm2		\n\t"
        "psubusb %%mm4, %%mm0		\n\t"
        "movq (%1, %%eax), %%mm1	\n\t"
        "movq (%2, %%eax), %%mm3	\n\t"
        "movq (%2, %%eax), %%mm5	\n\t"
        "psubusb %%mm1, %%mm3		\n\t"
        "psubusb %%mm5, %%mm1		\n\t"
        "por %%mm2, %%mm0		\n\t"
        "por %%mm1, %%mm3		\n\t"
        "movq %%mm0, %%mm1		\n\t"
        "movq %%mm3, %%mm2		\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
        "punpckhbw %%mm7, %%mm1		\n\t"
        "punpcklbw %%mm7, %%mm3		\n\t"
        "punpckhbw %%mm7, %%mm2		\n\t"
        "paddw %%mm1, %%mm0		\n\t"
        "paddw %%mm3, %%mm2		\n\t"
        "paddw %%mm2, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "addl %3, %%eax			\n\t"
        " js 1b				\n\t"
        : "+a" (len)
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
    );
66 67
}

68
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
69
{
70
    int len= -(stride*h);
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
        "movq (%1, %%eax), %%mm0	\n\t"
        "movq (%2, %%eax), %%mm2	\n\t"
        "psadbw %%mm2, %%mm0		\n\t"
        "addl %3, %%eax			\n\t"
        "movq (%1, %%eax), %%mm1	\n\t"
        "movq (%2, %%eax), %%mm3	\n\t"
        "psadbw %%mm1, %%mm3		\n\t"
        "paddw %%mm3, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "addl %3, %%eax			\n\t"
        " js 1b				\n\t"
        : "+a" (len)
        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
    );
88 89
}

90
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
91
{
92
    int len= -(stride*h);
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
        "movq (%1, %%eax), %%mm0	\n\t"
        "movq (%2, %%eax), %%mm2	\n\t"
        "pavgb %%mm2, %%mm0		\n\t"
        "movq (%3, %%eax), %%mm2	\n\t"
        "psadbw %%mm2, %%mm0		\n\t"
        "addl %4, %%eax			\n\t"
        "movq (%1, %%eax), %%mm1	\n\t"
        "movq (%2, %%eax), %%mm3	\n\t"
        "pavgb %%mm1, %%mm3		\n\t"
        "movq (%3, %%eax), %%mm1	\n\t"
        "psadbw %%mm1, %%mm3		\n\t"
        "paddw %%mm3, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "addl %4, %%eax			\n\t"
        " js 1b				\n\t"
        : "+a" (len)
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
    );
114 115
}

116
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
117
{ //FIXME reuse src
118
    int len= -(stride*h);
119 120
    asm volatile(
        ".balign 16			\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
121
        "movq "MANGLE(bone)", %%mm5	\n\t"
122
        "1:				\n\t"
123 124 125 126 127 128
        "movq (%1, %%eax), %%mm0	\n\t"
        "movq (%2, %%eax), %%mm2	\n\t"
        "movq 1(%1, %%eax), %%mm1	\n\t"
        "movq 1(%2, %%eax), %%mm3	\n\t"
        "pavgb %%mm2, %%mm0		\n\t"
        "pavgb %%mm1, %%mm3		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
129
        "psubusb %%mm5, %%mm3		\n\t"
130 131 132 133 134 135 136 137 138 139
        "pavgb %%mm3, %%mm0		\n\t"
        "movq (%3, %%eax), %%mm2	\n\t"
        "psadbw %%mm2, %%mm0		\n\t"
        "addl %4, %%eax			\n\t"
        "movq (%1, %%eax), %%mm1	\n\t"
        "movq (%2, %%eax), %%mm3	\n\t"
        "movq 1(%1, %%eax), %%mm2	\n\t"
        "movq 1(%2, %%eax), %%mm4	\n\t"
        "pavgb %%mm3, %%mm1		\n\t"
        "pavgb %%mm4, %%mm2		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
140
        "psubusb %%mm5, %%mm2		\n\t"
141 142 143 144 145 146 147 148 149 150
        "pavgb %%mm1, %%mm2		\n\t"
        "movq (%3, %%eax), %%mm1	\n\t"
        "psadbw %%mm1, %%mm2		\n\t"
        "paddw %%mm2, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "addl %4, %%eax			\n\t"
        " js 1b				\n\t"
        : "+a" (len)
        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
    );
151 152
}

153
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
154
{
155
    int len= -(stride*h);
156 157 158 159 160 161 162 163 164 165 166 167 168
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
        "movq (%1, %%eax), %%mm0	\n\t"
        "movq (%2, %%eax), %%mm1	\n\t"
        "movq (%1, %%eax), %%mm2	\n\t"
        "movq (%2, %%eax), %%mm3	\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
        "punpcklbw %%mm7, %%mm1		\n\t"
        "punpckhbw %%mm7, %%mm2		\n\t"
        "punpckhbw %%mm7, %%mm3		\n\t"
        "paddw %%mm0, %%mm1		\n\t"
        "paddw %%mm2, %%mm3		\n\t"
169
        "movq (%3, %%eax), %%mm4	\n\t"
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
        "movq (%3, %%eax), %%mm2	\n\t"
        "paddw %%mm5, %%mm1		\n\t"
        "paddw %%mm5, %%mm3		\n\t"
        "psrlw $1, %%mm1		\n\t"
        "psrlw $1, %%mm3		\n\t"
        "packuswb %%mm3, %%mm1		\n\t"
        "psubusb %%mm1, %%mm4		\n\t"
        "psubusb %%mm2, %%mm1		\n\t"
        "por %%mm4, %%mm1		\n\t"
        "movq %%mm1, %%mm0		\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
        "punpckhbw %%mm7, %%mm1		\n\t"
        "paddw %%mm1, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "addl %4, %%eax			\n\t"
        " js 1b				\n\t"
        : "+a" (len)
        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
    );
189 190
}

191
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
192
{
193
    int len= -(stride*h);
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
        "movq (%1, %%eax), %%mm0	\n\t"
        "movq (%2, %%eax), %%mm1	\n\t"
        "movq %%mm0, %%mm4		\n\t"
        "movq %%mm1, %%mm2		\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
        "punpcklbw %%mm7, %%mm1		\n\t"
        "punpckhbw %%mm7, %%mm4		\n\t"
        "punpckhbw %%mm7, %%mm2		\n\t"
        "paddw %%mm1, %%mm0		\n\t"
        "paddw %%mm2, %%mm4		\n\t"
        "movq 1(%1, %%eax), %%mm2	\n\t"
        "movq 1(%2, %%eax), %%mm3	\n\t"
        "movq %%mm2, %%mm1		\n\t"
        "punpcklbw %%mm7, %%mm2		\n\t"
        "punpckhbw %%mm7, %%mm1		\n\t"
        "paddw %%mm0, %%mm2		\n\t"
        "paddw %%mm4, %%mm1		\n\t"
        "movq %%mm3, %%mm4		\n\t"
        "punpcklbw %%mm7, %%mm3		\n\t"
        "punpckhbw %%mm7, %%mm4		\n\t"
        "paddw %%mm3, %%mm2		\n\t"
        "paddw %%mm4, %%mm1		\n\t"
219 220
        "movq (%3, %%eax), %%mm3	\n\t"
        "movq (%3, %%eax), %%mm4	\n\t"
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
        "paddw %%mm5, %%mm2		\n\t"
        "paddw %%mm5, %%mm1		\n\t"
        "psrlw $2, %%mm2		\n\t"
        "psrlw $2, %%mm1		\n\t"
        "packuswb %%mm1, %%mm2		\n\t"
        "psubusb %%mm2, %%mm3		\n\t"
        "psubusb %%mm4, %%mm2		\n\t"
        "por %%mm3, %%mm2		\n\t"
        "movq %%mm2, %%mm0		\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
        "punpckhbw %%mm7, %%mm2		\n\t"
        "paddw %%mm2, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "addl %4, %%eax			\n\t"
        " js 1b				\n\t"
        : "+a" (len)
        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
    );
239 240
}

241
static inline int sum_mmx(void)
242
{
243 244 245 246 247 248 249 250 251 252 253 254
    int ret;
    asm volatile(
        "movq %%mm6, %%mm0		\n\t"
        "psrlq $32, %%mm6		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "movq %%mm6, %%mm0		\n\t"
        "psrlq $16, %%mm6		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
        "movd %%mm6, %0			\n\t"
        : "=r" (ret)
    );
    return ret&0xFFFF;
255 256
}

257
static inline int sum_mmx2(void)
258
{
259 260 261 262 263 264
    int ret;
    asm volatile(
        "movd %%mm6, %0			\n\t"
        : "=r" (ret)
    );
    return ret;
265 266
}

267

268
#define PIX_SAD(suf)\
269
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
270
{\
271
    assert(h==8);\
272 273 274
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t":);\
\
275
    sad8_1_ ## suf(blk1, blk2, stride, 8);\
276 277 278
\
    return sum_ ## suf();\
}\
279
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
280
{\
281
    assert(h==8);\
282 283 284 285 286 287
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t"\
                 "movq %0, %%mm5		\n\t"\
                 :: "m"(round_tab[1]) \
                 );\
\
288
    sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
289 290 291 292
\
    return sum_ ## suf();\
}\
\
293
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
294
{\
295
    assert(h==8);\
296 297 298 299 300 301
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t"\
                 "movq %0, %%mm5		\n\t"\
                 :: "m"(round_tab[1]) \
                 );\
\
302
    sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
303 304 305 306
\
    return sum_ ## suf();\
}\
\
307
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
308
{\
309
    assert(h==8);\
310 311 312 313 314 315
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t"\
                 "movq %0, %%mm5		\n\t"\
                 :: "m"(round_tab[2]) \
                 );\
\
316
    sad8_4_ ## suf(blk1, blk2, stride, 8);\
317 318 319 320
\
    return sum_ ## suf();\
}\
\
321
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
Michael Niedermayer's avatar
Michael Niedermayer committed
322 323 324 325
{\
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t":);\
\
326 327
    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
Michael Niedermayer's avatar
Michael Niedermayer committed
328 329 330
\
    return sum_ ## suf();\
}\
331
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
332 333 334 335 336 337 338
{\
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t"\
                 "movq %0, %%mm5		\n\t"\
                 :: "m"(round_tab[1]) \
                 );\
\
339 340
    sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, h);\
    sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
341 342 343
\
    return sum_ ## suf();\
}\
344
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
345 346 347 348 349 350 351
{\
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t"\
                 "movq %0, %%mm5		\n\t"\
                 :: "m"(round_tab[1]) \
                 );\
\
352 353
    sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, h);\
    sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
354 355 356
\
    return sum_ ## suf();\
}\
357
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
358 359 360 361 362 363 364
{\
    asm volatile("pxor %%mm7, %%mm7		\n\t"\
                 "pxor %%mm6, %%mm6		\n\t"\
                 "movq %0, %%mm5		\n\t"\
                 :: "m"(round_tab[2]) \
                 );\
\
365 366
    sad8_4_ ## suf(blk1  , blk2  , stride, h);\
    sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
367 368 369
\
    return sum_ ## suf();\
}\
370

371 372
PIX_SAD(mmx)
PIX_SAD(mmx2)
373

374
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
375 376
{
    if (mm_flags & MM_MMX) {
377 378 379 380 381 382 383 384
        c->pix_abs[0][0] = sad16_mmx;
        c->pix_abs[0][1] = sad16_x2_mmx;
        c->pix_abs[0][2] = sad16_y2_mmx;
        c->pix_abs[0][3] = sad16_xy2_mmx;
        c->pix_abs[1][0] = sad8_mmx;
        c->pix_abs[1][1] = sad8_x2_mmx;
        c->pix_abs[1][2] = sad8_y2_mmx;
        c->pix_abs[1][3] = sad8_xy2_mmx;
385

386 387
	c->sad[0]= sad16_mmx;
        c->sad[1]= sad8_mmx;
388 389
    }
    if (mm_flags & MM_MMXEXT) {
390 391
	c->pix_abs[0][0] = sad16_mmx2;
	c->pix_abs[1][0] = sad8_mmx2;
392

393 394
	c->sad[0]= sad16_mmx2;
	c->sad[1]= sad8_mmx2;
395 396
        
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
397 398 399 400 401 402
            c->pix_abs[0][1] = sad16_x2_mmx2;
            c->pix_abs[0][2] = sad16_y2_mmx2;
            c->pix_abs[0][3] = sad16_xy2_mmx2;
            c->pix_abs[1][1] = sad8_x2_mmx2;
            c->pix_abs[1][2] = sad8_y2_mmx2;
            c->pix_abs[1][3] = sad8_xy2_mmx2;
403
        }
404 405
    }
}