h264_chromamc.asm 17.4 KB
Newer Older
1 2 3 4 5
;******************************************************************************
;* MMX/SSSE3-optimized functions for H264 chroma MC
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;*               2005-2008 Loren Merritt
;*
6
;* This file is part of Libav.
7
;*
8
;* Libav is free software; you can redistribute it and/or
9 10 11 12
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
13
;* Libav is distributed in the hope that it will be useful,
14 15 16 17 18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with Libav; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 22
;******************************************************************************

23
%include "libavutil/x86/x86util.asm"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70

SECTION_RODATA

rnd_rv40_2d_tbl: times 4 dw  0
                 times 4 dw 16
                 times 4 dw 32
                 times 4 dw 16
                 times 4 dw 32
                 times 4 dw 28
                 times 4 dw 32
                 times 4 dw 28
                 times 4 dw  0
                 times 4 dw 32
                 times 4 dw 16
                 times 4 dw 32
                 times 4 dw 32
                 times 4 dw 28
                 times 4 dw 32
                 times 4 dw 28
rnd_rv40_1d_tbl: times 4 dw  0
                 times 4 dw  2
                 times 4 dw  4
                 times 4 dw  2
                 times 4 dw  4
                 times 4 dw  3
                 times 4 dw  4
                 times 4 dw  3
                 times 4 dw  0
                 times 4 dw  4
                 times 4 dw  2
                 times 4 dw  4
                 times 4 dw  4
                 times 4 dw  3
                 times 4 dw  4
                 times 4 dw  3

cextern pw_3
cextern pw_4
cextern pw_8
cextern pw_28
cextern pw_32
cextern pw_64

SECTION .text

%macro mv0_pixels_mc8 0
    lea           r4, [r2*2 ]
71
.next4rows:
72 73
    movq         mm0, [r1   ]
    movq         mm1, [r1+r2]
74
    add           r1, r4
75 76 77 78 79 80 81
    CHROMAMC_AVG mm0, [r0   ]
    CHROMAMC_AVG mm1, [r0+r2]
    movq     [r0   ], mm0
    movq     [r0+r2], mm1
    add           r0, r4
    movq         mm0, [r1   ]
    movq         mm1, [r1+r2]
82
    add           r1, r4
83 84 85 86 87 88 89 90 91
    CHROMAMC_AVG mm0, [r0   ]
    CHROMAMC_AVG mm1, [r0+r2]
    movq     [r0   ], mm0
    movq     [r0+r2], mm1
    add           r0, r4
    sub          r3d, 4
    jne .next4rows
%endmacro

92
%macro chroma_mc8_mmx_func 2-3
93 94 95 96 97 98 99 100 101 102 103 104 105
%ifidn %2, rv40
%ifdef PIC
%define rnd_1d_rv40 r8
%define rnd_2d_rv40 r8
%define extra_regs 2
%else ; no-PIC
%define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%define extra_regs 1
%endif ; PIC
%else
%define extra_regs 0
%endif ; rv40
106 107 108
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
;                           int stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
109
%if ARCH_X86_64
110 111 112 113 114 115 116 117 118
    movsxd        r2, r2d
%endif
    mov          r6d, r5d
    or           r6d, r4d
    jne .at_least_one_non_zero
    ; mx == 0 AND my == 0 - no filter needed
    mv0_pixels_mc8
    REP_RET

119
.at_least_one_non_zero:
120
%ifidn %2, rv40
121
%if ARCH_X86_64
122 123 124 125 126
    mov           r7, r5
    and           r7, 6         ; &~1 for mx/my=[0,7]
    lea           r7, [r7*4+r4]
    sar          r7d, 1
%define rnd_bias r7
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
%define dest_reg r0
%else ; x86-32
    mov           r0, r5
    and           r0, 6         ; &~1 for mx/my=[0,7]
    lea           r0, [r0*4+r4]
    sar          r0d, 1
%define rnd_bias r0
%define dest_reg r5
%endif
%else ; vc1, h264
%define rnd_bias  0
%define dest_reg r0
%endif

    test         r5d, r5d
    mov           r6, 1
    je .my_is_zero
    test         r4d, r4d
    mov           r6, r2        ; dxy = x ? 1 : stride
    jne .both_non_zero
147
.my_is_zero:
148 149 150 151 152
    ; mx == 0 XOR my == 0 - 1 dimensional filter only
    or           r4d, r5d       ; x + y

%ifidn %2, rv40
%ifdef PIC
153
    lea           r8, [rnd_rv40_1d_tbl]
154
%endif
155
%if ARCH_X86_64 == 0
156 157 158 159 160 161 162 163 164 165 166 167
    mov           r5, r0m
%endif
%endif

    movd          m5, r4d
    movq          m4, [pw_8]
    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
    punpcklwd     m5, m5
    punpckldq     m5, m5        ; mm5 = B = x
    pxor          m7, m7
    psubw         m4, m5        ; mm4 = A = 8-x

168
.next1drow:
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
    movq          m0, [r1   ]   ; mm0 = src[0..7]
    movq          m2, [r1+r6]   ; mm1 = src[1..8]

    movq          m1, m0
    movq          m3, m2
    punpcklbw     m0, m7
    punpckhbw     m1, m7
    punpcklbw     m2, m7
    punpckhbw     m3, m7
    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
    pmullw        m1, m4
    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
    pmullw        m3, m5

    paddw         m0, m6
    paddw         m1, m6
    paddw         m0, m2
    paddw         m1, m3
    psrlw         m0, 3
    psrlw         m1, 3
    packuswb      m0, m1
    CHROMAMC_AVG  m0, [dest_reg]
    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3

    add     dest_reg, r2
    add           r1, r2
    dec           r3d
    jne .next1drow
    REP_RET

199
.both_non_zero: ; general case, bilinear
200 201 202 203
    movd          m4, r4d         ; x
    movd          m6, r5d         ; y
%ifidn %2, rv40
%ifdef PIC
204
    lea           r8, [rnd_rv40_2d_tbl]
205
%endif
206
%if ARCH_X86_64 == 0
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    mov           r5, r0m
%endif
%endif
    mov           r6, rsp         ; backup stack pointer
    and          rsp, ~(mmsize-1) ; align stack
    sub          rsp, 16          ; AA and DD

    punpcklwd     m4, m4
    punpcklwd     m6, m6
    punpckldq     m4, m4          ; mm4 = x words
    punpckldq     m6, m6          ; mm6 = y words
    movq          m5, m4
    pmullw        m4, m6          ; mm4 = x * y
    psllw         m5, 3
    psllw         m6, 3
    movq          m7, m5
    paddw         m7, m6
    movq     [rsp+8], m4          ; DD = x * y
    psubw         m5, m4          ; mm5 = B = 8x - xy
    psubw         m6, m4          ; mm6 = C = 8y - xy
    paddw         m4, [pw_64]
    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
    pxor          m7, m7
    movq     [rsp  ], m4

    movq          m0, [r1  ]      ; mm0 = src[0..7]
    movq          m1, [r1+1]      ; mm1 = src[1..8]
234
.next2drow:
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
    add           r1, r2

    movq          m2, m0
    movq          m3, m1
    punpckhbw     m0, m7
    punpcklbw     m1, m7
    punpcklbw     m2, m7
    punpckhbw     m3, m7
    pmullw        m0, [rsp]
    pmullw        m2, [rsp]
    pmullw        m1, m5
    pmullw        m3, m5
    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]

    movq          m0, [r1]
    movq          m1, m0
    punpcklbw     m0, m7
    punpckhbw     m1, m7
    pmullw        m0, m6
    pmullw        m1, m6
    paddw         m2, m0
    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]

    movq          m1, [r1+1]
    movq          m0, m1
    movq          m4, m1
    punpcklbw     m0, m7
    punpckhbw     m4, m7
    pmullw        m0, [rsp+8]
    pmullw        m4, [rsp+8]
    paddw         m2, m0
    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
    movq          m0, [r1]

    paddw         m2, [rnd_2d_%2+rnd_bias*8]
    paddw         m3, [rnd_2d_%2+rnd_bias*8]
    psrlw         m2, 6
    psrlw         m3, 6
    packuswb      m2, m3
    CHROMAMC_AVG  m2, [dest_reg]
    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6

    add     dest_reg, r2
    dec          r3d
    jne .next2drow
    mov          rsp, r6          ; restore stack pointer
    RET
%endmacro

285
%macro chroma_mc4_mmx_func 2
286 287 288 289 290 291
%define extra_regs 0
%ifidn %2, rv40
%ifdef PIC
%define extra_regs 1
%endif ; PIC
%endif ; rv40
292
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
293
%if ARCH_X86_64
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
    movsxd        r2, r2d
%endif
    pxor          m7, m7
    movd          m2, r4d         ; x
    movd          m3, r5d         ; y
    movq          m4, [pw_8]
    movq          m5, [pw_8]
    punpcklwd     m2, m2
    punpcklwd     m3, m3
    punpcklwd     m2, m2
    punpcklwd     m3, m3
    psubw         m4, m2
    psubw         m5, m3

%ifidn %2, rv40
%ifdef PIC
310 311
   lea            r6, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r6
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
%else
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
    and           r5, 6         ; &~1 for mx/my=[0,7]
    lea           r5, [r5*4+r4]
    sar          r5d, 1
%define rnd_bias r5
%else ; vc1, h264
%define rnd_bias 0
%endif

    movd          m0, [r1  ]
    movd          m6, [r1+1]
    add           r1, r2
    punpcklbw     m0, m7
    punpcklbw     m6, m7
    pmullw        m0, m4
    pmullw        m6, m2
    paddw         m6, m0

332
.next2rows:
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
    movd          m0, [r1  ]
    movd          m1, [r1+1]
    add           r1, r2
    punpcklbw     m0, m7
    punpcklbw     m1, m7
    pmullw        m0, m4
    pmullw        m1, m2
    paddw         m1, m0
    movq          m0, m1

    pmullw        m6, m5
    pmullw        m1, m3
    paddw         m6, [rnd_2d_%2+rnd_bias*8]
    paddw         m1, m6
    psrlw         m1, 6
    packuswb      m1, m1
    CHROMAMC_AVG4 m1, m6, [r0]
    movd        [r0], m1
    add           r0, r2

    movd          m6, [r1  ]
    movd          m1, [r1+1]
    add           r1, r2
    punpcklbw     m6, m7
    punpcklbw     m1, m7
    pmullw        m6, m4
    pmullw        m1, m2
    paddw         m1, m6
    movq          m6, m1
    pmullw        m0, m5
    pmullw        m1, m3
    paddw         m0, [rnd_2d_%2+rnd_bias*8]
    paddw         m1, m0
    psrlw         m1, 6
    packuswb      m1, m1
    CHROMAMC_AVG4 m1, m0, [r0]
    movd        [r0], m1
    add           r0, r2
    sub          r3d, 2
    jnz .next2rows
    REP_RET
%endmacro

376 377
%macro chroma_mc2_mmx_func 2
cglobal %1_%2_chroma_mc2, 6, 7, 0
378
%if ARCH_X86_64
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
    movsxd        r2, r2d
%endif

    mov          r6d, r4d
    shl          r4d, 16
    sub          r4d, r6d
    add          r4d, 8
    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
    shl          r4d, 3
    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)

    movd          m5, r4d
    movd          m6, r5d
    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
    pxor          m7, m7
    movd          m2, [r1]
    punpcklbw     m2, m7
    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]

399
.nextrow:
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
    add           r1, r2
    movq          m1, m2
    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
    movd          m0, [r1]
    punpcklbw     m0, m7
    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
    movq          m2, m0
    pmaddwd       m0, m6
    paddw         m1, [rnd_2d_%2]
    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
    psrlw         m1, 6
    packssdw      m1, m7
    packuswb      m1, m7
    CHROMAMC_AVG4 m1, m3, [r0]
    movd         r5d, m1
    mov         [r0], r5w
    add           r0, r2
    sub          r3d, 1
    jnz .nextrow
    REP_RET
%endmacro

%define rnd_1d_h264 pw_4
%define rnd_2d_h264 pw_32
%define rnd_1d_vc1  pw_3
%define rnd_2d_vc1  pw_28

%macro NOTHING 2-3
%endmacro
%macro DIRECT_AVG 2
430
    PAVGB         %1, %2
431 432 433
%endmacro
%macro COPY_AVG 3
    movd          %2, %3
434
    PAVGB         %1, %2
435 436
%endmacro

437
INIT_MMX mmx
438 439
%define CHROMAMC_AVG  NOTHING
%define CHROMAMC_AVG4 NOTHING
440 441 442 443 444 445 446 447
chroma_mc8_mmx_func put, h264, _rnd
chroma_mc8_mmx_func put, vc1,  _nornd
chroma_mc8_mmx_func put, rv40
chroma_mc4_mmx_func put, h264
chroma_mc4_mmx_func put, rv40

INIT_MMX mmxext
chroma_mc2_mmx_func put, h264
448 449 450

%define CHROMAMC_AVG  DIRECT_AVG
%define CHROMAMC_AVG4 COPY_AVG
451 452 453 454 455 456
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1,  _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
chroma_mc2_mmx_func avg, h264
457

458 459 460 461 462 463 464 465 466
INIT_MMX 3dnow
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1,  _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40

%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
467
%if ARCH_X86_64
468 469 470 471 472 473 474 475 476
    movsxd        r2, r2d
%endif
    mov          r6d, r5d
    or           r6d, r4d
    jne .at_least_one_non_zero
    ; mx == 0 AND my == 0 - no filter needed
    mv0_pixels_mc8
    REP_RET

477
.at_least_one_non_zero:
478 479 480 481 482 483 484 485 486 487
    test         r5d, r5d
    je .my_is_zero
    test         r4d, r4d
    je .mx_is_zero

    ; general case, bilinear
    mov          r6d, r4d
    shl          r4d, 8
    sub           r4, r6
    mov           r6, 8
488
    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
489 490 491 492 493 494 495
    sub          r6d, r5d
    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)

    movd          m7, r6d
    movd          m6, r4d
    movdqa        m5, [rnd_2d_%2]
496 497
    movq          m0, [r1  ]
    movq          m1, [r1+1]
498 499
    pshuflw       m7, m7, 0
    pshuflw       m6, m6, 0
500
    punpcklbw     m0, m1
501 502 503
    movlhps       m7, m7
    movlhps       m6, m6

504
.next2rows:
505 506 507 508
    movq          m1, [r1+r2*1   ]
    movq          m2, [r1+r2*1+1]
    movq          m3, [r1+r2*2  ]
    movq          m4, [r1+r2*2+1]
509 510 511
    lea           r1, [r1+r2*2]
    punpcklbw     m1, m2
    movdqa        m2, m1
512
    punpcklbw     m3, m4
513 514 515 516 517 518 519 520 521 522
    movdqa        m4, m3
    pmaddubsw     m0, m7
    pmaddubsw     m1, m6
    pmaddubsw     m2, m7
    pmaddubsw     m3, m6
    paddw         m0, m5
    paddw         m2, m5
    paddw         m1, m0
    paddw         m3, m2
    psrlw         m1, 6
523
    movdqa        m0, m4
524 525 526 527 528 529 530 531 532 533 534 535 536 537
    psrlw         m3, 6
%ifidn %1, avg
    movq          m2, [r0   ]
    movhps        m2, [r0+r2]
%endif
    packuswb      m1, m3
    CHROMAMC_AVG  m1, m2
    movq     [r0   ], m1
    movhps   [r0+r2], m1
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    jg .next2rows
    REP_RET

538
.my_is_zero:
539 540 541 542 543
    mov          r5d, r4d
    shl          r4d, 8
    add           r4, 8
    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
    movd          m7, r4d
544
    movdqa        m6, [rnd_1d_%2]
545 546 547
    pshuflw       m7, m7, 0
    movlhps       m7, m7

548
.next2xrows:
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
    movq          m0, [r1     ]
    movq          m1, [r1   +1]
    movq          m2, [r1+r2  ]
    movq          m3, [r1+r2+1]
    punpcklbw     m0, m1
    punpcklbw     m2, m3
    pmaddubsw     m0, m7
    pmaddubsw     m2, m7
%ifidn %1, avg
    movq          m4, [r0   ]
    movhps        m4, [r0+r2]
%endif
    paddw         m0, m6
    paddw         m2, m6
    psrlw         m0, 3
    psrlw         m2, 3
    packuswb      m0, m2
    CHROMAMC_AVG  m0, m4
    movq     [r0   ], m0
    movhps   [r0+r2], m0
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    lea           r1, [r1+r2*2]
    jg .next2xrows
    REP_RET

575
.mx_is_zero:
576 577 578 579 580
    mov          r4d, r5d
    shl          r5d, 8
    add           r5, 8
    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
    movd          m7, r5d
581
    movdqa        m6, [rnd_1d_%2]
582 583 584
    pshuflw       m7, m7, 0
    movlhps       m7, m7

585
.next2yrows:
586 587 588 589
    movq          m0, [r1     ]
    movq          m1, [r1+r2  ]
    movdqa        m2, m1
    movq          m3, [r1+r2*2]
590
    lea           r1, [r1+r2*2]
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
    punpcklbw     m0, m1
    punpcklbw     m2, m3
    pmaddubsw     m0, m7
    pmaddubsw     m2, m7
%ifidn %1, avg
    movq          m4, [r0   ]
    movhps        m4, [r0+r2]
%endif
    paddw         m0, m6
    paddw         m2, m6
    psrlw         m0, 3
    psrlw         m2, 3
    packuswb      m0, m2
    CHROMAMC_AVG  m0, m4
    movq     [r0   ], m0
    movhps   [r0+r2], m0
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    jg .next2yrows
    REP_RET
%endmacro

613 614
%macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7, 0
615
%if ARCH_X86_64
616 617 618 619 620 621
    movsxd        r2, r2d
%endif
    mov           r6, r4
    shl          r4d, 8
    sub          r4d, r6d
    mov           r6, 8
622
    add          r4d, 8           ; x*288+8
623 624 625 626 627 628 629
    sub          r6d, r5d
    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)

    movd          m7, r6d
    movd          m6, r4d
    movq          m5, [pw_32]
630
    movd          m0, [r1  ]
631
    pshufw        m7, m7, 0
632
    punpcklbw     m0, [r1+1]
633 634
    pshufw        m6, m6, 0

635
.next2rows:
636 637 638 639
    movd          m1, [r1+r2*1  ]
    movd          m3, [r1+r2*2  ]
    punpcklbw     m1, [r1+r2*1+1]
    punpcklbw     m3, [r1+r2*2+1]
640 641 642 643 644 645 646 647 648 649 650 651
    lea           r1, [r1+r2*2]
    movq          m2, m1
    movq          m4, m3
    pmaddubsw     m0, m7
    pmaddubsw     m1, m6
    pmaddubsw     m2, m7
    pmaddubsw     m3, m6
    paddw         m0, m5
    paddw         m2, m5
    paddw         m1, m0
    paddw         m3, m2
    psrlw         m1, 6
652
    movq          m0, m4
653 654 655 656 657 658 659 660 661 662 663 664 665 666
    psrlw         m3, 6
    packuswb      m1, m1
    packuswb      m3, m3
    CHROMAMC_AVG  m1, [r0  ]
    CHROMAMC_AVG  m3, [r0+r2]
    movd     [r0   ], m1
    movd     [r0+r2], m3
    sub          r3d, 2
    lea           r0, [r0+r2*2]
    jg .next2rows
    REP_RET
%endmacro

%define CHROMAMC_AVG NOTHING
667 668 669 670 671
INIT_XMM ssse3
chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1,  _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func put, h264
672 673

%define CHROMAMC_AVG DIRECT_AVG
674 675 676 677 678
INIT_XMM ssse3
chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1,  _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func avg, h264