h264_intrapred_10bit.asm 32 KB
Newer Older
1 2 3 4 5 6 7
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11 12 13 14
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16 17 18 19 20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22 23 24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

25
%include "libavutil/x86/x86util.asm"
26 27 28

SECTION_RODATA

29 30
cextern pw_1023
%define pw_pixel_max pw_1023
31
cextern pw_512
32
cextern pw_16
33
cextern pw_8
34
cextern pw_4
35
cextern pw_2
36
cextern pw_1
37
cextern pd_16
38

39 40 41 42
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3:        times 8 dw -3
pd_17:        times 4 dd 17

43 44
SECTION .text

45 46
; dest, left, right, src
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 48 49 50 51 52 53
%macro PRED4x4_LOWPASS 4
    paddw       %2, %3
    psrlw       %2, 1
    pavgw       %1, %4, %2
%endmacro

;-----------------------------------------------------------------------------
54 55
; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
;                               ptrdiff_t stride)
56
;-----------------------------------------------------------------------------
57 58
%macro PRED4x4_DR 0
cglobal pred4x4_down_right_10, 3, 3
59 60 61 62 63 64 65 66 67 68
    sub       r0, r2
    lea       r1, [r0+r2*2]
    movhps    m1, [r1-8]
    movhps    m2, [r0+r2*1-8]
    movhps    m4, [r0-8]
    punpckhwd m2, m4
    movq      m3, [r0]
    punpckhdq m1, m2
    PALIGNR   m3, m1, 10, m1
    movhps    m4, [r1+r2*1-8]
69
    PALIGNR   m0, m3, m4, 14, m4
70
    movhps    m4, [r1+r2*2-8]
71 72
    PALIGNR   m2, m0, m4, 14, m4
    PRED4x4_LOWPASS m0, m2, m3, m0
73 74 75 76 77 78 79 80 81 82
    movq      [r1+r2*2], m0
    psrldq    m0, 2
    movq      [r1+r2*1], m0
    psrldq    m0, 2
    movq      [r0+r2*2], m0
    psrldq    m0, 2
    movq      [r0+r2*1], m0
    RET
%endmacro

83 84 85 86
INIT_XMM sse2
PRED4x4_DR
INIT_XMM ssse3
PRED4x4_DR
87
%if HAVE_AVX_EXTERNAL
88 89
INIT_XMM avx
PRED4x4_DR
90 91
%endif

92
;------------------------------------------------------------------------------
93 94
; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
;                                   ptrdiff_t stride)
95
;------------------------------------------------------------------------------
96 97
%macro PRED4x4_VR 0
cglobal pred4x4_vertical_right_10, 3, 3, 6
98 99 100 101 102 103 104 105 106
    sub     r0, r2
    lea     r1, [r0+r2*2]
    movq    m5, [r0]            ; ........t3t2t1t0
    movhps  m1, [r0-8]
    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
    pavgw   m5, m0
    movhps  m1, [r0+r2*1-8]
    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
    movhps  m2, [r0+r2*2-8]
107
    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
108
    movhps  m3, [r1+r2*1-8]
109 110 111 112
    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
    PRED4x4_LOWPASS m1, m0, m2, m1
    pslldq  m0, m1, 12
    psrldq  m1, 4
113
    movq    [r0+r2*1], m5
114 115 116
    movq    [r0+r2*2], m1
    PALIGNR m5, m0, 14, m2
    pslldq  m0, 2
117
    movq    [r1+r2*1], m5
118 119
    PALIGNR m1, m0, 14, m0
    movq    [r1+r2*2], m1
120 121 122
    RET
%endmacro

123 124 125 126
INIT_XMM sse2
PRED4x4_VR
INIT_XMM ssse3
PRED4x4_VR
127
%if HAVE_AVX_EXTERNAL
128 129
INIT_XMM avx
PRED4x4_VR
130 131
%endif

132
;-------------------------------------------------------------------------------
133 134
; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
;                                    ptrdiff_t stride)
135
;-------------------------------------------------------------------------------
136 137
%macro PRED4x4_HD 0
cglobal pred4x4_horizontal_down_10, 3, 3
138 139 140 141 142 143 144 145 146 147 148 149 150 151
    sub        r0, r2
    lea        r1, [r0+r2*2]
    movq       m0, [r0-8]      ; lt ..
    movhps     m0, [r0]
    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
    movq       m1, [r1+r2*2-8] ; l3
    movq       m3, [r1+r2*1-8]
    punpcklwd  m1, m3          ; l2 l3
    movq       m2, [r0+r2*2-8] ; l1
    movq       m3, [r0+r2*1-8]
    punpcklwd  m2, m3          ; l0 l1
    punpckhdq  m1, m2          ; l0 l1 l2 l3
    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
152 153 154
    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
    pavgw      m5, m1, m3
    PRED4x4_LOWPASS m3, m1, m0, m3
155 156 157 158 159 160 161 162 163 164 165
    punpcklwd  m5, m3
    psrldq     m3, 8
    PALIGNR    m3, m5, 12, m4
    movq       [r1+r2*2], m5
    movhps     [r0+r2*2], m5
    psrldq     m5, 4
    movq       [r1+r2*1], m5
    movq       [r0+r2*1], m3
    RET
%endmacro

166 167 168 169
INIT_XMM sse2
PRED4x4_HD
INIT_XMM ssse3
PRED4x4_HD
170
%if HAVE_AVX_EXTERNAL
171 172
INIT_XMM avx
PRED4x4_HD
173 174 175
%endif

;-----------------------------------------------------------------------------
176
; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
177 178
;-----------------------------------------------------------------------------

179
INIT_MMX mmxext
180
cglobal pred4x4_dc_10, 3, 3
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
    sub    r0, r2
    lea    r1, [r0+r2*2]
    movq   m2, [r0+r2*1-8]
    paddw  m2, [r0+r2*2-8]
    paddw  m2, [r1+r2*1-8]
    paddw  m2, [r1+r2*2-8]
    psrlq  m2, 48
    movq   m0, [r0]
    HADDW  m0, m1
    paddw  m0, [pw_4]
    paddw  m0, m2
    psrlw  m0, 3
    SPLATW m0, m0, 0
    movq   [r0+r2*1], m0
    movq   [r0+r2*2], m0
    movq   [r1+r2*1], m0
    movq   [r1+r2*2], m0
    RET

;-----------------------------------------------------------------------------
201 202
; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
;                              ptrdiff_t stride)
203
;-----------------------------------------------------------------------------
204 205
%macro PRED4x4_DL 0
cglobal pred4x4_down_left_10, 3, 3
206
    sub        r0, r2
207 208 209 210 211 212
    movq       m0, [r0]
    movhps     m0, [r1]
    psrldq     m2, m0, 2
    pslldq     m3, m0, 2
    pshufhw    m2, m2, 10100100b
    PRED4x4_LOWPASS m0, m3, m2, m0
213 214 215 216 217 218 219 220 221 222 223
    lea        r1, [r0+r2*2]
    movhps     [r1+r2*2], m0
    psrldq     m0, 2
    movq       [r0+r2*1], m0
    psrldq     m0, 2
    movq       [r0+r2*2], m0
    psrldq     m0, 2
    movq       [r1+r2*1], m0
    RET
%endmacro

224 225
INIT_XMM sse2
PRED4x4_DL
226
%if HAVE_AVX_EXTERNAL
227 228
INIT_XMM avx
PRED4x4_DL
229 230 231
%endif

;-----------------------------------------------------------------------------
232 233
; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
;                                  ptrdiff_t stride)
234
;-----------------------------------------------------------------------------
235 236
%macro PRED4x4_VL 0
cglobal pred4x4_vertical_left_10, 3, 3
237 238 239
    sub        r0, r2
    movu       m1, [r0]
    movhps     m1, [r1]
240
    psrldq     m0, m1, 2
241
    psrldq     m2, m1, 4
242 243
    pavgw      m4, m0, m1
    PRED4x4_LOWPASS m0, m1, m2, m0
244 245 246 247 248 249 250 251 252 253
    lea        r1, [r0+r2*2]
    movq       [r0+r2*1], m4
    movq       [r0+r2*2], m0
    psrldq     m4, 2
    psrldq     m0, 2
    movq       [r1+r2*1], m4
    movq       [r1+r2*2], m0
    RET
%endmacro

254 255
INIT_XMM sse2
PRED4x4_VL
256
%if HAVE_AVX_EXTERNAL
257 258
INIT_XMM avx
PRED4x4_VL
259 260 261
%endif

;-----------------------------------------------------------------------------
262 263
; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
;                                  ptrdiff_t stride)
264
;-----------------------------------------------------------------------------
265
INIT_MMX mmxext
266
cglobal pred4x4_horizontal_up_10, 3, 3
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
    sub       r0, r2
    lea       r1, [r0+r2*2]
    movq      m0, [r0+r2*1-8]
    punpckhwd m0, [r0+r2*2-8]
    movq      m1, [r1+r2*1-8]
    punpckhwd m1, [r1+r2*2-8]
    punpckhdq m0, m1
    pshufw    m1, m1, 0xFF
    movq      [r1+r2*2], m1
    movd      [r1+r2*1+4], m1
    pshufw    m2, m0, 11111001b
    movq      m1, m2
    pavgw     m2, m0

    pshufw    m5, m0, 11111110b
282
    PRED4x4_LOWPASS m1, m0, m5, m1
283
    movq      m6, m2
284
    punpcklwd m6, m1
285 286
    movq      [r0+r2*1], m6
    psrlq     m2, 16
287 288
    psrlq     m1, 16
    punpcklwd m2, m1
289 290 291 292 293 294 295 296
    movq      [r0+r2*2], m2
    psrlq     m2, 32
    movd      [r1+r2*1], m2
    RET



;-----------------------------------------------------------------------------
297
; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
298
;-----------------------------------------------------------------------------
299 300
INIT_XMM sse2
cglobal pred8x8_vertical_10, 2, 2
301 302 303 304 305 306 307 308 309 310 311 312
    sub  r0, r1
    mova m0, [r0]
%rep 3
    mova [r0+r1*1], m0
    mova [r0+r1*2], m0
    lea  r0, [r0+r1*2]
%endrep
    mova [r0+r1*1], m0
    mova [r0+r1*2], m0
    RET

;-----------------------------------------------------------------------------
313
; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
314
;-----------------------------------------------------------------------------
315 316
INIT_XMM sse2
cglobal pred8x8_horizontal_10, 2, 3
317
    mov         r2d, 4
318 319 320 321 322 323 324 325 326 327
.loop:
    movq         m0, [r0+r1*0-8]
    movq         m1, [r0+r1*1-8]
    pshuflw      m0, m0, 0xff
    pshuflw      m1, m1, 0xff
    punpcklqdq   m0, m0
    punpcklqdq   m1, m1
    mova  [r0+r1*0], m0
    mova  [r0+r1*1], m1
    lea          r0, [r0+r1*2]
328
    dec          r2d
329 330
    jg .loop
    REP_RET
331 332

;-----------------------------------------------------------------------------
333
; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
334 335 336 337 338 339 340 341 342 343 344
;-----------------------------------------------------------------------------
%macro MOV8 2-3
; sort of a hack, but it works
%if mmsize==8
    movq    [%1+0], %2
    movq    [%1+8], %3
%else
    movdqa    [%1], %2
%endif
%endmacro

345 346
%macro PRED8x8_DC 1
cglobal pred8x8_dc_10, 2, 6
347 348 349 350
    sub         r0, r1
    pxor        m4, m4
    movq        m0, [r0+0]
    movq        m1, [r0+8]
351 352 353 354 355 356 357 358 359 360 361
%if mmsize==16
    punpcklwd   m0, m1
    movhlps     m1, m0
    paddw       m0, m1
%else
    pshufw      m2, m0, 00001110b
    pshufw      m3, m1, 00001110b
    paddw       m0, m2
    paddw       m1, m3
    punpcklwd   m0, m1
%endif
362
    %1          m2, m0, 00001110b
363
    paddw       m0, m2
364

365 366
    lea         r5, [r1*3]
    lea         r4, [r0+r1*4]
367 368 369
    movzx      r2d, word [r0+r1*1-2]
    movzx      r3d, word [r0+r1*2-2]
    add        r2d, r3d
370
    movzx      r3d, word [r0+r5*1-2]
371
    add        r2d, r3d
372
    movzx      r3d, word [r4-2]
373 374 375
    add        r2d, r3d
    movd        m2, r2d            ; s2

376 377
    movzx      r2d, word [r4+r1*1-2]
    movzx      r3d, word [r4+r1*2-2]
378
    add        r2d, r3d
379
    movzx      r3d, word [r4+r5*1-2]
380
    add        r2d, r3d
381
    movzx      r3d, word [r4+r1*4-2]
382 383 384 385 386
    add        r2d, r3d
    movd        m3, r2d            ; s3

    punpcklwd   m2, m3
    punpckldq   m0, m2            ; s0, s1, s2, s3
387 388
    %1          m3, m0, 11110110b ; s2, s1, s3, s3
    %1          m0, m0, 01110100b ; s0, s1, s3, s1
389 390 391
    paddw       m0, m3
    psrlw       m0, 2
    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
392
%if mmsize==16
393 394 395 396 397 398 399 400 401 402 403 404
    punpcklwd   m0, m0
    pshufd      m3, m0, 11111010b
    punpckldq   m0, m0
    SWAP         0,1
%else
    pshufw      m1, m0, 0x00
    pshufw      m2, m0, 0x55
    pshufw      m3, m0, 0xaa
    pshufw      m4, m0, 0xff
%endif
    MOV8   r0+r1*1, m1, m2
    MOV8   r0+r1*2, m1, m2
405
    MOV8   r0+r5*1, m1, m2
406
    MOV8   r0+r1*4, m1, m2
407 408 409 410
    MOV8   r4+r1*1, m3, m4
    MOV8   r4+r1*2, m3, m4
    MOV8   r4+r5*1, m3, m4
    MOV8   r4+r1*4, m3, m4
411 412 413
    RET
%endmacro

414
INIT_MMX mmxext
415 416 417
PRED8x8_DC pshufw
INIT_XMM sse2
PRED8x8_DC pshuflw
418 419

;-----------------------------------------------------------------------------
420
; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
421
;-----------------------------------------------------------------------------
422 423
INIT_XMM sse2
cglobal pred8x8_top_dc_10, 2, 4
424
    sub         r0, r1
425 426 427 428 429 430 431 432
    mova        m0, [r0]
    pshuflw     m1, m0, 0x4e
    pshufhw     m1, m1, 0x4e
    paddw       m0, m1
    pshuflw     m1, m0, 0xb1
    pshufhw     m1, m1, 0xb1
    paddw       m0, m1
    lea         r2, [r1*3]
433
    lea         r3, [r0+r1*4]
434
    paddw       m0, [pw_2]
435
    psrlw       m0, 2
436 437 438 439 440 441 442 443
    mova [r0+r1*1], m0
    mova [r0+r1*2], m0
    mova [r0+r2*1], m0
    mova [r0+r1*4], m0
    mova [r3+r1*1], m0
    mova [r3+r1*2], m0
    mova [r3+r2*1], m0
    mova [r3+r1*4], m0
444 445
    RET

446
;-----------------------------------------------------------------------------
447
; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
448
;-----------------------------------------------------------------------------
449 450
INIT_XMM sse2
cglobal pred8x8_plane_10, 2, 7, 7
451
    sub       r0, r1
452
    lea       r2, [r1*3]
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
    lea       r3, [r0+r1*4]
    mova      m2, [r0]
    pmaddwd   m2, [pw_m32101234]
    HADDD     m2, m1
    movd      m0, [r0-4]
    psrld     m0, 14
    psubw     m2, m0               ; H
    movd      m0, [r3+r1*4-4]
    movd      m1, [r0+12]
    paddw     m0, m1
    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
    sub      r4d, r5d
    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
    sub      r6d, r5d
    lea      r4d, [r4+r6*2]
    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
    sub      r5d, r6d
474
    lea      r5d, [r5*3]
475 476 477 478 479 480 481 482 483 484
    add      r4d, r5d
    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
    sub      r6d, r5d
    lea      r4d, [r4+r6*4]
    movd      m3, r4d              ; V
    punpckldq m2, m3
    pmaddwd   m2, [pd_17]
    paddd     m2, [pd_16]
    psrad     m2, 5                ; b, c
485

486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
    mova      m3, [pw_pixel_max]
    pxor      m1, m1
    SPLATW    m0, m0, 1
    SPLATW    m4, m2, 2
    SPLATW    m2, m2, 0
    pmullw    m2, [pw_m32101234]   ; b
    pmullw    m5, m4, [pw_m3]      ; c
    paddw     m5, [pw_16]
    mov      r2d, 8
    add       r0, r1
.loop:
    paddsw    m6, m2, m5
    paddsw    m6, m0
    psraw     m6, 5
    CLIPW     m6, m1, m3
    mova    [r0], m6
    paddw     m5, m4
    add       r0, r1
    dec r2d
    jg .loop
    REP_RET


;-----------------------------------------------------------------------------
510 511
; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
;                            ptrdiff_t stride)
512
;-----------------------------------------------------------------------------
513 514
%macro PRED8x8L_128_DC 0
cglobal pred8x8l_128_dc_10, 4, 4
515 516
    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
    lea       r1, [r3*3]
517 518 519 520 521 522 523 524 525 526 527 528
    lea       r2, [r0+r3*4]
    MOV8 r0+r3*0, m0, m0
    MOV8 r0+r3*1, m0, m0
    MOV8 r0+r3*2, m0, m0
    MOV8 r0+r1*1, m0, m0
    MOV8 r2+r3*0, m0, m0
    MOV8 r2+r3*1, m0, m0
    MOV8 r2+r3*2, m0, m0
    MOV8 r2+r1*1, m0, m0
    RET
%endmacro

529
INIT_MMX mmxext
530 531 532
PRED8x8L_128_DC
INIT_XMM sse2
PRED8x8L_128_DC
533 534

;-----------------------------------------------------------------------------
535 536
; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
;                            ptrdiff_t stride)
537
;-----------------------------------------------------------------------------
538 539
%macro PRED8x8L_TOP_DC 0
cglobal pred8x8l_top_dc_10, 4, 4, 6
540
    sub         r0, r3
541 542 543 544 545 546 547 548 549
    mova        m0, [r0]
    shr        r1d, 14
    shr        r2d, 13
    neg         r1
    pslldq      m1, m0, 2
    psrldq      m2, m0, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    lea         r1, [r3*3]
550
    lea         r2, [r0+r3*4]
551
    PRED4x4_LOWPASS m0, m2, m1, m0
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
    HADDW       m0, m1
    paddw       m0, [pw_4]
    psrlw       m0, 3
    SPLATW      m0, m0, 0
    mova [r0+r3*1], m0
    mova [r0+r3*2], m0
    mova [r0+r1*1], m0
    mova [r0+r3*4], m0
    mova [r2+r3*1], m0
    mova [r2+r3*2], m0
    mova [r2+r1*1], m0
    mova [r2+r3*4], m0
    RET
%endmacro

567 568
INIT_XMM sse2
PRED8x8L_TOP_DC
569
%if HAVE_AVX_EXTERNAL
570 571
INIT_XMM avx
PRED8x8L_TOP_DC
572
%endif
573

574
;-------------------------------------------------------------------------------
575 576
; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
;                        ptrdiff_t stride)
577
;-------------------------------------------------------------------------------
578
;TODO: see if scalar is faster
579 580
%macro PRED8x8L_DC 0
cglobal pred8x8l_dc_10, 4, 6, 6
581
    sub         r0, r3
582 583 584 585 586 587
    lea         r4, [r0+r3*4]
    lea         r5, [r3*3]
    mova        m0, [r0+r3*2-16]
    punpckhwd   m0, [r0+r3*1-16]
    mova        m1, [r4+r3*0-16]
    punpckhwd   m1, [r0+r5*1-16]
588
    punpckhdq   m1, m0
589 590 591 592
    mova        m2, [r4+r3*2-16]
    punpckhwd   m2, [r4+r3*1-16]
    mova        m3, [r4+r3*4-16]
    punpckhwd   m3, [r4+r5*1-16]
593 594
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623
    mova        m0, [r0]
    shr        r1d, 14
    shr        r2d, 13
    neg         r1
    pslldq      m1, m0, 2
    psrldq      m2, m0, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    not         r1
    and         r1, r3
    pslldq      m4, m3, 2
    psrldq      m5, m3, 2
    pshuflw     m4, m4, 11100101b
    pinsrw      m5, [r0+r1-2], 7
    PRED4x4_LOWPASS m3, m4, m5, m3
    PRED4x4_LOWPASS m0, m2, m1, m0
    paddw       m0, m3
    HADDW       m0, m1
    paddw       m0, [pw_8]
    psrlw       m0, 4
    SPLATW      m0, m0
    mova [r0+r3*1], m0
    mova [r0+r3*2], m0
    mova [r0+r5*1], m0
    mova [r0+r3*4], m0
    mova [r4+r3*1], m0
    mova [r4+r3*2], m0
    mova [r4+r5*1], m0
    mova [r4+r3*4], m0
624 625 626
    RET
%endmacro

627 628
INIT_XMM sse2
PRED8x8L_DC
629
%if HAVE_AVX_EXTERNAL
630 631
INIT_XMM avx
PRED8x8L_DC
632
%endif
633 634

;-----------------------------------------------------------------------------
635 636
; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
;                              ptrdiff_t stride)
637
;-----------------------------------------------------------------------------
638 639
%macro PRED8x8L_VERTICAL 0
cglobal pred8x8l_vertical_10, 4, 4, 6
640
    sub         r0, r3
641 642 643 644 645 646 647 648 649
    mova        m0, [r0]
    shr        r1d, 14
    shr        r2d, 13
    neg         r1
    pslldq      m1, m0, 2
    psrldq      m2, m0, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    lea         r1, [r3*3]
650
    lea         r2, [r0+r3*4]
651
    PRED4x4_LOWPASS m0, m2, m1, m0
652 653 654 655 656 657 658 659 660 661 662
    mova [r0+r3*1], m0
    mova [r0+r3*2], m0
    mova [r0+r1*1], m0
    mova [r0+r3*4], m0
    mova [r2+r3*1], m0
    mova [r2+r3*2], m0
    mova [r2+r1*1], m0
    mova [r2+r3*4], m0
    RET
%endmacro

663 664
INIT_XMM sse2
PRED8x8L_VERTICAL
665
%if HAVE_AVX_EXTERNAL
666 667
INIT_XMM avx
PRED8x8L_VERTICAL
668
%endif
669 670

;-----------------------------------------------------------------------------
671 672
; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
;                                int has_topright, ptrdiff_t stride)
673
;-----------------------------------------------------------------------------
674 675
%macro PRED8x8L_HORIZONTAL 0
cglobal pred8x8l_horizontal_10, 4, 4, 5
676 677 678 679 680 681 682 683 684 685
    mova        m0, [r0-16]
    shr        r1d, 14
    dec         r1
    and         r1, r3
    sub         r1, r3
    punpckhwd   m0, [r0+r1-16]
    mova        m1, [r0+r3*2-16]
    punpckhwd   m1, [r0+r3*1-16]
    lea         r2, [r0+r3*4]
    lea         r1, [r3*3]
686
    punpckhdq   m1, m0
687 688 689 690
    mova        m2, [r2+r3*0-16]
    punpckhwd   m2, [r0+r1-16]
    mova        m3, [r2+r3*2-16]
    punpckhwd   m3, [r2+r3*1-16]
691 692
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
693 694 695 696 697 698
    PALIGNR     m4, m3, [r2+r1-16], 14, m0
    pslldq      m0, m4, 2
    pshuflw     m0, m0, 11100101b
    PRED4x4_LOWPASS m4, m3, m0, m4
    punpckhwd   m3, m4, m4
    punpcklwd   m4, m4
699 700 701 702
    pshufd      m0, m3, 0xff
    pshufd      m1, m3, 0xaa
    pshufd      m2, m3, 0x55
    pshufd      m3, m3, 0x00
703 704 705 706 707 708 709 710 711 712 713 714
    mova [r0+r3*0], m0
    mova [r0+r3*1], m1
    mova [r0+r3*2], m2
    mova [r0+r1*1], m3
    pshufd      m0, m4, 0xff
    pshufd      m1, m4, 0xaa
    pshufd      m2, m4, 0x55
    pshufd      m3, m4, 0x00
    mova [r2+r3*0], m0
    mova [r2+r3*1], m1
    mova [r2+r3*2], m2
    mova [r2+r1*1], m3
715 716 717
    RET
%endmacro

718 719 720 721
INIT_XMM sse2
PRED8x8L_HORIZONTAL
INIT_XMM ssse3
PRED8x8L_HORIZONTAL
722
%if HAVE_AVX_EXTERNAL
723 724
INIT_XMM avx
PRED8x8L_HORIZONTAL
725
%endif
726 727

;-----------------------------------------------------------------------------
728 729
; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
;                               ptrdiff_t stride)
730
;-----------------------------------------------------------------------------
731 732
%macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_10, 4, 4, 7
733 734
    sub         r0, r3
    mova        m3, [r0]
735 736 737 738 739 740 741 742 743
    shr        r1d, 14
    neg         r1
    shr        r2d, 13
    pslldq      m1, m3, 2
    psrldq      m2, m3, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    PRED4x4_LOWPASS m6, m2, m1, m3
    jz .fix_tr ; flags from shr r2d
744
    mova        m1, [r0+16]
745 746 747 748
    psrldq      m5, m1, 2
    PALIGNR     m2, m1, m3, 14, m3
    pshufhw     m5, m5, 10100100b
    PRED4x4_LOWPASS m1, m2, m5, m1
749
.do_topright:
750 751
    lea         r1, [r3*3]
    psrldq      m5, m1, 14
752
    lea         r2, [r0+r3*4]
753 754 755 756 757 758
    PALIGNR     m2, m1, m6,  2, m0
    PALIGNR     m3, m1, m6, 14, m0
    PALIGNR     m5, m1,  2, m0
    pslldq      m4, m6, 2
    PRED4x4_LOWPASS m6, m4, m2, m6
    PRED4x4_LOWPASS m1, m3, m5, m1
759
    mova [r2+r3*4], m1
760 761
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
762
    mova [r2+r1*1], m1
763 764
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
765
    mova [r2+r3*2], m1
766 767
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
768
    mova [r2+r3*1], m1
769 770
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
771
    mova [r0+r3*4], m1
772 773
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
774
    mova [r0+r1*1], m1
775 776
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
777
    mova [r0+r3*2], m1
778
    PALIGNR     m1, m6, 14, m6
779 780
    mova [r0+r3*1], m1
    RET
781 782 783 784
.fix_tr:
    punpckhwd   m3, m3
    pshufd      m1, m3, 0xFF
    jmp .do_topright
785 786
%endmacro

787 788 789 790
INIT_XMM sse2
PRED8x8L_DOWN_LEFT
INIT_XMM ssse3
PRED8x8L_DOWN_LEFT
791
%if HAVE_AVX_EXTERNAL
792 793
INIT_XMM avx
PRED8x8L_DOWN_LEFT
794
%endif
795 796

;-----------------------------------------------------------------------------
797 798
; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
;                                int has_topright, ptrdiff_t stride)
799
;-----------------------------------------------------------------------------
800
%macro PRED8x8L_DOWN_RIGHT 0
801 802
; standard forbids this when has_topleft is false
; no need to check
803
cglobal pred8x8l_down_right_10, 4, 5, 8
804
    sub         r0, r3
805 806
    lea         r4, [r0+r3*4]
    lea         r1, [r3*3]
807 808
    mova        m0, [r0+r3*1-16]
    punpckhwd   m0, [r0+r3*0-16]
809
    mova        m1, [r0+r1*1-16]
810 811
    punpckhwd   m1, [r0+r3*2-16]
    punpckhdq   m1, m0
812 813 814 815
    mova        m2, [r4+r3*1-16]
    punpckhwd   m2, [r4+r3*0-16]
    mova        m3, [r4+r1*1-16]
    punpckhwd   m3, [r4+r3*2-16]
816 817
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
818 819 820 821 822 823 824 825
    mova        m0, [r4+r3*4-16]
    mova        m1, [r0]
    PALIGNR     m4, m3, m0, 14, m0
    PALIGNR     m1, m3,  2, m2
    pslldq      m0, m4, 2
    pshuflw     m0, m0, 11100101b
    PRED4x4_LOWPASS m6, m1, m4, m3
    PRED4x4_LOWPASS m4, m3, m0, m4
826
    mova        m3, [r0]
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
    shr        r2d, 13
    pslldq      m1, m3, 2
    psrldq      m2, m3, 2
    pinsrw      m1, [r0-2], 0
    pinsrw      m2, [r0+r2+14], 7
    PRED4x4_LOWPASS m3, m2, m1, m3
    PALIGNR     m2, m3, m6,  2, m0
    PALIGNR     m5, m3, m6, 14, m0
    psrldq      m7, m3, 2
    PRED4x4_LOWPASS m6, m4, m2, m6
    PRED4x4_LOWPASS m3, m5, m7, m3
    mova [r4+r3*4], m6
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r3*1], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r3*2], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r1*1], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r3*4], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r4+r3*1], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r4+r3*2], m3
    PALIGNR     m3, m6, 14, m6
    mova [r4+r1*1], m3
859 860 861
    RET
%endmacro

862 863 864 865
INIT_XMM sse2
PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3
PRED8x8L_DOWN_RIGHT
866
%if HAVE_AVX_EXTERNAL
867 868
INIT_XMM avx
PRED8x8L_DOWN_RIGHT
869
%endif
870 871

;-----------------------------------------------------------------------------
872 873
; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
;                                    int has_topright, ptrdiff_t stride)
874
;-----------------------------------------------------------------------------
875
%macro PRED8x8L_VERTICAL_RIGHT 0
876
; likewise with 8x8l_down_right
877
cglobal pred8x8l_vertical_right_10, 4, 5, 7
878
    sub         r0, r3
879 880
    lea         r4, [r0+r3*4]
    lea         r1, [r3*3]
881 882
    mova        m0, [r0+r3*1-16]
    punpckhwd   m0, [r0+r3*0-16]
883
    mova        m1, [r0+r1*1-16]
884 885
    punpckhwd   m1, [r0+r3*2-16]
    punpckhdq   m1, m0
886 887 888 889
    mova        m2, [r4+r3*1-16]
    punpckhwd   m2, [r4+r3*0-16]
    mova        m3, [r4+r1*1-16]
    punpckhwd   m3, [r4+r3*2-16]
890 891
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
    mova        m0, [r4+r3*4-16]
    mova        m1, [r0]
    PALIGNR     m4, m3, m0, 14, m0
    PALIGNR     m1, m3,  2, m2
    PRED4x4_LOWPASS m3, m1, m4, m3
    mova        m2, [r0]
    shr        r2d, 13
    pslldq      m1, m2, 2
    psrldq      m5, m2, 2
    pinsrw      m1, [r0-2], 0
    pinsrw      m5, [r0+r2+14], 7
    PRED4x4_LOWPASS m2, m5, m1, m2
    PALIGNR     m6, m2, m3, 12, m1
    PALIGNR     m5, m2, m3, 14, m0
    PRED4x4_LOWPASS m0, m6, m2, m5
    pavgw       m2, m5
908
    mova [r0+r3*2], m0
909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929
    mova [r0+r3*1], m2
    pslldq      m6, m3, 4
    pslldq      m1, m3, 2
    PRED4x4_LOWPASS m1, m3, m6, m1
    PALIGNR     m2, m1, 14, m4
    mova [r0+r1*1], m2
    pslldq      m1, 2
    PALIGNR     m0, m1, 14, m3
    mova [r0+r3*4], m0
    pslldq      m1, 2
    PALIGNR     m2, m1, 14, m4
    mova [r4+r3*1], m2
    pslldq      m1, 2
    PALIGNR     m0, m1, 14, m3
    mova [r4+r3*2], m0
    pslldq      m1, 2
    PALIGNR     m2, m1, 14, m4
    mova [r4+r1*1], m2
    pslldq      m1, 2
    PALIGNR     m0, m1, 14, m1
    mova [r4+r3*4], m0
930 931 932
    RET
%endmacro

933 934 935 936
INIT_XMM sse2
PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3
PRED8x8L_VERTICAL_RIGHT
937
%if HAVE_AVX_EXTERNAL
938 939
INIT_XMM avx
PRED8x8L_VERTICAL_RIGHT
940
%endif
941 942

;-----------------------------------------------------------------------------
943 944
; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
;                                   int has_topright, ptrdiff_t stride)
945
;-----------------------------------------------------------------------------
946 947
%macro PRED8x8L_HORIZONTAL_UP 0
cglobal pred8x8l_horizontal_up_10, 4, 4, 6
948
    mova        m0, [r0+r3*0-16]
949 950 951 952 953 954 955
    punpckhwd   m0, [r0+r3*1-16]
    shr        r1d, 14
    dec         r1
    and         r1, r3
    sub         r1, r3
    mova        m4, [r0+r1*1-16]
    lea         r1, [r3*3]
956
    lea         r2, [r0+r3*4]
957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
    mova        m1, [r0+r3*2-16]
    punpckhwd   m1, [r0+r1*1-16]
    punpckhdq   m0, m1
    mova        m2, [r2+r3*0-16]
    punpckhwd   m2, [r2+r3*1-16]
    mova        m3, [r2+r3*2-16]
    punpckhwd   m3, [r2+r1*1-16]
    punpckhdq   m2, m3
    punpckhqdq  m0, m2
    PALIGNR     m1, m0, m4, 14, m4
    psrldq      m2, m0, 2
    pshufhw     m2, m2, 10100100b
    PRED4x4_LOWPASS m0, m1, m2, m0
    psrldq      m1, m0, 2
    psrldq      m2, m0, 4
    pshufhw     m1, m1, 10100100b
    pshufhw     m2, m2, 01010100b
    pavgw       m4, m0, m1
    PRED4x4_LOWPASS m1, m2, m0, m1
    punpckhwd   m5, m4, m1
    punpcklwd   m4, m1
    mova [r2+r3*0], m5
    mova [r0+r3*0], m4
    pshufd      m0, m5, 11111001b
    pshufd      m1, m5, 11111110b
    pshufd      m2, m5, 11111111b
983 984 985
    mova [r2+r3*1], m0
    mova [r2+r3*2], m1
    mova [r2+r1*1], m2
986 987 988 989 990 991
    PALIGNR     m2, m5, m4, 4, m0
    PALIGNR     m3, m5, m4, 8, m1
    PALIGNR     m5, m5, m4, 12, m4
    mova [r0+r3*1], m2
    mova [r0+r3*2], m3
    mova [r0+r1*1], m5
992 993 994
    RET
%endmacro

995 996 997 998
INIT_XMM sse2
PRED8x8L_HORIZONTAL_UP
INIT_XMM ssse3
PRED8x8L_HORIZONTAL_UP
999
%if HAVE_AVX_EXTERNAL
1000 1001
INIT_XMM avx
PRED8x8L_HORIZONTAL_UP
1002
%endif
1003 1004 1005


;-----------------------------------------------------------------------------
1006
; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
;-----------------------------------------------------------------------------
%macro MOV16 3-5
    mova [%1+     0], %2
    mova [%1+mmsize], %3
%if mmsize==8
    mova [%1+    16], %4
    mova [%1+    24], %5
%endif
%endmacro

1017 1018
%macro PRED16x16_VERTICAL 0
cglobal pred16x16_vertical_10, 2, 3
1019
    sub   r0, r1
1020
    mov  r2d, 8
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
    mova  m0, [r0+ 0]
    mova  m1, [r0+mmsize]
%if mmsize==8
    mova  m2, [r0+16]
    mova  m3, [r0+24]
%endif
.loop:
    MOV16 r0+r1*1, m0, m1, m2, m3
    MOV16 r0+r1*2, m0, m1, m2, m3
    lea   r0, [r0+r1*2]
1031
    dec   r2d
1032 1033 1034 1035
    jg .loop
    REP_RET
%endmacro

1036
INIT_MMX mmxext
1037 1038 1039
PRED16x16_VERTICAL
INIT_XMM sse2
PRED16x16_VERTICAL
1040 1041

;-----------------------------------------------------------------------------
1042
; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
1043
;-----------------------------------------------------------------------------
1044 1045
%macro PRED16x16_HORIZONTAL 0
cglobal pred16x16_horizontal_10, 2, 3
1046
    mov   r2d, 8
1047 1048 1049 1050 1051 1052 1053 1054
.vloop:
    movd   m0, [r0+r1*0-4]
    movd   m1, [r0+r1*1-4]
    SPLATW m0, m0, 1
    SPLATW m1, m1, 1
    MOV16  r0+r1*0, m0, m0, m0, m0
    MOV16  r0+r1*1, m1, m1, m1, m1
    lea    r0, [r0+r1*2]
1055
    dec    r2d
1056
    jg .vloop
1057 1058 1059
    REP_RET
%endmacro

1060
INIT_MMX mmxext
1061 1062 1063
PRED16x16_HORIZONTAL
INIT_XMM sse2
PRED16x16_HORIZONTAL
1064 1065

;-----------------------------------------------------------------------------
1066
; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
1067
;-----------------------------------------------------------------------------
1068 1069
%macro PRED16x16_DC 0
cglobal pred16x16_dc_10, 2, 6
1070
    mov        r5, r0
1071 1072 1073 1074 1075 1076 1077 1078 1079
    sub        r0, r1
    mova       m0, [r0+0]
    paddw      m0, [r0+mmsize]
%if mmsize==8
    paddw      m0, [r0+16]
    paddw      m0, [r0+24]
%endif
    HADDW      m0, m2

1080 1081 1082
    lea        r0, [r0+r1-2]
    movzx     r3d, word [r0]
    movzx     r4d, word [r0+r1]
1083 1084
%rep 7
    lea        r0, [r0+r1*2]
1085
    movzx     r2d, word [r0]
1086
    add       r3d, r2d
1087 1088
    movzx     r2d, word [r0+r1]
    add       r4d, r2d
1089
%endrep
1090
    lea       r3d, [r3+r4+16]
1091 1092 1093 1094 1095 1096 1097

    movd       m1, r3d
    paddw      m0, m1
    psrlw      m0, 5
    SPLATW     m0, m0
    mov       r3d, 8
.loop:
1098 1099 1100
    MOV16 r5+r1*0, m0, m0, m0, m0
    MOV16 r5+r1*1, m0, m0, m0, m0
    lea        r5, [r5+r1*2]
1101 1102 1103 1104 1105
    dec       r3d
    jg .loop
    REP_RET
%endmacro

1106
INIT_MMX mmxext
1107 1108 1109
PRED16x16_DC
INIT_XMM sse2
PRED16x16_DC
1110 1111

;-----------------------------------------------------------------------------
1112
; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
1113
;-----------------------------------------------------------------------------
1114 1115
%macro PRED16x16_TOP_DC 0
cglobal pred16x16_top_dc_10, 2, 3
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
    sub        r0, r1
    mova       m0, [r0+0]
    paddw      m0, [r0+mmsize]
%if mmsize==8
    paddw      m0, [r0+16]
    paddw      m0, [r0+24]
%endif
    HADDW      m0, m2

    SPLATW     m0, m0
    paddw      m0, [pw_8]
    psrlw      m0, 4
    mov       r2d, 8
.loop:
    MOV16 r0+r1*1, m0, m0, m0, m0
    MOV16 r0+r1*2, m0, m0, m0, m0
    lea        r0, [r0+r1*2]
    dec       r2d
    jg .loop
    REP_RET
%endmacro

1138
INIT_MMX mmxext
1139 1140 1141
PRED16x16_TOP_DC
INIT_XMM sse2
PRED16x16_TOP_DC
1142 1143

;-----------------------------------------------------------------------------
1144
; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
1145
;-----------------------------------------------------------------------------
1146 1147
%macro PRED16x16_LEFT_DC 0
cglobal pred16x16_left_dc_10, 2, 6
1148
    mov        r5, r0
1149 1150

    sub        r0, 2
1151 1152
    movzx     r3d, word [r0]
    movzx     r4d, word [r0+r1]
1153 1154
%rep 7
    lea        r0, [r0+r1*2]
1155 1156 1157 1158
    movzx     r2d, word [r0]
    add       r3d, r2d
    movzx     r2d, word [r0+r1]
    add       r4d, r2d
1159
%endrep
1160 1161
    lea       r3d, [r3+r4+8]
    shr       r3d, 4
1162

1163
    movd       m0, r3d
1164 1165 1166
    SPLATW     m0, m0
    mov       r3d, 8
.loop:
1167 1168 1169
    MOV16 r5+r1*0, m0, m0, m0, m0
    MOV16 r5+r1*1, m0, m0, m0, m0
    lea        r5, [r5+r1*2]
1170 1171 1172 1173 1174
    dec       r3d
    jg .loop
    REP_RET
%endmacro

1175
INIT_MMX mmxext
1176 1177 1178
PRED16x16_LEFT_DC
INIT_XMM sse2
PRED16x16_LEFT_DC
1179 1180

;-----------------------------------------------------------------------------
1181
; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
1182
;-----------------------------------------------------------------------------
1183 1184
%macro PRED16x16_128_DC 0
cglobal pred16x16_128_dc_10, 2,3
1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195
    mova       m0, [pw_512]
    mov       r2d, 8
.loop:
    MOV16 r0+r1*0, m0, m0, m0, m0
    MOV16 r0+r1*1, m0, m0, m0, m0
    lea        r0, [r0+r1*2]
    dec       r2d
    jg .loop
    REP_RET
%endmacro

1196
INIT_MMX mmxext
1197 1198 1199
PRED16x16_128_DC
INIT_XMM sse2
PRED16x16_128_DC