h264_intrapred_10bit.asm 31.6 KB
Newer Older
1 2 3 4 5 6 7
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11 12 13 14
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16 17 18 19 20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22 23 24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

25
%include "libavutil/x86/x86util.asm"
26 27 28

SECTION_RODATA

29
cextern pw_16
30
cextern pw_8
31
cextern pw_4
32
cextern pw_2
33 34
cextern pw_1

35 36 37 38 39 40 41
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3:        times 8 dw -3
pw_pixel_max: times 8 dw ((1 << 10)-1)
pw_512:       times 8 dw 512
pd_17:        times 4 dd 17
pd_16:        times 4 dd 16

42 43
SECTION .text

44 45
; dest, left, right, src
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 47 48 49 50 51 52
%macro PRED4x4_LOWPASS 4
    paddw       %2, %3
    psrlw       %2, 1
    pavgw       %1, %4, %2
%endmacro

;-----------------------------------------------------------------------------
53
; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54
;-----------------------------------------------------------------------------
55 56
%macro PRED4x4_DR 0
cglobal pred4x4_down_right_10, 3, 3
57 58 59 60 61 62 63 64 65 66
    sub       r0, r2
    lea       r1, [r0+r2*2]
    movhps    m1, [r1-8]
    movhps    m2, [r0+r2*1-8]
    movhps    m4, [r0-8]
    punpckhwd m2, m4
    movq      m3, [r0]
    punpckhdq m1, m2
    PALIGNR   m3, m1, 10, m1
    movhps    m4, [r1+r2*1-8]
67
    PALIGNR   m0, m3, m4, 14, m4
68
    movhps    m4, [r1+r2*2-8]
69 70
    PALIGNR   m2, m0, m4, 14, m4
    PRED4x4_LOWPASS m0, m2, m3, m0
71 72 73 74 75 76 77 78 79 80
    movq      [r1+r2*2], m0
    psrldq    m0, 2
    movq      [r1+r2*1], m0
    psrldq    m0, 2
    movq      [r0+r2*2], m0
    psrldq    m0, 2
    movq      [r0+r2*1], m0
    RET
%endmacro

81 82 83 84
INIT_XMM sse2
PRED4x4_DR
INIT_XMM ssse3
PRED4x4_DR
85
%if HAVE_AVX_EXTERNAL
86 87
INIT_XMM avx
PRED4x4_DR
88 89
%endif

90 91 92
;------------------------------------------------------------------------------
; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
;------------------------------------------------------------------------------
93 94
%macro PRED4x4_VR 0
cglobal pred4x4_vertical_right_10, 3, 3, 6
95 96 97 98 99 100 101 102 103
    sub     r0, r2
    lea     r1, [r0+r2*2]
    movq    m5, [r0]            ; ........t3t2t1t0
    movhps  m1, [r0-8]
    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
    pavgw   m5, m0
    movhps  m1, [r0+r2*1-8]
    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
    movhps  m2, [r0+r2*2-8]
104
    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
105
    movhps  m3, [r1+r2*1-8]
106 107 108 109
    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
    PRED4x4_LOWPASS m1, m0, m2, m1
    pslldq  m0, m1, 12
    psrldq  m1, 4
110
    movq    [r0+r2*1], m5
111 112 113
    movq    [r0+r2*2], m1
    PALIGNR m5, m0, 14, m2
    pslldq  m0, 2
114
    movq    [r1+r2*1], m5
115 116
    PALIGNR m1, m0, 14, m0
    movq    [r1+r2*2], m1
117 118 119
    RET
%endmacro

120 121 122 123
INIT_XMM sse2
PRED4x4_VR
INIT_XMM ssse3
PRED4x4_VR
124
%if HAVE_AVX_EXTERNAL
125 126
INIT_XMM avx
PRED4x4_VR
127 128
%endif

129 130 131
;-------------------------------------------------------------------------------
; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
;-------------------------------------------------------------------------------
132 133
%macro PRED4x4_HD 0
cglobal pred4x4_horizontal_down_10, 3, 3
134 135 136 137 138 139 140 141 142 143 144 145 146 147
    sub        r0, r2
    lea        r1, [r0+r2*2]
    movq       m0, [r0-8]      ; lt ..
    movhps     m0, [r0]
    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
    movq       m1, [r1+r2*2-8] ; l3
    movq       m3, [r1+r2*1-8]
    punpcklwd  m1, m3          ; l2 l3
    movq       m2, [r0+r2*2-8] ; l1
    movq       m3, [r0+r2*1-8]
    punpcklwd  m2, m3          ; l0 l1
    punpckhdq  m1, m2          ; l0 l1 l2 l3
    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
148 149 150
    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
    pavgw      m5, m1, m3
    PRED4x4_LOWPASS m3, m1, m0, m3
151 152 153 154 155 156 157 158 159 160 161
    punpcklwd  m5, m3
    psrldq     m3, 8
    PALIGNR    m3, m5, 12, m4
    movq       [r1+r2*2], m5
    movhps     [r0+r2*2], m5
    psrldq     m5, 4
    movq       [r1+r2*1], m5
    movq       [r0+r2*1], m3
    RET
%endmacro

162 163 164 165
INIT_XMM sse2
PRED4x4_HD
INIT_XMM ssse3
PRED4x4_HD
166
%if HAVE_AVX_EXTERNAL
167 168
INIT_XMM avx
PRED4x4_HD
169 170 171
%endif

;-----------------------------------------------------------------------------
172
; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
173 174
;-----------------------------------------------------------------------------

175
INIT_MMX mmxext
176
cglobal pred4x4_dc_10, 3, 3
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
    sub    r0, r2
    lea    r1, [r0+r2*2]
    movq   m2, [r0+r2*1-8]
    paddw  m2, [r0+r2*2-8]
    paddw  m2, [r1+r2*1-8]
    paddw  m2, [r1+r2*2-8]
    psrlq  m2, 48
    movq   m0, [r0]
    HADDW  m0, m1
    paddw  m0, [pw_4]
    paddw  m0, m2
    psrlw  m0, 3
    SPLATW m0, m0, 0
    movq   [r0+r2*1], m0
    movq   [r0+r2*2], m0
    movq   [r1+r2*1], m0
    movq   [r1+r2*2], m0
    RET

;-----------------------------------------------------------------------------
197
; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
198
;-----------------------------------------------------------------------------
199 200
%macro PRED4x4_DL 0
cglobal pred4x4_down_left_10, 3, 3
201
    sub        r0, r2
202 203 204 205 206 207
    movq       m0, [r0]
    movhps     m0, [r1]
    psrldq     m2, m0, 2
    pslldq     m3, m0, 2
    pshufhw    m2, m2, 10100100b
    PRED4x4_LOWPASS m0, m3, m2, m0
208 209 210 211 212 213 214 215 216 217 218
    lea        r1, [r0+r2*2]
    movhps     [r1+r2*2], m0
    psrldq     m0, 2
    movq       [r0+r2*1], m0
    psrldq     m0, 2
    movq       [r0+r2*2], m0
    psrldq     m0, 2
    movq       [r1+r2*1], m0
    RET
%endmacro

219 220
INIT_XMM sse2
PRED4x4_DL
221
%if HAVE_AVX_EXTERNAL
222 223
INIT_XMM avx
PRED4x4_DL
224 225 226
%endif

;-----------------------------------------------------------------------------
227
; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
228
;-----------------------------------------------------------------------------
229 230
%macro PRED4x4_VL 0
cglobal pred4x4_vertical_left_10, 3, 3
231 232 233
    sub        r0, r2
    movu       m1, [r0]
    movhps     m1, [r1]
234
    psrldq     m0, m1, 2
235
    psrldq     m2, m1, 4
236 237
    pavgw      m4, m0, m1
    PRED4x4_LOWPASS m0, m1, m2, m0
238 239 240 241 242 243 244 245 246 247
    lea        r1, [r0+r2*2]
    movq       [r0+r2*1], m4
    movq       [r0+r2*2], m0
    psrldq     m4, 2
    psrldq     m0, 2
    movq       [r1+r2*1], m4
    movq       [r1+r2*2], m0
    RET
%endmacro

248 249
INIT_XMM sse2
PRED4x4_VL
250
%if HAVE_AVX_EXTERNAL
251 252
INIT_XMM avx
PRED4x4_VL
253 254 255
%endif

;-----------------------------------------------------------------------------
256
; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
257
;-----------------------------------------------------------------------------
258
INIT_MMX mmxext
259
cglobal pred4x4_horizontal_up_10, 3, 3
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
    sub       r0, r2
    lea       r1, [r0+r2*2]
    movq      m0, [r0+r2*1-8]
    punpckhwd m0, [r0+r2*2-8]
    movq      m1, [r1+r2*1-8]
    punpckhwd m1, [r1+r2*2-8]
    punpckhdq m0, m1
    pshufw    m1, m1, 0xFF
    movq      [r1+r2*2], m1
    movd      [r1+r2*1+4], m1
    pshufw    m2, m0, 11111001b
    movq      m1, m2
    pavgw     m2, m0

    pshufw    m5, m0, 11111110b
275
    PRED4x4_LOWPASS m1, m0, m5, m1
276
    movq      m6, m2
277
    punpcklwd m6, m1
278 279
    movq      [r0+r2*1], m6
    psrlq     m2, 16
280 281
    psrlq     m1, 16
    punpcklwd m2, m1
282 283 284 285 286 287 288 289
    movq      [r0+r2*2], m2
    psrlq     m2, 32
    movd      [r1+r2*1], m2
    RET



;-----------------------------------------------------------------------------
290
; void ff_pred8x8_vertical(pixel *src, int stride)
291
;-----------------------------------------------------------------------------
292 293
INIT_XMM sse2
cglobal pred8x8_vertical_10, 2, 2
294 295 296 297 298 299 300 301 302 303 304 305
    sub  r0, r1
    mova m0, [r0]
%rep 3
    mova [r0+r1*1], m0
    mova [r0+r1*2], m0
    lea  r0, [r0+r1*2]
%endrep
    mova [r0+r1*1], m0
    mova [r0+r1*2], m0
    RET

;-----------------------------------------------------------------------------
306
; void ff_pred8x8_horizontal(pixel *src, int stride)
307
;-----------------------------------------------------------------------------
308 309
INIT_XMM sse2
cglobal pred8x8_horizontal_10, 2, 3
310
    mov         r2d, 4
311 312 313 314 315 316 317 318 319 320
.loop:
    movq         m0, [r0+r1*0-8]
    movq         m1, [r0+r1*1-8]
    pshuflw      m0, m0, 0xff
    pshuflw      m1, m1, 0xff
    punpcklqdq   m0, m0
    punpcklqdq   m1, m1
    mova  [r0+r1*0], m0
    mova  [r0+r1*1], m1
    lea          r0, [r0+r1*2]
321
    dec          r2d
322 323
    jg .loop
    REP_RET
324 325

;-----------------------------------------------------------------------------
326
; void ff_predict_8x8_dc(pixel *src, int stride)
327 328 329 330 331 332 333 334 335 336 337
;-----------------------------------------------------------------------------
%macro MOV8 2-3
; sort of a hack, but it works
%if mmsize==8
    movq    [%1+0], %2
    movq    [%1+8], %3
%else
    movdqa    [%1], %2
%endif
%endmacro

338 339
%macro PRED8x8_DC 1
cglobal pred8x8_dc_10, 2, 6
340 341 342 343
    sub         r0, r1
    pxor        m4, m4
    movq        m0, [r0+0]
    movq        m1, [r0+8]
344 345 346 347 348 349 350 351 352 353 354
%if mmsize==16
    punpcklwd   m0, m1
    movhlps     m1, m0
    paddw       m0, m1
%else
    pshufw      m2, m0, 00001110b
    pshufw      m3, m1, 00001110b
    paddw       m0, m2
    paddw       m1, m3
    punpcklwd   m0, m1
%endif
355
    %1          m2, m0, 00001110b
356
    paddw       m0, m2
357

358 359
    lea         r5, [r1*3]
    lea         r4, [r0+r1*4]
360 361 362
    movzx      r2d, word [r0+r1*1-2]
    movzx      r3d, word [r0+r1*2-2]
    add        r2d, r3d
363
    movzx      r3d, word [r0+r5*1-2]
364
    add        r2d, r3d
365
    movzx      r3d, word [r4-2]
366 367 368
    add        r2d, r3d
    movd        m2, r2d            ; s2

369 370
    movzx      r2d, word [r4+r1*1-2]
    movzx      r3d, word [r4+r1*2-2]
371
    add        r2d, r3d
372
    movzx      r3d, word [r4+r5*1-2]
373
    add        r2d, r3d
374
    movzx      r3d, word [r4+r1*4-2]
375 376 377 378 379
    add        r2d, r3d
    movd        m3, r2d            ; s3

    punpcklwd   m2, m3
    punpckldq   m0, m2            ; s0, s1, s2, s3
380 381
    %1          m3, m0, 11110110b ; s2, s1, s3, s3
    %1          m0, m0, 01110100b ; s0, s1, s3, s1
382 383 384
    paddw       m0, m3
    psrlw       m0, 2
    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
385
%if mmsize==16
386 387 388 389 390 391 392 393 394 395 396 397
    punpcklwd   m0, m0
    pshufd      m3, m0, 11111010b
    punpckldq   m0, m0
    SWAP         0,1
%else
    pshufw      m1, m0, 0x00
    pshufw      m2, m0, 0x55
    pshufw      m3, m0, 0xaa
    pshufw      m4, m0, 0xff
%endif
    MOV8   r0+r1*1, m1, m2
    MOV8   r0+r1*2, m1, m2
398
    MOV8   r0+r5*1, m1, m2
399
    MOV8   r0+r1*4, m1, m2
400 401 402 403
    MOV8   r4+r1*1, m3, m4
    MOV8   r4+r1*2, m3, m4
    MOV8   r4+r5*1, m3, m4
    MOV8   r4+r1*4, m3, m4
404 405 406
    RET
%endmacro

407
INIT_MMX mmxext
408 409 410
PRED8x8_DC pshufw
INIT_XMM sse2
PRED8x8_DC pshuflw
411 412

;-----------------------------------------------------------------------------
413
; void ff_pred8x8_top_dc(pixel *src, int stride)
414
;-----------------------------------------------------------------------------
415 416
INIT_XMM sse2
cglobal pred8x8_top_dc_10, 2, 4
417
    sub         r0, r1
418 419 420 421 422 423 424 425
    mova        m0, [r0]
    pshuflw     m1, m0, 0x4e
    pshufhw     m1, m1, 0x4e
    paddw       m0, m1
    pshuflw     m1, m0, 0xb1
    pshufhw     m1, m1, 0xb1
    paddw       m0, m1
    lea         r2, [r1*3]
426
    lea         r3, [r0+r1*4]
427
    paddw       m0, [pw_2]
428
    psrlw       m0, 2
429 430 431 432 433 434 435 436
    mova [r0+r1*1], m0
    mova [r0+r1*2], m0
    mova [r0+r2*1], m0
    mova [r0+r1*4], m0
    mova [r3+r1*1], m0
    mova [r3+r1*2], m0
    mova [r3+r2*1], m0
    mova [r3+r1*4], m0
437 438
    RET

439
;-----------------------------------------------------------------------------
440
; void ff_pred8x8_plane(pixel *src, int stride)
441
;-----------------------------------------------------------------------------
442 443
INIT_XMM sse2
cglobal pred8x8_plane_10, 2, 7, 7
444
    sub       r0, r1
445
    lea       r2, [r1*3]
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
    lea       r3, [r0+r1*4]
    mova      m2, [r0]
    pmaddwd   m2, [pw_m32101234]
    HADDD     m2, m1
    movd      m0, [r0-4]
    psrld     m0, 14
    psubw     m2, m0               ; H
    movd      m0, [r3+r1*4-4]
    movd      m1, [r0+12]
    paddw     m0, m1
    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
    sub      r4d, r5d
    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
    sub      r6d, r5d
    lea      r4d, [r4+r6*2]
    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
    sub      r5d, r6d
467
    lea      r5d, [r5*3]
468 469 470 471 472 473 474 475 476 477
    add      r4d, r5d
    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
    sub      r6d, r5d
    lea      r4d, [r4+r6*4]
    movd      m3, r4d              ; V
    punpckldq m2, m3
    pmaddwd   m2, [pd_17]
    paddd     m2, [pd_16]
    psrad     m2, 5                ; b, c
478

479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
    mova      m3, [pw_pixel_max]
    pxor      m1, m1
    SPLATW    m0, m0, 1
    SPLATW    m4, m2, 2
    SPLATW    m2, m2, 0
    pmullw    m2, [pw_m32101234]   ; b
    pmullw    m5, m4, [pw_m3]      ; c
    paddw     m5, [pw_16]
    mov      r2d, 8
    add       r0, r1
.loop:
    paddsw    m6, m2, m5
    paddsw    m6, m0
    psraw     m6, 5
    CLIPW     m6, m1, m3
    mova    [r0], m6
    paddw     m5, m4
    add       r0, r1
    dec r2d
    jg .loop
    REP_RET


;-----------------------------------------------------------------------------
503 504
; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
;                         int stride)
505
;-----------------------------------------------------------------------------
506 507
%macro PRED8x8L_128_DC 0
cglobal pred8x8l_128_dc_10, 4, 4
508 509
    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
    lea       r1, [r3*3]
510 511 512 513 514 515 516 517 518 519 520 521
    lea       r2, [r0+r3*4]
    MOV8 r0+r3*0, m0, m0
    MOV8 r0+r3*1, m0, m0
    MOV8 r0+r3*2, m0, m0
    MOV8 r0+r1*1, m0, m0
    MOV8 r2+r3*0, m0, m0
    MOV8 r2+r3*1, m0, m0
    MOV8 r2+r3*2, m0, m0
    MOV8 r2+r1*1, m0, m0
    RET
%endmacro

522
INIT_MMX mmxext
523 524 525
PRED8x8L_128_DC
INIT_XMM sse2
PRED8x8L_128_DC
526 527

;-----------------------------------------------------------------------------
528 529
; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
;                         int stride)
530
;-----------------------------------------------------------------------------
531 532
%macro PRED8x8L_TOP_DC 0
cglobal pred8x8l_top_dc_10, 4, 4, 6
533
    sub         r0, r3
534 535 536 537 538 539 540 541 542
    mova        m0, [r0]
    shr        r1d, 14
    shr        r2d, 13
    neg         r1
    pslldq      m1, m0, 2
    psrldq      m2, m0, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    lea         r1, [r3*3]
543
    lea         r2, [r0+r3*4]
544
    PRED4x4_LOWPASS m0, m2, m1, m0
545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
    HADDW       m0, m1
    paddw       m0, [pw_4]
    psrlw       m0, 3
    SPLATW      m0, m0, 0
    mova [r0+r3*1], m0
    mova [r0+r3*2], m0
    mova [r0+r1*1], m0
    mova [r0+r3*4], m0
    mova [r2+r3*1], m0
    mova [r2+r3*2], m0
    mova [r2+r1*1], m0
    mova [r2+r3*4], m0
    RET
%endmacro

560 561
INIT_XMM sse2
PRED8x8L_TOP_DC
562
%if HAVE_AVX_EXTERNAL
563 564
INIT_XMM avx
PRED8x8L_TOP_DC
565
%endif
566

567 568 569
;-------------------------------------------------------------------------------
; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-------------------------------------------------------------------------------
570
;TODO: see if scalar is faster
571 572
%macro PRED8x8L_DC 0
cglobal pred8x8l_dc_10, 4, 6, 6
573
    sub         r0, r3
574 575 576 577 578 579
    lea         r4, [r0+r3*4]
    lea         r5, [r3*3]
    mova        m0, [r0+r3*2-16]
    punpckhwd   m0, [r0+r3*1-16]
    mova        m1, [r4+r3*0-16]
    punpckhwd   m1, [r0+r5*1-16]
580
    punpckhdq   m1, m0
581 582 583 584
    mova        m2, [r4+r3*2-16]
    punpckhwd   m2, [r4+r3*1-16]
    mova        m3, [r4+r3*4-16]
    punpckhwd   m3, [r4+r5*1-16]
585 586
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
    mova        m0, [r0]
    shr        r1d, 14
    shr        r2d, 13
    neg         r1
    pslldq      m1, m0, 2
    psrldq      m2, m0, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    not         r1
    and         r1, r3
    pslldq      m4, m3, 2
    psrldq      m5, m3, 2
    pshuflw     m4, m4, 11100101b
    pinsrw      m5, [r0+r1-2], 7
    PRED4x4_LOWPASS m3, m4, m5, m3
    PRED4x4_LOWPASS m0, m2, m1, m0
    paddw       m0, m3
    HADDW       m0, m1
    paddw       m0, [pw_8]
    psrlw       m0, 4
    SPLATW      m0, m0
    mova [r0+r3*1], m0
    mova [r0+r3*2], m0
    mova [r0+r5*1], m0
    mova [r0+r3*4], m0
    mova [r4+r3*1], m0
    mova [r4+r3*2], m0
    mova [r4+r5*1], m0
    mova [r4+r3*4], m0
616 617 618
    RET
%endmacro

619 620
INIT_XMM sse2
PRED8x8L_DC
621
%if HAVE_AVX_EXTERNAL
622 623
INIT_XMM avx
PRED8x8L_DC
624
%endif
625 626

;-----------------------------------------------------------------------------
627 628
; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
;                           int stride)
629
;-----------------------------------------------------------------------------
630 631
%macro PRED8x8L_VERTICAL 0
cglobal pred8x8l_vertical_10, 4, 4, 6
632
    sub         r0, r3
633 634 635 636 637 638 639 640 641
    mova        m0, [r0]
    shr        r1d, 14
    shr        r2d, 13
    neg         r1
    pslldq      m1, m0, 2
    psrldq      m2, m0, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    lea         r1, [r3*3]
642
    lea         r2, [r0+r3*4]
643
    PRED4x4_LOWPASS m0, m2, m1, m0
644 645 646 647 648 649 650 651 652 653 654
    mova [r0+r3*1], m0
    mova [r0+r3*2], m0
    mova [r0+r1*1], m0
    mova [r0+r3*4], m0
    mova [r2+r3*1], m0
    mova [r2+r3*2], m0
    mova [r2+r1*1], m0
    mova [r2+r3*4], m0
    RET
%endmacro

655 656
INIT_XMM sse2
PRED8x8L_VERTICAL
657
%if HAVE_AVX_EXTERNAL
658 659
INIT_XMM avx
PRED8x8L_VERTICAL
660
%endif
661 662

;-----------------------------------------------------------------------------
663 664
; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
;                             int stride)
665
;-----------------------------------------------------------------------------
666 667
%macro PRED8x8L_HORIZONTAL 0
cglobal pred8x8l_horizontal_10, 4, 4, 5
668 669 670 671 672 673 674 675 676 677
    mova        m0, [r0-16]
    shr        r1d, 14
    dec         r1
    and         r1, r3
    sub         r1, r3
    punpckhwd   m0, [r0+r1-16]
    mova        m1, [r0+r3*2-16]
    punpckhwd   m1, [r0+r3*1-16]
    lea         r2, [r0+r3*4]
    lea         r1, [r3*3]
678
    punpckhdq   m1, m0
679 680 681 682
    mova        m2, [r2+r3*0-16]
    punpckhwd   m2, [r0+r1-16]
    mova        m3, [r2+r3*2-16]
    punpckhwd   m3, [r2+r3*1-16]
683 684
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
685 686 687 688 689 690
    PALIGNR     m4, m3, [r2+r1-16], 14, m0
    pslldq      m0, m4, 2
    pshuflw     m0, m0, 11100101b
    PRED4x4_LOWPASS m4, m3, m0, m4
    punpckhwd   m3, m4, m4
    punpcklwd   m4, m4
691 692 693 694
    pshufd      m0, m3, 0xff
    pshufd      m1, m3, 0xaa
    pshufd      m2, m3, 0x55
    pshufd      m3, m3, 0x00
695 696 697 698 699 700 701 702 703 704 705 706
    mova [r0+r3*0], m0
    mova [r0+r3*1], m1
    mova [r0+r3*2], m2
    mova [r0+r1*1], m3
    pshufd      m0, m4, 0xff
    pshufd      m1, m4, 0xaa
    pshufd      m2, m4, 0x55
    pshufd      m3, m4, 0x00
    mova [r2+r3*0], m0
    mova [r2+r3*1], m1
    mova [r2+r3*2], m2
    mova [r2+r1*1], m3
707 708 709
    RET
%endmacro

710 711 712 713
INIT_XMM sse2
PRED8x8L_HORIZONTAL
INIT_XMM ssse3
PRED8x8L_HORIZONTAL
714
%if HAVE_AVX_EXTERNAL
715 716
INIT_XMM avx
PRED8x8L_HORIZONTAL
717
%endif
718 719

;-----------------------------------------------------------------------------
720 721
; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
;                            int stride)
722
;-----------------------------------------------------------------------------
723 724
%macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_10, 4, 4, 7
725 726
    sub         r0, r3
    mova        m3, [r0]
727 728 729 730 731 732 733 734 735
    shr        r1d, 14
    neg         r1
    shr        r2d, 13
    pslldq      m1, m3, 2
    psrldq      m2, m3, 2
    pinsrw      m1, [r0+r1], 0
    pinsrw      m2, [r0+r2+14], 7
    PRED4x4_LOWPASS m6, m2, m1, m3
    jz .fix_tr ; flags from shr r2d
736
    mova        m1, [r0+16]
737 738 739 740
    psrldq      m5, m1, 2
    PALIGNR     m2, m1, m3, 14, m3
    pshufhw     m5, m5, 10100100b
    PRED4x4_LOWPASS m1, m2, m5, m1
741
.do_topright:
742 743
    lea         r1, [r3*3]
    psrldq      m5, m1, 14
744
    lea         r2, [r0+r3*4]
745 746 747 748 749 750
    PALIGNR     m2, m1, m6,  2, m0
    PALIGNR     m3, m1, m6, 14, m0
    PALIGNR     m5, m1,  2, m0
    pslldq      m4, m6, 2
    PRED4x4_LOWPASS m6, m4, m2, m6
    PRED4x4_LOWPASS m1, m3, m5, m1
751
    mova [r2+r3*4], m1
752 753
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
754
    mova [r2+r1*1], m1
755 756
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
757
    mova [r2+r3*2], m1
758 759
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
760
    mova [r2+r3*1], m1
761 762
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
763
    mova [r0+r3*4], m1
764 765
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
766
    mova [r0+r1*1], m1
767 768
    PALIGNR     m1, m6, 14, m2
    pslldq      m6, 2
769
    mova [r0+r3*2], m1
770
    PALIGNR     m1, m6, 14, m6
771 772
    mova [r0+r3*1], m1
    RET
773 774 775 776
.fix_tr:
    punpckhwd   m3, m3
    pshufd      m1, m3, 0xFF
    jmp .do_topright
777 778
%endmacro

779 780 781 782
INIT_XMM sse2
PRED8x8L_DOWN_LEFT
INIT_XMM ssse3
PRED8x8L_DOWN_LEFT
783
%if HAVE_AVX_EXTERNAL
784 785
INIT_XMM avx
PRED8x8L_DOWN_LEFT
786
%endif
787 788

;-----------------------------------------------------------------------------
789 790
; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
;                             int stride)
791
;-----------------------------------------------------------------------------
792
%macro PRED8x8L_DOWN_RIGHT 0
793 794
; standard forbids this when has_topleft is false
; no need to check
795
cglobal pred8x8l_down_right_10, 4, 5, 8
796
    sub         r0, r3
797 798
    lea         r4, [r0+r3*4]
    lea         r1, [r3*3]
799 800
    mova        m0, [r0+r3*1-16]
    punpckhwd   m0, [r0+r3*0-16]
801
    mova        m1, [r0+r1*1-16]
802 803
    punpckhwd   m1, [r0+r3*2-16]
    punpckhdq   m1, m0
804 805 806 807
    mova        m2, [r4+r3*1-16]
    punpckhwd   m2, [r4+r3*0-16]
    mova        m3, [r4+r1*1-16]
    punpckhwd   m3, [r4+r3*2-16]
808 809
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
810 811 812 813 814 815 816 817
    mova        m0, [r4+r3*4-16]
    mova        m1, [r0]
    PALIGNR     m4, m3, m0, 14, m0
    PALIGNR     m1, m3,  2, m2
    pslldq      m0, m4, 2
    pshuflw     m0, m0, 11100101b
    PRED4x4_LOWPASS m6, m1, m4, m3
    PRED4x4_LOWPASS m4, m3, m0, m4
818
    mova        m3, [r0]
819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
    shr        r2d, 13
    pslldq      m1, m3, 2
    psrldq      m2, m3, 2
    pinsrw      m1, [r0-2], 0
    pinsrw      m2, [r0+r2+14], 7
    PRED4x4_LOWPASS m3, m2, m1, m3
    PALIGNR     m2, m3, m6,  2, m0
    PALIGNR     m5, m3, m6, 14, m0
    psrldq      m7, m3, 2
    PRED4x4_LOWPASS m6, m4, m2, m6
    PRED4x4_LOWPASS m3, m5, m7, m3
    mova [r4+r3*4], m6
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r3*1], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r3*2], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r1*1], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r0+r3*4], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r4+r3*1], m3
    PALIGNR     m3, m6, 14, m2
    pslldq      m6, 2
    mova [r4+r3*2], m3
    PALIGNR     m3, m6, 14, m6
    mova [r4+r1*1], m3
851 852 853
    RET
%endmacro

854 855 856 857
INIT_XMM sse2
PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3
PRED8x8L_DOWN_RIGHT
858
%if HAVE_AVX_EXTERNAL
859 860
INIT_XMM avx
PRED8x8L_DOWN_RIGHT
861
%endif
862 863

;-----------------------------------------------------------------------------
864 865
; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
;                                 int has_topright, int stride)
866
;-----------------------------------------------------------------------------
867
%macro PRED8x8L_VERTICAL_RIGHT 0
868
; likewise with 8x8l_down_right
869
cglobal pred8x8l_vertical_right_10, 4, 5, 7
870
    sub         r0, r3
871 872
    lea         r4, [r0+r3*4]
    lea         r1, [r3*3]
873 874
    mova        m0, [r0+r3*1-16]
    punpckhwd   m0, [r0+r3*0-16]
875
    mova        m1, [r0+r1*1-16]
876 877
    punpckhwd   m1, [r0+r3*2-16]
    punpckhdq   m1, m0
878 879 880 881
    mova        m2, [r4+r3*1-16]
    punpckhwd   m2, [r4+r3*0-16]
    mova        m3, [r4+r1*1-16]
    punpckhwd   m3, [r4+r3*2-16]
882 883
    punpckhdq   m3, m2
    punpckhqdq  m3, m1
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
    mova        m0, [r4+r3*4-16]
    mova        m1, [r0]
    PALIGNR     m4, m3, m0, 14, m0
    PALIGNR     m1, m3,  2, m2
    PRED4x4_LOWPASS m3, m1, m4, m3
    mova        m2, [r0]
    shr        r2d, 13
    pslldq      m1, m2, 2
    psrldq      m5, m2, 2
    pinsrw      m1, [r0-2], 0
    pinsrw      m5, [r0+r2+14], 7
    PRED4x4_LOWPASS m2, m5, m1, m2
    PALIGNR     m6, m2, m3, 12, m1
    PALIGNR     m5, m2, m3, 14, m0
    PRED4x4_LOWPASS m0, m6, m2, m5
    pavgw       m2, m5
900
    mova [r0+r3*2], m0
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
    mova [r0+r3*1], m2
    pslldq      m6, m3, 4
    pslldq      m1, m3, 2
    PRED4x4_LOWPASS m1, m3, m6, m1
    PALIGNR     m2, m1, 14, m4
    mova [r0+r1*1], m2
    pslldq      m1, 2
    PALIGNR     m0, m1, 14, m3
    mova [r0+r3*4], m0
    pslldq      m1, 2
    PALIGNR     m2, m1, 14, m4
    mova [r4+r3*1], m2
    pslldq      m1, 2
    PALIGNR     m0, m1, 14, m3
    mova [r4+r3*2], m0
    pslldq      m1, 2
    PALIGNR     m2, m1, 14, m4
    mova [r4+r1*1], m2
    pslldq      m1, 2
    PALIGNR     m0, m1, 14, m1
    mova [r4+r3*4], m0
922 923 924
    RET
%endmacro

925 926 927 928
INIT_XMM sse2
PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3
PRED8x8L_VERTICAL_RIGHT
929
%if HAVE_AVX_EXTERNAL
930 931
INIT_XMM avx
PRED8x8L_VERTICAL_RIGHT
932
%endif
933 934

;-----------------------------------------------------------------------------
935 936
; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
;                                int has_topright, int stride)
937
;-----------------------------------------------------------------------------
938 939
%macro PRED8x8L_HORIZONTAL_UP 0
cglobal pred8x8l_horizontal_up_10, 4, 4, 6
940
    mova        m0, [r0+r3*0-16]
941 942 943 944 945 946 947
    punpckhwd   m0, [r0+r3*1-16]
    shr        r1d, 14
    dec         r1
    and         r1, r3
    sub         r1, r3
    mova        m4, [r0+r1*1-16]
    lea         r1, [r3*3]
948
    lea         r2, [r0+r3*4]
949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
    mova        m1, [r0+r3*2-16]
    punpckhwd   m1, [r0+r1*1-16]
    punpckhdq   m0, m1
    mova        m2, [r2+r3*0-16]
    punpckhwd   m2, [r2+r3*1-16]
    mova        m3, [r2+r3*2-16]
    punpckhwd   m3, [r2+r1*1-16]
    punpckhdq   m2, m3
    punpckhqdq  m0, m2
    PALIGNR     m1, m0, m4, 14, m4
    psrldq      m2, m0, 2
    pshufhw     m2, m2, 10100100b
    PRED4x4_LOWPASS m0, m1, m2, m0
    psrldq      m1, m0, 2
    psrldq      m2, m0, 4
    pshufhw     m1, m1, 10100100b
    pshufhw     m2, m2, 01010100b
    pavgw       m4, m0, m1
    PRED4x4_LOWPASS m1, m2, m0, m1
    punpckhwd   m5, m4, m1
    punpcklwd   m4, m1
    mova [r2+r3*0], m5
    mova [r0+r3*0], m4
    pshufd      m0, m5, 11111001b
    pshufd      m1, m5, 11111110b
    pshufd      m2, m5, 11111111b
975 976 977
    mova [r2+r3*1], m0
    mova [r2+r3*2], m1
    mova [r2+r1*1], m2
978 979 980 981 982 983
    PALIGNR     m2, m5, m4, 4, m0
    PALIGNR     m3, m5, m4, 8, m1
    PALIGNR     m5, m5, m4, 12, m4
    mova [r0+r3*1], m2
    mova [r0+r3*2], m3
    mova [r0+r1*1], m5
984 985 986
    RET
%endmacro

987 988 989 990
INIT_XMM sse2
PRED8x8L_HORIZONTAL_UP
INIT_XMM ssse3
PRED8x8L_HORIZONTAL_UP
991
%if HAVE_AVX_EXTERNAL
992 993
INIT_XMM avx
PRED8x8L_HORIZONTAL_UP
994
%endif
995 996 997


;-----------------------------------------------------------------------------
998
; void ff_pred16x16_vertical(pixel *src, int stride)
999 1000 1001 1002 1003 1004 1005 1006 1007 1008
;-----------------------------------------------------------------------------
%macro MOV16 3-5
    mova [%1+     0], %2
    mova [%1+mmsize], %3
%if mmsize==8
    mova [%1+    16], %4
    mova [%1+    24], %5
%endif
%endmacro

1009 1010
%macro PRED16x16_VERTICAL 0
cglobal pred16x16_vertical_10, 2, 3
1011
    sub   r0, r1
1012
    mov  r2d, 8
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
    mova  m0, [r0+ 0]
    mova  m1, [r0+mmsize]
%if mmsize==8
    mova  m2, [r0+16]
    mova  m3, [r0+24]
%endif
.loop:
    MOV16 r0+r1*1, m0, m1, m2, m3
    MOV16 r0+r1*2, m0, m1, m2, m3
    lea   r0, [r0+r1*2]
1023
    dec   r2d
1024 1025 1026 1027
    jg .loop
    REP_RET
%endmacro

1028
INIT_MMX mmxext
1029 1030 1031
PRED16x16_VERTICAL
INIT_XMM sse2
PRED16x16_VERTICAL
1032 1033

;-----------------------------------------------------------------------------
1034
; void ff_pred16x16_horizontal(pixel *src, int stride)
1035
;-----------------------------------------------------------------------------
1036 1037
%macro PRED16x16_HORIZONTAL 0
cglobal pred16x16_horizontal_10, 2, 3
1038
    mov   r2d, 8
1039 1040 1041 1042 1043 1044 1045 1046
.vloop:
    movd   m0, [r0+r1*0-4]
    movd   m1, [r0+r1*1-4]
    SPLATW m0, m0, 1
    SPLATW m1, m1, 1
    MOV16  r0+r1*0, m0, m0, m0, m0
    MOV16  r0+r1*1, m1, m1, m1, m1
    lea    r0, [r0+r1*2]
1047
    dec    r2d
1048
    jg .vloop
1049 1050 1051
    REP_RET
%endmacro

1052
INIT_MMX mmxext
1053 1054 1055
PRED16x16_HORIZONTAL
INIT_XMM sse2
PRED16x16_HORIZONTAL
1056 1057

;-----------------------------------------------------------------------------
1058
; void ff_pred16x16_dc(pixel *src, int stride)
1059
;-----------------------------------------------------------------------------
1060 1061
%macro PRED16x16_DC 0
cglobal pred16x16_dc_10, 2, 6
1062
    mov        r5, r0
1063 1064 1065 1066 1067 1068 1069 1070 1071
    sub        r0, r1
    mova       m0, [r0+0]
    paddw      m0, [r0+mmsize]
%if mmsize==8
    paddw      m0, [r0+16]
    paddw      m0, [r0+24]
%endif
    HADDW      m0, m2

1072 1073 1074
    lea        r0, [r0+r1-2]
    movzx     r3d, word [r0]
    movzx     r4d, word [r0+r1]
1075 1076
%rep 7
    lea        r0, [r0+r1*2]
1077
    movzx     r2d, word [r0]
1078
    add       r3d, r2d
1079 1080
    movzx     r2d, word [r0+r1]
    add       r4d, r2d
1081
%endrep
1082
    lea       r3d, [r3+r4+16]
1083 1084 1085 1086 1087 1088 1089

    movd       m1, r3d
    paddw      m0, m1
    psrlw      m0, 5
    SPLATW     m0, m0
    mov       r3d, 8
.loop:
1090 1091 1092
    MOV16 r5+r1*0, m0, m0, m0, m0
    MOV16 r5+r1*1, m0, m0, m0, m0
    lea        r5, [r5+r1*2]
1093 1094 1095 1096 1097
    dec       r3d
    jg .loop
    REP_RET
%endmacro

1098
INIT_MMX mmxext
1099 1100 1101
PRED16x16_DC
INIT_XMM sse2
PRED16x16_DC
1102 1103

;-----------------------------------------------------------------------------
1104
; void ff_pred16x16_top_dc(pixel *src, int stride)
1105
;-----------------------------------------------------------------------------
1106 1107
%macro PRED16x16_TOP_DC 0
cglobal pred16x16_top_dc_10, 2, 3
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
    sub        r0, r1
    mova       m0, [r0+0]
    paddw      m0, [r0+mmsize]
%if mmsize==8
    paddw      m0, [r0+16]
    paddw      m0, [r0+24]
%endif
    HADDW      m0, m2

    SPLATW     m0, m0
    paddw      m0, [pw_8]
    psrlw      m0, 4
    mov       r2d, 8
.loop:
    MOV16 r0+r1*1, m0, m0, m0, m0
    MOV16 r0+r1*2, m0, m0, m0, m0
    lea        r0, [r0+r1*2]
    dec       r2d
    jg .loop
    REP_RET
%endmacro

1130
INIT_MMX mmxext
1131 1132 1133
PRED16x16_TOP_DC
INIT_XMM sse2
PRED16x16_TOP_DC
1134 1135

;-----------------------------------------------------------------------------
1136
; void ff_pred16x16_left_dc(pixel *src, int stride)
1137
;-----------------------------------------------------------------------------
1138 1139
%macro PRED16x16_LEFT_DC 0
cglobal pred16x16_left_dc_10, 2, 6
1140
    mov        r5, r0
1141 1142

    sub        r0, 2
1143 1144
    movzx     r3d, word [r0]
    movzx     r4d, word [r0+r1]
1145 1146
%rep 7
    lea        r0, [r0+r1*2]
1147 1148 1149 1150
    movzx     r2d, word [r0]
    add       r3d, r2d
    movzx     r2d, word [r0+r1]
    add       r4d, r2d
1151
%endrep
1152 1153
    lea       r3d, [r3+r4+8]
    shr       r3d, 4
1154

1155
    movd       m0, r3d
1156 1157 1158
    SPLATW     m0, m0
    mov       r3d, 8
.loop:
1159 1160 1161
    MOV16 r5+r1*0, m0, m0, m0, m0
    MOV16 r5+r1*1, m0, m0, m0, m0
    lea        r5, [r5+r1*2]
1162 1163 1164 1165 1166
    dec       r3d
    jg .loop
    REP_RET
%endmacro

1167
INIT_MMX mmxext
1168 1169 1170
PRED16x16_LEFT_DC
INIT_XMM sse2
PRED16x16_LEFT_DC
1171 1172

;-----------------------------------------------------------------------------
1173
; void ff_pred16x16_128_dc(pixel *src, int stride)
1174
;-----------------------------------------------------------------------------
1175 1176
%macro PRED16x16_128_DC 0
cglobal pred16x16_128_dc_10, 2,3
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
    mova       m0, [pw_512]
    mov       r2d, 8
.loop:
    MOV16 r0+r1*0, m0, m0, m0, m0
    MOV16 r0+r1*1, m0, m0, m0, m0
    lea        r0, [r0+r1*2]
    dec       r2d
    jg .loop
    REP_RET
%endmacro

1188
INIT_MMX mmxext
1189 1190 1191
PRED16x16_128_DC
INIT_XMM sse2
PRED16x16_128_DC