vp8dsp.asm 78.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
;******************************************************************************
;* VP8 MMXEXT optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 22
;******************************************************************************

23 24
%include "libavutil/x86/x86inc.asm"
%include "libavutil/x86/x86util.asm"
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

SECTION_RODATA

fourtap_filter_hw_m: times 4 dw  -6, 123
                     times 4 dw  12,  -1
                     times 4 dw  -9,  93
                     times 4 dw  50,  -6
                     times 4 dw  -6,  50
                     times 4 dw  93,  -9
                     times 4 dw  -1,  12
                     times 4 dw 123,  -6

sixtap_filter_hw_m:  times 4 dw   2, -11
                     times 4 dw 108,  36
                     times 4 dw  -8,   1
                     times 4 dw   3, -16
                     times 4 dw  77,  77
                     times 4 dw -16,   3
                     times 4 dw   1,  -8
                     times 4 dw  36, 108
                     times 4 dw -11,   2

47 48 49 50 51 52 53 54
fourtap_filter_hb_m: times 8 db  -6, 123
                     times 8 db  12,  -1
                     times 8 db  -9,  93
                     times 8 db  50,  -6
                     times 8 db  -6,  50
                     times 8 db  93,  -9
                     times 8 db  -1,  12
                     times 8 db 123,  -6
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101

sixtap_filter_hb_m:  times 8 db   2,   1
                     times 8 db -11, 108
                     times 8 db  36,  -8
                     times 8 db   3,   3
                     times 8 db -16,  77
                     times 8 db  77, -16
                     times 8 db   1,   2
                     times 8 db  -8,  36
                     times 8 db 108, -11

fourtap_filter_v_m:  times 8 dw  -6
                     times 8 dw 123
                     times 8 dw  12
                     times 8 dw  -1
                     times 8 dw  -9
                     times 8 dw  93
                     times 8 dw  50
                     times 8 dw  -6
                     times 8 dw  -6
                     times 8 dw  50
                     times 8 dw  93
                     times 8 dw  -9
                     times 8 dw  -1
                     times 8 dw  12
                     times 8 dw 123
                     times 8 dw  -6

sixtap_filter_v_m:   times 8 dw   2
                     times 8 dw -11
                     times 8 dw 108
                     times 8 dw  36
                     times 8 dw  -8
                     times 8 dw   1
                     times 8 dw   3
                     times 8 dw -16
                     times 8 dw  77
                     times 8 dw  77
                     times 8 dw -16
                     times 8 dw   3
                     times 8 dw   1
                     times 8 dw  -8
                     times 8 dw  36
                     times 8 dw 108
                     times 8 dw -11
                     times 8 dw   2

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
bilinear_filter_vw_m: times 8 dw 1
                      times 8 dw 2
                      times 8 dw 3
                      times 8 dw 4
                      times 8 dw 5
                      times 8 dw 6
                      times 8 dw 7

bilinear_filter_vb_m: times 8 db 7, 1
                      times 8 db 6, 2
                      times 8 db 5, 3
                      times 8 db 4, 4
                      times 8 db 3, 5
                      times 8 db 2, 6
                      times 8 db 1, 7

118
%ifdef PIC
119 120 121 122 123 124 125 126
%define fourtap_filter_hw    r11
%define sixtap_filter_hw     r11
%define fourtap_filter_hb    r11
%define sixtap_filter_hb     r11
%define fourtap_filter_v     r11
%define sixtap_filter_v      r11
%define bilinear_filter_vw   r11
%define bilinear_filter_vb   r11
127 128 129 130 131 132 133
%else
%define fourtap_filter_hw fourtap_filter_hw_m
%define sixtap_filter_hw  sixtap_filter_hw_m
%define fourtap_filter_hb fourtap_filter_hb_m
%define sixtap_filter_hb  sixtap_filter_hb_m
%define fourtap_filter_v  fourtap_filter_v_m
%define sixtap_filter_v   sixtap_filter_v_m
134 135
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
136 137
%endif

138
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140

141 142 143
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144

Ronald S. Bultje's avatar
Ronald S. Bultje committed
145 146 147
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734

148 149 150 151
pb_27_63: times 8 db 27, 63
pb_18_63: times 8 db 18, 63
pb_9_63:  times 8 db  9, 63

152
cextern pb_1
153
cextern pw_3
154
cextern pb_3
155
cextern pw_4
156
cextern pb_4
157 158 159 160
cextern pw_9
cextern pw_18
cextern pw_27
cextern pw_63
161
cextern pw_64
162 163 164
cextern pb_80
cextern pb_F8
cextern pb_FE
165 166 167 168 169 170 171 172 173 174 175

SECTION .text

;-----------------------------------------------------------------------------
; subpel MC functions:
;
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
;                                              uint8_t *src, int srcstride,
;                                              int height,   int mx, int my);
;-----------------------------------------------------------------------------

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
%macro FILTER_SSSE3 3
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
    lea      r5d, [r5*3]
    mova      m3, [filter_h6_shuf2]
    mova      m4, [filter_h6_shuf3]
%ifdef PIC
    lea      r11, [sixtap_filter_hb_m]
%endif
    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
    mova      m6, [sixtap_filter_hb+r5*8-32]
    mova      m7, [sixtap_filter_hb+r5*8-16]

.nextrow
    movu      m0, [r2-2]
    mova      m1, m0
    mova      m2, m0
%ifidn %1, 4
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
    punpcklbw m0, [r2+3]
%else
    pshufb    m0, [filter_h6_shuf1]
%endif
    pshufb    m1, m3
    pshufb    m2, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    pmaddubsw m2, m7
    paddsw    m0, m1
    paddsw    m0, m2
    paddsw    m0, [pw_64]
    psraw     m0, 7
    packuswb  m0, m0
    movh    [r0], m0        ; store

    ; go to next line
    add       r0, r1
    add       r2, r3
214
    dec      r4d            ; next row
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
    jg .nextrow
    REP_RET

cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
    shl      r5d, 4
    mova      m2, [pw_64]
    mova      m3, [filter_h2_shuf]
    mova      m4, [filter_h4_shuf]
%ifdef PIC
    lea      r11, [fourtap_filter_hb_m]
%endif
    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
    mova      m6, [fourtap_filter_hb+r5]

.nextrow
    movu      m0, [r2-1]
    mova      m1, m0
    pshufb    m0, m3
    pshufb    m1, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    paddsw    m0, m2
    paddsw    m0, m1
    psraw     m0, 7
    packuswb  m0, m0
    movh    [r0], m0        ; store

    ; go to next line
    add       r0, r1
    add       r2, r3
245
    dec      r4d            ; next row
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
    jg .nextrow
    REP_RET

cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
    shl      r6d, 4
%ifdef PIC
    lea      r11, [fourtap_filter_hb_m]
%endif
    mova      m5, [fourtap_filter_hb+r6-16]
    mova      m6, [fourtap_filter_hb+r6]
    mova      m7, [pw_64]

    ; read 3 lines
    sub       r2, r3
    movh      m0, [r2]
    movh      m1, [r2+  r3]
    movh      m2, [r2+2*r3]
    add       r2, r3

.nextrow
    movh      m3, [r2+2*r3]                ; read new row
    mova      m4, m0
    mova      m0, m1
    punpcklbw m4, m1
    mova      m1, m2
    punpcklbw m2, m3
    pmaddubsw m4, m5
    pmaddubsw m2, m6
    paddsw    m4, m2
    mova      m2, m3
    paddsw    m4, m7
    psraw     m4, 7
    packuswb  m4, m4
    movh    [r0], m4

    ; go to next line
    add        r0, r1
    add        r2, r3
284
    dec       r4d                          ; next row
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
    jg .nextrow
    REP_RET

cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
    lea      r6d, [r6*3]
%ifdef PIC
    lea      r11, [sixtap_filter_hb_m]
%endif
    lea       r6, [sixtap_filter_hb+r6*8]

    ; read 5 lines
    sub       r2, r3
    sub       r2, r3
    movh      m0, [r2]
    movh      m1, [r2+r3]
    movh      m2, [r2+r3*2]
    lea       r2, [r2+r3*2]
    add       r2, r3
    movh      m3, [r2]
    movh      m4, [r2+r3]

.nextrow
    movh      m5, [r2+2*r3]                ; read new row
    mova      m6, m0
    punpcklbw m6, m5
    mova      m0, m1
    punpcklbw m1, m2
    mova      m7, m3
    punpcklbw m7, m4
    pmaddubsw m6, [r6-48]
    pmaddubsw m1, [r6-32]
    pmaddubsw m7, [r6-16]
    paddsw    m6, m1
    paddsw    m6, m7
    mova      m1, m2
    paddsw    m6, [pw_64]
    mova      m2, m3
    psraw     m6, 7
    mova      m3, m4
    packuswb  m6, m6
    mova      m4, m5
    movh    [r0], m6

    ; go to next line
    add        r0, r1
    add        r2, r3
331
    dec       r4d                          ; next row
332 333 334 335 336 337 338 339 340
    jg .nextrow
    REP_RET
%endmacro

INIT_MMX
FILTER_SSSE3 4, 0, 0
INIT_XMM
FILTER_SSSE3 8, 8, 7

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext, 6, 6
    shl       r5d, 4
%ifdef PIC
    lea       r11, [fourtap_filter_hw_m]
%endif
    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
    movq      mm5, [fourtap_filter_hw+r5]
    movq      mm7, [pw_64]
    pxor      mm6, mm6

.nextrow
    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm6                     ; byte->word ABCD
    pshufw    mm0, mm2, 9                  ; byte CDEF..
    punpcklbw mm0, mm6                     ; byte->word CDEF
    pshufw    mm3, mm1, 0x94               ; word ABBC
    pshufw    mm1, mm0, 0x94               ; word CDDE
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
    movq      mm0, mm1                     ; backup for second set of pixels
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm3, mm1                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
    punpckhbw mm2, mm6                     ; byte->word EFGH
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
    pshufw    mm1, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm0, mm1                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm3, mm0                     ; merge dword->word (4px)
    paddsw    mm3, mm7                     ; rounding
    psraw     mm3, 7
    packuswb  mm3, mm6                     ; clip and word->bytes
    movd     [r0], mm3                     ; store

    ; go to next line
    add        r0, r1
    add        r2, r3
384
    dec       r4d                          ; next row
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
    jg .nextrow
    REP_RET

; 4x4 block, H-only 6-tap filter
cglobal put_vp8_epel4_h6_mmxext, 6, 6
    lea       r5d, [r5*3]
%ifdef PIC
    lea       r11, [sixtap_filter_hw_m]
%endif
    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
    movq      mm5, [sixtap_filter_hw+r5*8-32]
    movq      mm6, [sixtap_filter_hw+r5*8-16]
    movq      mm7, [pw_64]
    pxor      mm3, mm3

.nextrow
    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm3                     ; byte->word ABCD
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
    punpckhbw mm2, mm3                     ; byte->word EFGH
    punpcklbw mm0, mm3                     ; byte->word CDEF
    pshufw    mm1, mm1, 0x94               ; word ABBC
    pshufw    mm2, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
    pshufw    mm3, mm0, 0x94               ; word CDDE
    movq      mm0, mm3                     ; backup for second set of pixels
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
    paddd     mm1, mm3                     ; add to 1st 2px cache
    movq      mm3, mm2                     ; backup for second set of pixels
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm1, mm2                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
    paddd     mm0, mm3                     ; add to 2nd 2px cache
    pxor      mm3, mm3
    punpcklbw mm2, mm3                     ; byte->word FGHI
    pshufw    mm2, mm2, 0xE9               ; word GHHI
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm0, mm2                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm1, mm0                     ; merge dword->word (4px)
    paddsw    mm1, mm7                     ; rounding
    psraw     mm1, 7
    packuswb  mm1, mm3                     ; clip and word->bytes
    movd     [r0], mm1                     ; store

    ; go to next line
    add        r0, r1
    add        r2, r3
441
    dec       r4d                          ; next row
442 443 444 445
    jg .nextrow
    REP_RET

INIT_XMM
446 447
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
    shl      r5d, 5
448
%ifdef PIC
449
    lea      r11, [fourtap_filter_v_m]
450
%endif
451
    lea       r5, [fourtap_filter_v+r5-32]
452
    pxor      m7, m7
453 454 455 456 457 458 459
    mova      m4, [pw_64]
    mova      m5, [r5+ 0]
    mova      m6, [r5+16]
%ifdef m8
    mova      m8, [r5+32]
    mova      m9, [r5+48]
%endif
460
.nextrow
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
    movq      m0, [r2-1]
    movq      m1, [r2-0]
    movq      m2, [r2+1]
    movq      m3, [r2+2]
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    pmullw    m0, m5
    pmullw    m1, m6
%ifdef m8
    pmullw    m2, m8
    pmullw    m3, m9
%else
    pmullw    m2, [r5+32]
    pmullw    m3, [r5+48]
%endif
    paddsw    m0, m1
    paddsw    m2, m3
    paddsw    m0, m2
    paddsw    m0, m4
482 483 484 485 486 487 488
    psraw     m0, 7
    packuswb  m0, m7
    movh    [r0], m0        ; store

    ; go to next line
    add       r0, r1
    add       r2, r3
489
    dec      r4d            ; next row
490 491 492
    jg .nextrow
    REP_RET

493
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
494
    lea      r5d, [r5*3]
495
    shl      r5d, 4
496
%ifdef PIC
497
    lea      r11, [sixtap_filter_v_m]
498
%endif
499
    lea       r5, [sixtap_filter_v+r5-96]
500
    pxor      m7, m7
501 502 503 504 505 506 507 508 509
    mova      m6, [pw_64]
%ifdef m8
    mova      m8, [r5+ 0]
    mova      m9, [r5+16]
    mova     m10, [r5+32]
    mova     m11, [r5+48]
    mova     m12, [r5+64]
    mova     m13, [r5+80]
%endif
510
.nextrow
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
    movq      m0, [r2-2]
    movq      m1, [r2-1]
    movq      m2, [r2-0]
    movq      m3, [r2+1]
    movq      m4, [r2+2]
    movq      m5, [r2+3]
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7
    punpcklbw m5, m7
%ifdef m8
    pmullw    m0, m8
    pmullw    m1, m9
    pmullw    m2, m10
    pmullw    m3, m11
    pmullw    m4, m12
    pmullw    m5, m13
%else
    pmullw    m0, [r5+ 0]
    pmullw    m1, [r5+16]
    pmullw    m2, [r5+32]
    pmullw    m3, [r5+48]
    pmullw    m4, [r5+64]
    pmullw    m5, [r5+80]
%endif
    paddsw    m1, m4
    paddsw    m0, m5
    paddsw    m1, m2
    paddsw    m0, m3
    paddsw    m0, m1
    paddsw    m0, m6
544 545 546 547 548 549 550
    psraw     m0, 7
    packuswb  m0, m7
    movh    [r0], m0        ; store

    ; go to next line
    add       r0, r1
    add       r2, r3
551
    dec      r4d            ; next row
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
    jg .nextrow
    REP_RET

%macro FILTER_V 3
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
    shl      r6d, 5
%ifdef PIC
    lea      r11, [fourtap_filter_v_m]
%endif
    lea       r6, [fourtap_filter_v+r6-32]
    mova      m6, [pw_64]
    pxor      m7, m7
    mova      m5, [r6+48]

    ; read 3 lines
    sub       r2, r3
    movh      m0, [r2]
    movh      m1, [r2+  r3]
    movh      m2, [r2+2*r3]
    add       r2, r3
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7

.nextrow
    ; first calculate negative taps (to prevent losing positive overflows)
    movh      m4, [r2+2*r3]                ; read new row
    punpcklbw m4, m7
    mova      m3, m4
    pmullw    m0, [r6+0]
    pmullw    m4, m5
    paddsw    m4, m0

    ; then calculate positive taps
    mova      m0, m1
    pmullw    m1, [r6+16]
    paddsw    m4, m1
    mova      m1, m2
    pmullw    m2, [r6+32]
    paddsw    m4, m2
    mova      m2, m3

    ; round/clip/store
    paddsw    m4, m6
    psraw     m4, 7
    packuswb  m4, m7
    movh    [r0], m4

    ; go to next line
    add       r0, r1
    add       r2, r3
604
    dec      r4d                           ; next row
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
    jg .nextrow
    REP_RET


; 4x4 block, V-only 6-tap filter
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
    shl      r6d, 4
    lea       r6, [r6*3]
%ifdef PIC
    lea      r11, [sixtap_filter_v_m]
%endif
    lea       r6, [sixtap_filter_v+r6-96]
    pxor      m7, m7

    ; read 5 lines
    sub       r2, r3
    sub       r2, r3
    movh      m0, [r2]
    movh      m1, [r2+r3]
    movh      m2, [r2+r3*2]
    lea       r2, [r2+r3*2]
    add       r2, r3
    movh      m3, [r2]
    movh      m4, [r2+r3]
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7

.nextrow
    ; first calculate negative taps (to prevent losing positive overflows)
    mova      m5, m1
    pmullw    m5, [r6+16]
    mova      m6, m4
    pmullw    m6, [r6+64]
    paddsw    m6, m5

    ; then calculate positive taps
    movh      m5, [r2+2*r3]                ; read new row
    punpcklbw m5, m7
    pmullw    m0, [r6+0]
    paddsw    m6, m0
    mova      m0, m1
    mova      m1, m2
    pmullw    m2, [r6+32]
    paddsw    m6, m2
    mova      m2, m3
    pmullw    m3, [r6+48]
    paddsw    m6, m3
    mova      m3, m4
    mova      m4, m5
    pmullw    m5, [r6+80]
    paddsw    m6, m5

    ; round/clip/store
    paddsw    m6, [pw_64]
    psraw     m6, 7
    packuswb  m6, m7
    movh    [r0], m6

    ; go to next line
    add       r0, r1
    add       r2, r3
669
    dec      r4d                           ; next row
670 671 672 673 674 675 676 677 678
    jg .nextrow
    REP_RET
%endmacro

INIT_MMX
FILTER_V mmxext, 4, 0
INIT_XMM
FILTER_V sse2,   8, 8

679 680 681 682 683 684 685 686 687
%macro FILTER_BILINEAR 3
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
    mov      r5d, 8*16
    shl      r6d, 4
    sub      r5d, r6d
%ifdef PIC
    lea      r11, [bilinear_filter_vw_m]
%endif
    pxor      m6, m6
688 689
    mova      m4, [bilinear_filter_vw+r5-16]
    mova      m5, [bilinear_filter_vw+r6-16]
690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
.nextrow
    movh      m0, [r2+r3*0]
    movh      m1, [r2+r3*1]
    movh      m3, [r2+r3*2]
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m3, m6
    mova      m2, m1
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
%ifidn %1, mmxext
    packuswb  m0, m0
    packuswb  m2, m2
    movh [r0+r1*0], m0
    movh [r0+r1*1], m2
%else
    packuswb  m0, m2
    movh   [r0+r1*0], m0
    movhps [r0+r1*1], m0
%endif

    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
721
    sub      r4d, 2
722 723 724 725 726 727 728 729 730 731 732
    jg .nextrow
    REP_RET

cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
    mov      r6d, 8*16
    shl      r5d, 4
    sub      r6d, r5d
%ifdef PIC
    lea      r11, [bilinear_filter_vw_m]
%endif
    pxor      m6, m6
733 734
    mova      m4, [bilinear_filter_vw+r6-16]
    mova      m5, [bilinear_filter_vw+r5-16]
735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
.nextrow
    movh      m0, [r2+r3*0+0]
    movh      m1, [r2+r3*0+1]
    movh      m2, [r2+r3*1+0]
    movh      m3, [r2+r3*1+1]
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m2, m6
    punpcklbw m3, m6
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
%ifidn %1, mmxext
    packuswb  m0, m0
    packuswb  m2, m2
    movh [r0+r1*0], m0
    movh [r0+r1*1], m2
%else
    packuswb  m0, m2
    movh   [r0+r1*0], m0
    movhps [r0+r1*1], m0
%endif

    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
767
    sub      r4d, 2
768 769 770 771 772 773 774 775 776
    jg .nextrow
    REP_RET
%endmacro

INIT_MMX
FILTER_BILINEAR mmxext, 4, 0
INIT_XMM
FILTER_BILINEAR   sse2, 8, 7

777 778
%macro FILTER_BILINEAR_SSSE3 1
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
779 780 781 782 783
    shl      r6d, 4
%ifdef PIC
    lea      r11, [bilinear_filter_vb_m]
%endif
    pxor      m4, m4
784
    mova      m3, [bilinear_filter_vb+r6-16]
785 786 787 788 789 790 791 792 793 794 795 796
.nextrow
    movh      m0, [r2+r3*0]
    movh      m1, [r2+r3*1]
    movh      m2, [r2+r3*2]
    punpcklbw m0, m1
    punpcklbw m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
797 798 799 800 801 802
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
    movh [r0+r1*0], m0
    movh [r0+r1*1], m1
%else
803 804 805
    packuswb  m0, m1
    movh   [r0+r1*0], m0
    movhps [r0+r1*1], m0
806
%endif
807 808 809

    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
810
    sub      r4d, 2
811 812 813
    jg .nextrow
    REP_RET

814
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
815 816 817 818 819 820
    shl      r5d, 4
%ifdef PIC
    lea      r11, [bilinear_filter_vb_m]
%endif
    pxor      m4, m4
    mova      m2, [filter_h2_shuf]
821
    mova      m3, [bilinear_filter_vb+r5-16]
822 823 824 825 826 827 828 829 830 831 832
.nextrow
    movu      m0, [r2+r3*0]
    movu      m1, [r2+r3*1]
    pshufb    m0, m2
    pshufb    m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
833 834 835 836 837 838
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
    movh [r0+r1*0], m0
    movh [r0+r1*1], m1
%else
839 840 841
    packuswb  m0, m1
    movh   [r0+r1*0], m0
    movhps [r0+r1*1], m0
842
%endif
843 844 845

    lea       r0, [r0+r1*2]
    lea       r2, [r2+r3*2]
846
    sub      r4d, 2
847 848
    jg .nextrow
    REP_RET
849 850 851 852 853 854
%endmacro

INIT_MMX
FILTER_BILINEAR_SSSE3 4
INIT_XMM
FILTER_BILINEAR_SSSE3 8
855

856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
cglobal put_vp8_pixels8_mmx, 5,5
.nextrow:
    movq  mm0, [r2+r3*0]
    movq  mm1, [r2+r3*1]
    lea    r2, [r2+r3*2]
    movq [r0+r1*0], mm0
    movq [r0+r1*1], mm1
    lea    r0, [r0+r1*2]
    sub   r4d, 2
    jg .nextrow
    REP_RET

cglobal put_vp8_pixels16_mmx, 5,5
.nextrow:
    movq  mm0, [r2+r3*0+0]
    movq  mm1, [r2+r3*0+8]
    movq  mm2, [r2+r3*1+0]
    movq  mm3, [r2+r3*1+8]
    lea    r2, [r2+r3*2]
    movq [r0+r1*0+0], mm0
    movq [r0+r1*0+8], mm1
    movq [r0+r1*1+0], mm2
    movq [r0+r1*1+8], mm3
    lea    r0, [r0+r1*2]
    sub   r4d, 2
    jg .nextrow
    REP_RET

cglobal put_vp8_pixels16_sse, 5,5,2
.nextrow:
    movups xmm0, [r2+r3*0]
    movups xmm1, [r2+r3*1]
    lea     r2, [r2+r3*2]
    movaps [r0+r1*0], xmm0
    movaps [r0+r1*1], xmm1
    lea     r0, [r0+r1*2]
    sub    r4d, 2
    jg .nextrow
    REP_RET

896 897 898 899
;-----------------------------------------------------------------------------
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
;-----------------------------------------------------------------------------

900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
%macro ADD_DC 4
    %4        m2, [r0+%3]
    %4        m3, [r0+r2+%3]
    %4        m4, [r1+%3]
    %4        m5, [r1+r2+%3]
    paddusb   m2, %1
    paddusb   m3, %1
    paddusb   m4, %1
    paddusb   m5, %1
    psubusb   m2, %2
    psubusb   m3, %2
    psubusb   m4, %2
    psubusb   m5, %2
    %4    [r0+%3], m2
    %4 [r0+r2+%3], m3
    %4    [r1+%3], m4
    %4 [r1+r2+%3], m5
%endmacro

INIT_MMX
920 921
cglobal vp8_idct_dc_add_mmx, 3, 3
    ; load data
922
    movd       m0, [r1]
923 924

    ; calculate DC
925 926 927 928 929 930 931 932 933 934 935
    paddw      m0, [pw_4]
    pxor       m1, m1
    psraw      m0, 3
    movd      [r1], m1
    psubw      m1, m0
    packuswb   m0, m0
    packuswb   m1, m1
    punpcklbw  m0, m0
    punpcklbw  m1, m1
    punpcklwd  m0, m0
    punpcklwd  m1, m1
936 937

    ; add DC
938 939
    lea        r1, [r0+r2*2]
    ADD_DC     m0, m1, 0, movh
940 941
    RET

942
INIT_XMM
943 944
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
    ; load data
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
    movd       m0, [r1]
    pxor       m1, m1

    ; calculate DC
    paddw      m0, [pw_4]
    movd     [r1], m1
    lea        r1, [r0+r2*2]
    movd       m2, [r0]
    movd       m3, [r0+r2]
    movd       m4, [r1]
    movd       m5, [r1+r2]
    psraw      m0, 3
    pshuflw    m0, m0, 0
    punpcklqdq m0, m0
    punpckldq  m2, m3
    punpckldq  m4, m5
    punpcklbw  m2, m1
    punpcklbw  m4, m1
    paddw      m2, m0
    paddw      m4, m0
    packuswb   m2, m4
    movd      [r0], m2
    pextrd [r0+r2], m2, 1
    pextrd    [r1], m2, 2
    pextrd [r1+r2], m2, 3
    RET

;-----------------------------------------------------------------------------
973
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
974 975 976
;-----------------------------------------------------------------------------

INIT_MMX
977
cglobal vp8_idct_dc_add4y_mmx, 3, 3
978 979 980 981 982
    ; load data
    movd      m0, [r1+32*0] ; A
    movd      m1, [r1+32*2] ; C
    punpcklwd m0, [r1+32*1] ; A B
    punpcklwd m1, [r1+32*3] ; C D
983
    punpckldq m0, m1        ; A B C D
984
    pxor      m6, m6
985 986

    ; calculate DC
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
    paddw     m0, [pw_4]
    movd [r1+32*0], m6
    movd [r1+32*1], m6
    movd [r1+32*2], m6
    movd [r1+32*3], m6
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
    lea       r1, [r0+r2*2]
    ADD_DC    m0, m6, 0, mova
    ADD_DC    m1, m7, 8, mova
    RET

INIT_XMM
1012
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1013 1014 1015 1016 1017
    ; load data
    movd      m0, [r1+32*0] ; A
    movd      m1, [r1+32*2] ; C
    punpcklwd m0, [r1+32*1] ; A B
    punpcklwd m1, [r1+32*3] ; C D
1018
    punpckldq m0, m1        ; A B C D
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
    pxor      m1, m1

    ; calculate DC
    paddw     m0, [pw_4]
    movd [r1+32*0], m1
    movd [r1+32*1], m1
    movd [r1+32*2], m1
    movd [r1+32*3], m1
    psraw     m0, 3
    psubw     m1, m0
    packuswb  m0, m0
    packuswb  m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1

    ; add DC
    lea       r1, [r0+r2*2]
    ADD_DC    m0, m1, 0, mova
1039
    RET
1040

1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
;-----------------------------------------------------------------------------
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;-----------------------------------------------------------------------------

INIT_MMX
cglobal vp8_idct_dc_add4uv_mmx, 3, 3
    ; load data
    movd      m0, [r1+32*0] ; A
    movd      m1, [r1+32*2] ; C
    punpcklwd m0, [r1+32*1] ; A B
    punpcklwd m1, [r1+32*3] ; C D
    punpckldq m0, m1        ; A B C D
    pxor      m6, m6

    ; calculate DC
    paddw     m0, [pw_4]
    movd [r1+32*0], m6
    movd [r1+32*1], m6
    movd [r1+32*2], m6
    movd [r1+32*3], m6
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
    lea       r1, [r0+r2*2]
    ADD_DC    m0, m6, 0, mova
    lea       r0, [r0+r2*4]
    lea       r1, [r1+r2*4]
    ADD_DC    m1, m7, 0, mova
    RET

Ronald S. Bultje's avatar
Ronald S. Bultje committed
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
;-----------------------------------------------------------------------------
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
;-----------------------------------------------------------------------------

; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
%macro VP8_MULTIPLY_SUMSUB 4
    mova      %3, %1
    mova      %4, %2
    pmulhw    %3, m6 ;20091(1)
    pmulhw    %4, m6 ;20091(2)
    paddw     %3, %1
    paddw     %4, %2
1095 1096
    paddw     %1, %1
    paddw     %2, %2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
    pmulhw    %1, m7 ;35468(1)
    pmulhw    %2, m7 ;35468(2)
    psubw     %1, %4
    paddw     %2, %3
%endmacro

; calculate x0=%1+%3; x1=%1-%3
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
;           %5/%6 are temporary registers
;           we assume m6/m7 have constant words 20091/17734 loaded in them
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1109
    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1110
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1111 1112
    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1113 1114 1115 1116 1117
    SWAP                 %4,  %1
    SWAP                 %4,  %3
%endmacro

INIT_MMX
1118 1119
%macro VP8_IDCT_ADD 1
cglobal vp8_idct_add_%1, 3, 3
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1120
    ; load block data
1121 1122
    movq         m0, [r1+ 0]
    movq         m1, [r1+ 8]
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1123 1124 1125 1126
    movq         m2, [r1+16]
    movq         m3, [r1+24]
    movq         m6, [pw_20091]
    movq         m7, [pw_17734]
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
%ifidn %1, sse
    xorps      xmm0, xmm0
    movaps  [r1+ 0], xmm0
    movaps  [r1+16], xmm0
%else
    pxor         m4, m4
    movq    [r1+ 0], m4
    movq    [r1+ 8], m4
    movq    [r1+16], m4
    movq    [r1+24], m4
%endif
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152

    ; actual IDCT
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4
    paddw        m0, [pw_4]
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4

    ; store
    pxor         m4, m4
    lea          r1, [r0+2*r2]
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2

    RET
1153 1154 1155 1156
%endmacro

VP8_IDCT_ADD mmx
VP8_IDCT_ADD sse
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1157

1158 1159 1160 1161
;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
;-----------------------------------------------------------------------------

1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
%macro SCATTER_WHT 3
    movd  r1d, m%1
    movd  r2d, m%2
    mov [r0+2*16*(0+%3)], r1w
    mov [r0+2*16*(1+%3)], r2w
    shr   r1d, 16
    shr   r2d, 16
    psrlq m%1, 32
    psrlq m%2, 32
    mov [r0+2*16*(4+%3)], r1w
    mov [r0+2*16*(5+%3)], r2w
    movd  r1d, m%1
    movd  r2d, m%2
    mov [r0+2*16*(8+%3)], r1w
    mov [r0+2*16*(9+%3)], r2w
    shr   r1d, 16
    shr   r2d, 16
    mov [r0+2*16*(12+%3)], r1w
    mov [r0+2*16*(13+%3)], r2w
1181 1182 1183
%endmacro

%macro HADAMARD4_1D 4
1184 1185
    SUMSUB_BADC w, %2, %1, %4, %3
    SUMSUB_BADC w, %4, %2, %3, %1
1186 1187 1188
    SWAP %1, %4, %3
%endmacro

1189 1190
%macro VP8_DC_WHT 1
cglobal vp8_luma_dc_wht_%1, 2,3
1191 1192 1193 1194
    movq          m0, [r1]
    movq          m1, [r1+8]
    movq          m2, [r1+16]
    movq          m3, [r1+24]
1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
%ifidn %1, sse
    xorps      xmm0, xmm0
    movaps  [r1+ 0], xmm0
    movaps  [r1+16], xmm0
%else
    pxor         m4, m4
    movq    [r1+ 0], m4
    movq    [r1+ 8], m4
    movq    [r1+16], m4
    movq    [r1+24], m4
%endif
1206 1207 1208 1209 1210 1211 1212 1213
    HADAMARD4_1D  0, 1, 2, 3
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    paddw         m0, [pw_3]
    HADAMARD4_1D  0, 1, 2, 3
    psraw         m0, 3
    psraw         m1, 3
    psraw         m2, 3
    psraw         m3, 3
1214 1215
    SCATTER_WHT   0, 1, 0
    SCATTER_WHT   2, 3, 2
1216
    RET
1217 1218 1219 1220 1221
%endmacro

INIT_MMX
VP8_DC_WHT mmx
VP8_DC_WHT sse
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316

;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
;-----------------------------------------------------------------------------

; macro called with 7 mm register indexes as argument, and 4 regular registers
;
; first 4 mm registers will carry the transposed pixel data
; the other three are scratchspace (one would be sufficient, but this allows
; for more spreading/pipelining and thus faster execution on OOE CPUs)
;
; first two regular registers are buf+4*stride and buf+5*stride
; third is -stride, fourth is +stride
%macro READ_8x4_INTERLEAVED 11
    ; interleave 8 (A-H) rows of 4 pixels each
    movd          m%1, [%8+%10*4]   ; A0-3
    movd          m%5, [%9+%10*4]   ; B0-3
    movd          m%2, [%8+%10*2]   ; C0-3
    movd          m%6, [%8+%10]     ; D0-3
    movd          m%3, [%8]         ; E0-3
    movd          m%7, [%9]         ; F0-3
    movd          m%4, [%9+%11]     ; G0-3
    punpcklbw     m%1, m%5          ; A/B interleaved
    movd          m%5, [%9+%11*2]   ; H0-3
    punpcklbw     m%2, m%6          ; C/D interleaved
    punpcklbw     m%3, m%7          ; E/F interleaved
    punpcklbw     m%4, m%5          ; G/H interleaved
%endmacro

; macro called with 7 mm register indexes as argument, and 5 regular registers
; first 11 mean the same as READ_8x4_TRANSPOSED above
; fifth regular register is scratchspace to reach the bottom 8 rows, it
; will be set to second regular register + 8*stride at the end
%macro READ_16x4_INTERLEAVED 12
    ; transpose 16 (A-P) rows of 4 pixels each
    lea           %12, [r0+8*r2]

    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
    movd          m%1, [%8+%10*4]   ; A0-3
    movd          m%3, [%12+%10*4]  ; I0-3
    movd          m%2, [%8+%10*2]   ; C0-3
    movd          m%4, [%12+%10*2]  ; K0-3
    movd          m%6, [%8+%10]     ; D0-3
    movd          m%5, [%12+%10]    ; L0-3
    movd          m%7, [%12]        ; M0-3
    add           %12, %11
    punpcklbw     m%1, m%3          ; A/I
    movd          m%3, [%8]         ; E0-3
    punpcklbw     m%2, m%4          ; C/K
    punpcklbw     m%6, m%5          ; D/L
    punpcklbw     m%3, m%7          ; E/M
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved

    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
    movd         m%5, [%9+%10*4]   ; B0-3
    movd         m%4, [%12+%10*4]  ; J0-3
    movd         m%7, [%9]         ; F0-3
    movd         m%6, [%12]        ; N0-3
    punpcklbw    m%5, m%4          ; B/J
    punpcklbw    m%7, m%6          ; F/N
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
    movd         m%4, [%9+%11]     ; G0-3
    movd         m%6, [%12+%11]    ; O0-3
    movd         m%5, [%9+%11*2]   ; H0-3
    movd         m%7, [%12+%11*2]  ; P0-3
    punpcklbw    m%4, m%6          ; G/O
    punpcklbw    m%5, m%7          ; H/P
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
%endmacro

; write 4 mm registers of 2 dwords each
; first four arguments are mm register indexes containing source data
; last four are registers containing buf+4*stride, buf+5*stride,
; -stride and +stride
%macro WRITE_4x2D 8
    ; write out (2 dwords per register)
    movd    [%5+%7*4], m%1
    movd    [%5+%7*2], m%2
    movd         [%5], m%3
    movd      [%6+%8], m%4
    punpckhdq     m%1, m%1
    punpckhdq     m%2, m%2
    punpckhdq     m%3, m%3
    punpckhdq     m%4, m%4
    movd    [%6+%7*4], m%1
    movd      [%5+%7], m%2
    movd         [%6], m%3
    movd    [%6+%8*2], m%4
%endmacro

; write 4 xmm registers of 4 dwords each
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
; we add 1*stride to the third regular registry in the process
1317 1318 1319 1320 1321
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
; same memory region), or 8 if they cover two separate buffers (third one points to
; a different memory region than the first two), allowing for more optimal code for
; the 16-width case
%macro WRITE_4x4D 10
1322 1323 1324
    ; write out (4 dwords per register), start with dwords zero
    movd    [%5+%8*4], m%1
    movd         [%5], m%2
1325 1326
    movd    [%7+%8*4], m%3
    movd         [%7], m%4
1327 1328 1329 1330 1331 1332 1333 1334

    ; store dwords 1
    psrldq        m%1, 4
    psrldq        m%2, 4
    psrldq        m%3, 4
    psrldq        m%4, 4
    movd    [%6+%8*4], m%1
    movd         [%6], m%2
1335
%if %10 == 16
1336
    movd    [%6+%9*4], m%3
1337 1338
%endif
    movd      [%7+%9], m%4
1339 1340 1341 1342

    ; write dwords 2
    psrldq        m%1, 4
    psrldq        m%2, 4
1343 1344
%if %10 == 8
    movd    [%5+%8*2], m%1
1345
    movd          %5d, m%3
1346
%endif
1347 1348
    psrldq        m%3, 4
    psrldq        m%4, 4
1349
%if %10 == 16
1350
    movd    [%5+%8*2], m%1
1351
%endif
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
    movd      [%6+%9], m%2
    movd    [%7+%8*2], m%3
    movd    [%7+%9*2], m%4
    add            %7, %9

    ; store dwords 3
    psrldq        m%1, 4
    psrldq        m%2, 4
    psrldq        m%3, 4
    psrldq        m%4, 4
1362 1363 1364 1365
%if %10 == 8
    mov     [%7+%8*4], %5d
    movd    [%6+%8*2], m%1
%else
1366
    movd      [%5+%8], m%1
1367
%endif
1368 1369 1370 1371 1372
    movd    [%6+%9*2], m%2
    movd    [%7+%8*2], m%3
    movd    [%7+%9*2], m%4
%endmacro

1373 1374 1375 1376 1377 1378 1379 1380 1381
; write 4 or 8 words in the mmx/xmm registers as 8 lines
; 1 and 2 are the registers to write, this can be the same (for SSE2)
; for pre-SSE4:
; 3 is a general-purpose register that we will clobber
; for SSE4:
; 3 is a pointer to the destination's 5th line
; 4 is a pointer to the destination's 4th line
; 5/6 is -stride and +stride
%macro WRITE_2x4W 6
1382
    movd            %3d, %1
1383 1384 1385 1386 1387 1388
    punpckhdq        %1, %1
    mov       [%4+%5*4], %3w
    shr              %3, 16
    add              %4, %6
    mov       [%4+%5*4], %3w

1389
    movd            %3d, %1
1390 1391 1392 1393 1394
    add              %4, %5
    mov       [%4+%5*2], %3w
    shr              %3, 16
    mov       [%4+%5  ], %3w

1395
    movd            %3d, %2
1396 1397 1398 1399 1400
    punpckhdq        %2, %2
    mov       [%4     ], %3w
    shr              %3, 16
    mov       [%4+%6  ], %3w

1401
    movd            %3d, %2
1402 1403 1404 1405 1406 1407 1408 1409
    add              %4, %6
    mov       [%4+%6  ], %3w
    shr              %3, 16
    mov       [%4+%6*2], %3w
    add              %4, %5
%endmacro

%macro WRITE_8W_SSE2 5
1410
    movd            %2d, %1
1411 1412 1413 1414 1415 1416
    psrldq           %1, 4
    mov       [%3+%4*4], %2w
    shr              %2, 16
    add              %3, %5
    mov       [%3+%4*4], %2w

1417
    movd            %2d, %1
1418 1419 1420 1421 1422 1423
    psrldq           %1, 4
    add              %3, %4
    mov       [%3+%4*2], %2w
    shr              %2, 16
    mov       [%3+%4  ], %2w

1424
    movd            %2d, %1
1425 1426 1427 1428 1429
    psrldq           %1, 4
    mov       [%3     ], %2w
    shr              %2, 16
    mov       [%3+%5  ], %2w

1430
    movd            %2d, %1
1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
    add              %3, %5
    mov       [%3+%5  ], %2w
    shr              %2, 16
    mov       [%3+%5*2], %2w
%endmacro

%macro WRITE_8W_SSE4 5
    pextrw    [%3+%4*4], %1, 0
    pextrw    [%2+%4*4], %1, 1
    pextrw    [%3+%4*2], %1, 2
    pextrw    [%3+%4  ], %1, 3
    pextrw    [%3     ], %1, 4
    pextrw    [%2     ], %1, 5
    pextrw    [%2+%5  ], %1, 6
    pextrw    [%2+%5*2], %1, 7
%endmacro

1448
%macro SPLATB_REG_MMX 2-3
1449
    movd           %1, %2d
1450 1451 1452
    punpcklbw      %1, %1
    punpcklwd      %1, %1
    punpckldq      %1, %1
1453 1454 1455
%endmacro

%macro SPLATB_REG_MMXEXT 2-3
1456
    movd           %1, %2d
1457
    punpcklbw      %1, %1
1458
    pshufw         %1, %1, 0x0
1459 1460 1461
%endmacro

%macro SPLATB_REG_SSE2 2-3
1462
    movd           %1, %2d
1463 1464 1465 1466 1467 1468
    punpcklbw      %1, %1
    pshuflw        %1, %1, 0x0
    punpcklqdq     %1, %1
%endmacro

%macro SPLATB_REG_SSSE3 3
1469
    movd           %1, %2d
1470
    pshufb         %1, %3
1471 1472
%endmacro

1473 1474
%macro SIMPLE_LOOPFILTER 4
cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
1475 1476 1477
%if mmsize == 8 ; mmx/mmxext
    mov            r3, 2
%endif
1478 1479
%ifnidn %1, sse2
%if mmsize == 16
1480
    pxor           m0, m0
1481
%endif
1482
%endif
1483
    SPLATB_REG     m7, r2, m0       ; splat "flim" into register
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579

    ; set up indexes to address 4 rows
    mov            r2, r1
    neg            r1
%ifidn %2, h
    lea            r0, [r0+4*r2-2]
%endif

%if mmsize == 8 ; mmx / mmxext
.next8px
%endif
%ifidn %2, v
    ; read 4 half/full rows of pixels
    mova           m0, [r0+r1*2]    ; p1
    mova           m1, [r0+r1]      ; p0
    mova           m2, [r0]         ; q0
    mova           m3, [r0+r2]      ; q1
%else ; h
    lea            r4, [r0+r2]

%if mmsize == 8 ; mmx/mmxext
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
%else ; sse2
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
%endif
    TRANSPOSE4x4W         0, 1, 2, 3, 4
%endif

    ; simple_limit
    mova           m5, m2           ; m5=backup of q0
    mova           m6, m1           ; m6=backup of p0
    psubusb        m1, m2           ; p0-q0
    psubusb        m2, m6           ; q0-p0
    por            m1, m2           ; FFABS(p0-q0)
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2

    mova           m4, m3
    mova           m2, m0
    psubusb        m3, m0           ; q1-p1
    psubusb        m0, m4           ; p1-q1
    por            m3, m0           ; FFABS(p1-q1)
    mova           m0, [pb_80]
    pxor           m2, m0
    pxor           m4, m0
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
    pand           m3, [pb_FE]
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
    paddusb        m3, m1
    psubusb        m3, m7
    pxor           m1, m1
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)

    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
    mova           m4, m5
    pxor           m5, m0
    pxor           m0, m6
    psubsb         m5, m0           ; q0-p0 (signed)
    paddsb         m2, m5
    paddsb         m2, m5
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
    pand           m2, m3           ; apply filter mask (m3)

    mova           m3, [pb_F8]
    mova           m1, m2
    paddsb         m2, [pb_4]       ; f1<<3=a+4
    paddsb         m1, [pb_3]       ; f2<<3=a+3
    pand           m2, m3
    pand           m1, m3           ; cache f2<<3

    pxor           m0, m0
    pxor           m3, m3
    pcmpgtb        m0, m2           ; which values are <0?
    psubb          m3, m2           ; -f1<<3
    psrlq          m2, 3            ; +f1
    psrlq          m3, 3            ; -f1
    pand           m3, m0
    pandn          m0, m2
    psubusb        m4, m0
    paddusb        m4, m3           ; q0-f1

    pxor           m0, m0
    pxor           m3, m3
    pcmpgtb        m0, m1           ; which values are <0?
    psubb          m3, m1           ; -f2<<3
    psrlq          m1, 3            ; +f2
    psrlq          m3, 3            ; -f2
    pand           m3, m0
    pandn          m0, m1
    paddusb        m6, m0
    psubusb        m6, m3           ; p0+f2

    ; store
%ifidn %2, v
    mova         [r0], m4
    mova      [r0+r1], m6
%else ; h
1580 1581
    inc           r0
    SBUTTERFLY    bw, 6, 4, 0
1582 1583

%if mmsize == 16 ; sse2
1584 1585 1586 1587 1588 1589 1590 1591 1592
%ifidn %1, sse4
    inc            r4
%endif
    WRITE_8W       m6, r4, r0, r1, r2
    lea            r4, [r3+r1+1]
%ifidn %1, sse4
    inc            r3
%endif
    WRITE_8W       m4, r3, r4, r1, r2
1593
%else ; mmx/mmxext
1594
    WRITE_2x4W     m6, m4, r4, r0, r1, r2
1595 1596 1597 1598 1599 1600 1601 1602
%endif
%endif

%if mmsize == 8 ; mmx/mmxext
    ; next 8 pixels
%ifidn %2, v
    add            r0, 8            ; advance 8 cols = pixels
%else ; h
1603
    lea            r0, [r0+r2*8-1]  ; advance 8 rows = lines
1604 1605 1606 1607 1608 1609 1610 1611 1612 1613
%endif
    dec            r3
    jg .next8px
    REP_RET
%else ; sse2
    RET
%endif
%endmacro

INIT_MMX
1614
%define SPLATB_REG SPLATB_REG_MMX
1615 1616
SIMPLE_LOOPFILTER mmx,    v, 4, 0
SIMPLE_LOOPFILTER mmx,    h, 5, 0
1617
%define SPLATB_REG SPLATB_REG_MMXEXT
1618 1619
SIMPLE_LOOPFILTER mmxext, v, 4, 0
SIMPLE_LOOPFILTER mmxext, h, 5, 0
1620
INIT_XMM
1621
%define SPLATB_REG SPLATB_REG_SSE2
1622
%define WRITE_8W   WRITE_8W_SSE2
1623 1624
SIMPLE_LOOPFILTER sse2,   v, 3, 8
SIMPLE_LOOPFILTER sse2,   h, 5, 8
1625
%define SPLATB_REG SPLATB_REG_SSSE3
1626 1627
SIMPLE_LOOPFILTER ssse3,  v, 3, 8
SIMPLE_LOOPFILTER ssse3,  h, 5, 8
1628
%define WRITE_8W   WRITE_8W_SSE4
1629
SIMPLE_LOOPFILTER sse4,   h, 5, 8
1630 1631

;-----------------------------------------------------------------------------
1632
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1633 1634 1635
;                                            int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------

1636 1637 1638 1639 1640 1641 1642 1643 1644 1645
%macro INNER_LOOPFILTER 5
%if %4 == 8 ; chroma
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
%define dst8_reg    r1
%define mstride_reg r2
%define E_reg       r3
%define I_reg       r4
%define hev_thr_reg r5
%else ; luma
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
%define mstride_reg r1
%define E_reg       r2
%define I_reg       r3
%define hev_thr_reg r4
%ifdef m8 ; x86-64, sse2
%define dst8_reg    r4
%elif mmsize == 16 ; x86-32, sse2
%define dst8_reg    r5
%else ; x86-32, mmx/mmxext
%define cnt_reg     r5
%endif
1657 1658
%endif
%define dst_reg     r0
1659 1660 1661 1662 1663 1664
%define stride_reg  E_reg
%define dst2_reg    I_reg
%ifndef m8
%define stack_reg   hev_thr_reg
%endif

1665 1666
%ifnidn %1, sse2
%if mmsize == 16
1667 1668
    pxor             m7, m7
%endif
1669
%endif
1670

1671 1672
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
    ; splat function arguments
1673 1674 1675
    SPLATB_REG       m0, E_reg, m7   ; E
    SPLATB_REG       m1, I_reg, m7   ; I
    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
1676 1677

    ; align stack
1678
    mov       stack_reg, rsp         ; backup stack pointer
1679 1680 1681 1682 1683
    and             rsp, ~(mmsize-1) ; align stack
%ifidn %2, v
    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
                                     ;               [3]=hev() result
%else ; h
1684
    sub             rsp, mmsize * 5  ; extra storage space for transposes
1685 1686 1687 1688 1689 1690
%endif

%define flim_E   [rsp]
%define flim_I   [rsp+mmsize]
%define hev_thr  [rsp+mmsize*2]
%define mask_res [rsp+mmsize*3]
1691 1692
%define p0backup [rsp+mmsize*3]
%define q0backup [rsp+mmsize*4]
1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703

    mova         flim_E, m0
    mova         flim_I, m1
    mova        hev_thr, m2

%else ; sse2 on x86-64

%define flim_E   m9
%define flim_I   m10
%define hev_thr  m11
%define mask_res m12
1704 1705
%define p0backup m12
%define q0backup m8
1706 1707

    ; splat function arguments
1708 1709 1710
    SPLATB_REG   flim_E, E_reg, m7   ; E
    SPLATB_REG   flim_I, I_reg, m7   ; I
    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
1711 1712
%endif

1713
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1714
    mov         cnt_reg, 2
1715
%endif
1716 1717
    mov      stride_reg, mstride_reg
    neg     mstride_reg
1718
%ifidn %2, h
1719
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1720 1721 1722
%if %4 == 8
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
%endif
1723 1724 1725 1726 1727 1728
%endif

%if mmsize == 8
.next8px
%endif
    ; read
1729
    lea        dst2_reg, [dst_reg + stride_reg]
1730
%ifidn %2, v
1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751
%if %4 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
%endif
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
    movrow           m5, [dst2_reg]               ; q1
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
%if mmsize == 16 && %4 == 8
    movhps           m0, [dst8_reg+mstride_reg*4]
    movhps           m2, [dst8_reg+mstride_reg*2]
    add        dst8_reg, stride_reg
    movhps           m1, [dst8_reg+mstride_reg*4]
    movhps           m5, [dst8_reg]
    movhps           m6, [dst8_reg+ stride_reg]
    movhps           m7, [dst8_reg+ stride_reg*2]
    add        dst8_reg, mstride_reg
%endif
1752 1753
%elif mmsize == 8 ; mmx/mmxext (h)
    ; read 8 rows of 8px each
1754 1755 1756 1757 1758 1759 1760
    movu             m0, [dst_reg +mstride_reg*4]
    movu             m1, [dst2_reg+mstride_reg*4]
    movu             m2, [dst_reg +mstride_reg*2]
    movu             m3, [dst_reg +mstride_reg]
    movu             m4, [dst_reg]
    movu             m5, [dst2_reg]
    movu             m6, [dst2_reg+ stride_reg]
1761 1762 1763

    ; 8x8 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1764
    mova       q0backup, m1
1765
    movu             m7, [dst2_reg+ stride_reg*2]
1766 1767 1768 1769
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1770 1771
    mova             m1, q0backup
    mova       q0backup, m2          ; store q0
1772
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1773
    mova       p0backup, m5          ; store p0
1774 1775 1776 1777 1778
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%else ; sse2 (h)
1779
%if %4 == 16
1780
    lea        dst8_reg, [dst_reg + stride_reg*8]
1781
%endif
1782 1783

    ; read 16 rows of 8px each, interleave
1784 1785 1786 1787 1788 1789 1790 1791
    movh             m0, [dst_reg +mstride_reg*4]
    movh             m1, [dst8_reg+mstride_reg*4]
    movh             m2, [dst_reg +mstride_reg*2]
    movh             m5, [dst8_reg+mstride_reg*2]
    movh             m3, [dst_reg +mstride_reg]
    movh             m6, [dst8_reg+mstride_reg]
    movh             m4, [dst_reg]
    movh             m7, [dst8_reg]
1792 1793 1794 1795 1796
    punpcklbw        m0, m1          ; A/I
    punpcklbw        m2, m5          ; C/K
    punpcklbw        m3, m6          ; D/L
    punpcklbw        m4, m7          ; E/M

1797 1798 1799 1800 1801
    add        dst8_reg, stride_reg
    movh             m1, [dst2_reg+mstride_reg*4]
    movh             m6, [dst8_reg+mstride_reg*4]
    movh             m5, [dst2_reg]
    movh             m7, [dst8_reg]
1802 1803
    punpcklbw        m1, m6          ; B/J
    punpcklbw        m5, m7          ; F/N
1804 1805
    movh             m6, [dst2_reg+ stride_reg]
    movh             m7, [dst8_reg+ stride_reg]
1806 1807 1808 1809
    punpcklbw        m6, m7          ; G/O

    ; 8x16 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1810
%ifdef m8
1811
    SWAP              1, 8
1812
%else
1813
    mova       q0backup, m1
1814
%endif
1815 1816
    movh             m7, [dst2_reg+ stride_reg*2]
    movh             m1, [dst8_reg+ stride_reg*2]
1817 1818 1819 1820 1821
    punpcklbw        m7, m1          ; H/P
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1822
%ifdef m8
1823 1824
    SWAP              1, 8
    SWAP              2, 8
1825
%else
1826 1827
    mova             m1, q0backup
    mova       q0backup, m2          ; store q0
1828 1829
%endif
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1830
%ifdef m12
1831
    SWAP              5, 12
1832
%else
1833
    mova       p0backup, m5          ; store p0
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866
%endif
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%endif

    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
    mova             m4, m1
    SWAP              4, 1
    psubusb          m4, m0          ; p2-p3
    psubusb          m0, m1          ; p3-p2
    por              m0, m4          ; abs(p3-p2)

    mova             m4, m2
    SWAP              4, 2
    psubusb          m4, m1          ; p1-p2
    psubusb          m1, m2          ; p2-p1
    por              m1, m4          ; abs(p2-p1)

    mova             m4, m6
    SWAP              4, 6
    psubusb          m4, m7          ; q2-q3
    psubusb          m7, m6          ; q3-q2
    por              m7, m4          ; abs(q3-q2)

    mova             m4, m5
    SWAP              4, 5
    psubusb          m4, m6          ; q1-q2
    psubusb          m6, m5          ; q2-q1
    por              m6, m4          ; abs(q2-q1)

%ifidn %1, mmx
1867
    mova             m4, flim_I
1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
    pxor             m3, m3
    psubusb          m0, m4
    psubusb          m1, m4
    psubusb          m7, m4
    psubusb          m6, m4
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
    pand             m0, m1
    pand             m7, m6
    pand             m0, m7
%else ; mmxext/sse2
    pmaxub           m0, m1
    pmaxub           m6, m7
    pmaxub           m0, m6
%endif

    ; normal_limit and high_edge_variance for p1-p0, q1-q0
    SWAP              7, 3           ; now m7 is zero
%ifidn %2, v
1889 1890 1891 1892 1893
    movrow           m3, [dst_reg +mstride_reg] ; p0
%if mmsize == 16 && %4 == 8
    movhps           m3, [dst8_reg+mstride_reg]
%endif
%elifdef m12
1894
    SWAP              3, 12
1895
%else
1896
    mova             m3, p0backup
1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
%endif

    mova             m1, m2
    SWAP              1, 2
    mova             m6, m3
    SWAP              3, 6
    psubusb          m1, m3          ; p1-p0
    psubusb          m6, m2          ; p0-p1
    por              m1, m6          ; abs(p1-p0)
%ifidn %1, mmx
    mova             m6, m1
    psubusb          m1, m4
    psubusb          m6, hev_thr
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
    pand             m0, m1
1913
    mova       mask_res, m6
1914 1915 1916 1917 1918 1919 1920
%else ; mmxext/sse2
    pmaxub           m0, m1          ; max_I
    SWAP              1, 4           ; max_hev_thresh
%endif

    SWAP              6, 4           ; now m6 is I
%ifidn %2, v
1921 1922 1923 1924 1925
    movrow           m4, [dst_reg]   ; q0
%if mmsize == 16 && %4 == 8
    movhps           m4, [dst8_reg]
%endif
%elifdef m8
1926
    SWAP              4, 8
1927
%else
1928
    mova             m4, q0backup
1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943
%endif
    mova             m1, m4
    SWAP              1, 4
    mova             m7, m5
    SWAP              7, 5
    psubusb          m1, m5          ; q0-q1
    psubusb          m7, m4          ; q1-q0
    por              m1, m7          ; abs(q1-q0)
%ifidn %1, mmx
    mova             m7, m1
    psubusb          m1, m6
    psubusb          m7, hev_thr
    pxor             m6, m6
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1944
    mova             m6, mask_res
1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
    pand             m6, m7
%else ; mmxext/sse2
    pxor             m7, m7
    pmaxub           m0, m1
    pmaxub           m6, m1
    psubusb          m0, flim_I
    psubusb          m6, hev_thr
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
%endif
%ifdef m12
    SWAP              6, 12
%else
1959
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
%endif

    ; simple_limit
    mova             m1, m3
    SWAP              1, 3
    mova             m6, m4          ; keep copies of p0/q0 around for later use
    SWAP              6, 4
    psubusb          m1, m4          ; p0-q0
    psubusb          m6, m3          ; q0-p0
    por              m1, m6          ; abs(q0-p0)
    paddusb          m1, m1          ; m1=2*abs(q0-p0)

    mova             m7, m2
    SWAP              7, 2
    mova             m6, m5
    SWAP              6, 5
    psubusb          m7, m5          ; p1-q1
    psubusb          m6, m2          ; q1-p1
    por              m7, m6          ; abs(q1-p1)
    pxor             m6, m6
    pand             m7, [pb_FE]
    psrlq            m7, 1           ; abs(q1-p1)/2
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
    psubusb          m7, flim_E
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
    pand             m0, m7          ; normal_limit result

    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef m8 ; x86-64 && sse2
    mova             m8, [pb_80]
%define pb_80_var m8
%else ; x86-32 or mmx/mmxext
%define pb_80_var [pb_80]
%endif
    mova             m1, m4
    mova             m7, m3
    pxor             m1, pb_80_var
    pxor             m7, pb_80_var
    psubsb           m1, m7          ; (signed) q0-p0
    mova             m6, m2
    mova             m7, m5
    pxor             m6, pb_80_var
    pxor             m7, pb_80_var
    psubsb           m6, m7          ; (signed) p1-q1
    mova             m7, mask_res
    pandn            m7, m6
    paddsb           m7, m1
    paddsb           m7, m1
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)

    pand             m7, m0
    mova             m1, [pb_F8]
    mova             m6, m7
    paddsb           m7, [pb_3]
    paddsb           m6, [pb_4]
    pand             m7, m1
    pand             m6, m1

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m1, m7
    psubb            m0, m7
    psrlq            m7, 3           ; +f2
    psrlq            m0, 3           ; -f2
    pand             m0, m1
    pandn            m1, m7
    psubusb          m3, m0
    paddusb          m3, m1          ; p0+f2

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m0, m6
    psubb            m1, m6
    psrlq            m6, 3           ; +f1
    psrlq            m1, 3           ; -f1
    pand             m1, m0
    pandn            m0, m6
    psubusb          m4, m0
    paddusb          m4, m1          ; q0-f1

%ifdef m12
    SWAP              6, 12
%else
2043
    mova             m6, mask_res
2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070
%endif
%ifidn %1, mmx
    mova             m7, [pb_1]
%else ; mmxext/sse2
    pxor             m7, m7
%endif
    pand             m0, m6
    pand             m1, m6
%ifidn %1, mmx
    paddusb          m0, m7
    pand             m1, [pb_FE]
    pandn            m7, m0
    psrlq            m1, 1
    psrlq            m7, 1
    SWAP              0, 7
%else ; mmxext/sse2
    psubusb          m1, [pb_1]
    pavgb            m0, m7          ; a
    pavgb            m1, m7          ; -a
%endif
    psubusb          m5, m0
    psubusb          m2, m1
    paddusb          m5, m1          ; q1-a
    paddusb          m2, m0          ; p1+a

    ; store
%ifidn %2, v
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080
    movrow [dst_reg +mstride_reg*2], m2
    movrow [dst_reg +mstride_reg  ], m3
    movrow    [dst_reg], m4
    movrow [dst_reg + stride_reg  ], m5
%if mmsize == 16 && %4 == 8
    movhps [dst8_reg+mstride_reg*2], m2
    movhps [dst8_reg+mstride_reg  ], m3
    movhps   [dst8_reg], m4
    movhps [dst8_reg+ stride_reg  ], m5
%endif
2081
%else ; h
2082 2083
    add         dst_reg, 2
    add        dst2_reg, 2
2084 2085 2086 2087 2088

    ; 4x8/16 transpose
    TRANSPOSE4x4B     2, 3, 4, 5, 6

%if mmsize == 8 ; mmx/mmxext (h)
2089
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
2090
%else ; sse2 (h)
2091
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
2092
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2093 2094 2095 2096
%endif
%endif

%if mmsize == 8
2097 2098 2099 2100 2101 2102 2103 2104
%if %4 == 8 ; chroma
%ifidn %2, h
    sub         dst_reg, 2
%endif
    cmp         dst_reg, dst8_reg
    mov         dst_reg, dst8_reg
    jnz .next8px
%else
2105
%ifidn %2, h
2106
    lea         dst_reg, [dst_reg + stride_reg*8-2]
2107
%else ; v
2108
    add         dst_reg, 8
2109
%endif
2110
    dec         cnt_reg
2111 2112
    jg .next8px
%endif
2113
%endif
2114 2115

%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2116
    mov             rsp, stack_reg   ; restore stack pointer
2117
%endif
2118
    RET
2119 2120 2121
%endmacro

INIT_MMX
2122
%define SPLATB_REG SPLATB_REG_MMX
2123 2124 2125 2126
INNER_LOOPFILTER mmx,    v, 6, 16, 0
INNER_LOOPFILTER mmx,    h, 6, 16, 0
INNER_LOOPFILTER mmx,    v, 6,  8, 0
INNER_LOOPFILTER mmx,    h, 6,  8, 0
2127 2128 2129 2130

%define SPLATB_REG SPLATB_REG_MMXEXT
INNER_LOOPFILTER mmxext, v, 6, 16, 0
INNER_LOOPFILTER mmxext, h, 6, 16, 0
2131 2132
INNER_LOOPFILTER mmxext, v, 6,  8, 0
INNER_LOOPFILTER mmxext, h, 6,  8, 0
2133

2134
INIT_XMM
2135
%define SPLATB_REG SPLATB_REG_SSE2
2136
INNER_LOOPFILTER sse2,   v, 5, 16, 13
2137
%ifdef m8
2138
INNER_LOOPFILTER sse2,   h, 5, 16, 13
2139
%else
2140
INNER_LOOPFILTER sse2,   h, 6, 16, 13
2141
%endif
2142 2143
INNER_LOOPFILTER sse2,   v, 6,  8, 13
INNER_LOOPFILTER sse2,   h, 6,  8, 13
2144

2145
%define SPLATB_REG SPLATB_REG_SSSE3
2146 2147 2148 2149 2150 2151 2152 2153 2154
INNER_LOOPFILTER ssse3,  v, 5, 16, 13
%ifdef m8
INNER_LOOPFILTER ssse3,  h, 5, 16, 13
%else
INNER_LOOPFILTER ssse3,  h, 6, 16, 13
%endif
INNER_LOOPFILTER ssse3,  v, 6,  8, 13
INNER_LOOPFILTER ssse3,  h, 6,  8, 13

2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
;                                            int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------

%macro MBEDGE_LOOPFILTER 5
%if %4 == 8 ; chroma
cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
%define dst8_reg    r1
%define mstride_reg r2
%define E_reg       r3
%define I_reg       r4
%define hev_thr_reg r5
%else ; luma
cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define mstride_reg r1
%define E_reg       r2
%define I_reg       r3
%define hev_thr_reg r4
%ifdef m8 ; x86-64, sse2
%define dst8_reg    r4
%elif mmsize == 16 ; x86-32, sse2
%define dst8_reg    r5
%else ; x86-32, mmx/mmxext
%define cnt_reg     r5
%endif
%endif
%define dst_reg     r0
%define stride_reg  E_reg
%define dst2_reg    I_reg
%ifndef m8
%define stack_reg   hev_thr_reg
%endif

2189
%define ssse3_or_higher 0
2190 2191
%ifnidn %1, sse2
%if mmsize == 16
2192 2193
%define ssse3_or_higher 1
%endif
2194
%endif
2195 2196 2197

%if ssse3_or_higher
    pxor             m7, m7
2198
%endif
2199

2200 2201
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
    ; splat function arguments
2202 2203 2204
    SPLATB_REG       m0, E_reg, m7   ; E
    SPLATB_REG       m1, I_reg, m7   ; I
    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
2205 2206 2207 2208

    ; align stack
    mov       stack_reg, rsp         ; backup stack pointer
    and             rsp, ~(mmsize-1) ; align stack
2209
%if mmsize == 16
2210 2211
    sub             rsp, mmsize * 7
%else
2212 2213 2214 2215 2216
    sub             rsp, mmsize * 8  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
                                     ;               [3]=hev() result
                                     ;               [4]=filter tmp result
                                     ;               [5]/[6] = p2/q2 backup
                                     ;               [7]=lim_res sign result
2217
%endif
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227

%define flim_E   [rsp]
%define flim_I   [rsp+mmsize]
%define hev_thr  [rsp+mmsize*2]
%define mask_res [rsp+mmsize*3]
%define lim_res  [rsp+mmsize*4]
%define p0backup [rsp+mmsize*3]
%define q0backup [rsp+mmsize*4]
%define p2backup [rsp+mmsize*5]
%define q2backup [rsp+mmsize*6]
2228
%if mmsize == 16
2229 2230
%define lim_sign [rsp]
%else
2231
%define lim_sign [rsp+mmsize*7]
2232
%endif
2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248

    mova         flim_E, m0
    mova         flim_I, m1
    mova        hev_thr, m2

%else ; sse2 on x86-64

%define flim_E   m9
%define flim_I   m10
%define hev_thr  m11
%define mask_res m12
%define lim_res  m8
%define p0backup m12
%define q0backup m8
%define p2backup m13
%define q2backup m14
2249
%define lim_sign m9
2250 2251

    ; splat function arguments
2252 2253 2254
    SPLATB_REG   flim_E, E_reg, m7   ; E
    SPLATB_REG   flim_I, I_reg, m7   ; I
    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594
%endif

%if mmsize == 8 && %4 == 16 ; mmx/mmxext
    mov         cnt_reg, 2
%endif
    mov      stride_reg, mstride_reg
    neg     mstride_reg
%ifidn %2, h
    lea         dst_reg, [dst_reg + stride_reg*4-4]
%if %4 == 8
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
%endif
%endif

%if mmsize == 8
.next8px
%endif
    ; read
    lea        dst2_reg, [dst_reg + stride_reg]
%ifidn %2, v
%if %4 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
%endif
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
    movrow           m5, [dst2_reg]               ; q1
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
%if mmsize == 16 && %4 == 8
    movhps           m0, [dst8_reg+mstride_reg*4]
    movhps           m2, [dst8_reg+mstride_reg*2]
    add        dst8_reg, stride_reg
    movhps           m1, [dst8_reg+mstride_reg*4]
    movhps           m5, [dst8_reg]
    movhps           m6, [dst8_reg+ stride_reg]
    movhps           m7, [dst8_reg+ stride_reg*2]
    add        dst8_reg, mstride_reg
%endif
%elif mmsize == 8 ; mmx/mmxext (h)
    ; read 8 rows of 8px each
    movu             m0, [dst_reg +mstride_reg*4]
    movu             m1, [dst2_reg+mstride_reg*4]
    movu             m2, [dst_reg +mstride_reg*2]
    movu             m3, [dst_reg +mstride_reg]
    movu             m4, [dst_reg]
    movu             m5, [dst2_reg]
    movu             m6, [dst2_reg+ stride_reg]

    ; 8x8 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
    mova       q0backup, m1
    movu             m7, [dst2_reg+ stride_reg*2]
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
    mova             m1, q0backup
    mova       q0backup, m2          ; store q0
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
    mova       p0backup, m5          ; store p0
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%else ; sse2 (h)
%if %4 == 16
    lea        dst8_reg, [dst_reg + stride_reg*8]
%endif

    ; read 16 rows of 8px each, interleave
    movh             m0, [dst_reg +mstride_reg*4]
    movh             m1, [dst8_reg+mstride_reg*4]
    movh             m2, [dst_reg +mstride_reg*2]
    movh             m5, [dst8_reg+mstride_reg*2]
    movh             m3, [dst_reg +mstride_reg]
    movh             m6, [dst8_reg+mstride_reg]
    movh             m4, [dst_reg]
    movh             m7, [dst8_reg]
    punpcklbw        m0, m1          ; A/I
    punpcklbw        m2, m5          ; C/K
    punpcklbw        m3, m6          ; D/L
    punpcklbw        m4, m7          ; E/M

    add        dst8_reg, stride_reg
    movh             m1, [dst2_reg+mstride_reg*4]
    movh             m6, [dst8_reg+mstride_reg*4]
    movh             m5, [dst2_reg]
    movh             m7, [dst8_reg]
    punpcklbw        m1, m6          ; B/J
    punpcklbw        m5, m7          ; F/N
    movh             m6, [dst2_reg+ stride_reg]
    movh             m7, [dst8_reg+ stride_reg]
    punpcklbw        m6, m7          ; G/O

    ; 8x16 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
%ifdef m8
    SWAP              1, 8
%else
    mova       q0backup, m1
%endif
    movh             m7, [dst2_reg+ stride_reg*2]
    movh             m1, [dst8_reg+ stride_reg*2]
    punpcklbw        m7, m1          ; H/P
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
%ifdef m8
    SWAP              1, 8
    SWAP              2, 8
%else
    mova             m1, q0backup
    mova       q0backup, m2          ; store q0
%endif
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
%ifdef m12
    SWAP              5, 12
%else
    mova       p0backup, m5          ; store p0
%endif
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%endif

    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
    mova             m4, m1
    SWAP              4, 1
    psubusb          m4, m0          ; p2-p3
    psubusb          m0, m1          ; p3-p2
    por              m0, m4          ; abs(p3-p2)

    mova             m4, m2
    SWAP              4, 2
    psubusb          m4, m1          ; p1-p2
    mova       p2backup, m1
    psubusb          m1, m2          ; p2-p1
    por              m1, m4          ; abs(p2-p1)

    mova             m4, m6
    SWAP              4, 6
    psubusb          m4, m7          ; q2-q3
    psubusb          m7, m6          ; q3-q2
    por              m7, m4          ; abs(q3-q2)

    mova             m4, m5
    SWAP              4, 5
    psubusb          m4, m6          ; q1-q2
    mova       q2backup, m6
    psubusb          m6, m5          ; q2-q1
    por              m6, m4          ; abs(q2-q1)

%ifidn %1, mmx
    mova             m4, flim_I
    pxor             m3, m3
    psubusb          m0, m4
    psubusb          m1, m4
    psubusb          m7, m4
    psubusb          m6, m4
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
    pand             m0, m1
    pand             m7, m6
    pand             m0, m7
%else ; mmxext/sse2
    pmaxub           m0, m1
    pmaxub           m6, m7
    pmaxub           m0, m6
%endif

    ; normal_limit and high_edge_variance for p1-p0, q1-q0
    SWAP              7, 3           ; now m7 is zero
%ifidn %2, v
    movrow           m3, [dst_reg +mstride_reg] ; p0
%if mmsize == 16 && %4 == 8
    movhps           m3, [dst8_reg+mstride_reg]
%endif
%elifdef m12
    SWAP              3, 12
%else
    mova             m3, p0backup
%endif

    mova             m1, m2
    SWAP              1, 2
    mova             m6, m3
    SWAP              3, 6
    psubusb          m1, m3          ; p1-p0
    psubusb          m6, m2          ; p0-p1
    por              m1, m6          ; abs(p1-p0)
%ifidn %1, mmx
    mova             m6, m1
    psubusb          m1, m4
    psubusb          m6, hev_thr
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
    pand             m0, m1
    mova       mask_res, m6
%else ; mmxext/sse2
    pmaxub           m0, m1          ; max_I
    SWAP              1, 4           ; max_hev_thresh
%endif

    SWAP              6, 4           ; now m6 is I
%ifidn %2, v
    movrow           m4, [dst_reg]   ; q0
%if mmsize == 16 && %4 == 8
    movhps           m4, [dst8_reg]
%endif
%elifdef m8
    SWAP              4, 8
%else
    mova             m4, q0backup
%endif
    mova             m1, m4
    SWAP              1, 4
    mova             m7, m5
    SWAP              7, 5
    psubusb          m1, m5          ; q0-q1
    psubusb          m7, m4          ; q1-q0
    por              m1, m7          ; abs(q1-q0)
%ifidn %1, mmx
    mova             m7, m1
    psubusb          m1, m6
    psubusb          m7, hev_thr
    pxor             m6, m6
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
    mova             m6, mask_res
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
    pand             m6, m7
%else ; mmxext/sse2
    pxor             m7, m7
    pmaxub           m0, m1
    pmaxub           m6, m1
    psubusb          m0, flim_I
    psubusb          m6, hev_thr
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
%endif
%ifdef m12
    SWAP              6, 12
%else
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
%endif

    ; simple_limit
    mova             m1, m3
    SWAP              1, 3
    mova             m6, m4          ; keep copies of p0/q0 around for later use
    SWAP              6, 4
    psubusb          m1, m4          ; p0-q0
    psubusb          m6, m3          ; q0-p0
    por              m1, m6          ; abs(q0-p0)
    paddusb          m1, m1          ; m1=2*abs(q0-p0)

    mova             m7, m2
    SWAP              7, 2
    mova             m6, m5
    SWAP              6, 5
    psubusb          m7, m5          ; p1-q1
    psubusb          m6, m2          ; q1-p1
    por              m7, m6          ; abs(q1-p1)
    pxor             m6, m6
    pand             m7, [pb_FE]
    psrlq            m7, 1           ; abs(q1-p1)/2
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
    psubusb          m7, flim_E
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
    pand             m0, m7          ; normal_limit result

    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef m8 ; x86-64 && sse2
    mova             m8, [pb_80]
%define pb_80_var m8
%else ; x86-32 or mmx/mmxext
%define pb_80_var [pb_80]
%endif
    mova             m1, m4
    mova             m7, m3
    pxor             m1, pb_80_var
    pxor             m7, pb_80_var
    psubsb           m1, m7          ; (signed) q0-p0
    mova             m6, m2
    mova             m7, m5
    pxor             m6, pb_80_var
    pxor             m7, pb_80_var
    psubsb           m6, m7          ; (signed) p1-q1
    mova             m7, mask_res
    paddsb           m6, m1
    paddsb           m6, m1
    paddsb           m6, m1
    pand             m6, m0
%ifdef m8
    mova        lim_res, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
    pand        lim_res, m7
%else
    mova             m0, m6
    pand             m0, m7
    mova        lim_res, m0
%endif
    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common

    mova             m1, [pb_F8]
    mova             m6, m7
    paddsb           m7, [pb_3]
    paddsb           m6, [pb_4]
    pand             m7, m1
    pand             m6, m1

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m1, m7
    psubb            m0, m7
    psrlq            m7, 3           ; +f2
    psrlq            m0, 3           ; -f2
    pand             m0, m1
    pandn            m1, m7
    psubusb          m3, m0
    paddusb          m3, m1          ; p0+f2

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m0, m6
    psubb            m1, m6
    psrlq            m6, 3           ; +f1
    psrlq            m1, 3           ; -f1
    pand             m1, m0
    pandn            m0, m6
    psubusb          m4, m0
    paddusb          m4, m1          ; q0-f1

    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2595 2596 2597
%if ssse3_or_higher
    mova             m7, [pb_1]
%else
2598
    mova             m7, [pw_63]
2599
%endif
2600 2601 2602 2603 2604 2605 2606 2607
%ifdef m8
    SWAP              1, 8
%else
    mova             m1, lim_res
%endif
    pxor             m0, m0
    mova             m6, m1
    pcmpgtb          m0, m1         ; which are negative
2608 2609 2610 2611
%if ssse3_or_higher
    punpcklbw        m6, m7         ; interleave with "1" for rounding
    punpckhbw        m1, m7
%else
2612 2613
    punpcklbw        m6, m0         ; signed byte->word
    punpckhbw        m1, m0
2614
%endif
2615
    mova       lim_sign, m0
2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634
%if ssse3_or_higher
    mova             m7, [pb_27_63]
%ifndef m8
    mova        lim_res, m1
%endif
%ifdef m10
    SWAP              0, 10         ; don't lose lim_sign copy
%endif
    mova             m0, m7
    pmaddubsw        m7, m6
    SWAP              6, 7
    pmaddubsw        m0, m1
    SWAP              1, 0
%ifdef m10
    SWAP              0, 10
%else
    mova             m0, lim_sign
%endif
%else
2635 2636 2637 2638 2639 2640
    mova       mask_res, m6         ; backup for later in filter
    mova        lim_res, m1
    pmullw          m6, [pw_27]
    pmullw          m1, [pw_27]
    paddw           m6, m7
    paddw           m1, m7
2641
%endif
2642 2643 2644 2645 2646 2647 2648
    psraw           m6, 7
    psraw           m1, 7
    packsswb        m6, m1          ; a0
    pxor            m1, m1
    psubb           m1, m6
    pand            m1, m0          ; -a0
    pandn           m0, m6          ; +a0
2649 2650 2651
%if ssse3_or_higher
    mova            m6, [pb_18_63]  ; pipelining
%endif
2652 2653 2654 2655 2656
    psubusb         m3, m1
    paddusb         m4, m1
    paddusb         m3, m0          ; p0+a0
    psubusb         m4, m0          ; q0-a0

2657 2658 2659 2660 2661
%if ssse3_or_higher
    SWAP             6, 7
%ifdef m10
    SWAP             1, 10
%else
2662
    mova            m1, lim_res
2663 2664 2665 2666 2667 2668 2669 2670 2671
%endif
    mova            m0, m7
    pmaddubsw       m7, m6
    SWAP             6, 7
    pmaddubsw       m0, m1
    SWAP             1, 0
%ifdef m10
    SWAP             0, 10
%endif
2672
    mova            m0, lim_sign
2673 2674 2675
%else
    mova            m6, mask_res
    mova            m1, lim_res
2676 2677 2678 2679
    pmullw          m6, [pw_18]
    pmullw          m1, [pw_18]
    paddw           m6, m7
    paddw           m1, m7
2680 2681
%endif
    mova            m0, lim_sign
2682 2683 2684 2685 2686 2687 2688
    psraw           m6, 7
    psraw           m1, 7
    packsswb        m6, m1          ; a1
    pxor            m1, m1
    psubb           m1, m6
    pand            m1, m0          ; -a1
    pandn           m0, m6          ; +a1
2689 2690 2691
%if ssse3_or_higher
    mova            m6, [pb_9_63]
%endif
2692 2693 2694 2695 2696
    psubusb         m2, m1
    paddusb         m5, m1
    paddusb         m2, m0          ; p1+a1
    psubusb         m5, m0          ; q1-a1

2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
%if ssse3_or_higher
    SWAP             6, 7
%ifdef m10
    SWAP             1, 10
%else
    mova            m1, lim_res
%endif
    mova            m0, m7
    pmaddubsw       m7, m6
    SWAP             6, 7
    pmaddubsw       m0, m1
    SWAP             1, 0
%else
2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720
%ifdef m8
    SWAP             6, 12
    SWAP             1, 8
%else
    mova            m6, mask_res
    mova            m1, lim_res
%endif
    pmullw          m6, [pw_9]
    pmullw          m1, [pw_9]
    paddw           m6, m7
    paddw           m1, m7
2721
%endif
2722 2723
%ifdef m9
    SWAP             7, 9
2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774
%else
    mova            m7, lim_sign
%endif
    psraw           m6, 7
    psraw           m1, 7
    packsswb        m6, m1          ; a1
    pxor            m0, m0
    psubb           m0, m6
    pand            m0, m7          ; -a1
    pandn           m7, m6          ; +a1
%ifdef m8
    SWAP             1, 13
    SWAP             6, 14
%else
    mova            m1, p2backup
    mova            m6, q2backup
%endif
    psubusb         m1, m0
    paddusb         m6, m0
    paddusb         m1, m7          ; p1+a1
    psubusb         m6, m7          ; q1-a1

    ; store
%ifidn %2, v
    movrow [dst2_reg+mstride_reg*4], m1
    movrow [dst_reg +mstride_reg*2], m2
    movrow [dst_reg +mstride_reg  ], m3
    movrow    [dst_reg], m4
    movrow   [dst2_reg], m5
    movrow [dst2_reg+ stride_reg  ], m6
%if mmsize == 16 && %4 == 8
    add        dst8_reg, mstride_reg
    movhps [dst8_reg+mstride_reg*2], m1
    movhps [dst8_reg+mstride_reg  ], m2
    movhps   [dst8_reg], m3
    add        dst8_reg, stride_reg
    movhps   [dst8_reg], m4
    movhps [dst8_reg+ stride_reg  ], m5
    movhps [dst8_reg+ stride_reg*2], m6
%endif
%else ; h
    inc         dst_reg
    inc        dst2_reg

    ; 4x8/16 transpose
    TRANSPOSE4x4B     1, 2, 3, 4, 0
    SBUTTERFLY       bw, 5, 6, 0

%if mmsize == 8 ; mmx/mmxext (h)
    WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
    add         dst_reg, 4
2775
    WRITE_2x4W       m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2776 2777 2778
%else ; sse2 (h)
    lea        dst8_reg, [dst8_reg+mstride_reg+1]
    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2779 2780
    lea         dst_reg, [dst2_reg+mstride_reg+4]
    lea        dst8_reg, [dst8_reg+mstride_reg+4]
2781 2782 2783 2784 2785
%ifidn %1, sse4
    add        dst2_reg, 4
%endif
    WRITE_8W         m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
%ifidn %1, sse4
2786
    lea        dst2_reg, [dst8_reg+ stride_reg]
2787
%endif
2788
    WRITE_8W         m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817
%endif
%endif

%if mmsize == 8
%if %4 == 8 ; chroma
%ifidn %2, h
    sub         dst_reg, 5
%endif
    cmp         dst_reg, dst8_reg
    mov         dst_reg, dst8_reg
    jnz .next8px
%else
%ifidn %2, h
    lea         dst_reg, [dst_reg + stride_reg*8-5]
%else ; v
    add         dst_reg, 8
%endif
    dec         cnt_reg
    jg .next8px
%endif
%endif

%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
    mov             rsp, stack_reg   ; restore stack pointer
%endif
    RET
%endmacro

INIT_MMX
2818
%define SPLATB_REG SPLATB_REG_MMX
2819 2820 2821 2822
MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
2823 2824 2825 2826

%define SPLATB_REG SPLATB_REG_MMXEXT
MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2827 2828
MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
2829 2830

INIT_XMM
2831
%define SPLATB_REG SPLATB_REG_SSE2
2832
%define WRITE_8W   WRITE_8W_SSE2
2833
MBEDGE_LOOPFILTER sse2,   v, 5, 16, 15
2834
%ifdef m8
2835
MBEDGE_LOOPFILTER sse2,   h, 5, 16, 15
2836
%else
2837
MBEDGE_LOOPFILTER sse2,   h, 6, 16, 15
2838
%endif
2839 2840
MBEDGE_LOOPFILTER sse2,   v, 6,  8, 15
MBEDGE_LOOPFILTER sse2,   h, 6,  8, 15
2841

2842
%define SPLATB_REG SPLATB_REG_SSSE3
2843
MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 15
2844
%ifdef m8
2845
MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 15
2846
%else
2847
MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 15
2848
%endif
2849 2850
MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 15
MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 15
2851

2852
%define WRITE_8W   WRITE_8W_SSE4
2853
%ifdef m8
2854
MBEDGE_LOOPFILTER sse4,   h, 5, 16, 15
2855
%else
2856
MBEDGE_LOOPFILTER sse4,   h, 6, 16, 15
2857
%endif
2858
MBEDGE_LOOPFILTER sse4,   h, 6,  8, 15