vp8dsp.asm 78.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
;******************************************************************************
;* VP8 MMXEXT optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 22
;******************************************************************************

23
%include "libavutil/x86/x86util.asm"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

SECTION_RODATA

fourtap_filter_hw_m: times 4 dw  -6, 123
                     times 4 dw  12,  -1
                     times 4 dw  -9,  93
                     times 4 dw  50,  -6
                     times 4 dw  -6,  50
                     times 4 dw  93,  -9
                     times 4 dw  -1,  12
                     times 4 dw 123,  -6

sixtap_filter_hw_m:  times 4 dw   2, -11
                     times 4 dw 108,  36
                     times 4 dw  -8,   1
                     times 4 dw   3, -16
                     times 4 dw  77,  77
                     times 4 dw -16,   3
                     times 4 dw   1,  -8
                     times 4 dw  36, 108
                     times 4 dw -11,   2

46 47 48 49 50 51 52 53
fourtap_filter_hb_m: times 8 db  -6, 123
                     times 8 db  12,  -1
                     times 8 db  -9,  93
                     times 8 db  50,  -6
                     times 8 db  -6,  50
                     times 8 db  93,  -9
                     times 8 db  -1,  12
                     times 8 db 123,  -6
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

sixtap_filter_hb_m:  times 8 db   2,   1
                     times 8 db -11, 108
                     times 8 db  36,  -8
                     times 8 db   3,   3
                     times 8 db -16,  77
                     times 8 db  77, -16
                     times 8 db   1,   2
                     times 8 db  -8,  36
                     times 8 db 108, -11

fourtap_filter_v_m:  times 8 dw  -6
                     times 8 dw 123
                     times 8 dw  12
                     times 8 dw  -1
                     times 8 dw  -9
                     times 8 dw  93
                     times 8 dw  50
                     times 8 dw  -6
                     times 8 dw  -6
                     times 8 dw  50
                     times 8 dw  93
                     times 8 dw  -9
                     times 8 dw  -1
                     times 8 dw  12
                     times 8 dw 123
                     times 8 dw  -6

sixtap_filter_v_m:   times 8 dw   2
                     times 8 dw -11
                     times 8 dw 108
                     times 8 dw  36
                     times 8 dw  -8
                     times 8 dw   1
                     times 8 dw   3
                     times 8 dw -16
                     times 8 dw  77
                     times 8 dw  77
                     times 8 dw -16
                     times 8 dw   3
                     times 8 dw   1
                     times 8 dw  -8
                     times 8 dw  36
                     times 8 dw 108
                     times 8 dw -11
                     times 8 dw   2

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
bilinear_filter_vw_m: times 8 dw 1
                      times 8 dw 2
                      times 8 dw 3
                      times 8 dw 4
                      times 8 dw 5
                      times 8 dw 6
                      times 8 dw 7

bilinear_filter_vb_m: times 8 db 7, 1
                      times 8 db 6, 2
                      times 8 db 5, 3
                      times 8 db 4, 4
                      times 8 db 3, 5
                      times 8 db 2, 6
                      times 8 db 1, 7

117
%ifdef PIC
118 119 120 121 122 123 124 125 126
%define fourtap_filter_hw  picregq
%define sixtap_filter_hw   picregq
%define fourtap_filter_hb  picregq
%define sixtap_filter_hb   picregq
%define fourtap_filter_v   picregq
%define sixtap_filter_v    picregq
%define bilinear_filter_vw picregq
%define bilinear_filter_vb picregq
%define npicregs 1
127
%else
128 129 130 131 132 133
%define fourtap_filter_hw  fourtap_filter_hw_m
%define sixtap_filter_hw   sixtap_filter_hw_m
%define fourtap_filter_hb  fourtap_filter_hb_m
%define sixtap_filter_hb   sixtap_filter_hb_m
%define fourtap_filter_v   fourtap_filter_v_m
%define sixtap_filter_v    sixtap_filter_v_m
134 135
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
136
%define npicregs 0
137 138
%endif

139
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
140
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
141

142 143 144
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
145

146 147 148
pw_27:    times 8 dw 27
pw_63:    times 8 dw 63
pw_256:   times 8 dw 256
Ronald S. Bultje's avatar
Ronald S. Bultje committed
149 150 151
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734

152 153 154
pb_4:     times 16 db 4
pb_F8:    times 16 db 0xF8
pb_FE:    times 16 db 0xFE
155 156 157 158
pb_27_63: times 8 db 27, 63
pb_18_63: times 8 db 18, 63
pb_9_63:  times 8 db  9, 63

159
cextern pb_1
160
cextern pw_3
161
cextern pb_3
162
cextern pw_4
163 164
cextern pw_9
cextern pw_18
165
cextern pw_64
166
cextern pb_80
167 168 169 170 171 172 173 174 175 176 177

SECTION .text

;-----------------------------------------------------------------------------
; subpel MC functions:
;
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
;                                              uint8_t *src, int srcstride,
;                                              int height,   int mx, int my);
;-----------------------------------------------------------------------------

178
%macro FILTER_SSSE3 1
179 180
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
    lea      mxd, [mxq*3]
181 182 183
    mova      m3, [filter_h6_shuf2]
    mova      m4, [filter_h6_shuf3]
%ifdef PIC
184
    lea  picregq, [sixtap_filter_hb_m]
185
%endif
186 187 188
    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
    mova      m6, [sixtap_filter_hb+mxq*8-32]
    mova      m7, [sixtap_filter_hb+mxq*8-16]
189

190
.nextrow:
191
    movu      m0, [srcq-2]
192 193
    mova      m1, m0
    mova      m2, m0
194
%if mmsize == 8
195 196
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
197
    punpcklbw m0, [srcq+3]
198 199 200 201 202 203 204 205 206 207
%else
    pshufb    m0, [filter_h6_shuf1]
%endif
    pshufb    m1, m3
    pshufb    m2, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    pmaddubsw m2, m7
    paddsw    m0, m1
    paddsw    m0, m2
208
    pmulhrsw  m0, [pw_256]
209
    packuswb  m0, m0
210
    movh  [dstq], m0        ; store
211 212

    ; go to next line
213 214 215
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
216 217 218
    jg .nextrow
    REP_RET

219 220
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
221
    mova      m2, [pw_256]
222 223 224
    mova      m3, [filter_h2_shuf]
    mova      m4, [filter_h4_shuf]
%ifdef PIC
225
    lea  picregq, [fourtap_filter_hb_m]
226
%endif
227 228
    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
    mova      m6, [fourtap_filter_hb+mxq]
229

230
.nextrow:
231
    movu      m0, [srcq-1]
232 233 234 235 236 237
    mova      m1, m0
    pshufb    m0, m3
    pshufb    m1, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    paddsw    m0, m1
238
    pmulhrsw  m0, m2
239
    packuswb  m0, m0
240
    movh  [dstq], m0        ; store
241 242

    ; go to next line
243 244 245
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
246 247 248
    jg .nextrow
    REP_RET

249 250
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
251
%ifdef PIC
252
    lea  picregq, [fourtap_filter_hb_m]
253
%endif
254 255
    mova      m5, [fourtap_filter_hb+myq-16]
    mova      m6, [fourtap_filter_hb+myq]
256
    mova      m7, [pw_256]
257 258

    ; read 3 lines
259 260 261 262 263
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+  srcstrideq]
    movh      m2, [srcq+2*srcstrideq]
    add     srcq, srcstrideq
264

265
.nextrow:
266
    movh      m3, [srcq+2*srcstrideq]      ; read new row
267 268 269 270 271 272 273 274 275
    mova      m4, m0
    mova      m0, m1
    punpcklbw m4, m1
    mova      m1, m2
    punpcklbw m2, m3
    pmaddubsw m4, m5
    pmaddubsw m2, m6
    paddsw    m4, m2
    mova      m2, m3
276
    pmulhrsw  m4, m7
277
    packuswb  m4, m4
278
    movh  [dstq], m4
279 280

    ; go to next line
281 282 283
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
284 285 286
    jg .nextrow
    REP_RET

287 288
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    lea      myd, [myq*3]
289
%ifdef PIC
290
    lea  picregq, [sixtap_filter_hb_m]
291
%endif
292
    lea      myq, [sixtap_filter_hb+myq*8]
293 294

    ; read 5 lines
295 296 297 298 299 300 301 302 303
    sub     srcq, srcstrideq
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+srcstrideq]
    movh      m2, [srcq+srcstrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    add     srcq, srcstrideq
    movh      m3, [srcq]
    movh      m4, [srcq+srcstrideq]
304

305
.nextrow:
306
    movh      m5, [srcq+2*srcstrideq]      ; read new row
307 308 309 310 311 312
    mova      m6, m0
    punpcklbw m6, m5
    mova      m0, m1
    punpcklbw m1, m2
    mova      m7, m3
    punpcklbw m7, m4
313 314 315
    pmaddubsw m6, [myq-48]
    pmaddubsw m1, [myq-32]
    pmaddubsw m7, [myq-16]
316 317 318 319
    paddsw    m6, m1
    paddsw    m6, m7
    mova      m1, m2
    mova      m2, m3
320
    pmulhrsw  m6, [pw_256]
321 322 323
    mova      m3, m4
    packuswb  m6, m6
    mova      m4, m5
324
    movh  [dstq], m6
325 326

    ; go to next line
327 328 329
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
330 331 332 333
    jg .nextrow
    REP_RET
%endmacro

334 335 336 337
INIT_MMX ssse3
FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
338

339
; 4x4 block, H-only 4-tap filter
340
INIT_MMX mmxext
341 342
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    shl       mxd, 4
343
%ifdef PIC
344
    lea   picregq, [fourtap_filter_hw_m]
345
%endif
346 347
    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
    movq      mm5, [fourtap_filter_hw+mxq]
348 349 350
    movq      mm7, [pw_64]
    pxor      mm6, mm6

351
.nextrow:
352
    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm6                     ; byte->word ABCD
    pshufw    mm0, mm2, 9                  ; byte CDEF..
    punpcklbw mm0, mm6                     ; byte->word CDEF
    pshufw    mm3, mm1, 0x94               ; word ABBC
    pshufw    mm1, mm0, 0x94               ; word CDDE
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
    movq      mm0, mm1                     ; backup for second set of pixels
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm3, mm1                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
    punpckhbw mm2, mm6                     ; byte->word EFGH
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
    pshufw    mm1, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm0, mm1                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm3, mm0                     ; merge dword->word (4px)
    paddsw    mm3, mm7                     ; rounding
    psraw     mm3, 7
    packuswb  mm3, mm6                     ; clip and word->bytes
378
    movd   [dstq], mm3                     ; store
379 380

    ; go to next line
381 382 383
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
384 385 386 387
    jg .nextrow
    REP_RET

; 4x4 block, H-only 6-tap filter
388
INIT_MMX mmxext
389 390
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    lea       mxd, [mxq*3]
391
%ifdef PIC
392
    lea   picregq, [sixtap_filter_hw_m]
393
%endif
394 395 396
    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
    movq      mm5, [sixtap_filter_hw+mxq*8-32]
    movq      mm6, [sixtap_filter_hw+mxq*8-16]
397 398 399
    movq      mm7, [pw_64]
    pxor      mm3, mm3

400
.nextrow:
401
    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm3                     ; byte->word ABCD
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
    punpckhbw mm2, mm3                     ; byte->word EFGH
    punpcklbw mm0, mm3                     ; byte->word CDEF
    pshufw    mm1, mm1, 0x94               ; word ABBC
    pshufw    mm2, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
    pshufw    mm3, mm0, 0x94               ; word CDDE
    movq      mm0, mm3                     ; backup for second set of pixels
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
    paddd     mm1, mm3                     ; add to 1st 2px cache
    movq      mm3, mm2                     ; backup for second set of pixels
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm1, mm2                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
421
    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
422 423 424 425 426 427 428 429 430 431 432 433 434 435
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
    paddd     mm0, mm3                     ; add to 2nd 2px cache
    pxor      mm3, mm3
    punpcklbw mm2, mm3                     ; byte->word FGHI
    pshufw    mm2, mm2, 0xE9               ; word GHHI
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm0, mm2                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm1, mm0                     ; merge dword->word (4px)
    paddsw    mm1, mm7                     ; rounding
    psraw     mm1, 7
    packuswb  mm1, mm3                     ; clip and word->bytes
436
    movd   [dstq], mm1                     ; store
437 438

    ; go to next line
439 440 441
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
442 443 444
    jg .nextrow
    REP_RET

445
INIT_XMM sse2
446 447
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 5
448
%ifdef PIC
449
    lea  picregq, [fourtap_filter_v_m]
450
%endif
451
    lea      mxq, [fourtap_filter_v+mxq-32]
452
    pxor      m7, m7
453
    mova      m4, [pw_64]
454 455
    mova      m5, [mxq+ 0]
    mova      m6, [mxq+16]
456
%ifdef m8
457 458
    mova      m8, [mxq+32]
    mova      m9, [mxq+48]
459
%endif
460
.nextrow:
461 462 463 464
    movq      m0, [srcq-1]
    movq      m1, [srcq-0]
    movq      m2, [srcq+1]
    movq      m3, [srcq+2]
465 466 467 468 469 470 471 472 473 474
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    pmullw    m0, m5
    pmullw    m1, m6
%ifdef m8
    pmullw    m2, m8
    pmullw    m3, m9
%else
475 476
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
477 478 479 480 481
%endif
    paddsw    m0, m1
    paddsw    m2, m3
    paddsw    m0, m2
    paddsw    m0, m4
482 483
    psraw     m0, 7
    packuswb  m0, m7
484
    movh  [dstq], m0        ; store
485 486

    ; go to next line
487 488 489
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
490 491 492
    jg .nextrow
    REP_RET

493
INIT_XMM sse2
494 495 496
cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
    lea      mxd, [mxq*3]
    shl      mxd, 4
497
%ifdef PIC
498
    lea  picregq, [sixtap_filter_v_m]
499
%endif
500
    lea      mxq, [sixtap_filter_v+mxq-96]
501
    pxor      m7, m7
502 503
    mova      m6, [pw_64]
%ifdef m8
504 505 506 507 508 509
    mova      m8, [mxq+ 0]
    mova      m9, [mxq+16]
    mova     m10, [mxq+32]
    mova     m11, [mxq+48]
    mova     m12, [mxq+64]
    mova     m13, [mxq+80]
510
%endif
511
.nextrow:
512 513 514 515 516 517
    movq      m0, [srcq-2]
    movq      m1, [srcq-1]
    movq      m2, [srcq-0]
    movq      m3, [srcq+1]
    movq      m4, [srcq+2]
    movq      m5, [srcq+3]
518 519 520 521 522 523 524 525 526 527 528 529 530 531
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7
    punpcklbw m5, m7
%ifdef m8
    pmullw    m0, m8
    pmullw    m1, m9
    pmullw    m2, m10
    pmullw    m3, m11
    pmullw    m4, m12
    pmullw    m5, m13
%else
532 533 534 535 536 537
    pmullw    m0, [mxq+ 0]
    pmullw    m1, [mxq+16]
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
    pmullw    m4, [mxq+64]
    pmullw    m5, [mxq+80]
538 539 540 541 542 543 544
%endif
    paddsw    m1, m4
    paddsw    m0, m5
    paddsw    m1, m2
    paddsw    m0, m3
    paddsw    m0, m1
    paddsw    m0, m6
545 546
    psraw     m0, 7
    packuswb  m0, m7
547
    movh  [dstq], m0        ; store
548 549

    ; go to next line
550 551 552
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
553 554 555
    jg .nextrow
    REP_RET

556
%macro FILTER_V 1
557
; 4x4 block, V-only 4-tap filter
558 559
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 5
560
%ifdef PIC
561
    lea  picregq, [fourtap_filter_v_m]
562
%endif
563
    lea      myq, [fourtap_filter_v+myq-32]
564 565
    mova      m6, [pw_64]
    pxor      m7, m7
566
    mova      m5, [myq+48]
567 568

    ; read 3 lines
569 570 571 572 573
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+  srcstrideq]
    movh      m2, [srcq+2*srcstrideq]
    add     srcq, srcstrideq
574 575 576 577
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7

578
.nextrow:
579
    ; first calculate negative taps (to prevent losing positive overflows)
580
    movh      m4, [srcq+2*srcstrideq]      ; read new row
581 582
    punpcklbw m4, m7
    mova      m3, m4
583
    pmullw    m0, [myq+0]
584 585 586 587 588
    pmullw    m4, m5
    paddsw    m4, m0

    ; then calculate positive taps
    mova      m0, m1
589
    pmullw    m1, [myq+16]
590 591
    paddsw    m4, m1
    mova      m1, m2
592
    pmullw    m2, [myq+32]
593 594 595 596 597 598 599
    paddsw    m4, m2
    mova      m2, m3

    ; round/clip/store
    paddsw    m4, m6
    psraw     m4, 7
    packuswb  m4, m7
600
    movh  [dstq], m4
601 602

    ; go to next line
603 604 605
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd                           ; next row
606 607 608 609 610
    jg .nextrow
    REP_RET


; 4x4 block, V-only 6-tap filter
611 612 613
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
    lea      myq, [myq*3]
614
%ifdef PIC
615
    lea  picregq, [sixtap_filter_v_m]
616
%endif
617
    lea      myq, [sixtap_filter_v+myq-96]
618 619 620
    pxor      m7, m7

    ; read 5 lines
621 622 623 624 625 626 627 628 629
    sub     srcq, srcstrideq
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+srcstrideq]
    movh      m2, [srcq+srcstrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    add     srcq, srcstrideq
    movh      m3, [srcq]
    movh      m4, [srcq+srcstrideq]
630 631 632 633 634 635
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7

636
.nextrow:
637 638
    ; first calculate negative taps (to prevent losing positive overflows)
    mova      m5, m1
639
    pmullw    m5, [myq+16]
640
    mova      m6, m4
641
    pmullw    m6, [myq+64]
642 643 644
    paddsw    m6, m5

    ; then calculate positive taps
645
    movh      m5, [srcq+2*srcstrideq]      ; read new row
646
    punpcklbw m5, m7
647
    pmullw    m0, [myq+0]
648 649 650
    paddsw    m6, m0
    mova      m0, m1
    mova      m1, m2
651
    pmullw    m2, [myq+32]
652 653
    paddsw    m6, m2
    mova      m2, m3
654
    pmullw    m3, [myq+48]
655 656 657
    paddsw    m6, m3
    mova      m3, m4
    mova      m4, m5
658
    pmullw    m5, [myq+80]
659 660 661 662 663 664
    paddsw    m6, m5

    ; round/clip/store
    paddsw    m6, [pw_64]
    psraw     m6, 7
    packuswb  m6, m7
665
    movh  [dstq], m6
666 667

    ; go to next line
668 669 670
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd                           ; next row
671 672 673 674
    jg .nextrow
    REP_RET
%endmacro

675
INIT_MMX mmxext
676 677 678
FILTER_V 4
INIT_XMM sse2
FILTER_V 8
679

680
%macro FILTER_BILINEAR 1
681 682
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
683
%ifdef PIC
684
    lea  picregq, [bilinear_filter_vw_m]
685 686
%endif
    pxor      m6, m6
687 688 689
    mova      m5, [bilinear_filter_vw+myq-1*16]
    neg      myq
    mova      m4, [bilinear_filter_vw+myq+7*16]
690
.nextrow:
691 692 693
    movh      m0, [srcq+srcstrideq*0]
    movh      m1, [srcq+srcstrideq*1]
    movh      m3, [srcq+srcstrideq*2]
694 695 696 697 698 699 700 701 702 703 704 705 706 707
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m3, m6
    mova      m2, m1
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
708
%if mmsize == 8
709 710
    packuswb  m0, m0
    packuswb  m2, m2
711 712
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m2
713 714
%else
    packuswb  m0, m2
715 716
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
717 718
%endif

719 720 721
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
722 723 724
    jg .nextrow
    REP_RET

725 726
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
727
%ifdef PIC
728
    lea  picregq, [bilinear_filter_vw_m]
729 730
%endif
    pxor      m6, m6
731 732 733
    mova      m5, [bilinear_filter_vw+mxq-1*16]
    neg      mxq
    mova      m4, [bilinear_filter_vw+mxq+7*16]
734
.nextrow:
735 736 737 738
    movh      m0, [srcq+srcstrideq*0+0]
    movh      m1, [srcq+srcstrideq*0+1]
    movh      m2, [srcq+srcstrideq*1+0]
    movh      m3, [srcq+srcstrideq*1+1]
739 740 741 742 743 744 745 746 747 748 749 750 751 752
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m2, m6
    punpcklbw m3, m6
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
753
%if mmsize == 8
754 755
    packuswb  m0, m0
    packuswb  m2, m2
756 757
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m2
758 759
%else
    packuswb  m0, m2
760 761
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
762 763
%endif

764 765 766
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
767 768 769 770
    jg .nextrow
    REP_RET
%endmacro

771
INIT_MMX mmxext
772 773 774
FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
775

776
%macro FILTER_BILINEAR_SSSE3 1
777 778
cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
779
%ifdef PIC
780
    lea  picregq, [bilinear_filter_vb_m]
781 782
%endif
    pxor      m4, m4
783
    mova      m3, [bilinear_filter_vb+myq-16]
784
.nextrow:
785 786 787
    movh      m0, [srcq+srcstrideq*0]
    movh      m1, [srcq+srcstrideq*1]
    movh      m2, [srcq+srcstrideq*2]
788 789 790 791 792 793 794 795
    punpcklbw m0, m1
    punpcklbw m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
796 797 798
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
799 800
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m1
801
%else
802
    packuswb  m0, m1
803 804
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
805
%endif
806

807 808 809
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
810 811 812
    jg .nextrow
    REP_RET

813 814
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
815
%ifdef PIC
816
    lea  picregq, [bilinear_filter_vb_m]
817 818 819
%endif
    pxor      m4, m4
    mova      m2, [filter_h2_shuf]
820
    mova      m3, [bilinear_filter_vb+mxq-16]
821
.nextrow:
822 823
    movu      m0, [srcq+srcstrideq*0]
    movu      m1, [srcq+srcstrideq*1]
824 825 826 827 828 829 830 831
    pshufb    m0, m2
    pshufb    m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
832 833 834
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
835 836
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m1
837
%else
838
    packuswb  m0, m1
839 840
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
841
%endif
842

843 844 845
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
846 847
    jg .nextrow
    REP_RET
848 849
%endmacro

850
INIT_MMX ssse3
851
FILTER_BILINEAR_SSSE3 4
852
INIT_XMM ssse3
853
FILTER_BILINEAR_SSSE3 8
854

855
INIT_MMX mmx
856
cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
857
.nextrow:
858 859 860 861 862 863 864
    movq    mm0, [srcq+srcstrideq*0]
    movq    mm1, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
    movq [dstq+dststrideq*0], mm0
    movq [dstq+dststrideq*1], mm1
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
865 866 867
    jg .nextrow
    REP_RET

868
%if ARCH_X86_32
869
INIT_MMX mmx
870
cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
871
.nextrow:
872 873 874 875 876 877 878 879 880 881 882
    movq    mm0, [srcq+srcstrideq*0+0]
    movq    mm1, [srcq+srcstrideq*0+8]
    movq    mm2, [srcq+srcstrideq*1+0]
    movq    mm3, [srcq+srcstrideq*1+8]
    lea    srcq, [srcq+srcstrideq*2]
    movq [dstq+dststrideq*0+0], mm0
    movq [dstq+dststrideq*0+8], mm1
    movq [dstq+dststrideq*1+0], mm2
    movq [dstq+dststrideq*1+8], mm3
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
883 884
    jg .nextrow
    REP_RET
885
%endif
886

887
INIT_XMM sse
888
cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
889
.nextrow:
890 891 892 893 894 895 896
    movups xmm0, [srcq+srcstrideq*0]
    movups xmm1, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
    movaps [dstq+dststrideq*0], xmm0
    movaps [dstq+dststrideq*1], xmm1
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
897 898 899
    jg .nextrow
    REP_RET

900
;-----------------------------------------------------------------------------
Diego Biurrun's avatar
Diego Biurrun committed
901
; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
902 903
;-----------------------------------------------------------------------------

904
%macro ADD_DC 4
905 906 907 908
    %4        m2, [dst1q+%3]
    %4        m3, [dst1q+strideq+%3]
    %4        m4, [dst2q+%3]
    %4        m5, [dst2q+strideq+%3]
909 910 911 912 913 914 915 916
    paddusb   m2, %1
    paddusb   m3, %1
    paddusb   m4, %1
    paddusb   m5, %1
    psubusb   m2, %2
    psubusb   m3, %2
    psubusb   m4, %2
    psubusb   m5, %2
917 918 919 920
    %4 [dst1q+%3], m2
    %4 [dst1q+strideq+%3], m3
    %4 [dst2q+%3], m4
    %4 [dst2q+strideq+%3], m5
921 922
%endmacro

923
INIT_MMX mmx
924
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
925
    ; load data
926
    movd       m0, [blockq]
927 928

    ; calculate DC
929 930 931
    paddw      m0, [pw_4]
    pxor       m1, m1
    psraw      m0, 3
932
    movd [blockq], m1
933 934 935 936 937 938 939
    psubw      m1, m0
    packuswb   m0, m0
    packuswb   m1, m1
    punpcklbw  m0, m0
    punpcklbw  m1, m1
    punpcklwd  m0, m0
    punpcklwd  m1, m1
940 941

    ; add DC
942 943
    DEFINE_ARGS dst1, dst2, stride
    lea     dst2q, [dst1q+strideq*2]
944
    ADD_DC     m0, m1, 0, movh
945 946
    RET

947
INIT_XMM sse4
948
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
949
    ; load data
950
    movd       m0, [blockq]
951 952 953 954
    pxor       m1, m1

    ; calculate DC
    paddw      m0, [pw_4]
955 956 957 958 959 960 961
    movd [blockq], m1
    DEFINE_ARGS dst1, dst2, stride
    lea     dst2q, [dst1q+strideq*2]
    movd       m2, [dst1q]
    movd       m3, [dst1q+strideq]
    movd       m4, [dst2q]
    movd       m5, [dst2q+strideq]
962 963 964 965 966 967 968 969 970 971
    psraw      m0, 3
    pshuflw    m0, m0, 0
    punpcklqdq m0, m0
    punpckldq  m2, m3
    punpckldq  m4, m5
    punpcklbw  m2, m1
    punpcklbw  m4, m1
    paddw      m2, m0
    paddw      m4, m0
    packuswb   m2, m4
972 973 974 975
    movd   [dst1q], m2
    pextrd [dst1q+strideq], m2, 1
    pextrd [dst2q], m2, 2
    pextrd [dst2q+strideq], m2, 3
976 977 978
    RET

;-----------------------------------------------------------------------------
Diego Biurrun's avatar
Diego Biurrun committed
979
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
980 981
;-----------------------------------------------------------------------------

982
%if ARCH_X86_32
983
INIT_MMX mmx
984
cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
985
    ; load data
986 987 988 989
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
990
    punpckldq m0, m1        ; A B C D
991
    pxor      m6, m6
992 993

    ; calculate DC
994
    paddw     m0, [pw_4]
995 996 997 998
    movd [blockq+32*0], m6
    movd [blockq+32*1], m6
    movd [blockq+32*2], m6
    movd [blockq+32*3], m6
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
1013 1014
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1015 1016 1017
    ADD_DC    m0, m6, 0, mova
    ADD_DC    m1, m7, 8, mova
    RET
1018
%endif
1019

1020
INIT_XMM sse2
1021
cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1022
    ; load data
1023 1024 1025 1026
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
1027
    punpckldq m0, m1        ; A B C D
1028 1029 1030 1031
    pxor      m1, m1

    ; calculate DC
    paddw     m0, [pw_4]
1032 1033 1034 1035
    movd [blockq+32*0], m1
    movd [blockq+32*1], m1
    movd [blockq+32*2], m1
    movd [blockq+32*3], m1
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
    psraw     m0, 3
    psubw     m1, m0
    packuswb  m0, m0
    packuswb  m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1

    ; add DC
1046 1047
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1048
    ADD_DC    m0, m1, 0, mova
1049
    RET
1050

1051
;-----------------------------------------------------------------------------
Diego Biurrun's avatar
Diego Biurrun committed
1052
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
1053 1054
;-----------------------------------------------------------------------------

1055
INIT_MMX mmx
1056
cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1057
    ; load data
1058 1059 1060 1061
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
1062 1063 1064 1065 1066
    punpckldq m0, m1        ; A B C D
    pxor      m6, m6

    ; calculate DC
    paddw     m0, [pw_4]
1067 1068 1069 1070
    movd [blockq+32*0], m6
    movd [blockq+32*1], m6
    movd [blockq+32*2], m6
    movd [blockq+32*3], m6
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
1085 1086
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1087
    ADD_DC    m0, m6, 0, mova
1088 1089
    lea    dst1q, [dst1q+strideq*4]
    lea    dst2q, [dst2q+strideq*4]
1090 1091 1092
    ADD_DC    m1, m7, 0, mova
    RET

Ronald S. Bultje's avatar
Ronald S. Bultje committed
1093
;-----------------------------------------------------------------------------
Diego Biurrun's avatar
Diego Biurrun committed
1094
; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
;-----------------------------------------------------------------------------

; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
%macro VP8_MULTIPLY_SUMSUB 4
    mova      %3, %1
    mova      %4, %2
    pmulhw    %3, m6 ;20091(1)
    pmulhw    %4, m6 ;20091(2)
    paddw     %3, %1
    paddw     %4, %2
1106 1107
    paddw     %1, %1
    paddw     %2, %2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
    pmulhw    %1, m7 ;35468(1)
    pmulhw    %2, m7 ;35468(2)
    psubw     %1, %4
    paddw     %2, %3
%endmacro

; calculate x0=%1+%3; x1=%1-%3
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
;           %5/%6 are temporary registers
;           we assume m6/m7 have constant words 20091/17734 loaded in them
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1120
    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1121
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1122 1123
    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1124 1125 1126 1127
    SWAP                 %4,  %1
    SWAP                 %4,  %3
%endmacro

1128
%macro VP8_IDCT_ADD 0
1129
cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1130
    ; load block data
1131 1132 1133 1134
    movq         m0, [blockq+ 0]
    movq         m1, [blockq+ 8]
    movq         m2, [blockq+16]
    movq         m3, [blockq+24]
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1135 1136
    movq         m6, [pw_20091]
    movq         m7, [pw_17734]
1137
%if cpuflag(sse)
1138
    xorps      xmm0, xmm0
1139 1140
    movaps [blockq+ 0], xmm0
    movaps [blockq+16], xmm0
1141 1142
%else
    pxor         m4, m4
1143 1144 1145 1146
    movq [blockq+ 0], m4
    movq [blockq+ 8], m4
    movq [blockq+16], m4
    movq [blockq+24], m4
1147
%endif
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1148 1149 1150 1151 1152 1153 1154 1155 1156 1157

    ; actual IDCT
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4
    paddw        m0, [pw_4]
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4

    ; store
    pxor         m4, m4
1158 1159 1160 1161
    DEFINE_ARGS dst1, dst2, stride
    lea       dst2q, [dst1q+2*strideq]
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1162 1163

    RET
1164 1165
%endmacro

1166
%if ARCH_X86_32
1167 1168
INIT_MMX mmx
VP8_IDCT_ADD
1169
%endif
1170 1171
INIT_MMX sse
VP8_IDCT_ADD
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1172

1173
;-----------------------------------------------------------------------------
Diego Biurrun's avatar
Diego Biurrun committed
1174
; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
1175 1176
;-----------------------------------------------------------------------------

1177
%macro SCATTER_WHT 3
1178 1179 1180 1181 1182 1183
    movd dc1d, m%1
    movd dc2d, m%2
    mov [blockq+2*16*(0+%3)], dc1w
    mov [blockq+2*16*(1+%3)], dc2w
    shr  dc1d, 16
    shr  dc2d, 16
1184 1185
    psrlq m%1, 32
    psrlq m%2, 32
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195
    mov [blockq+2*16*(4+%3)], dc1w
    mov [blockq+2*16*(5+%3)], dc2w
    movd dc1d, m%1
    movd dc2d, m%2
    mov [blockq+2*16*(8+%3)], dc1w
    mov [blockq+2*16*(9+%3)], dc2w
    shr  dc1d, 16
    shr  dc2d, 16
    mov [blockq+2*16*(12+%3)], dc1w
    mov [blockq+2*16*(13+%3)], dc2w
1196 1197 1198
%endmacro

%macro HADAMARD4_1D 4
1199 1200
    SUMSUB_BADC w, %2, %1, %4, %3
    SUMSUB_BADC w, %4, %2, %3, %1
1201 1202 1203
    SWAP %1, %4, %3
%endmacro

1204
%macro VP8_DC_WHT 0
1205 1206 1207 1208 1209
cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
    movq          m0, [dc1q]
    movq          m1, [dc1q+8]
    movq          m2, [dc1q+16]
    movq          m3, [dc1q+24]
1210
%if cpuflag(sse)
1211
    xorps      xmm0, xmm0
1212 1213
    movaps [dc1q+ 0], xmm0
    movaps [dc1q+16], xmm0
1214 1215
%else
    pxor         m4, m4
1216 1217 1218 1219
    movq  [dc1q+ 0], m4
    movq  [dc1q+ 8], m4
    movq  [dc1q+16], m4
    movq  [dc1q+24], m4
1220
%endif
1221 1222 1223 1224 1225 1226 1227 1228
    HADAMARD4_1D  0, 1, 2, 3
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    paddw         m0, [pw_3]
    HADAMARD4_1D  0, 1, 2, 3
    psraw         m0, 3
    psraw         m1, 3
    psraw         m2, 3
    psraw         m3, 3
1229 1230
    SCATTER_WHT   0, 1, 0
    SCATTER_WHT   2, 3, 2
1231
    RET
1232 1233
%endmacro

1234
%if ARCH_X86_32
1235 1236
INIT_MMX mmx
VP8_DC_WHT
1237
%endif
1238 1239
INIT_MMX sse
VP8_DC_WHT
1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334

;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
;-----------------------------------------------------------------------------

; macro called with 7 mm register indexes as argument, and 4 regular registers
;
; first 4 mm registers will carry the transposed pixel data
; the other three are scratchspace (one would be sufficient, but this allows
; for more spreading/pipelining and thus faster execution on OOE CPUs)
;
; first two regular registers are buf+4*stride and buf+5*stride
; third is -stride, fourth is +stride
%macro READ_8x4_INTERLEAVED 11
    ; interleave 8 (A-H) rows of 4 pixels each
    movd          m%1, [%8+%10*4]   ; A0-3
    movd          m%5, [%9+%10*4]   ; B0-3
    movd          m%2, [%8+%10*2]   ; C0-3
    movd          m%6, [%8+%10]     ; D0-3
    movd          m%3, [%8]         ; E0-3
    movd          m%7, [%9]         ; F0-3
    movd          m%4, [%9+%11]     ; G0-3
    punpcklbw     m%1, m%5          ; A/B interleaved
    movd          m%5, [%9+%11*2]   ; H0-3
    punpcklbw     m%2, m%6          ; C/D interleaved
    punpcklbw     m%3, m%7          ; E/F interleaved
    punpcklbw     m%4, m%5          ; G/H interleaved
%endmacro

; macro called with 7 mm register indexes as argument, and 5 regular registers
; first 11 mean the same as READ_8x4_TRANSPOSED above
; fifth regular register is scratchspace to reach the bottom 8 rows, it
; will be set to second regular register + 8*stride at the end
%macro READ_16x4_INTERLEAVED 12
    ; transpose 16 (A-P) rows of 4 pixels each
    lea           %12, [r0+8*r2]

    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
    movd          m%1, [%8+%10*4]   ; A0-3
    movd          m%3, [%12+%10*4]  ; I0-3
    movd          m%2, [%8+%10*2]   ; C0-3
    movd          m%4, [%12+%10*2]  ; K0-3
    movd          m%6, [%8+%10]     ; D0-3
    movd          m%5, [%12+%10]    ; L0-3
    movd          m%7, [%12]        ; M0-3
    add           %12, %11
    punpcklbw     m%1, m%3          ; A/I
    movd          m%3, [%8]         ; E0-3
    punpcklbw     m%2, m%4          ; C/K
    punpcklbw     m%6, m%5          ; D/L
    punpcklbw     m%3, m%7          ; E/M
    punpcklbw     m%2, m%6          ; C/D/K/L interleaved

    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
    movd         m%5, [%9+%10*4]   ; B0-3
    movd         m%4, [%12+%10*4]  ; J0-3
    movd         m%7, [%9]         ; F0-3
    movd         m%6, [%12]        ; N0-3
    punpcklbw    m%5, m%4          ; B/J
    punpcklbw    m%7, m%6          ; F/N
    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
    movd         m%4, [%9+%11]     ; G0-3
    movd         m%6, [%12+%11]    ; O0-3
    movd         m%5, [%9+%11*2]   ; H0-3
    movd         m%7, [%12+%11*2]  ; P0-3
    punpcklbw    m%4, m%6          ; G/O
    punpcklbw    m%5, m%7          ; H/P
    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
%endmacro

; write 4 mm registers of 2 dwords each
; first four arguments are mm register indexes containing source data
; last four are registers containing buf+4*stride, buf+5*stride,
; -stride and +stride
%macro WRITE_4x2D 8
    ; write out (2 dwords per register)
    movd    [%5+%7*4], m%1
    movd    [%5+%7*2], m%2
    movd         [%5], m%3
    movd      [%6+%8], m%4
    punpckhdq     m%1, m%1
    punpckhdq     m%2, m%2
    punpckhdq     m%3, m%3
    punpckhdq     m%4, m%4
    movd    [%6+%7*4], m%1
    movd      [%5+%7], m%2
    movd         [%6], m%3
    movd    [%6+%8*2], m%4
%endmacro

; write 4 xmm registers of 4 dwords each
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
; we add 1*stride to the third regular registry in the process
1335 1336 1337 1338 1339
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
; same memory region), or 8 if they cover two separate buffers (third one points to
; a different memory region than the first two), allowing for more optimal code for
; the 16-width case
%macro WRITE_4x4D 10
1340 1341 1342
    ; write out (4 dwords per register), start with dwords zero
    movd    [%5+%8*4], m%1
    movd         [%5], m%2
1343 1344
    movd    [%7+%8*4], m%3
    movd         [%7], m%4
1345 1346 1347 1348 1349 1350 1351 1352

    ; store dwords 1
    psrldq        m%1, 4
    psrldq        m%2, 4
    psrldq        m%3, 4
    psrldq        m%4, 4
    movd    [%6+%8*4], m%1
    movd         [%6], m%2
1353
%if %10 == 16
1354
    movd    [%6+%9*4], m%3
1355 1356
%endif
    movd      [%7+%9], m%4
1357 1358 1359 1360

    ; write dwords 2
    psrldq        m%1, 4
    psrldq        m%2, 4
1361 1362
%if %10 == 8
    movd    [%5+%8*2], m%1
1363
    movd          %5d, m%3
1364
%endif
1365 1366
    psrldq        m%3, 4
    psrldq        m%4, 4
1367
%if %10 == 16
1368
    movd    [%5+%8*2], m%1
1369
%endif
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379
    movd      [%6+%9], m%2
    movd    [%7+%8*2], m%3
    movd    [%7+%9*2], m%4
    add            %7, %9

    ; store dwords 3
    psrldq        m%1, 4
    psrldq        m%2, 4
    psrldq        m%3, 4
    psrldq        m%4, 4
1380 1381 1382 1383
%if %10 == 8
    mov     [%7+%8*4], %5d
    movd    [%6+%8*2], m%1
%else
1384
    movd      [%5+%8], m%1
1385
%endif
1386 1387 1388 1389 1390
    movd    [%6+%9*2], m%2
    movd    [%7+%8*2], m%3
    movd    [%7+%9*2], m%4
%endmacro

1391 1392 1393 1394 1395 1396 1397 1398 1399
; write 4 or 8 words in the mmx/xmm registers as 8 lines
; 1 and 2 are the registers to write, this can be the same (for SSE2)
; for pre-SSE4:
; 3 is a general-purpose register that we will clobber
; for SSE4:
; 3 is a pointer to the destination's 5th line
; 4 is a pointer to the destination's 4th line
; 5/6 is -stride and +stride
%macro WRITE_2x4W 6
1400
    movd            %3d, %1
1401 1402 1403 1404 1405 1406
    punpckhdq        %1, %1
    mov       [%4+%5*4], %3w
    shr              %3, 16
    add              %4, %6
    mov       [%4+%5*4], %3w

1407
    movd            %3d, %1
1408 1409 1410 1411 1412
    add              %4, %5
    mov       [%4+%5*2], %3w
    shr              %3, 16
    mov       [%4+%5  ], %3w

1413
    movd            %3d, %2
1414 1415 1416 1417 1418
    punpckhdq        %2, %2
    mov       [%4     ], %3w
    shr              %3, 16
    mov       [%4+%6  ], %3w

1419
    movd            %3d, %2
1420 1421 1422 1423 1424 1425 1426
    add              %4, %6
    mov       [%4+%6  ], %3w
    shr              %3, 16
    mov       [%4+%6*2], %3w
    add              %4, %5
%endmacro

1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437
%macro WRITE_8W 5
%if cpuflag(sse4)
    pextrw    [%3+%4*4], %1, 0
    pextrw    [%2+%4*4], %1, 1
    pextrw    [%3+%4*2], %1, 2
    pextrw    [%3+%4  ], %1, 3
    pextrw    [%3     ], %1, 4
    pextrw    [%2     ], %1, 5
    pextrw    [%2+%5  ], %1, 6
    pextrw    [%2+%5*2], %1, 7
%else
1438
    movd            %2d, %1
1439 1440 1441 1442 1443 1444
    psrldq           %1, 4
    mov       [%3+%4*4], %2w
    shr              %2, 16
    add              %3, %5
    mov       [%3+%4*4], %2w

1445
    movd            %2d, %1
1446 1447 1448 1449 1450 1451
    psrldq           %1, 4
    add              %3, %4
    mov       [%3+%4*2], %2w
    shr              %2, 16
    mov       [%3+%4  ], %2w

1452
    movd            %2d, %1
1453 1454 1455 1456 1457
    psrldq           %1, 4
    mov       [%3     ], %2w
    shr              %2, 16
    mov       [%3+%5  ], %2w

1458
    movd            %2d, %1
1459 1460 1461 1462
    add              %3, %5
    mov       [%3+%5  ], %2w
    shr              %2, 16
    mov       [%3+%5*2], %2w
1463
%endif
1464 1465
%endmacro

1466
%macro SIMPLE_LOOPFILTER 2
1467
cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
1468
%if mmsize == 8 ; mmx/mmxext
1469
    mov         cntrq, 2
1470
%endif
1471
%if cpuflag(ssse3)
1472
    pxor           m0, m0
1473
%endif
1474
    SPLATB_REG     m7, flim, m0     ; splat "flim" into register
1475 1476

    ; set up indexes to address 4 rows
1477 1478 1479 1480 1481 1482 1483
%if mmsize == 8
    DEFINE_ARGS dst1, mstride, stride, cntr, dst2
%else
    DEFINE_ARGS dst1, mstride, stride, dst3, dst2
%endif
    mov       strideq, mstrideq
    neg      mstrideq
1484
%ifidn %1, h
1485
    lea         dst1q, [dst1q+4*strideq-2]
1486 1487 1488
%endif

%if mmsize == 8 ; mmx / mmxext
1489
.next8px:
1490
%endif
1491
%ifidn %1, v
1492
    ; read 4 half/full rows of pixels
1493 1494 1495 1496
    mova           m0, [dst1q+mstrideq*2]    ; p1
    mova           m1, [dst1q+mstrideq]      ; p0
    mova           m2, [dst1q]               ; q0
    mova           m3, [dst1q+ strideq]      ; q1
1497
%else ; h
1498
    lea         dst2q, [dst1q+ strideq]
1499 1500

%if mmsize == 8 ; mmx/mmxext
1501
    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
1502
%else ; sse2
1503
    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
%endif
    TRANSPOSE4x4W         0, 1, 2, 3, 4
%endif

    ; simple_limit
    mova           m5, m2           ; m5=backup of q0
    mova           m6, m1           ; m6=backup of p0
    psubusb        m1, m2           ; p0-q0
    psubusb        m2, m6           ; q0-p0
    por            m1, m2           ; FFABS(p0-q0)
    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2

    mova           m4, m3
    mova           m2, m0
    psubusb        m3, m0           ; q1-p1
    psubusb        m0, m4           ; p1-q1
    por            m3, m0           ; FFABS(p1-q1)
    mova           m0, [pb_80]
    pxor           m2, m0
    pxor           m4, m0
    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
    pand           m3, [pb_FE]
    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
    paddusb        m3, m1
    psubusb        m3, m7
    pxor           m1, m1
    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)

    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
    mova           m4, m5
    pxor           m5, m0
    pxor           m0, m6
    psubsb         m5, m0           ; q0-p0 (signed)
    paddsb         m2, m5
    paddsb         m2, m5
    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
    pand           m2, m3           ; apply filter mask (m3)

    mova           m3, [pb_F8]
    mova           m1, m2
    paddsb         m2, [pb_4]       ; f1<<3=a+4
    paddsb         m1, [pb_3]       ; f2<<3=a+3
    pand           m2, m3
    pand           m1, m3           ; cache f2<<3

    pxor           m0, m0
    pxor           m3, m3
    pcmpgtb        m0, m2           ; which values are <0?
    psubb          m3, m2           ; -f1<<3
    psrlq          m2, 3            ; +f1
    psrlq          m3, 3            ; -f1
    pand           m3, m0
    pandn          m0, m2
    psubusb        m4, m0
    paddusb        m4, m3           ; q0-f1

    pxor           m0, m0
    pxor           m3, m3
    pcmpgtb        m0, m1           ; which values are <0?
    psubb          m3, m1           ; -f2<<3
    psrlq          m1, 3            ; +f2
    psrlq          m3, 3            ; -f2
    pand           m3, m0
    pandn          m0, m1
    paddusb        m6, m0
    psubusb        m6, m3           ; p0+f2

    ; store
1572
%ifidn %1, v
1573 1574
    mova      [dst1q], m4
    mova [dst1q+mstrideq], m6
1575
%else ; h
1576
    inc        dst1q
1577
    SBUTTERFLY    bw, 6, 4, 0
1578 1579

%if mmsize == 16 ; sse2
1580
%if cpuflag(sse4)
1581
    inc         dst2q
1582
%endif
1583 1584
    WRITE_8W       m6, dst2q, dst1q, mstrideq, strideq
    lea         dst2q, [dst3q+mstrideq+1]
1585
%if cpuflag(sse4)
1586
    inc         dst3q
1587
%endif
1588
    WRITE_8W       m4, dst3q, dst2q, mstrideq, strideq
1589
%else ; mmx/mmxext
1590
    WRITE_2x4W     m6, m4, dst2q, dst1q, mstrideq, strideq
1591 1592 1593 1594 1595
%endif
%endif

%if mmsize == 8 ; mmx/mmxext
    ; next 8 pixels
1596
%ifidn %1, v
1597
    add         dst1q, 8            ; advance 8 cols = pixels
1598
%else ; h
1599
    lea         dst1q, [dst1q+strideq*8-1]  ; advance 8 rows = lines
1600
%endif
1601
    dec         cntrq
1602 1603 1604 1605 1606 1607 1608
    jg .next8px
    REP_RET
%else ; sse2
    RET
%endif
%endmacro

1609
%if ARCH_X86_32
1610 1611 1612
INIT_MMX mmx
SIMPLE_LOOPFILTER v, 4
SIMPLE_LOOPFILTER h, 5
1613
INIT_MMX mmxext
1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625
SIMPLE_LOOPFILTER v, 4
SIMPLE_LOOPFILTER h, 5
%endif

INIT_XMM sse2
SIMPLE_LOOPFILTER v, 3
SIMPLE_LOOPFILTER h, 5
INIT_XMM ssse3
SIMPLE_LOOPFILTER v, 3
SIMPLE_LOOPFILTER h, 5
INIT_XMM sse4
SIMPLE_LOOPFILTER h, 5
1626 1627

;-----------------------------------------------------------------------------
1628
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1629 1630 1631
;                                            int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------

1632
%macro INNER_LOOPFILTER 2
1633 1634 1635 1636 1637 1638 1639 1640 1641
%define stack_size 0
%ifndef m8   ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v ;               [3]=hev() result
%define stack_size mmsize * -4
%else ; h    ; extra storage space for transposes
%define stack_size mmsize * -5
%endif
%endif

1642
%if %2 == 8 ; chroma
1643
cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
1644
%else ; luma
1645
cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
1646 1647
%endif

1648
%if cpuflag(ssse3)
1649 1650
    pxor             m7, m7
%endif
1651 1652

%ifndef m8
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674
    ; splat function arguments
    SPLATB_REG       m0, flimEq, m7   ; E
    SPLATB_REG       m1, flimIq, m7   ; I
    SPLATB_REG       m2, hevthrq, m7  ; hev_thresh

%define m_flimE    [rsp]
%define m_flimI    [rsp+mmsize]
%define m_hevthr   [rsp+mmsize*2]
%define m_maskres  [rsp+mmsize*3]
%define m_p0backup [rsp+mmsize*3]
%define m_q0backup [rsp+mmsize*4]

    mova        m_flimE, m0
    mova        m_flimI, m1
    mova       m_hevthr, m2
%else
%define m_flimE    m9
%define m_flimI    m10
%define m_hevthr   m11
%define m_maskres  m12
%define m_p0backup m12
%define m_q0backup m8
1675 1676

    ; splat function arguments
1677 1678 1679
    SPLATB_REG  m_flimE, flimEq, m7   ; E
    SPLATB_REG  m_flimI, flimIq, m7   ; I
    SPLATB_REG m_hevthr, hevthrq, m7  ; hev_thresh
1680 1681
%endif

1682 1683 1684 1685 1686 1687 1688
%if %2 == 8 ; chroma
    DEFINE_ARGS dst1, dst8, mstride, stride, dst2
%elif mmsize == 8
    DEFINE_ARGS dst1, mstride, stride, dst2, cntr
    mov           cntrq, 2
%else
    DEFINE_ARGS dst1, mstride, stride, dst2, dst8
1689
%endif
1690 1691
    mov         strideq, mstrideq
    neg        mstrideq
1692
%ifidn %1, h
1693 1694 1695
    lea           dst1q, [dst1q+strideq*4-4]
%if %2 == 8 ; chroma
    lea           dst8q, [dst8q+strideq*4-4]
1696
%endif
1697 1698 1699
%endif

%if mmsize == 8
1700
.next8px:
1701 1702
%endif
    ; read
1703
    lea           dst2q, [dst1q+strideq]
1704
%ifidn %1, v
1705
%if %2 == 8 && mmsize == 16
1706 1707 1708 1709
%define movrow movh
%else
%define movrow mova
%endif
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724
    movrow           m0, [dst1q+mstrideq*4] ; p3
    movrow           m1, [dst2q+mstrideq*4] ; p2
    movrow           m2, [dst1q+mstrideq*2] ; p1
    movrow           m5, [dst2q]            ; q1
    movrow           m6, [dst2q+ strideq*1] ; q2
    movrow           m7, [dst2q+ strideq*2] ; q3
%if mmsize == 16 && %2 == 8
    movhps           m0, [dst8q+mstrideq*4]
    movhps           m2, [dst8q+mstrideq*2]
    add           dst8q, strideq
    movhps           m1, [dst8q+mstrideq*4]
    movhps           m5, [dst8q]
    movhps           m6, [dst8q+ strideq  ]
    movhps           m7, [dst8q+ strideq*2]
    add           dst8q, mstrideq
1725
%endif
1726 1727
%elif mmsize == 8 ; mmx/mmxext (h)
    ; read 8 rows of 8px each
1728 1729 1730 1731 1732 1733 1734
    movu             m0, [dst1q+mstrideq*4]
    movu             m1, [dst2q+mstrideq*4]
    movu             m2, [dst1q+mstrideq*2]
    movu             m3, [dst1q+mstrideq  ]
    movu             m4, [dst1q]
    movu             m5, [dst2q]
    movu             m6, [dst2q+ strideq  ]
1735 1736 1737

    ; 8x8 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1738 1739
    mova     m_q0backup, m1
    movu             m7, [dst2q+ strideq*2]
1740 1741 1742 1743
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1744 1745
    mova             m1, m_q0backup
    mova     m_q0backup, m2          ; store q0
1746
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1747
    mova     m_p0backup, m5          ; store p0
1748 1749 1750 1751 1752
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%else ; sse2 (h)
1753 1754
%if %2 == 16
    lea           dst8q, [dst1q+ strideq*8]
1755
%endif
1756 1757

    ; read 16 rows of 8px each, interleave
1758 1759 1760 1761 1762 1763 1764 1765
    movh             m0, [dst1q+mstrideq*4]
    movh             m1, [dst8q+mstrideq*4]
    movh             m2, [dst1q+mstrideq*2]
    movh             m5, [dst8q+mstrideq*2]
    movh             m3, [dst1q+mstrideq  ]
    movh             m6, [dst8q+mstrideq  ]
    movh             m4, [dst1q]
    movh             m7, [dst8q]
1766 1767 1768 1769 1770
    punpcklbw        m0, m1          ; A/I
    punpcklbw        m2, m5          ; C/K
    punpcklbw        m3, m6          ; D/L
    punpcklbw        m4, m7          ; E/M

1771 1772 1773 1774 1775
    add           dst8q, strideq
    movh             m1, [dst2q+mstrideq*4]
    movh             m6, [dst8q+mstrideq*4]
    movh             m5, [dst2q]
    movh             m7, [dst8q]
1776 1777
    punpcklbw        m1, m6          ; B/J
    punpcklbw        m5, m7          ; F/N
1778 1779
    movh             m6, [dst2q+ strideq  ]
    movh             m7, [dst8q+ strideq  ]
1780 1781 1782 1783
    punpcklbw        m6, m7          ; G/O

    ; 8x16 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1784
%ifdef m8
1785
    SWAP              1, 8
1786
%else
1787
    mova     m_q0backup, m1
1788
%endif
1789 1790
    movh             m7, [dst2q+ strideq*2]
    movh             m1, [dst8q+ strideq*2]
1791 1792 1793 1794 1795
    punpcklbw        m7, m1          ; H/P
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1796
%ifdef m8
1797 1798
    SWAP              1, 8
    SWAP              2, 8
1799
%else
1800 1801
    mova             m1, m_q0backup
    mova     m_q0backup, m2          ; store q0
1802 1803
%endif
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1804
%ifdef m12
1805
    SWAP              5, 12
1806
%else
1807
    mova     m_p0backup, m5          ; store p0
1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839
%endif
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%endif

    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
    mova             m4, m1
    SWAP              4, 1
    psubusb          m4, m0          ; p2-p3
    psubusb          m0, m1          ; p3-p2
    por              m0, m4          ; abs(p3-p2)

    mova             m4, m2
    SWAP              4, 2
    psubusb          m4, m1          ; p1-p2
    psubusb          m1, m2          ; p2-p1
    por              m1, m4          ; abs(p2-p1)

    mova             m4, m6
    SWAP              4, 6
    psubusb          m4, m7          ; q2-q3
    psubusb          m7, m6          ; q3-q2
    por              m7, m4          ; abs(q3-q2)

    mova             m4, m5
    SWAP              4, 5
    psubusb          m4, m6          ; q1-q2
    psubusb          m6, m5          ; q2-q1
    por              m6, m4          ; abs(q2-q1)

1840
%if notcpuflag(mmxext)
1841
    mova             m4, m_flimI
1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
    pxor             m3, m3
    psubusb          m0, m4
    psubusb          m1, m4
    psubusb          m7, m4
    psubusb          m6, m4
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
    pand             m0, m1
    pand             m7, m6
    pand             m0, m7
%else ; mmxext/sse2
    pmaxub           m0, m1
    pmaxub           m6, m7
    pmaxub           m0, m6
%endif

    ; normal_limit and high_edge_variance for p1-p0, q1-q0
    SWAP              7, 3           ; now m7 is zero
1862
%ifidn %1, v
1863 1864 1865
    movrow           m3, [dst1q+mstrideq  ] ; p0
%if mmsize == 16 && %2 == 8
    movhps           m3, [dst8q+mstrideq  ]
1866 1867
%endif
%elifdef m12
1868
    SWAP              3, 12
1869
%else
1870
    mova             m3, m_p0backup
1871 1872 1873 1874 1875 1876 1877 1878 1879
%endif

    mova             m1, m2
    SWAP              1, 2
    mova             m6, m3
    SWAP              3, 6
    psubusb          m1, m3          ; p1-p0
    psubusb          m6, m2          ; p0-p1
    por              m1, m6          ; abs(p1-p0)
1880
%if notcpuflag(mmxext)
1881 1882
    mova             m6, m1
    psubusb          m1, m4
1883
    psubusb          m6, m_hevthr
1884 1885 1886
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
    pand             m0, m1
1887
    mova      m_maskres, m6
1888 1889 1890 1891 1892 1893
%else ; mmxext/sse2
    pmaxub           m0, m1          ; max_I
    SWAP              1, 4           ; max_hev_thresh
%endif

    SWAP              6, 4           ; now m6 is I
1894
%ifidn %1, v
1895 1896 1897
    movrow           m4, [dst1q]     ; q0
%if mmsize == 16 && %2 == 8
    movhps           m4, [dst8q]
1898 1899
%endif
%elifdef m8
1900
    SWAP              4, 8
1901
%else
1902
    mova             m4, m_q0backup
1903 1904 1905 1906 1907 1908 1909 1910
%endif
    mova             m1, m4
    SWAP              1, 4
    mova             m7, m5
    SWAP              7, 5
    psubusb          m1, m5          ; q0-q1
    psubusb          m7, m4          ; q1-q0
    por              m1, m7          ; abs(q1-q0)
1911
%if notcpuflag(mmxext)
1912 1913
    mova             m7, m1
    psubusb          m1, m6
1914
    psubusb          m7, m_hevthr
1915 1916 1917
    pxor             m6, m6
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1918
    mova             m6, m_maskres
1919 1920 1921 1922 1923 1924
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
    pand             m6, m7
%else ; mmxext/sse2
    pxor             m7, m7
    pmaxub           m0, m1
    pmaxub           m6, m1
1925 1926
    psubusb          m0, m_flimI
    psubusb          m6, m_hevthr
1927 1928 1929 1930 1931 1932
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
%endif
%ifdef m12
    SWAP              6, 12
%else
1933
    mova      m_maskres, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956
%endif

    ; simple_limit
    mova             m1, m3
    SWAP              1, 3
    mova             m6, m4          ; keep copies of p0/q0 around for later use
    SWAP              6, 4
    psubusb          m1, m4          ; p0-q0
    psubusb          m6, m3          ; q0-p0
    por              m1, m6          ; abs(q0-p0)
    paddusb          m1, m1          ; m1=2*abs(q0-p0)

    mova             m7, m2
    SWAP              7, 2
    mova             m6, m5
    SWAP              6, 5
    psubusb          m7, m5          ; p1-q1
    psubusb          m6, m2          ; q1-p1
    por              m7, m6          ; abs(q1-p1)
    pxor             m6, m6
    pand             m7, [pb_FE]
    psrlq            m7, 1           ; abs(q1-p1)/2
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1957
    psubusb          m7, m_flimE
1958 1959 1960 1961 1962 1963
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
    pand             m0, m7          ; normal_limit result

    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef m8 ; x86-64 && sse2
    mova             m8, [pb_80]
1964
%define m_pb_80 m8
1965
%else ; x86-32 or mmx/mmxext
1966
%define m_pb_80 [pb_80]
1967 1968 1969
%endif
    mova             m1, m4
    mova             m7, m3
1970 1971
    pxor             m1, m_pb_80
    pxor             m7, m_pb_80
1972 1973 1974
    psubsb           m1, m7          ; (signed) q0-p0
    mova             m6, m2
    mova             m7, m5
1975 1976
    pxor             m6, m_pb_80
    pxor             m7, m_pb_80
1977
    psubsb           m6, m7          ; (signed) p1-q1
1978
    mova             m7, m_maskres
1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
    pandn            m7, m6
    paddsb           m7, m1
    paddsb           m7, m1
    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)

    pand             m7, m0
    mova             m1, [pb_F8]
    mova             m6, m7
    paddsb           m7, [pb_3]
    paddsb           m6, [pb_4]
    pand             m7, m1
    pand             m6, m1

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m1, m7
    psubb            m0, m7
    psrlq            m7, 3           ; +f2
    psrlq            m0, 3           ; -f2
    pand             m0, m1
    pandn            m1, m7
    psubusb          m3, m0
    paddusb          m3, m1          ; p0+f2

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m0, m6
    psubb            m1, m6
    psrlq            m6, 3           ; +f1
    psrlq            m1, 3           ; -f1
    pand             m1, m0
    pandn            m0, m6
    psubusb          m4, m0
    paddusb          m4, m1          ; q0-f1

%ifdef m12
    SWAP              6, 12
%else
2017
    mova             m6, m_maskres
2018
%endif
2019
%if notcpuflag(mmxext)
2020 2021 2022 2023 2024 2025
    mova             m7, [pb_1]
%else ; mmxext/sse2
    pxor             m7, m7
%endif
    pand             m0, m6
    pand             m1, m6
2026
%if notcpuflag(mmxext)
2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043
    paddusb          m0, m7
    pand             m1, [pb_FE]
    pandn            m7, m0
    psrlq            m1, 1
    psrlq            m7, 1
    SWAP              0, 7
%else ; mmxext/sse2
    psubusb          m1, [pb_1]
    pavgb            m0, m7          ; a
    pavgb            m1, m7          ; -a
%endif
    psubusb          m5, m0
    psubusb          m2, m1
    paddusb          m5, m1          ; q1-a
    paddusb          m2, m0          ; p1+a

    ; store
2044
%ifidn %1, v
2045 2046 2047 2048 2049 2050 2051 2052 2053
    movrow [dst1q+mstrideq*2], m2
    movrow [dst1q+mstrideq  ], m3
    movrow      [dst1q], m4
    movrow [dst1q+ strideq  ], m5
%if mmsize == 16 && %2 == 8
    movhps [dst8q+mstrideq*2], m2
    movhps [dst8q+mstrideq  ], m3
    movhps      [dst8q], m4
    movhps [dst8q+ strideq  ], m5
2054
%endif
2055
%else ; h
2056 2057
    add           dst1q, 2
    add           dst2q, 2
2058 2059 2060 2061 2062

    ; 4x8/16 transpose
    TRANSPOSE4x4B     2, 3, 4, 5, 6

%if mmsize == 8 ; mmx/mmxext (h)
2063
    WRITE_4x2D        2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
2064
%else ; sse2 (h)
2065 2066
    lea           dst8q, [dst8q+mstrideq  +2]
    WRITE_4x4D        2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
2067 2068 2069 2070
%endif
%endif

%if mmsize == 8
2071
%if %2 == 8 ; chroma
2072
%ifidn %1, h
2073
    sub           dst1q, 2
2074
%endif
2075 2076
    cmp           dst1q, dst8q
    mov           dst1q, dst8q
2077 2078
    jnz .next8px
%else
2079
%ifidn %1, h
2080
    lea           dst1q, [dst1q+ strideq*8-2]
2081
%else ; v
2082
    add           dst1q, 8
2083
%endif
2084
    dec           cntrq
2085 2086
    jg .next8px
%endif
2087 2088
    REP_RET
%else ; mmsize == 16
2089
    RET
2090
%endif
2091 2092
%endmacro

2093
%if ARCH_X86_32
2094
INIT_MMX mmx
2095 2096 2097 2098
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v,  8
INNER_LOOPFILTER h,  8
2099

2100
INIT_MMX mmxext
2101 2102 2103 2104
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v,  8
INNER_LOOPFILTER h,  8
2105
%endif
2106 2107

INIT_XMM sse2
2108 2109 2110 2111
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v,  8
INNER_LOOPFILTER h,  8
2112 2113

INIT_XMM ssse3
2114 2115 2116 2117
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v,  8
INNER_LOOPFILTER h,  8
2118

2119 2120 2121 2122 2123
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
;                                            int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------

2124
%macro MBEDGE_LOOPFILTER 2
2125
%define stack_size 0
2126 2127 2128 2129 2130
%ifndef m8       ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%if mmsize == 16 ;               [3]=hev() result
                 ;               [4]=filter tmp result
                 ;               [5]/[6] = p2/q2 backup
                 ;               [7]=lim_res sign result
2131
%define stack_size mmsize * -7
2132
%else ; 8        ; extra storage space for transposes
2133 2134
%define stack_size mmsize * -8
%endif
2135
%endif
2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147

%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
%endif

%if cpuflag(ssse3)
    pxor             m7, m7
%endif

%ifndef m8
2148
    ; splat function arguments
2149 2150 2151
    SPLATB_REG       m0, flimEq, m7   ; E
    SPLATB_REG       m1, flimIq, m7   ; I
    SPLATB_REG       m2, hevthrq, m7  ; hev_thresh
2152

2153 2154 2155 2156 2157 2158 2159 2160 2161
%define m_flimE    [rsp]
%define m_flimI    [rsp+mmsize]
%define m_hevthr   [rsp+mmsize*2]
%define m_maskres  [rsp+mmsize*3]
%define m_limres   [rsp+mmsize*4]
%define m_p0backup [rsp+mmsize*3]
%define m_q0backup [rsp+mmsize*4]
%define m_p2backup [rsp+mmsize*5]
%define m_q2backup [rsp+mmsize*6]
2162
%if mmsize == 16
2163
%define m_limsign  [rsp]
2164
%else
2165
%define m_limsign  [rsp+mmsize*7]
2166
%endif
2167

2168 2169 2170
    mova        m_flimE, m0
    mova        m_flimI, m1
    mova       m_hevthr, m2
2171
%else ; sse2 on x86-64
2172 2173 2174 2175 2176 2177 2178 2179 2180 2181
%define m_flimE    m9
%define m_flimI    m10
%define m_hevthr   m11
%define m_maskres  m12
%define m_limres   m8
%define m_p0backup m12
%define m_q0backup m8
%define m_p2backup m13
%define m_q2backup m14
%define m_limsign  m9
2182 2183

    ; splat function arguments
2184 2185 2186
    SPLATB_REG  m_flimE, flimEq, m7   ; E
    SPLATB_REG  m_flimI, flimIq, m7   ; I
    SPLATB_REG m_hevthr, hevthrq, m7  ; hev_thresh
2187 2188
%endif

2189 2190 2191 2192 2193 2194 2195
%if %2 == 8 ; chroma
    DEFINE_ARGS dst1, dst8, mstride, stride, dst2
%elif mmsize == 8
    DEFINE_ARGS dst1, mstride, stride, dst2, cntr
    mov           cntrq, 2
%else
    DEFINE_ARGS dst1, mstride, stride, dst2, dst8
2196
%endif
2197 2198
    mov         strideq, mstrideq
    neg        mstrideq
2199
%ifidn %1, h
2200 2201 2202
    lea           dst1q, [dst1q+strideq*4-4]
%if %2 == 8 ; chroma
    lea           dst8q, [dst8q+strideq*4-4]
2203 2204 2205 2206
%endif
%endif

%if mmsize == 8
2207
.next8px:
2208 2209
%endif
    ; read
2210
    lea           dst2q, [dst1q+ strideq  ]
2211
%ifidn %1, v
2212
%if %2 == 8 && mmsize == 16
2213 2214 2215 2216
%define movrow movh
%else
%define movrow mova
%endif
2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231
    movrow           m0, [dst1q+mstrideq*4] ; p3
    movrow           m1, [dst2q+mstrideq*4] ; p2
    movrow           m2, [dst1q+mstrideq*2] ; p1
    movrow           m5, [dst2q]            ; q1
    movrow           m6, [dst2q+ strideq  ] ; q2
    movrow           m7, [dst2q+ strideq*2] ; q3
%if mmsize == 16 && %2 == 8
    movhps           m0, [dst8q+mstrideq*4]
    movhps           m2, [dst8q+mstrideq*2]
    add           dst8q, strideq
    movhps           m1, [dst8q+mstrideq*4]
    movhps           m5, [dst8q]
    movhps           m6, [dst8q+ strideq  ]
    movhps           m7, [dst8q+ strideq*2]
    add           dst8q, mstrideq
2232 2233 2234
%endif
%elif mmsize == 8 ; mmx/mmxext (h)
    ; read 8 rows of 8px each
2235 2236 2237 2238 2239 2240 2241
    movu             m0, [dst1q+mstrideq*4]
    movu             m1, [dst2q+mstrideq*4]
    movu             m2, [dst1q+mstrideq*2]
    movu             m3, [dst1q+mstrideq  ]
    movu             m4, [dst1q]
    movu             m5, [dst2q]
    movu             m6, [dst2q+ strideq  ]
2242 2243 2244

    ; 8x8 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
2245 2246
    mova     m_q0backup, m1
    movu             m7, [dst2q+ strideq*2]
2247 2248 2249 2250
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2251 2252
    mova             m1, m_q0backup
    mova     m_q0backup, m2          ; store q0
2253
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2254
    mova     m_p0backup, m5          ; store p0
2255 2256 2257 2258 2259
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%else ; sse2 (h)
2260 2261
%if %2 == 16
    lea           dst8q, [dst1q+ strideq*8  ]
2262 2263 2264
%endif

    ; read 16 rows of 8px each, interleave
2265 2266 2267 2268 2269 2270 2271 2272
    movh             m0, [dst1q+mstrideq*4]
    movh             m1, [dst8q+mstrideq*4]
    movh             m2, [dst1q+mstrideq*2]
    movh             m5, [dst8q+mstrideq*2]
    movh             m3, [dst1q+mstrideq  ]
    movh             m6, [dst8q+mstrideq  ]
    movh             m4, [dst1q]
    movh             m7, [dst8q]
2273 2274 2275 2276 2277
    punpcklbw        m0, m1          ; A/I
    punpcklbw        m2, m5          ; C/K
    punpcklbw        m3, m6          ; D/L
    punpcklbw        m4, m7          ; E/M

2278 2279 2280 2281 2282
    add           dst8q, strideq
    movh             m1, [dst2q+mstrideq*4]
    movh             m6, [dst8q+mstrideq*4]
    movh             m5, [dst2q]
    movh             m7, [dst8q]
2283 2284
    punpcklbw        m1, m6          ; B/J
    punpcklbw        m5, m7          ; F/N
2285 2286
    movh             m6, [dst2q+ strideq  ]
    movh             m7, [dst8q+ strideq  ]
2287 2288 2289 2290 2291 2292 2293
    punpcklbw        m6, m7          ; G/O

    ; 8x16 transpose
    TRANSPOSE4x4B     0, 1, 2, 3, 7
%ifdef m8
    SWAP              1, 8
%else
2294
    mova     m_q0backup, m1
2295
%endif
2296 2297
    movh             m7, [dst2q+ strideq*2]
    movh             m1, [dst8q+ strideq*2]
2298 2299 2300 2301 2302 2303 2304 2305 2306
    punpcklbw        m7, m1          ; H/P
    TRANSPOSE4x4B     4, 5, 6, 7, 1
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
%ifdef m8
    SWAP              1, 8
    SWAP              2, 8
%else
2307 2308
    mova             m1, m_q0backup
    mova     m_q0backup, m2          ; store q0
2309 2310 2311 2312 2313
%endif
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
%ifdef m12
    SWAP              5, 12
%else
2314
    mova     m_p0backup, m5          ; store p0
2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331
%endif
    SWAP              1, 4
    SWAP              2, 4
    SWAP              6, 3
    SWAP              5, 3
%endif

    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
    mova             m4, m1
    SWAP              4, 1
    psubusb          m4, m0          ; p2-p3
    psubusb          m0, m1          ; p3-p2
    por              m0, m4          ; abs(p3-p2)

    mova             m4, m2
    SWAP              4, 2
    psubusb          m4, m1          ; p1-p2
2332
    mova     m_p2backup, m1
2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344
    psubusb          m1, m2          ; p2-p1
    por              m1, m4          ; abs(p2-p1)

    mova             m4, m6
    SWAP              4, 6
    psubusb          m4, m7          ; q2-q3
    psubusb          m7, m6          ; q3-q2
    por              m7, m4          ; abs(q3-q2)

    mova             m4, m5
    SWAP              4, 5
    psubusb          m4, m6          ; q1-q2
2345
    mova     m_q2backup, m6
2346 2347 2348
    psubusb          m6, m5          ; q2-q1
    por              m6, m4          ; abs(q2-q1)

2349
%if notcpuflag(mmxext)
2350
    mova             m4, m_flimI
2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370
    pxor             m3, m3
    psubusb          m0, m4
    psubusb          m1, m4
    psubusb          m7, m4
    psubusb          m6, m4
    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
    pand             m0, m1
    pand             m7, m6
    pand             m0, m7
%else ; mmxext/sse2
    pmaxub           m0, m1
    pmaxub           m6, m7
    pmaxub           m0, m6
%endif

    ; normal_limit and high_edge_variance for p1-p0, q1-q0
    SWAP              7, 3           ; now m7 is zero
2371
%ifidn %1, v
2372 2373 2374
    movrow           m3, [dst1q+mstrideq  ] ; p0
%if mmsize == 16 && %2 == 8
    movhps           m3, [dst8q+mstrideq  ]
2375 2376 2377 2378
%endif
%elifdef m12
    SWAP              3, 12
%else
2379
    mova             m3, m_p0backup
2380 2381 2382 2383 2384 2385 2386 2387 2388
%endif

    mova             m1, m2
    SWAP              1, 2
    mova             m6, m3
    SWAP              3, 6
    psubusb          m1, m3          ; p1-p0
    psubusb          m6, m2          ; p0-p1
    por              m1, m6          ; abs(p1-p0)
2389
%if notcpuflag(mmxext)
2390 2391
    mova             m6, m1
    psubusb          m1, m4
2392
    psubusb          m6, m_hevthr
2393 2394 2395
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
    pand             m0, m1
2396
    mova      m_maskres, m6
2397 2398 2399 2400 2401 2402
%else ; mmxext/sse2
    pmaxub           m0, m1          ; max_I
    SWAP              1, 4           ; max_hev_thresh
%endif

    SWAP              6, 4           ; now m6 is I
2403
%ifidn %1, v
2404 2405 2406
    movrow           m4, [dst1q]     ; q0
%if mmsize == 16 && %2 == 8
    movhps           m4, [dst8q]
2407 2408 2409 2410
%endif
%elifdef m8
    SWAP              4, 8
%else
2411
    mova             m4, m_q0backup
2412 2413 2414 2415 2416 2417 2418 2419
%endif
    mova             m1, m4
    SWAP              1, 4
    mova             m7, m5
    SWAP              7, 5
    psubusb          m1, m5          ; q0-q1
    psubusb          m7, m4          ; q1-q0
    por              m1, m7          ; abs(q1-q0)
2420
%if notcpuflag(mmxext)
2421 2422
    mova             m7, m1
    psubusb          m1, m6
2423
    psubusb          m7, m_hevthr
2424 2425 2426
    pxor             m6, m6
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
2427
    mova             m6, m_maskres
2428 2429 2430 2431 2432 2433
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
    pand             m6, m7
%else ; mmxext/sse2
    pxor             m7, m7
    pmaxub           m0, m1
    pmaxub           m6, m1
2434 2435
    psubusb          m0, m_flimI
    psubusb          m6, m_hevthr
2436 2437 2438 2439 2440 2441
    pcmpeqb          m0, m7          ; max(abs(..)) <= I
    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
%endif
%ifdef m12
    SWAP              6, 12
%else
2442
    mova      m_maskres, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465
%endif

    ; simple_limit
    mova             m1, m3
    SWAP              1, 3
    mova             m6, m4          ; keep copies of p0/q0 around for later use
    SWAP              6, 4
    psubusb          m1, m4          ; p0-q0
    psubusb          m6, m3          ; q0-p0
    por              m1, m6          ; abs(q0-p0)
    paddusb          m1, m1          ; m1=2*abs(q0-p0)

    mova             m7, m2
    SWAP              7, 2
    mova             m6, m5
    SWAP              6, 5
    psubusb          m7, m5          ; p1-q1
    psubusb          m6, m2          ; q1-p1
    por              m7, m6          ; abs(q1-p1)
    pxor             m6, m6
    pand             m7, [pb_FE]
    psrlq            m7, 1           ; abs(q1-p1)/2
    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
2466
    psubusb          m7, m_flimE
2467 2468 2469 2470 2471 2472
    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
    pand             m0, m7          ; normal_limit result

    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef m8 ; x86-64 && sse2
    mova             m8, [pb_80]
2473
%define m_pb_80 m8
2474
%else ; x86-32 or mmx/mmxext
2475
%define m_pb_80 [pb_80]
2476 2477 2478
%endif
    mova             m1, m4
    mova             m7, m3
2479 2480
    pxor             m1, m_pb_80
    pxor             m7, m_pb_80
2481 2482 2483
    psubsb           m1, m7          ; (signed) q0-p0
    mova             m6, m2
    mova             m7, m5
2484 2485
    pxor             m6, m_pb_80
    pxor             m7, m_pb_80
2486
    psubsb           m6, m7          ; (signed) p1-q1
2487
    mova             m7, m_maskres
2488 2489 2490 2491 2492
    paddsb           m6, m1
    paddsb           m6, m1
    paddsb           m6, m1
    pand             m6, m0
%ifdef m8
2493 2494
    mova       m_limres, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
    pand       m_limres, m7
2495 2496 2497
%else
    mova             m0, m6
    pand             m0, m7
2498
    mova       m_limres, m0
2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531
%endif
    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common

    mova             m1, [pb_F8]
    mova             m6, m7
    paddsb           m7, [pb_3]
    paddsb           m6, [pb_4]
    pand             m7, m1
    pand             m6, m1

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m1, m7
    psubb            m0, m7
    psrlq            m7, 3           ; +f2
    psrlq            m0, 3           ; -f2
    pand             m0, m1
    pandn            m1, m7
    psubusb          m3, m0
    paddusb          m3, m1          ; p0+f2

    pxor             m1, m1
    pxor             m0, m0
    pcmpgtb          m0, m6
    psubb            m1, m6
    psrlq            m6, 3           ; +f1
    psrlq            m1, 3           ; -f1
    pand             m1, m0
    pandn            m0, m6
    psubusb          m4, m0
    paddusb          m4, m1          ; q0-f1

    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2532
%if cpuflag(ssse3)
2533 2534
    mova             m7, [pb_1]
%else
2535
    mova             m7, [pw_63]
2536
%endif
2537 2538 2539
%ifdef m8
    SWAP              1, 8
%else
2540
    mova             m1, m_limres
2541 2542 2543 2544
%endif
    pxor             m0, m0
    mova             m6, m1
    pcmpgtb          m0, m1         ; which are negative
2545
%if cpuflag(ssse3)
2546 2547 2548
    punpcklbw        m6, m7         ; interleave with "1" for rounding
    punpckhbw        m1, m7
%else
2549 2550
    punpcklbw        m6, m0         ; signed byte->word
    punpckhbw        m1, m0
2551
%endif
2552
    mova      m_limsign, m0
2553
%if cpuflag(ssse3)
2554 2555
    mova             m7, [pb_27_63]
%ifndef m8
2556
    mova       m_limres, m1
2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568
%endif
%ifdef m10
    SWAP              0, 10         ; don't lose lim_sign copy
%endif
    mova             m0, m7
    pmaddubsw        m7, m6
    SWAP              6, 7
    pmaddubsw        m0, m1
    SWAP              1, 0
%ifdef m10
    SWAP              0, 10
%else
2569
    mova             m0, m_limsign
2570 2571
%endif
%else
2572 2573
    mova      m_maskres, m6         ; backup for later in filter
    mova       m_limres, m1
2574 2575 2576 2577
    pmullw          m6, [pw_27]
    pmullw          m1, [pw_27]
    paddw           m6, m7
    paddw           m1, m7
2578
%endif
2579 2580 2581 2582 2583 2584 2585
    psraw           m6, 7
    psraw           m1, 7
    packsswb        m6, m1          ; a0
    pxor            m1, m1
    psubb           m1, m6
    pand            m1, m0          ; -a0
    pandn           m0, m6          ; +a0
2586
%if cpuflag(ssse3)
2587 2588
    mova            m6, [pb_18_63]  ; pipelining
%endif
2589 2590 2591 2592 2593
    psubusb         m3, m1
    paddusb         m4, m1
    paddusb         m3, m0          ; p0+a0
    psubusb         m4, m0          ; q0-a0

2594
%if cpuflag(ssse3)
2595 2596 2597 2598
    SWAP             6, 7
%ifdef m10
    SWAP             1, 10
%else
2599
    mova            m1, m_limres
2600 2601 2602 2603 2604 2605 2606 2607 2608
%endif
    mova            m0, m7
    pmaddubsw       m7, m6
    SWAP             6, 7
    pmaddubsw       m0, m1
    SWAP             1, 0
%ifdef m10
    SWAP             0, 10
%endif
2609
    mova            m0, m_limsign
2610
%else
2611 2612
    mova            m6, m_maskres
    mova            m1, m_limres
2613 2614 2615 2616
    pmullw          m6, [pw_18]
    pmullw          m1, [pw_18]
    paddw           m6, m7
    paddw           m1, m7
2617
%endif
2618
    mova            m0, m_limsign
2619 2620 2621 2622 2623 2624 2625
    psraw           m6, 7
    psraw           m1, 7
    packsswb        m6, m1          ; a1
    pxor            m1, m1
    psubb           m1, m6
    pand            m1, m0          ; -a1
    pandn           m0, m6          ; +a1
2626
%if cpuflag(ssse3)
2627 2628
    mova            m6, [pb_9_63]
%endif
2629 2630 2631 2632 2633
    psubusb         m2, m1
    paddusb         m5, m1
    paddusb         m2, m0          ; p1+a1
    psubusb         m5, m0          ; q1-a1

2634
%if cpuflag(ssse3)
2635 2636 2637 2638
    SWAP             6, 7
%ifdef m10
    SWAP             1, 10
%else
2639
    mova            m1, m_limres
2640 2641 2642 2643 2644 2645 2646
%endif
    mova            m0, m7
    pmaddubsw       m7, m6
    SWAP             6, 7
    pmaddubsw       m0, m1
    SWAP             1, 0
%else
2647 2648 2649 2650
%ifdef m8
    SWAP             6, 12
    SWAP             1, 8
%else
2651 2652
    mova            m6, m_maskres
    mova            m1, m_limres
2653 2654 2655 2656 2657
%endif
    pmullw          m6, [pw_9]
    pmullw          m1, [pw_9]
    paddw           m6, m7
    paddw           m1, m7
2658
%endif
2659 2660
%ifdef m9
    SWAP             7, 9
2661
%else
2662
    mova            m7, m_limsign
2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674
%endif
    psraw           m6, 7
    psraw           m1, 7
    packsswb        m6, m1          ; a1
    pxor            m0, m0
    psubb           m0, m6
    pand            m0, m7          ; -a1
    pandn           m7, m6          ; +a1
%ifdef m8
    SWAP             1, 13
    SWAP             6, 14
%else
2675 2676
    mova            m1, m_p2backup
    mova            m6, m_q2backup
2677 2678 2679 2680 2681 2682 2683
%endif
    psubusb         m1, m0
    paddusb         m6, m0
    paddusb         m1, m7          ; p1+a1
    psubusb         m6, m7          ; q1-a1

    ; store
2684
%ifidn %1, v
2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699
    movrow [dst2q+mstrideq*4], m1
    movrow [dst1q+mstrideq*2], m2
    movrow [dst1q+mstrideq  ], m3
    movrow     [dst1q], m4
    movrow     [dst2q], m5
    movrow [dst2q+ strideq  ], m6
%if mmsize == 16 && %2 == 8
    add           dst8q, mstrideq
    movhps [dst8q+mstrideq*2], m1
    movhps [dst8q+mstrideq  ], m2
    movhps     [dst8q], m3
    add          dst8q, strideq
    movhps     [dst8q], m4
    movhps [dst8q+ strideq  ], m5
    movhps [dst8q+ strideq*2], m6
2700 2701
%endif
%else ; h
2702 2703
    inc          dst1q
    inc          dst2q
2704 2705

    ; 4x8/16 transpose
2706 2707
    TRANSPOSE4x4B    1, 2, 3, 4, 0
    SBUTTERFLY      bw, 5, 6, 0
2708 2709

%if mmsize == 8 ; mmx/mmxext (h)
2710 2711 2712
    WRITE_4x2D       1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
    add          dst1q, 4
    WRITE_2x4W      m5, m6, dst2q, dst1q, mstrideq, strideq
2713
%else ; sse2 (h)
2714 2715 2716 2717
    lea          dst8q, [dst8q+mstrideq+1]
    WRITE_4x4D       1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
    lea          dst1q, [dst2q+mstrideq+4]
    lea          dst8q, [dst8q+mstrideq+4]
2718
%if cpuflag(sse4)
2719
    add          dst2q, 4
2720
%endif
2721
    WRITE_8W        m5, dst2q, dst1q,  mstrideq, strideq
2722
%if cpuflag(sse4)
2723
    lea          dst2q, [dst8q+ strideq  ]
2724
%endif
2725
    WRITE_8W        m6, dst2q, dst8q, mstrideq, strideq
2726 2727 2728 2729
%endif
%endif

%if mmsize == 8
2730
%if %2 == 8 ; chroma
2731
%ifidn %1, h
2732
    sub          dst1q, 5
2733
%endif
2734 2735
    cmp          dst1q, dst8q
    mov          dst1q, dst8q
2736 2737
    jnz .next8px
%else
2738
%ifidn %1, h
2739
    lea          dst1q, [dst1q+ strideq*8-5]
2740
%else ; v
2741
    add          dst1q, 8
2742
%endif
2743
    dec          cntrq
2744 2745
    jg .next8px
%endif
2746 2747
    REP_RET
%else ; mmsize == 16
2748
    RET
2749
%endif
2750 2751
%endmacro

2752
%if ARCH_X86_32
2753
INIT_MMX mmx
2754 2755 2756 2757
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v,  8
MBEDGE_LOOPFILTER h,  8
2758

2759
INIT_MMX mmxext
2760 2761 2762 2763
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v,  8
MBEDGE_LOOPFILTER h,  8
2764
%endif
2765

2766
INIT_XMM sse2
2767 2768 2769 2770
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v,  8
MBEDGE_LOOPFILTER h,  8
2771 2772

INIT_XMM ssse3
2773 2774 2775 2776
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v,  8
MBEDGE_LOOPFILTER h,  8
2777 2778

INIT_XMM sse4
2779 2780
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER h,  8