vp8dsp.asm 33.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
;******************************************************************************
;* VP8 MMXEXT optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 22
;******************************************************************************

23
%include "libavutil/x86/x86util.asm"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

SECTION_RODATA

fourtap_filter_hw_m: times 4 dw  -6, 123
                     times 4 dw  12,  -1
                     times 4 dw  -9,  93
                     times 4 dw  50,  -6
                     times 4 dw  -6,  50
                     times 4 dw  93,  -9
                     times 4 dw  -1,  12
                     times 4 dw 123,  -6

sixtap_filter_hw_m:  times 4 dw   2, -11
                     times 4 dw 108,  36
                     times 4 dw  -8,   1
                     times 4 dw   3, -16
                     times 4 dw  77,  77
                     times 4 dw -16,   3
                     times 4 dw   1,  -8
                     times 4 dw  36, 108
                     times 4 dw -11,   2

46 47 48 49 50 51 52 53
fourtap_filter_hb_m: times 8 db  -6, 123
                     times 8 db  12,  -1
                     times 8 db  -9,  93
                     times 8 db  50,  -6
                     times 8 db  -6,  50
                     times 8 db  93,  -9
                     times 8 db  -1,  12
                     times 8 db 123,  -6
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

sixtap_filter_hb_m:  times 8 db   2,   1
                     times 8 db -11, 108
                     times 8 db  36,  -8
                     times 8 db   3,   3
                     times 8 db -16,  77
                     times 8 db  77, -16
                     times 8 db   1,   2
                     times 8 db  -8,  36
                     times 8 db 108, -11

fourtap_filter_v_m:  times 8 dw  -6
                     times 8 dw 123
                     times 8 dw  12
                     times 8 dw  -1
                     times 8 dw  -9
                     times 8 dw  93
                     times 8 dw  50
                     times 8 dw  -6
                     times 8 dw  -6
                     times 8 dw  50
                     times 8 dw  93
                     times 8 dw  -9
                     times 8 dw  -1
                     times 8 dw  12
                     times 8 dw 123
                     times 8 dw  -6

sixtap_filter_v_m:   times 8 dw   2
                     times 8 dw -11
                     times 8 dw 108
                     times 8 dw  36
                     times 8 dw  -8
                     times 8 dw   1
                     times 8 dw   3
                     times 8 dw -16
                     times 8 dw  77
                     times 8 dw  77
                     times 8 dw -16
                     times 8 dw   3
                     times 8 dw   1
                     times 8 dw  -8
                     times 8 dw  36
                     times 8 dw 108
                     times 8 dw -11
                     times 8 dw   2

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
bilinear_filter_vw_m: times 8 dw 1
                      times 8 dw 2
                      times 8 dw 3
                      times 8 dw 4
                      times 8 dw 5
                      times 8 dw 6
                      times 8 dw 7

bilinear_filter_vb_m: times 8 db 7, 1
                      times 8 db 6, 2
                      times 8 db 5, 3
                      times 8 db 4, 4
                      times 8 db 3, 5
                      times 8 db 2, 6
                      times 8 db 1, 7

117
%ifdef PIC
118 119 120 121 122 123 124 125 126
%define fourtap_filter_hw  picregq
%define sixtap_filter_hw   picregq
%define fourtap_filter_hb  picregq
%define sixtap_filter_hb   picregq
%define fourtap_filter_v   picregq
%define sixtap_filter_v    picregq
%define bilinear_filter_vw picregq
%define bilinear_filter_vb picregq
%define npicregs 1
127
%else
128 129 130 131 132 133
%define fourtap_filter_hw  fourtap_filter_hw_m
%define sixtap_filter_hw   sixtap_filter_hw_m
%define fourtap_filter_hb  fourtap_filter_hb_m
%define sixtap_filter_hb   sixtap_filter_hb_m
%define fourtap_filter_v   fourtap_filter_v_m
%define sixtap_filter_v    sixtap_filter_v_m
134 135
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
136
%define npicregs 0
137 138
%endif

139
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
140
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
141

142 143 144
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
145

146
pw_256:   times 8 dw 256
Ronald S. Bultje's avatar
Ronald S. Bultje committed
147 148 149
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734

150
cextern pw_3
151 152 153 154 155
cextern pw_4
cextern pw_64

SECTION .text

156
;-------------------------------------------------------------------------------
157 158
; subpel MC functions:
;
159 160 161 162
; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
;                                                 uint8_t *src, int srcstride,
;                                                 int height,   int mx, int my);
;-------------------------------------------------------------------------------
163

164
%macro FILTER_SSSE3 1
165 166
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
    lea      mxd, [mxq*3]
167 168 169
    mova      m3, [filter_h6_shuf2]
    mova      m4, [filter_h6_shuf3]
%ifdef PIC
170
    lea  picregq, [sixtap_filter_hb_m]
171
%endif
172 173 174
    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
    mova      m6, [sixtap_filter_hb+mxq*8-32]
    mova      m7, [sixtap_filter_hb+mxq*8-16]
175

176
.nextrow:
177
    movu      m0, [srcq-2]
178 179
    mova      m1, m0
    mova      m2, m0
180
%if mmsize == 8
181 182
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
183
    punpcklbw m0, [srcq+3]
184 185 186 187 188 189 190 191 192 193
%else
    pshufb    m0, [filter_h6_shuf1]
%endif
    pshufb    m1, m3
    pshufb    m2, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    pmaddubsw m2, m7
    paddsw    m0, m1
    paddsw    m0, m2
194
    pmulhrsw  m0, [pw_256]
195
    packuswb  m0, m0
196
    movh  [dstq], m0        ; store
197 198

    ; go to next line
199 200 201
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
202 203 204
    jg .nextrow
    REP_RET

205 206
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
207
    mova      m2, [pw_256]
208 209 210
    mova      m3, [filter_h2_shuf]
    mova      m4, [filter_h4_shuf]
%ifdef PIC
211
    lea  picregq, [fourtap_filter_hb_m]
212
%endif
213 214
    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
    mova      m6, [fourtap_filter_hb+mxq]
215

216
.nextrow:
217
    movu      m0, [srcq-1]
218 219 220 221 222 223
    mova      m1, m0
    pshufb    m0, m3
    pshufb    m1, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    paddsw    m0, m1
224
    pmulhrsw  m0, m2
225
    packuswb  m0, m0
226
    movh  [dstq], m0        ; store
227 228

    ; go to next line
229 230 231
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
232 233 234
    jg .nextrow
    REP_RET

235 236
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
237
%ifdef PIC
238
    lea  picregq, [fourtap_filter_hb_m]
239
%endif
240 241
    mova      m5, [fourtap_filter_hb+myq-16]
    mova      m6, [fourtap_filter_hb+myq]
242
    mova      m7, [pw_256]
243 244

    ; read 3 lines
245 246 247 248 249
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+  srcstrideq]
    movh      m2, [srcq+2*srcstrideq]
    add     srcq, srcstrideq
250

251
.nextrow:
252
    movh      m3, [srcq+2*srcstrideq]      ; read new row
253 254 255 256 257 258 259 260 261
    mova      m4, m0
    mova      m0, m1
    punpcklbw m4, m1
    mova      m1, m2
    punpcklbw m2, m3
    pmaddubsw m4, m5
    pmaddubsw m2, m6
    paddsw    m4, m2
    mova      m2, m3
262
    pmulhrsw  m4, m7
263
    packuswb  m4, m4
264
    movh  [dstq], m4
265 266

    ; go to next line
267 268 269
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
270 271 272
    jg .nextrow
    REP_RET

273 274
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    lea      myd, [myq*3]
275
%ifdef PIC
276
    lea  picregq, [sixtap_filter_hb_m]
277
%endif
278
    lea      myq, [sixtap_filter_hb+myq*8]
279 280

    ; read 5 lines
281 282 283 284 285 286 287 288 289
    sub     srcq, srcstrideq
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+srcstrideq]
    movh      m2, [srcq+srcstrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    add     srcq, srcstrideq
    movh      m3, [srcq]
    movh      m4, [srcq+srcstrideq]
290

291
.nextrow:
292
    movh      m5, [srcq+2*srcstrideq]      ; read new row
293 294 295 296 297 298
    mova      m6, m0
    punpcklbw m6, m5
    mova      m0, m1
    punpcklbw m1, m2
    mova      m7, m3
    punpcklbw m7, m4
299 300 301
    pmaddubsw m6, [myq-48]
    pmaddubsw m1, [myq-32]
    pmaddubsw m7, [myq-16]
302 303 304 305
    paddsw    m6, m1
    paddsw    m6, m7
    mova      m1, m2
    mova      m2, m3
306
    pmulhrsw  m6, [pw_256]
307 308 309
    mova      m3, m4
    packuswb  m6, m6
    mova      m4, m5
310
    movh  [dstq], m6
311 312

    ; go to next line
313 314 315
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
316 317 318 319
    jg .nextrow
    REP_RET
%endmacro

320 321 322 323
INIT_MMX ssse3
FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
324

325
; 4x4 block, H-only 4-tap filter
326
INIT_MMX mmxext
327 328
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    shl       mxd, 4
329
%ifdef PIC
330
    lea   picregq, [fourtap_filter_hw_m]
331
%endif
332 333
    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
    movq      mm5, [fourtap_filter_hw+mxq]
334 335 336
    movq      mm7, [pw_64]
    pxor      mm6, mm6

337
.nextrow:
338
    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm6                     ; byte->word ABCD
    pshufw    mm0, mm2, 9                  ; byte CDEF..
    punpcklbw mm0, mm6                     ; byte->word CDEF
    pshufw    mm3, mm1, 0x94               ; word ABBC
    pshufw    mm1, mm0, 0x94               ; word CDDE
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
    movq      mm0, mm1                     ; backup for second set of pixels
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm3, mm1                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
    punpckhbw mm2, mm6                     ; byte->word EFGH
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
    pshufw    mm1, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm0, mm1                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm3, mm0                     ; merge dword->word (4px)
    paddsw    mm3, mm7                     ; rounding
    psraw     mm3, 7
    packuswb  mm3, mm6                     ; clip and word->bytes
364
    movd   [dstq], mm3                     ; store
365 366

    ; go to next line
367 368 369
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
370 371 372 373
    jg .nextrow
    REP_RET

; 4x4 block, H-only 6-tap filter
374
INIT_MMX mmxext
375 376
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    lea       mxd, [mxq*3]
377
%ifdef PIC
378
    lea   picregq, [sixtap_filter_hw_m]
379
%endif
380 381 382
    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
    movq      mm5, [sixtap_filter_hw+mxq*8-32]
    movq      mm6, [sixtap_filter_hw+mxq*8-16]
383 384 385
    movq      mm7, [pw_64]
    pxor      mm3, mm3

386
.nextrow:
387
    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm3                     ; byte->word ABCD
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
    punpckhbw mm2, mm3                     ; byte->word EFGH
    punpcklbw mm0, mm3                     ; byte->word CDEF
    pshufw    mm1, mm1, 0x94               ; word ABBC
    pshufw    mm2, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
    pshufw    mm3, mm0, 0x94               ; word CDDE
    movq      mm0, mm3                     ; backup for second set of pixels
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
    paddd     mm1, mm3                     ; add to 1st 2px cache
    movq      mm3, mm2                     ; backup for second set of pixels
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm1, mm2                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
407
    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
408 409 410 411 412 413 414 415 416 417 418 419 420 421
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
    paddd     mm0, mm3                     ; add to 2nd 2px cache
    pxor      mm3, mm3
    punpcklbw mm2, mm3                     ; byte->word FGHI
    pshufw    mm2, mm2, 0xE9               ; word GHHI
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm0, mm2                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm1, mm0                     ; merge dword->word (4px)
    paddsw    mm1, mm7                     ; rounding
    psraw     mm1, 7
    packuswb  mm1, mm3                     ; clip and word->bytes
422
    movd   [dstq], mm1                     ; store
423 424

    ; go to next line
425 426 427
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
428 429 430
    jg .nextrow
    REP_RET

431
INIT_XMM sse2
432 433
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 5
434
%ifdef PIC
435
    lea  picregq, [fourtap_filter_v_m]
436
%endif
437
    lea      mxq, [fourtap_filter_v+mxq-32]
438
    pxor      m7, m7
439
    mova      m4, [pw_64]
440 441
    mova      m5, [mxq+ 0]
    mova      m6, [mxq+16]
442
%ifdef m8
443 444
    mova      m8, [mxq+32]
    mova      m9, [mxq+48]
445
%endif
446
.nextrow:
447 448 449 450
    movq      m0, [srcq-1]
    movq      m1, [srcq-0]
    movq      m2, [srcq+1]
    movq      m3, [srcq+2]
451 452 453 454 455 456 457 458 459 460
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    pmullw    m0, m5
    pmullw    m1, m6
%ifdef m8
    pmullw    m2, m8
    pmullw    m3, m9
%else
461 462
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
463 464 465 466 467
%endif
    paddsw    m0, m1
    paddsw    m2, m3
    paddsw    m0, m2
    paddsw    m0, m4
468 469
    psraw     m0, 7
    packuswb  m0, m7
470
    movh  [dstq], m0        ; store
471 472

    ; go to next line
473 474 475
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
476 477 478
    jg .nextrow
    REP_RET

479
INIT_XMM sse2
480 481 482
cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
    lea      mxd, [mxq*3]
    shl      mxd, 4
483
%ifdef PIC
484
    lea  picregq, [sixtap_filter_v_m]
485
%endif
486
    lea      mxq, [sixtap_filter_v+mxq-96]
487
    pxor      m7, m7
488 489
    mova      m6, [pw_64]
%ifdef m8
490 491 492 493 494 495
    mova      m8, [mxq+ 0]
    mova      m9, [mxq+16]
    mova     m10, [mxq+32]
    mova     m11, [mxq+48]
    mova     m12, [mxq+64]
    mova     m13, [mxq+80]
496
%endif
497
.nextrow:
498 499 500 501 502 503
    movq      m0, [srcq-2]
    movq      m1, [srcq-1]
    movq      m2, [srcq-0]
    movq      m3, [srcq+1]
    movq      m4, [srcq+2]
    movq      m5, [srcq+3]
504 505 506 507 508 509 510 511 512 513 514 515 516 517
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7
    punpcklbw m5, m7
%ifdef m8
    pmullw    m0, m8
    pmullw    m1, m9
    pmullw    m2, m10
    pmullw    m3, m11
    pmullw    m4, m12
    pmullw    m5, m13
%else
518 519 520 521 522 523
    pmullw    m0, [mxq+ 0]
    pmullw    m1, [mxq+16]
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
    pmullw    m4, [mxq+64]
    pmullw    m5, [mxq+80]
524 525 526 527 528 529 530
%endif
    paddsw    m1, m4
    paddsw    m0, m5
    paddsw    m1, m2
    paddsw    m0, m3
    paddsw    m0, m1
    paddsw    m0, m6
531 532
    psraw     m0, 7
    packuswb  m0, m7
533
    movh  [dstq], m0        ; store
534 535

    ; go to next line
536 537 538
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
539 540 541
    jg .nextrow
    REP_RET

542
%macro FILTER_V 1
543
; 4x4 block, V-only 4-tap filter
544 545
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 5
546
%ifdef PIC
547
    lea  picregq, [fourtap_filter_v_m]
548
%endif
549
    lea      myq, [fourtap_filter_v+myq-32]
550 551
    mova      m6, [pw_64]
    pxor      m7, m7
552
    mova      m5, [myq+48]
553 554

    ; read 3 lines
555 556 557 558 559
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+  srcstrideq]
    movh      m2, [srcq+2*srcstrideq]
    add     srcq, srcstrideq
560 561 562 563
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7

564
.nextrow:
565
    ; first calculate negative taps (to prevent losing positive overflows)
566
    movh      m4, [srcq+2*srcstrideq]      ; read new row
567 568
    punpcklbw m4, m7
    mova      m3, m4
569
    pmullw    m0, [myq+0]
570 571 572 573 574
    pmullw    m4, m5
    paddsw    m4, m0

    ; then calculate positive taps
    mova      m0, m1
575
    pmullw    m1, [myq+16]
576 577
    paddsw    m4, m1
    mova      m1, m2
578
    pmullw    m2, [myq+32]
579 580 581 582 583 584 585
    paddsw    m4, m2
    mova      m2, m3

    ; round/clip/store
    paddsw    m4, m6
    psraw     m4, 7
    packuswb  m4, m7
586
    movh  [dstq], m4
587 588

    ; go to next line
589 590 591
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd                           ; next row
592 593 594 595 596
    jg .nextrow
    REP_RET


; 4x4 block, V-only 6-tap filter
597 598 599
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
    lea      myq, [myq*3]
600
%ifdef PIC
601
    lea  picregq, [sixtap_filter_v_m]
602
%endif
603
    lea      myq, [sixtap_filter_v+myq-96]
604 605 606
    pxor      m7, m7

    ; read 5 lines
607 608 609 610 611 612 613 614 615
    sub     srcq, srcstrideq
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+srcstrideq]
    movh      m2, [srcq+srcstrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    add     srcq, srcstrideq
    movh      m3, [srcq]
    movh      m4, [srcq+srcstrideq]
616 617 618 619 620 621
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7

622
.nextrow:
623 624
    ; first calculate negative taps (to prevent losing positive overflows)
    mova      m5, m1
625
    pmullw    m5, [myq+16]
626
    mova      m6, m4
627
    pmullw    m6, [myq+64]
628 629 630
    paddsw    m6, m5

    ; then calculate positive taps
631
    movh      m5, [srcq+2*srcstrideq]      ; read new row
632
    punpcklbw m5, m7
633
    pmullw    m0, [myq+0]
634 635 636
    paddsw    m6, m0
    mova      m0, m1
    mova      m1, m2
637
    pmullw    m2, [myq+32]
638 639
    paddsw    m6, m2
    mova      m2, m3
640
    pmullw    m3, [myq+48]
641 642 643
    paddsw    m6, m3
    mova      m3, m4
    mova      m4, m5
644
    pmullw    m5, [myq+80]
645 646 647 648 649 650
    paddsw    m6, m5

    ; round/clip/store
    paddsw    m6, [pw_64]
    psraw     m6, 7
    packuswb  m6, m7
651
    movh  [dstq], m6
652 653

    ; go to next line
654 655 656
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd                           ; next row
657 658 659 660
    jg .nextrow
    REP_RET
%endmacro

661
INIT_MMX mmxext
662 663 664
FILTER_V 4
INIT_XMM sse2
FILTER_V 8
665

666
%macro FILTER_BILINEAR 1
667 668
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
669
%ifdef PIC
670
    lea  picregq, [bilinear_filter_vw_m]
671 672
%endif
    pxor      m6, m6
673 674 675
    mova      m5, [bilinear_filter_vw+myq-1*16]
    neg      myq
    mova      m4, [bilinear_filter_vw+myq+7*16]
676
.nextrow:
677 678 679
    movh      m0, [srcq+srcstrideq*0]
    movh      m1, [srcq+srcstrideq*1]
    movh      m3, [srcq+srcstrideq*2]
680 681 682 683 684 685 686 687 688 689 690 691 692 693
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m3, m6
    mova      m2, m1
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
694
%if mmsize == 8
695 696
    packuswb  m0, m0
    packuswb  m2, m2
697 698
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m2
699 700
%else
    packuswb  m0, m2
701 702
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
703 704
%endif

705 706 707
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
708 709 710
    jg .nextrow
    REP_RET

711 712
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
713
%ifdef PIC
714
    lea  picregq, [bilinear_filter_vw_m]
715 716
%endif
    pxor      m6, m6
717 718 719
    mova      m5, [bilinear_filter_vw+mxq-1*16]
    neg      mxq
    mova      m4, [bilinear_filter_vw+mxq+7*16]
720
.nextrow:
721 722 723 724
    movh      m0, [srcq+srcstrideq*0+0]
    movh      m1, [srcq+srcstrideq*0+1]
    movh      m2, [srcq+srcstrideq*1+0]
    movh      m3, [srcq+srcstrideq*1+1]
725 726 727 728 729 730 731 732 733 734 735 736 737 738
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m2, m6
    punpcklbw m3, m6
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
739
%if mmsize == 8
740 741
    packuswb  m0, m0
    packuswb  m2, m2
742 743
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m2
744 745
%else
    packuswb  m0, m2
746 747
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
748 749
%endif

750 751 752
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
753 754 755 756
    jg .nextrow
    REP_RET
%endmacro

757
INIT_MMX mmxext
758 759 760
FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
761

762
%macro FILTER_BILINEAR_SSSE3 1
763 764
cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
765
%ifdef PIC
766
    lea  picregq, [bilinear_filter_vb_m]
767 768
%endif
    pxor      m4, m4
769
    mova      m3, [bilinear_filter_vb+myq-16]
770
.nextrow:
771 772 773
    movh      m0, [srcq+srcstrideq*0]
    movh      m1, [srcq+srcstrideq*1]
    movh      m2, [srcq+srcstrideq*2]
774 775 776 777 778 779 780 781
    punpcklbw m0, m1
    punpcklbw m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
782 783 784
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
785 786
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m1
787
%else
788
    packuswb  m0, m1
789 790
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
791
%endif
792

793 794 795
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
796 797 798
    jg .nextrow
    REP_RET

799 800
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
801
%ifdef PIC
802
    lea  picregq, [bilinear_filter_vb_m]
803 804 805
%endif
    pxor      m4, m4
    mova      m2, [filter_h2_shuf]
806
    mova      m3, [bilinear_filter_vb+mxq-16]
807
.nextrow:
808 809
    movu      m0, [srcq+srcstrideq*0]
    movu      m1, [srcq+srcstrideq*1]
810 811 812 813 814 815 816 817
    pshufb    m0, m2
    pshufb    m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
818 819 820
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
821 822
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m1
823
%else
824
    packuswb  m0, m1
825 826
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
827
%endif
828

829 830 831
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
832 833
    jg .nextrow
    REP_RET
834 835
%endmacro

836
INIT_MMX ssse3
837
FILTER_BILINEAR_SSSE3 4
838
INIT_XMM ssse3
839
FILTER_BILINEAR_SSSE3 8
840

841
INIT_MMX mmx
842
cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
843
.nextrow:
844 845 846 847 848 849 850
    movq    mm0, [srcq+srcstrideq*0]
    movq    mm1, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
    movq [dstq+dststrideq*0], mm0
    movq [dstq+dststrideq*1], mm1
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
851 852 853
    jg .nextrow
    REP_RET

854
%if ARCH_X86_32
855
INIT_MMX mmx
856
cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
857
.nextrow:
858 859 860 861 862 863 864 865 866 867 868
    movq    mm0, [srcq+srcstrideq*0+0]
    movq    mm1, [srcq+srcstrideq*0+8]
    movq    mm2, [srcq+srcstrideq*1+0]
    movq    mm3, [srcq+srcstrideq*1+8]
    lea    srcq, [srcq+srcstrideq*2]
    movq [dstq+dststrideq*0+0], mm0
    movq [dstq+dststrideq*0+8], mm1
    movq [dstq+dststrideq*1+0], mm2
    movq [dstq+dststrideq*1+8], mm3
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
869 870
    jg .nextrow
    REP_RET
871
%endif
872

873
INIT_XMM sse
874
cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
875
.nextrow:
876 877 878 879 880 881 882
    movups xmm0, [srcq+srcstrideq*0]
    movups xmm1, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
    movaps [dstq+dststrideq*0], xmm0
    movaps [dstq+dststrideq*1], xmm1
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
883 884 885
    jg .nextrow
    REP_RET

886
;-----------------------------------------------------------------------------
887
; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
888 889
;-----------------------------------------------------------------------------

890
%macro ADD_DC 4
891 892 893 894
    %4        m2, [dst1q+%3]
    %4        m3, [dst1q+strideq+%3]
    %4        m4, [dst2q+%3]
    %4        m5, [dst2q+strideq+%3]
895 896 897 898 899 900 901 902
    paddusb   m2, %1
    paddusb   m3, %1
    paddusb   m4, %1
    paddusb   m5, %1
    psubusb   m2, %2
    psubusb   m3, %2
    psubusb   m4, %2
    psubusb   m5, %2
903 904 905 906
    %4 [dst1q+%3], m2
    %4 [dst1q+strideq+%3], m3
    %4 [dst2q+%3], m4
    %4 [dst2q+strideq+%3], m5
907 908
%endmacro

909
INIT_MMX mmx
910
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
911
    ; load data
912
    movd       m0, [blockq]
913 914

    ; calculate DC
915 916 917
    paddw      m0, [pw_4]
    pxor       m1, m1
    psraw      m0, 3
918
    movd [blockq], m1
919 920 921 922 923 924 925
    psubw      m1, m0
    packuswb   m0, m0
    packuswb   m1, m1
    punpcklbw  m0, m0
    punpcklbw  m1, m1
    punpcklwd  m0, m0
    punpcklwd  m1, m1
926 927

    ; add DC
928 929
    DEFINE_ARGS dst1, dst2, stride
    lea     dst2q, [dst1q+strideq*2]
930
    ADD_DC     m0, m1, 0, movh
931 932
    RET

933
INIT_XMM sse4
934
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
935
    ; load data
936
    movd       m0, [blockq]
937 938 939 940
    pxor       m1, m1

    ; calculate DC
    paddw      m0, [pw_4]
941 942 943 944 945 946 947
    movd [blockq], m1
    DEFINE_ARGS dst1, dst2, stride
    lea     dst2q, [dst1q+strideq*2]
    movd       m2, [dst1q]
    movd       m3, [dst1q+strideq]
    movd       m4, [dst2q]
    movd       m5, [dst2q+strideq]
948 949 950 951 952 953 954 955 956 957
    psraw      m0, 3
    pshuflw    m0, m0, 0
    punpcklqdq m0, m0
    punpckldq  m2, m3
    punpckldq  m4, m5
    punpcklbw  m2, m1
    punpcklbw  m4, m1
    paddw      m2, m0
    paddw      m4, m0
    packuswb   m2, m4
958 959 960 961
    movd   [dst1q], m2
    pextrd [dst1q+strideq], m2, 1
    pextrd [dst2q], m2, 2
    pextrd [dst2q+strideq], m2, 3
962 963 964
    RET

;-----------------------------------------------------------------------------
965
; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
966 967
;-----------------------------------------------------------------------------

968
%if ARCH_X86_32
969
INIT_MMX mmx
970
cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
971
    ; load data
972 973 974 975
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
976
    punpckldq m0, m1        ; A B C D
977
    pxor      m6, m6
978 979

    ; calculate DC
980
    paddw     m0, [pw_4]
981 982 983 984
    movd [blockq+32*0], m6
    movd [blockq+32*1], m6
    movd [blockq+32*2], m6
    movd [blockq+32*3], m6
985 986 987 988 989 990 991 992 993 994 995 996 997 998
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
999 1000
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1001 1002 1003
    ADD_DC    m0, m6, 0, mova
    ADD_DC    m1, m7, 8, mova
    RET
1004
%endif
1005

1006
INIT_XMM sse2
1007
cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1008
    ; load data
1009 1010 1011 1012
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
1013
    punpckldq m0, m1        ; A B C D
1014 1015 1016 1017
    pxor      m1, m1

    ; calculate DC
    paddw     m0, [pw_4]
1018 1019 1020 1021
    movd [blockq+32*0], m1
    movd [blockq+32*1], m1
    movd [blockq+32*2], m1
    movd [blockq+32*3], m1
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
    psraw     m0, 3
    psubw     m1, m0
    packuswb  m0, m0
    packuswb  m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1

    ; add DC
1032 1033
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1034
    ADD_DC    m0, m1, 0, mova
1035
    RET
1036

1037
;-----------------------------------------------------------------------------
1038
; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
1039 1040
;-----------------------------------------------------------------------------

1041
INIT_MMX mmx
1042
cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1043
    ; load data
1044 1045 1046 1047
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
1048 1049 1050 1051 1052
    punpckldq m0, m1        ; A B C D
    pxor      m6, m6

    ; calculate DC
    paddw     m0, [pw_4]
1053 1054 1055 1056
    movd [blockq+32*0], m6
    movd [blockq+32*1], m6
    movd [blockq+32*2], m6
    movd [blockq+32*3], m6
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
1071 1072
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1073
    ADD_DC    m0, m6, 0, mova
1074 1075
    lea    dst1q, [dst1q+strideq*4]
    lea    dst2q, [dst2q+strideq*4]
1076 1077 1078
    ADD_DC    m1, m7, 0, mova
    RET

Ronald S. Bultje's avatar
Ronald S. Bultje committed
1079
;-----------------------------------------------------------------------------
1080
; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
;-----------------------------------------------------------------------------

; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
%macro VP8_MULTIPLY_SUMSUB 4
    mova      %3, %1
    mova      %4, %2
    pmulhw    %3, m6 ;20091(1)
    pmulhw    %4, m6 ;20091(2)
    paddw     %3, %1
    paddw     %4, %2
1092 1093
    paddw     %1, %1
    paddw     %2, %2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
    pmulhw    %1, m7 ;35468(1)
    pmulhw    %2, m7 ;35468(2)
    psubw     %1, %4
    paddw     %2, %3
%endmacro

; calculate x0=%1+%3; x1=%1-%3
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
;           %5/%6 are temporary registers
;           we assume m6/m7 have constant words 20091/17734 loaded in them
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1106
    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1107
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1108 1109
    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1110 1111 1112 1113
    SWAP                 %4,  %1
    SWAP                 %4,  %3
%endmacro

1114
%macro VP8_IDCT_ADD 0
1115
cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1116
    ; load block data
1117 1118 1119 1120
    movq         m0, [blockq+ 0]
    movq         m1, [blockq+ 8]
    movq         m2, [blockq+16]
    movq         m3, [blockq+24]
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1121 1122
    movq         m6, [pw_20091]
    movq         m7, [pw_17734]
1123
%if cpuflag(sse)
1124
    xorps      xmm0, xmm0
1125 1126
    movaps [blockq+ 0], xmm0
    movaps [blockq+16], xmm0
1127 1128
%else
    pxor         m4, m4
1129 1130 1131 1132
    movq [blockq+ 0], m4
    movq [blockq+ 8], m4
    movq [blockq+16], m4
    movq [blockq+24], m4
1133
%endif
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143

    ; actual IDCT
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4
    paddw        m0, [pw_4]
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4

    ; store
    pxor         m4, m4
1144 1145 1146 1147
    DEFINE_ARGS dst1, dst2, stride
    lea       dst2q, [dst1q+2*strideq]
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1148 1149

    RET
1150 1151
%endmacro

1152
%if ARCH_X86_32
1153 1154
INIT_MMX mmx
VP8_IDCT_ADD
1155
%endif
1156 1157
INIT_MMX sse
VP8_IDCT_ADD
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1158

1159
;-----------------------------------------------------------------------------
1160
; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1161 1162
;-----------------------------------------------------------------------------

1163
%macro SCATTER_WHT 3
1164 1165 1166 1167 1168 1169
    movd dc1d, m%1
    movd dc2d, m%2
    mov [blockq+2*16*(0+%3)], dc1w
    mov [blockq+2*16*(1+%3)], dc2w
    shr  dc1d, 16
    shr  dc2d, 16
1170 1171
    psrlq m%1, 32
    psrlq m%2, 32
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
    mov [blockq+2*16*(4+%3)], dc1w
    mov [blockq+2*16*(5+%3)], dc2w
    movd dc1d, m%1
    movd dc2d, m%2
    mov [blockq+2*16*(8+%3)], dc1w
    mov [blockq+2*16*(9+%3)], dc2w
    shr  dc1d, 16
    shr  dc2d, 16
    mov [blockq+2*16*(12+%3)], dc1w
    mov [blockq+2*16*(13+%3)], dc2w
1182 1183 1184
%endmacro

%macro HADAMARD4_1D 4
1185 1186
    SUMSUB_BADC w, %2, %1, %4, %3
    SUMSUB_BADC w, %4, %2, %3, %1
1187 1188 1189
    SWAP %1, %4, %3
%endmacro

1190
%macro VP8_DC_WHT 0
1191 1192 1193 1194 1195
cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
    movq          m0, [dc1q]
    movq          m1, [dc1q+8]
    movq          m2, [dc1q+16]
    movq          m3, [dc1q+24]
1196
%if cpuflag(sse)
1197
    xorps      xmm0, xmm0
1198 1199
    movaps [dc1q+ 0], xmm0
    movaps [dc1q+16], xmm0
1200 1201
%else
    pxor         m4, m4
1202 1203 1204 1205
    movq  [dc1q+ 0], m4
    movq  [dc1q+ 8], m4
    movq  [dc1q+16], m4
    movq  [dc1q+24], m4
1206
%endif
1207 1208 1209 1210 1211 1212 1213 1214
    HADAMARD4_1D  0, 1, 2, 3
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    paddw         m0, [pw_3]
    HADAMARD4_1D  0, 1, 2, 3
    psraw         m0, 3
    psraw         m1, 3
    psraw         m2, 3
    psraw         m3, 3
1215 1216
    SCATTER_WHT   0, 1, 0
    SCATTER_WHT   2, 3, 2
1217
    RET
1218 1219
%endmacro

1220
%if ARCH_X86_32
1221 1222
INIT_MMX mmx
VP8_DC_WHT
1223
%endif
1224 1225
INIT_MMX sse
VP8_DC_WHT