vp8dsp.asm 34 KB
Newer Older
1 2 3
;******************************************************************************
;* VP8 MMXEXT optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 22
;******************************************************************************

23
%include "libavutil/x86/x86util.asm"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

SECTION_RODATA

fourtap_filter_hw_m: times 4 dw  -6, 123
                     times 4 dw  12,  -1
                     times 4 dw  -9,  93
                     times 4 dw  50,  -6
                     times 4 dw  -6,  50
                     times 4 dw  93,  -9
                     times 4 dw  -1,  12
                     times 4 dw 123,  -6

sixtap_filter_hw_m:  times 4 dw   2, -11
                     times 4 dw 108,  36
                     times 4 dw  -8,   1
                     times 4 dw   3, -16
                     times 4 dw  77,  77
                     times 4 dw -16,   3
                     times 4 dw   1,  -8
                     times 4 dw  36, 108
                     times 4 dw -11,   2

46 47 48 49 50 51 52 53
fourtap_filter_hb_m: times 8 db  -6, 123
                     times 8 db  12,  -1
                     times 8 db  -9,  93
                     times 8 db  50,  -6
                     times 8 db  -6,  50
                     times 8 db  93,  -9
                     times 8 db  -1,  12
                     times 8 db 123,  -6
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

sixtap_filter_hb_m:  times 8 db   2,   1
                     times 8 db -11, 108
                     times 8 db  36,  -8
                     times 8 db   3,   3
                     times 8 db -16,  77
                     times 8 db  77, -16
                     times 8 db   1,   2
                     times 8 db  -8,  36
                     times 8 db 108, -11

fourtap_filter_v_m:  times 8 dw  -6
                     times 8 dw 123
                     times 8 dw  12
                     times 8 dw  -1
                     times 8 dw  -9
                     times 8 dw  93
                     times 8 dw  50
                     times 8 dw  -6
                     times 8 dw  -6
                     times 8 dw  50
                     times 8 dw  93
                     times 8 dw  -9
                     times 8 dw  -1
                     times 8 dw  12
                     times 8 dw 123
                     times 8 dw  -6

sixtap_filter_v_m:   times 8 dw   2
                     times 8 dw -11
                     times 8 dw 108
                     times 8 dw  36
                     times 8 dw  -8
                     times 8 dw   1
                     times 8 dw   3
                     times 8 dw -16
                     times 8 dw  77
                     times 8 dw  77
                     times 8 dw -16
                     times 8 dw   3
                     times 8 dw   1
                     times 8 dw  -8
                     times 8 dw  36
                     times 8 dw 108
                     times 8 dw -11
                     times 8 dw   2

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
bilinear_filter_vw_m: times 8 dw 1
                      times 8 dw 2
                      times 8 dw 3
                      times 8 dw 4
                      times 8 dw 5
                      times 8 dw 6
                      times 8 dw 7

bilinear_filter_vb_m: times 8 db 7, 1
                      times 8 db 6, 2
                      times 8 db 5, 3
                      times 8 db 4, 4
                      times 8 db 3, 5
                      times 8 db 2, 6
                      times 8 db 1, 7

117
%ifdef PIC
118 119 120 121 122 123 124 125 126
%define fourtap_filter_hw  picregq
%define sixtap_filter_hw   picregq
%define fourtap_filter_hb  picregq
%define sixtap_filter_hb   picregq
%define fourtap_filter_v   picregq
%define sixtap_filter_v    picregq
%define bilinear_filter_vw picregq
%define bilinear_filter_vb picregq
%define npicregs 1
127
%else
128 129 130 131 132 133
%define fourtap_filter_hw  fourtap_filter_hw_m
%define sixtap_filter_hw   sixtap_filter_hw_m
%define fourtap_filter_hb  fourtap_filter_hb_m
%define sixtap_filter_hb   sixtap_filter_hb_m
%define fourtap_filter_v   fourtap_filter_v_m
%define sixtap_filter_v    sixtap_filter_v_m
134 135
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
136
%define npicregs 0
137 138
%endif

139
filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
140
filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
141

142 143 144
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
145

Ronald S. Bultje's avatar
Ronald S. Bultje committed
146 147 148
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734

149
cextern pw_3
150 151
cextern pw_4
cextern pw_64
152
cextern pw_256
153 154 155

SECTION .text

156
;-------------------------------------------------------------------------------
157 158
; subpel MC functions:
;
159 160
; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
;                                                 uint8_t *src, ptrdiff_t srcstride,
161 162
;                                                 int height,   int mx, int my);
;-------------------------------------------------------------------------------
163

164
%macro FILTER_SSSE3 1
165 166
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
    lea      mxd, [mxq*3]
167 168 169
    mova      m3, [filter_h6_shuf2]
    mova      m4, [filter_h6_shuf3]
%ifdef PIC
170
    lea  picregq, [sixtap_filter_hb_m]
171
%endif
172 173 174
    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
    mova      m6, [sixtap_filter_hb+mxq*8-32]
    mova      m7, [sixtap_filter_hb+mxq*8-16]
175

176
.nextrow:
177
    movu      m0, [srcq-2]
178 179
    mova      m1, m0
    mova      m2, m0
180
%if mmsize == 8
181 182
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
183
    punpcklbw m0, [srcq+3]
184 185 186 187 188 189 190 191 192 193
%else
    pshufb    m0, [filter_h6_shuf1]
%endif
    pshufb    m1, m3
    pshufb    m2, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    pmaddubsw m2, m7
    paddsw    m0, m1
    paddsw    m0, m2
194
    pmulhrsw  m0, [pw_256]
195
    packuswb  m0, m0
196
    movh  [dstq], m0        ; store
197 198

    ; go to next line
199 200 201
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
202 203 204
    jg .nextrow
    REP_RET

205 206
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
207
    mova      m2, [pw_256]
208 209 210
    mova      m3, [filter_h2_shuf]
    mova      m4, [filter_h4_shuf]
%ifdef PIC
211
    lea  picregq, [fourtap_filter_hb_m]
212
%endif
213 214
    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
    mova      m6, [fourtap_filter_hb+mxq]
215

216
.nextrow:
217
    movu      m0, [srcq-1]
218 219 220 221 222 223
    mova      m1, m0
    pshufb    m0, m3
    pshufb    m1, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    paddsw    m0, m1
224
    pmulhrsw  m0, m2
225
    packuswb  m0, m0
226
    movh  [dstq], m0        ; store
227 228

    ; go to next line
229 230 231
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
232 233 234
    jg .nextrow
    REP_RET

235 236
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
237
%ifdef PIC
238
    lea  picregq, [fourtap_filter_hb_m]
239
%endif
240 241
    mova      m5, [fourtap_filter_hb+myq-16]
    mova      m6, [fourtap_filter_hb+myq]
242
    mova      m7, [pw_256]
243 244

    ; read 3 lines
245 246 247 248 249
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+  srcstrideq]
    movh      m2, [srcq+2*srcstrideq]
    add     srcq, srcstrideq
250

251
.nextrow:
252
    movh      m3, [srcq+2*srcstrideq]      ; read new row
253 254 255 256 257 258 259 260 261
    mova      m4, m0
    mova      m0, m1
    punpcklbw m4, m1
    mova      m1, m2
    punpcklbw m2, m3
    pmaddubsw m4, m5
    pmaddubsw m2, m6
    paddsw    m4, m2
    mova      m2, m3
262
    pmulhrsw  m4, m7
263
    packuswb  m4, m4
264
    movh  [dstq], m4
265 266

    ; go to next line
267 268 269
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
270 271 272
    jg .nextrow
    REP_RET

273 274
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    lea      myd, [myq*3]
275
%ifdef PIC
276
    lea  picregq, [sixtap_filter_hb_m]
277
%endif
278
    lea      myq, [sixtap_filter_hb+myq*8]
279 280

    ; read 5 lines
281 282 283 284 285 286 287 288 289
    sub     srcq, srcstrideq
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+srcstrideq]
    movh      m2, [srcq+srcstrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    add     srcq, srcstrideq
    movh      m3, [srcq]
    movh      m4, [srcq+srcstrideq]
290

291
.nextrow:
292
    movh      m5, [srcq+2*srcstrideq]      ; read new row
293 294 295 296 297 298
    mova      m6, m0
    punpcklbw m6, m5
    mova      m0, m1
    punpcklbw m1, m2
    mova      m7, m3
    punpcklbw m7, m4
299 300 301
    pmaddubsw m6, [myq-48]
    pmaddubsw m1, [myq-32]
    pmaddubsw m7, [myq-16]
302 303 304 305
    paddsw    m6, m1
    paddsw    m6, m7
    mova      m1, m2
    mova      m2, m3
306
    pmulhrsw  m6, [pw_256]
307 308 309
    mova      m3, m4
    packuswb  m6, m6
    mova      m4, m5
310
    movh  [dstq], m6
311 312

    ; go to next line
313 314 315
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
316 317 318 319
    jg .nextrow
    REP_RET
%endmacro

320 321 322 323
INIT_MMX ssse3
FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
324

325
; 4x4 block, H-only 4-tap filter
326
INIT_MMX mmxext
327 328
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    shl       mxd, 4
329
%ifdef PIC
330
    lea   picregq, [fourtap_filter_hw_m]
331
%endif
332 333
    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
    movq      mm5, [fourtap_filter_hw+mxq]
334 335 336
    movq      mm7, [pw_64]
    pxor      mm6, mm6

337
.nextrow:
338
    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm6                     ; byte->word ABCD
    pshufw    mm0, mm2, 9                  ; byte CDEF..
    punpcklbw mm0, mm6                     ; byte->word CDEF
    pshufw    mm3, mm1, 0x94               ; word ABBC
    pshufw    mm1, mm0, 0x94               ; word CDDE
    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
    movq      mm0, mm1                     ; backup for second set of pixels
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm3, mm1                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
    punpckhbw mm2, mm6                     ; byte->word EFGH
    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
    pshufw    mm1, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    paddd     mm0, mm1                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm3, mm0                     ; merge dword->word (4px)
    paddsw    mm3, mm7                     ; rounding
    psraw     mm3, 7
    packuswb  mm3, mm6                     ; clip and word->bytes
364
    movd   [dstq], mm3                     ; store
365 366

    ; go to next line
367 368 369
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
370 371 372 373
    jg .nextrow
    REP_RET

; 4x4 block, H-only 6-tap filter
374
INIT_MMX mmxext
375 376
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    lea       mxd, [mxq*3]
377
%ifdef PIC
378
    lea   picregq, [sixtap_filter_hw_m]
379
%endif
380 381 382
    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
    movq      mm5, [sixtap_filter_hw+mxq*8-32]
    movq      mm6, [sixtap_filter_hw+mxq*8-16]
383 384 385
    movq      mm7, [pw_64]
    pxor      mm3, mm3

386
.nextrow:
387
    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406

    ; first set of 2 pixels
    movq      mm2, mm1                     ; byte ABCD..
    punpcklbw mm1, mm3                     ; byte->word ABCD
    pshufw    mm0, mm2, 0x9                ; byte CDEF..
    punpckhbw mm2, mm3                     ; byte->word EFGH
    punpcklbw mm0, mm3                     ; byte->word CDEF
    pshufw    mm1, mm1, 0x94               ; word ABBC
    pshufw    mm2, mm2, 0x94               ; word EFFG
    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
    pshufw    mm3, mm0, 0x94               ; word CDDE
    movq      mm0, mm3                     ; backup for second set of pixels
    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
    paddd     mm1, mm3                     ; add to 1st 2px cache
    movq      mm3, mm2                     ; backup for second set of pixels
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm1, mm2                     ; finish 1st 2px

    ; second set of 2 pixels, use backup of above
407
    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
408 409 410 411 412 413 414 415 416 417 418 419 420 421
    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
    paddd     mm0, mm3                     ; add to 2nd 2px cache
    pxor      mm3, mm3
    punpcklbw mm2, mm3                     ; byte->word FGHI
    pshufw    mm2, mm2, 0xE9               ; word GHHI
    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    paddd     mm0, mm2                     ; finish 2nd 2px

    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    packssdw  mm1, mm0                     ; merge dword->word (4px)
    paddsw    mm1, mm7                     ; rounding
    psraw     mm1, 7
    packuswb  mm1, mm3                     ; clip and word->bytes
422
    movd   [dstq], mm1                     ; store
423 424

    ; go to next line
425 426 427
    add      dstq, dststrideq
    add      srcq, srcstrideq
    dec   heightd                          ; next row
428 429 430
    jg .nextrow
    REP_RET

431
INIT_XMM sse2
432 433
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 5
434
%ifdef PIC
435
    lea  picregq, [fourtap_filter_v_m]
436
%endif
437
    lea      mxq, [fourtap_filter_v+mxq-32]
438
    pxor      m7, m7
439
    mova      m4, [pw_64]
440 441
    mova      m5, [mxq+ 0]
    mova      m6, [mxq+16]
442
%ifdef m8
443 444
    mova      m8, [mxq+32]
    mova      m9, [mxq+48]
445
%endif
446
.nextrow:
447 448 449 450
    movq      m0, [srcq-1]
    movq      m1, [srcq-0]
    movq      m2, [srcq+1]
    movq      m3, [srcq+2]
451 452 453 454 455 456 457 458 459 460
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    pmullw    m0, m5
    pmullw    m1, m6
%ifdef m8
    pmullw    m2, m8
    pmullw    m3, m9
%else
461 462
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
463 464 465 466 467
%endif
    paddsw    m0, m1
    paddsw    m2, m3
    paddsw    m0, m2
    paddsw    m0, m4
468 469
    psraw     m0, 7
    packuswb  m0, m7
470
    movh  [dstq], m0        ; store
471 472

    ; go to next line
473 474 475
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
476 477 478
    jg .nextrow
    REP_RET

479
INIT_XMM sse2
480 481 482
cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
    lea      mxd, [mxq*3]
    shl      mxd, 4
483
%ifdef PIC
484
    lea  picregq, [sixtap_filter_v_m]
485
%endif
486
    lea      mxq, [sixtap_filter_v+mxq-96]
487
    pxor      m7, m7
488 489
    mova      m6, [pw_64]
%ifdef m8
490 491 492 493 494 495
    mova      m8, [mxq+ 0]
    mova      m9, [mxq+16]
    mova     m10, [mxq+32]
    mova     m11, [mxq+48]
    mova     m12, [mxq+64]
    mova     m13, [mxq+80]
496
%endif
497
.nextrow:
498 499 500 501 502 503
    movq      m0, [srcq-2]
    movq      m1, [srcq-1]
    movq      m2, [srcq-0]
    movq      m3, [srcq+1]
    movq      m4, [srcq+2]
    movq      m5, [srcq+3]
504 505 506 507 508 509 510 511 512 513 514 515 516 517
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7
    punpcklbw m5, m7
%ifdef m8
    pmullw    m0, m8
    pmullw    m1, m9
    pmullw    m2, m10
    pmullw    m3, m11
    pmullw    m4, m12
    pmullw    m5, m13
%else
518 519 520 521 522 523
    pmullw    m0, [mxq+ 0]
    pmullw    m1, [mxq+16]
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
    pmullw    m4, [mxq+64]
    pmullw    m5, [mxq+80]
524 525 526 527 528 529 530
%endif
    paddsw    m1, m4
    paddsw    m0, m5
    paddsw    m1, m2
    paddsw    m0, m3
    paddsw    m0, m1
    paddsw    m0, m6
531 532
    psraw     m0, 7
    packuswb  m0, m7
533
    movh  [dstq], m0        ; store
534 535

    ; go to next line
536 537 538
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd            ; next row
539 540 541
    jg .nextrow
    REP_RET

542
%macro FILTER_V 1
543
; 4x4 block, V-only 4-tap filter
544 545
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 5
546
%ifdef PIC
547
    lea  picregq, [fourtap_filter_v_m]
548
%endif
549
    lea      myq, [fourtap_filter_v+myq-32]
550 551
    mova      m6, [pw_64]
    pxor      m7, m7
552
    mova      m5, [myq+48]
553 554

    ; read 3 lines
555 556 557 558 559
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+  srcstrideq]
    movh      m2, [srcq+2*srcstrideq]
    add     srcq, srcstrideq
560 561 562 563
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7

564
.nextrow:
565
    ; first calculate negative taps (to prevent losing positive overflows)
566
    movh      m4, [srcq+2*srcstrideq]      ; read new row
567 568
    punpcklbw m4, m7
    mova      m3, m4
569
    pmullw    m0, [myq+0]
570 571 572 573 574
    pmullw    m4, m5
    paddsw    m4, m0

    ; then calculate positive taps
    mova      m0, m1
575
    pmullw    m1, [myq+16]
576 577
    paddsw    m4, m1
    mova      m1, m2
578
    pmullw    m2, [myq+32]
579 580 581 582 583 584 585
    paddsw    m4, m2
    mova      m2, m3

    ; round/clip/store
    paddsw    m4, m6
    psraw     m4, 7
    packuswb  m4, m7
586
    movh  [dstq], m4
587 588

    ; go to next line
589 590 591
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd                           ; next row
592 593 594 595 596
    jg .nextrow
    REP_RET


; 4x4 block, V-only 6-tap filter
597 598 599
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
    lea      myq, [myq*3]
600
%ifdef PIC
601
    lea  picregq, [sixtap_filter_v_m]
602
%endif
603
    lea      myq, [sixtap_filter_v+myq-96]
604 605 606
    pxor      m7, m7

    ; read 5 lines
607 608 609 610 611 612 613 614 615
    sub     srcq, srcstrideq
    sub     srcq, srcstrideq
    movh      m0, [srcq]
    movh      m1, [srcq+srcstrideq]
    movh      m2, [srcq+srcstrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    add     srcq, srcstrideq
    movh      m3, [srcq]
    movh      m4, [srcq+srcstrideq]
616 617 618 619 620 621
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
    punpcklbw m3, m7
    punpcklbw m4, m7

622
.nextrow:
623 624
    ; first calculate negative taps (to prevent losing positive overflows)
    mova      m5, m1
625
    pmullw    m5, [myq+16]
626
    mova      m6, m4
627
    pmullw    m6, [myq+64]
628 629 630
    paddsw    m6, m5

    ; then calculate positive taps
631
    movh      m5, [srcq+2*srcstrideq]      ; read new row
632
    punpcklbw m5, m7
633
    pmullw    m0, [myq+0]
634 635 636
    paddsw    m6, m0
    mova      m0, m1
    mova      m1, m2
637
    pmullw    m2, [myq+32]
638 639
    paddsw    m6, m2
    mova      m2, m3
640
    pmullw    m3, [myq+48]
641 642 643
    paddsw    m6, m3
    mova      m3, m4
    mova      m4, m5
644
    pmullw    m5, [myq+80]
645 646 647 648 649 650
    paddsw    m6, m5

    ; round/clip/store
    paddsw    m6, [pw_64]
    psraw     m6, 7
    packuswb  m6, m7
651
    movh  [dstq], m6
652 653

    ; go to next line
654 655 656
    add     dstq, dststrideq
    add     srcq, srcstrideq
    dec  heightd                           ; next row
657 658 659 660
    jg .nextrow
    REP_RET
%endmacro

661
INIT_MMX mmxext
662 663 664
FILTER_V 4
INIT_XMM sse2
FILTER_V 8
665

666
%macro FILTER_BILINEAR 1
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
%if cpuflag(ssse3)
cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
%ifdef PIC
    lea  picregq, [bilinear_filter_vb_m]
%endif
    pxor      m4, m4
    mova      m3, [bilinear_filter_vb+myq-16]
.nextrow:
    movh      m0, [srcq+srcstrideq*0]
    movh      m1, [srcq+srcstrideq*1]
    movh      m2, [srcq+srcstrideq*2]
    punpcklbw m0, m1
    punpcklbw m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m1
%else
    packuswb  m0, m1
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
%endif
%else ; cpuflag(ssse3)
698 699
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
700
%ifdef PIC
701
    lea  picregq, [bilinear_filter_vw_m]
702 703
%endif
    pxor      m6, m6
704 705 706
    mova      m5, [bilinear_filter_vw+myq-1*16]
    neg      myq
    mova      m4, [bilinear_filter_vw+myq+7*16]
707
.nextrow:
708 709 710
    movh      m0, [srcq+srcstrideq*0]
    movh      m1, [srcq+srcstrideq*1]
    movh      m3, [srcq+srcstrideq*2]
711 712 713 714 715 716 717 718 719 720 721 722 723 724
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m3, m6
    mova      m2, m1
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
725
%if mmsize == 8
726 727
    packuswb  m0, m0
    packuswb  m2, m2
728 729
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m2
730 731
%else
    packuswb  m0, m2
732 733
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
734
%endif
735
%endif ; cpuflag(ssse3)
736

737 738 739
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
740 741 742
    jg .nextrow
    REP_RET

743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
%if cpuflag(ssse3)
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
%ifdef PIC
    lea  picregq, [bilinear_filter_vb_m]
%endif
    pxor      m4, m4
    mova      m2, [filter_h2_shuf]
    mova      m3, [bilinear_filter_vb+mxq-16]
.nextrow:
    movu      m0, [srcq+srcstrideq*0]
    movu      m1, [srcq+srcstrideq*1]
    pshufb    m0, m2
    pshufb    m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
    pavgw     m1, m4
%if mmsize==8
    packuswb  m0, m0
    packuswb  m1, m1
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m1
%else
    packuswb  m0, m1
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
%endif
%else ; cpuflag(ssse3)
774 775
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 4
776
%ifdef PIC
777
    lea  picregq, [bilinear_filter_vw_m]
778 779
%endif
    pxor      m6, m6
780 781 782
    mova      m5, [bilinear_filter_vw+mxq-1*16]
    neg      mxq
    mova      m4, [bilinear_filter_vw+mxq+7*16]
783
.nextrow:
784 785 786 787
    movh      m0, [srcq+srcstrideq*0+0]
    movh      m1, [srcq+srcstrideq*0+1]
    movh      m2, [srcq+srcstrideq*1+0]
    movh      m3, [srcq+srcstrideq*1+1]
788 789 790 791 792 793 794 795 796 797 798 799 800 801
    punpcklbw m0, m6
    punpcklbw m1, m6
    punpcklbw m2, m6
    punpcklbw m3, m6
    pmullw    m0, m4
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
    paddsw    m0, m1
    paddsw    m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
802
%if mmsize == 8
803 804
    packuswb  m0, m0
    packuswb  m2, m2
805 806
    movh   [dstq+dststrideq*0], m0
    movh   [dstq+dststrideq*1], m2
807 808
%else
    packuswb  m0, m2
809 810
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
811
%endif
812
%endif ; cpuflag(ssse3)
813

814 815 816
    lea     dstq, [dstq+dststrideq*2]
    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
817 818 819 820
    jg .nextrow
    REP_RET
%endmacro

821
INIT_MMX mmxext
822 823 824 825
FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
INIT_MMX ssse3
826
FILTER_BILINEAR 4
827
INIT_XMM ssse3
828
FILTER_BILINEAR 8
829

830
INIT_MMX mmx
831
cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
832
.nextrow:
833 834 835 836 837 838 839
    movq    mm0, [srcq+srcstrideq*0]
    movq    mm1, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
    movq [dstq+dststrideq*0], mm0
    movq [dstq+dststrideq*1], mm1
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
840 841 842
    jg .nextrow
    REP_RET

843
%if ARCH_X86_32
844
INIT_MMX mmx
845
cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
846
.nextrow:
847 848 849 850 851 852 853 854 855 856 857
    movq    mm0, [srcq+srcstrideq*0+0]
    movq    mm1, [srcq+srcstrideq*0+8]
    movq    mm2, [srcq+srcstrideq*1+0]
    movq    mm3, [srcq+srcstrideq*1+8]
    lea    srcq, [srcq+srcstrideq*2]
    movq [dstq+dststrideq*0+0], mm0
    movq [dstq+dststrideq*0+8], mm1
    movq [dstq+dststrideq*1+0], mm2
    movq [dstq+dststrideq*1+8], mm3
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
858 859
    jg .nextrow
    REP_RET
860
%endif
861

862
INIT_XMM sse
863
cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
864
.nextrow:
865 866 867 868 869 870 871
    movups xmm0, [srcq+srcstrideq*0]
    movups xmm1, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
    movaps [dstq+dststrideq*0], xmm0
    movaps [dstq+dststrideq*1], xmm1
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
872 873 874
    jg .nextrow
    REP_RET

875
;-----------------------------------------------------------------------------
876
; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
877 878
;-----------------------------------------------------------------------------

879
%macro ADD_DC 4
880 881 882 883
    %4        m2, [dst1q+%3]
    %4        m3, [dst1q+strideq+%3]
    %4        m4, [dst2q+%3]
    %4        m5, [dst2q+strideq+%3]
884 885 886 887 888 889 890 891
    paddusb   m2, %1
    paddusb   m3, %1
    paddusb   m4, %1
    paddusb   m5, %1
    psubusb   m2, %2
    psubusb   m3, %2
    psubusb   m4, %2
    psubusb   m5, %2
892 893 894 895
    %4 [dst1q+%3], m2
    %4 [dst1q+strideq+%3], m3
    %4 [dst2q+%3], m4
    %4 [dst2q+strideq+%3], m5
896 897
%endmacro

898
%if ARCH_X86_32
899
INIT_MMX mmx
900
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
901
    ; load data
902
    movd       m0, [blockq]
903 904

    ; calculate DC
905 906 907
    paddw      m0, [pw_4]
    pxor       m1, m1
    psraw      m0, 3
908
    movd [blockq], m1
909 910 911 912 913 914 915
    psubw      m1, m0
    packuswb   m0, m0
    packuswb   m1, m1
    punpcklbw  m0, m0
    punpcklbw  m1, m1
    punpcklwd  m0, m0
    punpcklwd  m1, m1
916 917

    ; add DC
918 919
    DEFINE_ARGS dst1, dst2, stride
    lea     dst2q, [dst1q+strideq*2]
920
    ADD_DC     m0, m1, 0, movh
921
    RET
922
%endif
923

924
%macro VP8_IDCT_DC_ADD 0
925
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
926
    ; load data
927
    movd       m0, [blockq]
928 929 930 931
    pxor       m1, m1

    ; calculate DC
    paddw      m0, [pw_4]
932 933 934 935 936 937 938
    movd [blockq], m1
    DEFINE_ARGS dst1, dst2, stride
    lea     dst2q, [dst1q+strideq*2]
    movd       m2, [dst1q]
    movd       m3, [dst1q+strideq]
    movd       m4, [dst2q]
    movd       m5, [dst2q+strideq]
939 940 941 942 943 944 945 946 947 948
    psraw      m0, 3
    pshuflw    m0, m0, 0
    punpcklqdq m0, m0
    punpckldq  m2, m3
    punpckldq  m4, m5
    punpcklbw  m2, m1
    punpcklbw  m4, m1
    paddw      m2, m0
    paddw      m4, m0
    packuswb   m2, m4
949
    movd   [dst1q], m2
950
%if cpuflag(sse4)
951 952 953
    pextrd [dst1q+strideq], m2, 1
    pextrd [dst2q], m2, 2
    pextrd [dst2q+strideq], m2, 3
954 955 956 957 958 959 960 961
%else
    psrldq     m2, 4
    movd [dst1q+strideq], m2
    psrldq     m2, 4
    movd [dst2q], m2
    psrldq     m2, 4
    movd [dst2q+strideq], m2
%endif
962
    RET
963 964 965 966 967 968
%endmacro

INIT_XMM sse2
VP8_IDCT_DC_ADD
INIT_XMM sse4
VP8_IDCT_DC_ADD
969 970

;-----------------------------------------------------------------------------
971
; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
972 973
;-----------------------------------------------------------------------------

974
%if ARCH_X86_32
975
INIT_MMX mmx
976
cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
977
    ; load data
978 979 980 981
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
982
    punpckldq m0, m1        ; A B C D
983
    pxor      m6, m6
984 985

    ; calculate DC
986
    paddw     m0, [pw_4]
987 988 989 990
    movd [blockq+32*0], m6
    movd [blockq+32*1], m6
    movd [blockq+32*2], m6
    movd [blockq+32*3], m6
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
1005 1006
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1007 1008 1009
    ADD_DC    m0, m6, 0, mova
    ADD_DC    m1, m7, 8, mova
    RET
1010
%endif
1011

1012
INIT_XMM sse2
1013
cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1014
    ; load data
1015 1016 1017 1018
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
1019
    punpckldq m0, m1        ; A B C D
1020 1021 1022 1023
    pxor      m1, m1

    ; calculate DC
    paddw     m0, [pw_4]
1024 1025 1026 1027
    movd [blockq+32*0], m1
    movd [blockq+32*1], m1
    movd [blockq+32*2], m1
    movd [blockq+32*3], m1
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
    psraw     m0, 3
    psubw     m1, m0
    packuswb  m0, m0
    packuswb  m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1

    ; add DC
1038 1039
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1040
    ADD_DC    m0, m1, 0, mova
1041
    RET
1042

1043
;-----------------------------------------------------------------------------
1044
; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
1045 1046
;-----------------------------------------------------------------------------

1047
INIT_MMX mmx
1048
cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1049
    ; load data
1050 1051 1052 1053
    movd      m0, [blockq+32*0] ; A
    movd      m1, [blockq+32*2] ; C
    punpcklwd m0, [blockq+32*1] ; A B
    punpcklwd m1, [blockq+32*3] ; C D
1054 1055 1056 1057 1058
    punpckldq m0, m1        ; A B C D
    pxor      m6, m6

    ; calculate DC
    paddw     m0, [pw_4]
1059 1060 1061 1062
    movd [blockq+32*0], m6
    movd [blockq+32*1], m6
    movd [blockq+32*2], m6
    movd [blockq+32*3], m6
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
    psraw     m0, 3
    psubw     m6, m0
    packuswb  m0, m0
    packuswb  m6, m6
    punpcklbw m0, m0 ; AABBCCDD
    punpcklbw m6, m6 ; AABBCCDD
    movq      m1, m0
    movq      m7, m6
    punpcklbw m0, m0 ; AAAABBBB
    punpckhbw m1, m1 ; CCCCDDDD
    punpcklbw m6, m6 ; AAAABBBB
    punpckhbw m7, m7 ; CCCCDDDD

    ; add DC
1077 1078
    DEFINE_ARGS dst1, dst2, stride
    lea    dst2q, [dst1q+strideq*2]
1079
    ADD_DC    m0, m6, 0, mova
1080 1081
    lea    dst1q, [dst1q+strideq*4]
    lea    dst2q, [dst2q+strideq*4]
1082 1083 1084
    ADD_DC    m1, m7, 0, mova
    RET

Ronald S. Bultje's avatar
Ronald S. Bultje committed
1085
;-----------------------------------------------------------------------------
1086
; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
;-----------------------------------------------------------------------------

; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
%macro VP8_MULTIPLY_SUMSUB 4
    mova      %3, %1
    mova      %4, %2
    pmulhw    %3, m6 ;20091(1)
    pmulhw    %4, m6 ;20091(2)
    paddw     %3, %1
    paddw     %4, %2
1098 1099
    paddw     %1, %1
    paddw     %2, %2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
    pmulhw    %1, m7 ;35468(1)
    pmulhw    %2, m7 ;35468(2)
    psubw     %1, %4
    paddw     %2, %3
%endmacro

; calculate x0=%1+%3; x1=%1-%3
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
;           %5/%6 are temporary registers
;           we assume m6/m7 have constant words 20091/17734 loaded in them
%macro VP8_IDCT_TRANSFORM4x4_1D 6
1112
    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1113
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1114 1115
    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1116 1117 1118 1119
    SWAP                 %4,  %1
    SWAP                 %4,  %3
%endmacro

1120
%macro VP8_IDCT_ADD 0
1121
cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1122
    ; load block data
1123 1124 1125 1126
    movq         m0, [blockq+ 0]
    movq         m1, [blockq+ 8]
    movq         m2, [blockq+16]
    movq         m3, [blockq+24]
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1127 1128
    movq         m6, [pw_20091]
    movq         m7, [pw_17734]
1129
%if cpuflag(sse)
1130
    xorps      xmm0, xmm0
1131 1132
    movaps [blockq+ 0], xmm0
    movaps [blockq+16], xmm0
1133 1134
%else
    pxor         m4, m4
1135 1136 1137 1138
    movq [blockq+ 0], m4
    movq [blockq+ 8], m4
    movq [blockq+16], m4
    movq [blockq+24], m4
1139
%endif
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149

    ; actual IDCT
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4
    paddw        m0, [pw_4]
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
    TRANSPOSE4x4W            0, 1, 2, 3, 4

    ; store
    pxor         m4, m4
1150 1151 1152 1153
    DEFINE_ARGS dst1, dst2, stride
    lea       dst2q, [dst1q+2*strideq]
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1154 1155

    RET
1156 1157
%endmacro

1158
%if ARCH_X86_32
1159 1160
INIT_MMX mmx
VP8_IDCT_ADD
1161
%endif
1162 1163
INIT_MMX sse
VP8_IDCT_ADD
Ronald S. Bultje's avatar
Ronald S. Bultje committed
1164

1165
;-----------------------------------------------------------------------------
1166
; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1167 1168
;-----------------------------------------------------------------------------

1169
%macro SCATTER_WHT 3
1170 1171 1172 1173 1174 1175
    movd dc1d, m%1
    movd dc2d, m%2
    mov [blockq+2*16*(0+%3)], dc1w
    mov [blockq+2*16*(1+%3)], dc2w
    shr  dc1d, 16
    shr  dc2d, 16
1176 1177
    psrlq m%1, 32
    psrlq m%2, 32
1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
    mov [blockq+2*16*(4+%3)], dc1w
    mov [blockq+2*16*(5+%3)], dc2w
    movd dc1d, m%1
    movd dc2d, m%2
    mov [blockq+2*16*(8+%3)], dc1w
    mov [blockq+2*16*(9+%3)], dc2w
    shr  dc1d, 16
    shr  dc2d, 16
    mov [blockq+2*16*(12+%3)], dc1w
    mov [blockq+2*16*(13+%3)], dc2w
1188 1189 1190
%endmacro

%macro HADAMARD4_1D 4
1191 1192
    SUMSUB_BADC w, %2, %1, %4, %3
    SUMSUB_BADC w, %4, %2, %3, %1
1193 1194 1195
    SWAP %1, %4, %3
%endmacro

1196
%macro VP8_DC_WHT 0
1197 1198 1199 1200 1201
cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
    movq          m0, [dc1q]
    movq          m1, [dc1q+8]
    movq          m2, [dc1q+16]
    movq          m3, [dc1q+24]
1202
%if cpuflag(sse)
1203
    xorps      xmm0, xmm0
1204 1205
    movaps [dc1q+ 0], xmm0
    movaps [dc1q+16], xmm0
1206 1207
%else
    pxor         m4, m4
1208 1209 1210 1211
    movq  [dc1q+ 0], m4
    movq  [dc1q+ 8], m4
    movq  [dc1q+16], m4
    movq  [dc1q+24], m4
1212
%endif
1213 1214 1215 1216 1217 1218 1219 1220
    HADAMARD4_1D  0, 1, 2, 3
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    paddw         m0, [pw_3]
    HADAMARD4_1D  0, 1, 2, 3
    psraw         m0, 3
    psraw         m1, 3
    psraw         m2, 3
    psraw         m3, 3
1221 1222
    SCATTER_WHT   0, 1, 0
    SCATTER_WHT   2, 3, 2
1223
    RET
1224 1225
%endmacro

1226
%if ARCH_X86_32
1227 1228
INIT_MMX mmx
VP8_DC_WHT
1229
%endif
1230 1231
INIT_MMX sse
VP8_DC_WHT