h264_idct.asm 27.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
;*****************************************************************************
;* MMX/SSE2-optimized H.264 iDCT
;*****************************************************************************
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*          Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <hal@duncan.ol.sub.de>
;*          Min Chen <chenm001.163.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
26
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 28
;*****************************************************************************

29
%include "libavutil/x86/x86util.asm"
30 31 32

SECTION_RODATA

33 34 35 36 37 38 39 40 41 42 43 44
scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45
%ifdef PIC
46 47
%define npicregs 1
%define scan8 picregq
48
%else
49
%define npicregs 0
50 51 52 53
%define scan8 scan8_mem
%endif

cextern pw_32
54
cextern pw_1
55 56 57 58 59 60 61 62 63 64 65

SECTION .text

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT4_ADD 3
    ; Load dct coeffs
    movq         m0, [%2]
    movq         m1, [%2+8]
    movq         m2, [%2+16]
    movq         m3, [%2+24]

66
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67 68 69
    mova         m6, [pw_32]
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    paddw        m0, m6
70
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
71
    pxor         m7, m7
72 73 74 75
    movq    [%2+ 0], m7
    movq    [%2+ 8], m7
    movq    [%2+16], m7
    movq    [%2+24], m7
76 77 78 79 80 81

    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro

82
INIT_MMX mmx
83
; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
84
cglobal h264_idct_add_8, 3, 3, 0
85 86 87 88 89 90
    IDCT4_ADD    r0, r1, r2
    RET

%macro IDCT8_1D 2
    mova         m0, m1
    psraw        m1, 1
91 92
    mova         m4, m5
    psraw        m4, 1
93 94 95 96 97 98 99 100 101
    paddw        m4, m5
    paddw        m1, m0
    paddw        m4, m7
    paddw        m1, m5
    psubw        m4, m0
    paddw        m1, m3

    psubw        m0, m3
    psubw        m5, m3
102
    psraw        m3, 1
103 104 105 106 107 108 109 110
    paddw        m0, m7
    psubw        m5, m7
    psraw        m7, 1
    psubw        m0, m3
    psubw        m5, m7

    mova         m7, m1
    psraw        m1, 2
111
    mova         m3, m4
112 113 114 115 116 117 118 119 120 121
    psraw        m3, 2
    paddw        m3, m0
    psraw        m0, 2
    paddw        m1, m5
    psraw        m5, 2
    psubw        m0, m4
    psubw        m7, m5

    mova         m5, m6
    psraw        m6, 1
122 123
    mova         m4, m2
    psraw        m4, 1
124
    paddw        m6, m2
125
    psubw        m4, m5
126 127 128

    mova         m2, %1
    mova         m5, %2
129 130 131 132 133 134 135
    SUMSUB_BA    w, 5, 2
    SUMSUB_BA    w, 6, 5
    SUMSUB_BA    w, 4, 2
    SUMSUB_BA    w, 7, 6
    SUMSUB_BA    w, 0, 4
    SUMSUB_BA    w, 3, 2
    SUMSUB_BA    w, 1, 5
136
    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
%endmacro

%macro IDCT8_1D_FULL 1
    mova         m7, [%1+112]
    mova         m6, [%1+ 96]
    mova         m5, [%1+ 80]
    mova         m3, [%1+ 48]
    mova         m2, [%1+ 32]
    mova         m1, [%1+ 16]
    IDCT8_1D   [%1], [%1+ 64]
%endmacro

; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_MMX_START 2
    IDCT8_1D_FULL %1
    mova       [%1], m7
    TRANSPOSE4x4W 0, 1, 2, 3, 7
    mova         m7, [%1]
    mova    [%2   ], m0
    mova    [%2+16], m1
    mova    [%2+32], m2
    mova    [%2+48], m3
    TRANSPOSE4x4W 4, 5, 6, 7, 3
    mova    [%2+ 8], m4
    mova    [%2+24], m5
    mova    [%2+40], m6
    mova    [%2+56], m7
%endmacro

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
167
%macro IDCT8_ADD_MMX_END 3-4
168 169 170 171 172 173
    IDCT8_1D_FULL %2
    mova    [%2   ], m5
    mova    [%2+16], m6
    mova    [%2+32], m7

    pxor         m7, m7
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
%if %0 == 4
    movq   [%4+  0], m7
    movq   [%4+  8], m7
    movq   [%4+ 16], m7
    movq   [%4+ 24], m7
    movq   [%4+ 32], m7
    movq   [%4+ 40], m7
    movq   [%4+ 48], m7
    movq   [%4+ 56], m7
    movq   [%4+ 64], m7
    movq   [%4+ 72], m7
    movq   [%4+ 80], m7
    movq   [%4+ 88], m7
    movq   [%4+ 96], m7
    movq   [%4+104], m7
    movq   [%4+112], m7
    movq   [%4+120], m7
%endif
192 193 194 195 196 197 198 199 200 201 202 203
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
    mova         m0, [%2   ]
    mova         m1, [%2+16]
    mova         m2, [%2+32]
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro

204
INIT_MMX mmx
205
; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
206
cglobal h264_idct8_add_8, 3, 4, 0
207 208 209 210 211 212 213
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    add   word [r1], 32
    IDCT8_ADD_MMX_START r1  , rsp
    IDCT8_ADD_MMX_START r1+8, rsp+64
    lea          r3, [r0+4]
214
    IDCT8_ADD_MMX_END   r0  , rsp,   r2, r1
215 216 217 218 219 220 221 222
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2

    ADD         rsp, pad
    RET

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
    IDCT8_1D_FULL %2
223
%if ARCH_X86_64
224 225 226 227 228 229
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
%endif
    paddw        m0, [pw_32]

230
%if ARCH_X86_64 == 0
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
    mova    [%2   ], m0
    mova    [%2+16], m4
    IDCT8_1D   [%2], [%2+ 16]
    mova    [%2   ], m6
    mova    [%2+16], m7
%else
    SWAP          0, 8
    SWAP          4, 9
    IDCT8_1D     m8, m9
    SWAP          6, 8
    SWAP          7, 9
%endif

    pxor         m7, m7
    lea          %4, [%3*3]
    STORE_DIFF   m0, m6, m7, [%1     ]
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
250
%if ARCH_X86_64 == 0
251 252 253 254 255 256
    mova         m0, [%2   ]
    mova         m1, [%2+16]
%else
    SWAP          0, 8
    SWAP          1, 9
%endif
257 258 259 260 261 262 263 264
    mova   [%2+  0], m7
    mova   [%2+ 16], m7
    mova   [%2+ 32], m7
    mova   [%2+ 48], m7
    mova   [%2+ 64], m7
    mova   [%2+ 80], m7
    mova   [%2+ 96], m7
    mova   [%2+112], m7
265 266 267 268 269 270 271
    lea          %1, [%1+%3*4]
    STORE_DIFF   m4, m6, m7, [%1     ]
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
%endmacro

272
INIT_XMM sse2
273
; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
274
cglobal h264_idct8_add_8, 3, 4, 10
275 276 277
    IDCT8_ADD_SSE r0, r1, r2, r3
    RET

278
%macro DC_ADD_MMXEXT_INIT 2
279 280
    add          %1, 32
    sar          %1, 6
281
    movd         m0, %1d
282 283 284 285 286 287 288 289
    lea          %1, [%2*3]
    pshufw       m0, m0, 0
    pxor         m1, m1
    psubw        m1, m0
    packuswb     m0, m0
    packuswb     m1, m1
%endmacro

290
%macro DC_ADD_MMXEXT_OP 4
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
    %1           m2, [%2     ]
    %1           m3, [%2+%3  ]
    %1           m4, [%2+%3*2]
    %1           m5, [%2+%4  ]
    paddusb      m2, m0
    paddusb      m3, m0
    paddusb      m4, m0
    paddusb      m5, m0
    psubusb      m2, m1
    psubusb      m3, m1
    psubusb      m4, m1
    psubusb      m5, m1
    %1    [%2     ], m2
    %1    [%2+%3  ], m3
    %1    [%2+%3*2], m4
    %1    [%2+%4  ], m5
%endmacro

309
INIT_MMX mmxext
310
; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
311 312 313
%if ARCH_X86_64
cglobal h264_idct_dc_add_8, 3, 4, 0
    movsx        r3, word [r1]
314
    mov  dword [r1], 0
315 316
    DC_ADD_MMXEXT_INIT r3, r2
    DC_ADD_MMXEXT_OP movh, r0, r2, r3
317 318
    RET

319
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
320 321
cglobal h264_idct8_dc_add_8, 3, 4, 0
    movsx        r3, word [r1]
322
    mov  dword [r1], 0
323 324
    DC_ADD_MMXEXT_INIT r3, r2
    DC_ADD_MMXEXT_OP mova, r0, r2, r3
325
    lea          r0, [r0+r2*4]
326 327 328
    DC_ADD_MMXEXT_OP mova, r0, r2, r3
    RET
%else
329
; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
330 331
cglobal h264_idct_dc_add_8, 2, 3, 0
    movsx        r2, word [r1]
332
    mov  dword [r1], 0
333 334 335
    mov          r1, r2m
    DC_ADD_MMXEXT_INIT r2, r1
    DC_ADD_MMXEXT_OP movh, r0, r1, r2
336 337
    RET

338
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
339 340
cglobal h264_idct8_dc_add_8, 2, 3, 0
    movsx        r2, word [r1]
341
    mov  dword [r1], 0
342 343 344 345 346 347 348 349
    mov          r1, r2m
    DC_ADD_MMXEXT_INIT r2, r1
    DC_ADD_MMXEXT_OP mova, r0, r1, r2
    lea          r0, [r0+r1*4]
    DC_ADD_MMXEXT_OP mova, r0, r1, r2
    RET
%endif

350
INIT_MMX mmx
351 352 353
; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
;                               int16_t *block, int stride,
;                               const uint8_t nnzc[6 * 8])
354
cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
355 356
    xor          r5, r5
%ifdef PIC
357
    lea     picregq, [scan8_mem]
358
%endif
359
.nextblock:
360 361 362 363 364 365 366
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6]
    IDCT4_ADD    r6, r2, r3
367
.skipblock:
368 369 370 371 372 373
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

374 375 376
; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
;                               int16_t *block, int stride,
;                               const uint8_t nnzc[6 * 8])
377
cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
378 379 380 381 382
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    xor          r5, r5
%ifdef PIC
383
    lea     picregq, [scan8_mem]
384
%endif
385
.nextblock:
386 387 388 389 390
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
391
    add          r6, r0
392 393 394
    add   word [r2], 32
    IDCT8_ADD_MMX_START r2  , rsp
    IDCT8_ADD_MMX_START r2+8, rsp+64
395
    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
396 397 398
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6+4]
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
399
.skipblock:
400 401 402 403 404 405 406
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    ADD         rsp, pad
    RET

407
INIT_MMX mmxext
408 409 410
; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
;                                  int16_t *block, int stride,
;                                  const uint8_t nnzc[6 * 8])
411
cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
412 413
    xor          r5, r5
%ifdef PIC
414
    lea     picregq, [scan8_mem]
415
%endif
416
.nextblock:
417 418 419 420 421 422 423 424 425
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
426 427
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
428 429 430
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
431
%endif
432 433
    mov       dst2d, dword [r1+r5*4]
    lea       dst2q, [r0+dst2q]
434
    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
435
%if ARCH_X86_64 == 0
436 437 438 439 440 441 442
    mov          r1, r1m
%endif
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET
443
.no_dc:
444
    mov         r6d, dword [r1+r5*4]
445
    add          r6, r0
446
    IDCT4_ADD    r6, r2, r3
447
.skipblock:
448 449 450 451 452 453
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

454
INIT_MMX mmx
455 456 457
; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
;                                    int16_t *block, int stride,
;                                    const uint8_t nnzc[6 * 8])
458
cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
459 460
    xor          r5, r5
%ifdef PIC
461
    lea     picregq, [scan8_mem]
462
%endif
463
.nextblock:
464 465 466 467 468 469
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    or          r6w, word [r2]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
470
    add          r6, r0
471
    IDCT4_ADD    r6, r2, r3
472
.skipblock:
473 474 475 476 477 478
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

479
INIT_MMX mmxext
480 481 482
; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
;                                       int16_t *block, int stride,
;                                       const uint8_t nnzc[6 * 8])
483
cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
484 485
    xor          r5, r5
%ifdef PIC
486
    lea     picregq, [scan8_mem]
487
%endif
488
.nextblock:
489 490 491 492 493 494 495 496 497 498 499 500
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .try_dc
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6]
    IDCT4_ADD    r6, r2, r3
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET
501
.try_dc:
502 503 504
    movsx        r6, word [r2]
    test         r6, r6
    jz .skipblock
505 506
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
507 508 509
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
510
%endif
511 512
    mov       dst2d, dword [r1+r5*4]
    add       dst2q, r0
513
    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
514
%if ARCH_X86_64 == 0
515 516
    mov          r1, r1m
%endif
517
.skipblock:
518 519 520 521 522 523
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

524 525 526
; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
;                                  int16_t *block, int stride,
;                                  const uint8_t nnzc[6 * 8])
527
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
528 529 530 531 532
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    xor          r5, r5
%ifdef PIC
533
    lea     picregq, [scan8_mem]
534
%endif
535
.nextblock:
536 537 538 539 540 541 542 543 544
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
545 546
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
547 548 549 550 551 552
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
    mov       dst2d, dword [r1+r5*4]
    lea       dst2q, [r0+dst2q]
553
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
554
    lea       dst2q, [dst2q+r3*4]
555
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
556
%if ARCH_X86_64 == 0
557 558 559 560 561 562 563 564 565
    mov          r1, r1m
%endif
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock

    ADD         rsp, pad
    RET
566
.no_dc:
567
    mov         r6d, dword [r1+r5*4]
568
    add          r6, r0
569 570 571
    add   word [r2], 32
    IDCT8_ADD_MMX_START r2  , rsp
    IDCT8_ADD_MMX_START r2+8, rsp+64
572
    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
573 574 575
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6+4]
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
576
.skipblock:
577 578 579 580 581 582 583 584
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock

    ADD         rsp, pad
    RET

585
INIT_XMM sse2
586 587 588
; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
;                                int16_t *block, int stride,
;                                const uint8_t nnzc[6 * 8])
589
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
590 591
    xor          r5, r5
%ifdef PIC
592
    lea     picregq, [scan8_mem]
593
%endif
594
.nextblock:
595 596 597 598 599 600 601 602 603
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
604
INIT_MMX cpuname
605 606
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
607 608 609 610 611 612
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
    mov       dst2d, dword [r1+r5*4]
    add       dst2q, r0
613
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
614
    lea       dst2q, [dst2q+r3*4]
615
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
616
%if ARCH_X86_64 == 0
617 618 619 620 621 622 623
    mov          r1, r1m
%endif
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    REP_RET
624
.no_dc:
625
INIT_XMM cpuname
626 627 628
    mov       dst2d, dword [r1+r5*4]
    add       dst2q, r0
    IDCT8_ADD_SSE dst2q, r2, r3, r6
629
%if ARCH_X86_64 == 0
630 631
    mov          r1, r1m
%endif
632
.skipblock:
633 634 635 636 637 638
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    REP_RET

639
INIT_MMX mmx
640
h264_idct_add8_mmx_plane:
641
.nextblock:
642 643 644 645 646
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    or          r6w, word [r2]
    test         r6, r6
    jz .skipblock
647
%if ARCH_X86_64
648
    mov         r0d, dword [r1+r5*4]
649
    add          r0, [dst2q]
650 651 652 653 654 655
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    IDCT4_ADD    r0, r2, r3
656
.skipblock:
657 658 659 660 661 662
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret

663 664 665
; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
;                              int16_t *block, int stride,
;                              const uint8_t nnzc[6 * 8])
666
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
667 668 669
    mov          r5, 16
    add          r2, 512
%ifdef PIC
670
    lea     picregq, [scan8_mem]
671
%endif
672
%if ARCH_X86_64
673
    mov       dst2q, r0
674 675
%endif
    call         h264_idct_add8_mmx_plane
676 677
    mov          r5, 32
    add          r2, 384
678
%if ARCH_X86_64
679
    add       dst2q, gprsize
680 681 682 683 684 685
%else
    add        r0mp, gprsize
%endif
    call         h264_idct_add8_mmx_plane
    RET

686
h264_idct_add8_mmxext_plane:
687
.nextblock:
688 689 690 691
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .try_dc
692
%if ARCH_X86_64
693
    mov         r0d, dword [r1+r5*4]
694
    add          r0, [dst2q]
695 696 697 698 699 700 701 702 703 704 705
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    IDCT4_ADD    r0, r2, r3
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret
706
.try_dc:
707 708 709
    movsx        r6, word [r2]
    test         r6, r6
    jz .skipblock
710 711
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
712
%if ARCH_X86_64
713
    mov         r0d, dword [r1+r5*4]
714
    add          r0, [dst2q]
715 716 717 718 719
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
720
    DC_ADD_MMXEXT_OP movh, r0, r3, r6
721
.skipblock:
722 723 724 725 726 727
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret

728
INIT_MMX mmxext
729 730 731
; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
;                                 int16_t *block, int stride,
;                                 const uint8_t nnzc[6 * 8])
732
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
733 734
    mov          r5, 16
    add          r2, 512
735
%if ARCH_X86_64
736
    mov       dst2q, r0
737 738
%endif
%ifdef PIC
739
    lea     picregq, [scan8_mem]
740
%endif
741
    call h264_idct_add8_mmxext_plane
742 743
    mov          r5, 32
    add          r2, 384
744
%if ARCH_X86_64
745
    add       dst2q, gprsize
746 747 748
%else
    add        r0mp, gprsize
%endif
749
    call h264_idct_add8_mmxext_plane
750 751 752
    RET

; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
753
h264_idct_dc_add8_mmxext:
754
    movd         m0, [r2   ]          ;  0 0 X D
755
    mov word [r2+ 0], 0
756
    punpcklwd    m0, [r2+32]          ;  x X d D
757
    mov word [r2+32], 0
758 759 760 761 762 763 764 765 766
    paddsw       m0, [pw_32]
    psraw        m0, 6
    punpcklwd    m0, m0               ;  d d D D
    pxor         m1, m1               ;  0 0 0 0
    psubw        m1, m0               ; -d-d-D-D
    packuswb     m0, m1               ; -d-d-D-D d d D D
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
    punpcklwd    m0, m0               ;  d d d d D D D D
    lea          r6, [r3*3]
767
    DC_ADD_MMXEXT_OP movq, r0, r3, r6
768 769 770
    ret

ALIGN 16
771
INIT_XMM sse2
772
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
773
h264_add8x4_idct_sse2:
774 775 776 777 778 779 780 781
    movq   m0, [r2+ 0]
    movq   m1, [r2+ 8]
    movq   m2, [r2+16]
    movq   m3, [r2+24]
    movhps m0, [r2+32]
    movhps m1, [r2+40]
    movhps m2, [r2+48]
    movhps m3, [r2+56]
782
    IDCT4_1D w,0,1,2,3,4,5
783 784
    TRANSPOSE2x4x4W 0,1,2,3,4
    paddw m0, [pw_32]
785
    IDCT4_1D w,0,1,2,3,4,5
786
    pxor  m7, m7
787 788 789 790
    mova [r2+ 0], m7
    mova [r2+16], m7
    mova [r2+32], m7
    mova [r2+48], m7
791 792 793 794 795 796 797 798 799 800
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
    lea   r0, [r0+r3*2]
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
    ret

%macro add16_sse2_cycle 2
    movzx       r0, word [r4+%2]
    test        r0, r0
    jz .cycle%1end
    mov        r0d, dword [r1+%1*8]
801
%if ARCH_X86_64
802
    add         r0, r5
803 804 805
%else
    add         r0, r0m
%endif
806
    call        h264_add8x4_idct_sse2
807
.cycle%1end:
808 809 810 811 812
%if %1 < 7
    add         r2, 64
%endif
%endmacro

813 814 815
; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
;                                int16_t *block, int stride,
;                                const uint8_t nnzc[6 * 8])
816
cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
817
%if ARCH_X86_64
818
    mov         r5, r0
819 820 821 822 823 824 825 826 827 828 829 830 831
%endif
    ; unrolling of the loop leads to an average performance gain of
    ; 20-25%
    add16_sse2_cycle 0, 0xc
    add16_sse2_cycle 1, 0x14
    add16_sse2_cycle 2, 0xe
    add16_sse2_cycle 3, 0x16
    add16_sse2_cycle 4, 0x1c
    add16_sse2_cycle 5, 0x24
    add16_sse2_cycle 6, 0x1e
    add16_sse2_cycle 7, 0x26
    RET

832 833
%macro add16intra_sse2_cycle 2
    movzx       r0, word [r4+%2]
834
    test        r0, r0
835 836
    jz .try%1dc
    mov        r0d, dword [r1+%1*8]
837
%if ARCH_X86_64
838
    add         r0, r7
839 840 841
%else
    add         r0, r0m
%endif
842
    call        h264_add8x4_idct_sse2
843
    jmp .cycle%1end
844
.try%1dc:
845 846
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
847 848
    jz .cycle%1end
    mov        r0d, dword [r1+%1*8]
849
%if ARCH_X86_64
850
    add         r0, r7
851 852 853
%else
    add         r0, r0m
%endif
854
    call        h264_idct_dc_add8_mmxext
855
.cycle%1end:
856
%if %1 < 7
857
    add         r2, 64
858 859 860
%endif
%endmacro

861 862 863
; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
;                                     int16_t *block, int stride,
;                                     const uint8_t nnzc[6 * 8])
864
cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
865
%if ARCH_X86_64
866
    mov         r7, r0
867 868 869 870 871 872 873 874 875 876
%endif
    add16intra_sse2_cycle 0, 0xc
    add16intra_sse2_cycle 1, 0x14
    add16intra_sse2_cycle 2, 0xe
    add16intra_sse2_cycle 3, 0x16
    add16intra_sse2_cycle 4, 0x1c
    add16intra_sse2_cycle 5, 0x24
    add16intra_sse2_cycle 6, 0x1e
    add16intra_sse2_cycle 7, 0x26
    RET
877

878 879
%macro add8_sse2_cycle 2
    movzx       r0, word [r4+%2]
880
    test        r0, r0
881
    jz .try%1dc
882
%if ARCH_X86_64
883
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
884
    add         r0, [r7]
885
%else
886
    mov         r0, r0m
887
    mov         r0, [r0]
888
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
889
%endif
890
    call        h264_add8x4_idct_sse2
891
    jmp .cycle%1end
892
.try%1dc:
893 894
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
895
    jz .cycle%1end
896
%if ARCH_X86_64
897
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
898
    add         r0, [r7]
899
%else
900
    mov         r0, r0m
901
    mov         r0, [r0]
902
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
903
%endif
904
    call        h264_idct_dc_add8_mmxext
905
.cycle%1end:
906 907 908
%if %1 == 1
    add         r2, 384+64
%elif %1 < 3
909
    add         r2, 64
910 911
%endif
%endmacro
912

913 914 915
; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
;                               int16_t *block, int stride,
;                               const uint8_t nnzc[6 * 8])
916
cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
917
    add          r2, 512
918
%if ARCH_X86_64
919
    mov          r7, r0
920
%endif
921 922
    add8_sse2_cycle 0, 0x34
    add8_sse2_cycle 1, 0x3c
923
%if ARCH_X86_64
924
    add          r7, gprsize
925 926 927
%else
    add        r0mp, gprsize
%endif
928 929
    add8_sse2_cycle 2, 0x5c
    add8_sse2_cycle 3, 0x64
930
    RET
931

Diego Biurrun's avatar
Diego Biurrun committed
932
;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
933 934

%macro WALSH4_1D 5
935 936
    SUMSUB_BADC w, %4, %3, %2, %1, %5
    SUMSUB_BADC w, %4, %2, %3, %1, %5
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
    SWAP %1, %4, %3
%endmacro

%macro DEQUANT_MMX 3
    mova        m7, [pw_1]
    mova        m4, %1
    punpcklwd   %1, m7
    punpckhwd   m4, m7
    mova        m5, %2
    punpcklwd   %2, m7
    punpckhwd   m5, m7
    movd        m7, t3d
    punpckldq   m7, m7
    pmaddwd     %1, m7
    pmaddwd     %2, m7
    pmaddwd     m4, m7
    pmaddwd     m5, m7
    psrad       %1, %3
    psrad       %2, %3
    psrad       m4, %3
    psrad       m5, %3
    packssdw    %1, m4
    packssdw    %2, m5
%endmacro

962 963
%macro STORE_WORDS 5-9
%if cpuflag(sse)
964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    psrldq  %1, 4
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    mov [t2+%6*32], t0w
    mov [t2+%8*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%7*32], t0w
    mov [t2+%9*32], t1w
983 984 985 986 987 988 989 990 991 992 993
%else
    movd  t0d, %1
    psrlq  %1, 32
    movd  t1d, %1
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
%endif
994 995
%endmacro

996 997
%macro DEQUANT_STORE 1
%if cpuflag(sse2)
998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
    movd      xmm4, t3d
    movq      xmm5, [pw_1]
    pshufd    xmm4, xmm4, 0
    movq2dq   xmm0, m0
    movq2dq   xmm1, m1
    movq2dq   xmm2, m2
    movq2dq   xmm3, m3
    punpcklwd xmm0, xmm5
    punpcklwd xmm1, xmm5
    punpcklwd xmm2, xmm5
    punpcklwd xmm3, xmm5
    pmaddwd   xmm0, xmm4
    pmaddwd   xmm1, xmm4
    pmaddwd   xmm2, xmm4
    pmaddwd   xmm3, xmm4
    psrad     xmm0, %1
    psrad     xmm1, %1
    psrad     xmm2, %1
    psrad     xmm3, %1
    packssdw  xmm0, xmm1
    packssdw  xmm2, xmm3
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
%else
    DEQUANT_MMX m0, m1, %1
    STORE_WORDS m0,  0,  1,  4,  5
    STORE_WORDS m1,  2,  3,  6,  7

    DEQUANT_MMX m2, m3, %1
    STORE_WORDS m2,  8,  9, 12, 13
    STORE_WORDS m3, 10, 11, 14, 15
%endif
1030 1031
%endmacro

1032 1033
%macro IDCT_DC_DEQUANT 1
cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1034 1035
    ; manually spill XMM registers for Win64 because
    ; the code here is initialized with INIT_MMX
1036
    WIN64_SPILL_XMM %1
1037 1038 1039 1040 1041 1042 1043 1044 1045
    movq        m3, [r1+24]
    movq        m2, [r1+16]
    movq        m1, [r1+ 8]
    movq        m0, [r1+ 0]
    WALSH4_1D    0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    WALSH4_1D    0,1,2,3,4

; shift, tmp, output, qmul
1046
%if WIN64
1047 1048 1049
    DECLARE_REG_TMP 0,3,1,2
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
    xchg        r0, t2
1050
%elif ARCH_X86_64
1051 1052 1053 1054 1055 1056 1057 1058
    DECLARE_REG_TMP 3,1,0,2
%else
    DECLARE_REG_TMP 1,3,0,2
%endif

    cmp        t3d, 32767
    jg .big_qmul
    add        t3d, 128 << 16
1059
    DEQUANT_STORE 8
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
    RET
.big_qmul:
    bsr        t0d, t3d
    add        t3d, 128 << 16
    mov        t1d, 7
    cmp        t0d, t1d
    cmovg      t0d, t1d
    inc        t1d
    shr        t3d, t0b
    sub        t1d, t0d
1070
%if cpuflag(sse2)
1071
    movd      xmm6, t1d
1072 1073 1074 1075
    DEQUANT_STORE xmm6
%else
    movd        m6, t1d
    DEQUANT_STORE m6
1076 1077 1078 1079
%endif
    RET
%endmacro

1080 1081 1082 1083
INIT_MMX mmx
IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7