h264_idct.asm 30.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
;*****************************************************************************
;* MMX/SSE2-optimized H.264 iDCT
;*****************************************************************************
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*          Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <hal@duncan.ol.sub.de>
;*          Min Chen <chenm001.163.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
26
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 28
;*****************************************************************************

29
%include "libavutil/x86/x86util.asm"
30 31 32

SECTION_RODATA

33 34 35 36 37 38 39 40 41 42 43 44
scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45
%ifdef PIC
46 47
%define npicregs 1
%define scan8 picregq
48
%else
49
%define npicregs 0
50 51 52 53
%define scan8 scan8_mem
%endif

cextern pw_32
54
cextern pw_1
55 56 57 58 59 60 61 62 63 64 65

SECTION .text

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT4_ADD 3
    ; Load dct coeffs
    movq         m0, [%2]
    movq         m1, [%2+8]
    movq         m2, [%2+16]
    movq         m3, [%2+24]

66
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67
    mova         m6, [pw_32]
68 69 70 71 72 73 74 75 76
    %if mmsize == 8
        TRANSPOSE4x4W 0, 1, 2, 3, 4
    %else
        punpcklwd m0, m1
        punpcklwd m2, m3
        SBUTTERFLY dq, 0, 2, 4
        MOVHL m1, m0
        MOVHL m3, m2
    %endif
77
    paddw        m0, m6
78
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
79
    pxor         m7, m7
80 81 82 83
    movq    [%2+ 0], m7
    movq    [%2+ 8], m7
    movq    [%2+16], m7
    movq    [%2+24], m7
84 85 86 87 88 89

    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro

90
INIT_MMX mmx
91
; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
92
cglobal h264_idct_add_8, 3, 3, 0
93
    movsxdifnidn r2, r2d
94 95 96 97
    IDCT4_ADD    r0, r1, r2
    RET

%macro IDCT8_1D 2
98 99 100
    psraw        m0, m1, 1
    SWAP 0, 1
    psraw        m4, m5, 1
101 102 103 104 105 106 107 108 109
    paddw        m4, m5
    paddw        m1, m0
    paddw        m4, m7
    paddw        m1, m5
    psubw        m4, m0
    paddw        m1, m3

    psubw        m0, m3
    psubw        m5, m3
110
    psraw        m3, 1
111 112 113 114 115 116
    paddw        m0, m7
    psubw        m5, m7
    psraw        m7, 1
    psubw        m0, m3
    psubw        m5, m7

117 118 119
    psraw        m7, m1, 2
    SWAP 7,1
    psraw        m3, m4, 2
120 121 122 123 124 125 126
    paddw        m3, m0
    psraw        m0, 2
    paddw        m1, m5
    psraw        m5, 2
    psubw        m0, m4
    psubw        m7, m5

127 128 129
    psraw        m5, m6, 1
    SWAP 5,6
    psraw        m4, m2, 1
130
    paddw        m6, m2
131
    psubw        m4, m5
132 133 134

    mova         m2, %1
    mova         m5, %2
135 136 137 138 139 140 141
    SUMSUB_BA    w, 5, 2
    SUMSUB_BA    w, 6, 5
    SUMSUB_BA    w, 4, 2
    SUMSUB_BA    w, 7, 6
    SUMSUB_BA    w, 0, 4
    SUMSUB_BA    w, 3, 2
    SUMSUB_BA    w, 1, 5
142
    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
%endmacro

%macro IDCT8_1D_FULL 1
    mova         m7, [%1+112]
    mova         m6, [%1+ 96]
    mova         m5, [%1+ 80]
    mova         m3, [%1+ 48]
    mova         m2, [%1+ 32]
    mova         m1, [%1+ 16]
    IDCT8_1D   [%1], [%1+ 64]
%endmacro

; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_MMX_START 2
    IDCT8_1D_FULL %1
    mova       [%1], m7
    TRANSPOSE4x4W 0, 1, 2, 3, 7
    mova         m7, [%1]
    mova    [%2   ], m0
    mova    [%2+16], m1
    mova    [%2+32], m2
    mova    [%2+48], m3
    TRANSPOSE4x4W 4, 5, 6, 7, 3
    mova    [%2+ 8], m4
    mova    [%2+24], m5
    mova    [%2+40], m6
    mova    [%2+56], m7
%endmacro

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
173
%macro IDCT8_ADD_MMX_END 3-4
174 175 176 177 178 179
    IDCT8_1D_FULL %2
    mova    [%2   ], m5
    mova    [%2+16], m6
    mova    [%2+32], m7

    pxor         m7, m7
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
%if %0 == 4
    movq   [%4+  0], m7
    movq   [%4+  8], m7
    movq   [%4+ 16], m7
    movq   [%4+ 24], m7
    movq   [%4+ 32], m7
    movq   [%4+ 40], m7
    movq   [%4+ 48], m7
    movq   [%4+ 56], m7
    movq   [%4+ 64], m7
    movq   [%4+ 72], m7
    movq   [%4+ 80], m7
    movq   [%4+ 88], m7
    movq   [%4+ 96], m7
    movq   [%4+104], m7
    movq   [%4+112], m7
    movq   [%4+120], m7
%endif
198 199 200 201 202 203 204 205 206 207 208 209
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
    mova         m0, [%2   ]
    mova         m1, [%2+16]
    mova         m2, [%2+32]
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro

210
INIT_MMX mmx
211
; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
212
cglobal h264_idct8_add_8, 3, 4, 0
213
    movsxdifnidn r2, r2d
214 215 216 217 218 219 220
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    add   word [r1], 32
    IDCT8_ADD_MMX_START r1  , rsp
    IDCT8_ADD_MMX_START r1+8, rsp+64
    lea          r3, [r0+4]
221
    IDCT8_ADD_MMX_END   r0  , rsp,   r2, r1
222 223 224 225 226 227 228 229
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2

    ADD         rsp, pad
    RET

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
    IDCT8_1D_FULL %2
230
%if ARCH_X86_64
231 232 233 234 235 236
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
%endif
    paddw        m0, [pw_32]

237
%if ARCH_X86_64 == 0
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
    mova    [%2   ], m0
    mova    [%2+16], m4
    IDCT8_1D   [%2], [%2+ 16]
    mova    [%2   ], m6
    mova    [%2+16], m7
%else
    SWAP          0, 8
    SWAP          4, 9
    IDCT8_1D     m8, m9
    SWAP          6, 8
    SWAP          7, 9
%endif

    pxor         m7, m7
    lea          %4, [%3*3]
    STORE_DIFF   m0, m6, m7, [%1     ]
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
257
%if ARCH_X86_64 == 0
258 259 260 261 262 263
    mova         m0, [%2   ]
    mova         m1, [%2+16]
%else
    SWAP          0, 8
    SWAP          1, 9
%endif
264 265 266 267 268 269 270 271
    mova   [%2+  0], m7
    mova   [%2+ 16], m7
    mova   [%2+ 32], m7
    mova   [%2+ 48], m7
    mova   [%2+ 64], m7
    mova   [%2+ 80], m7
    mova   [%2+ 96], m7
    mova   [%2+112], m7
272 273 274 275 276 277 278
    lea          %1, [%1+%3*4]
    STORE_DIFF   m4, m6, m7, [%1     ]
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
%endmacro

279
INIT_XMM sse2
280
; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
281
cglobal h264_idct8_add_8, 3, 4, 10
282
    movsxdifnidn  r2, r2d
283 284 285
    IDCT8_ADD_SSE r0, r1, r2, r3
    RET

286
%macro DC_ADD_MMXEXT_INIT 2
287 288
    add          %1, 32
    sar          %1, 6
289
    movd         m0, %1d
290 291 292 293 294 295 296 297
    lea          %1, [%2*3]
    pshufw       m0, m0, 0
    pxor         m1, m1
    psubw        m1, m0
    packuswb     m0, m0
    packuswb     m1, m1
%endmacro

298
%macro DC_ADD_MMXEXT_OP 4
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
    %1           m2, [%2     ]
    %1           m3, [%2+%3  ]
    %1           m4, [%2+%3*2]
    %1           m5, [%2+%4  ]
    paddusb      m2, m0
    paddusb      m3, m0
    paddusb      m4, m0
    paddusb      m5, m0
    psubusb      m2, m1
    psubusb      m3, m1
    psubusb      m4, m1
    psubusb      m5, m1
    %1    [%2     ], m2
    %1    [%2+%3  ], m3
    %1    [%2+%3*2], m4
    %1    [%2+%4  ], m5
%endmacro

317
INIT_MMX mmxext
318
; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
319 320
%if ARCH_X86_64
cglobal h264_idct_dc_add_8, 3, 4, 0
321
    movsxd       r2, r2d
322
    movsx        r3, word [r1]
323
    mov  dword [r1], 0
324 325
    DC_ADD_MMXEXT_INIT r3, r2
    DC_ADD_MMXEXT_OP movh, r0, r2, r3
326 327
    RET

328
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
329
cglobal h264_idct8_dc_add_8, 3, 4, 0
330
    movsxd       r2, r2d
331
    movsx        r3, word [r1]
332
    mov  dword [r1], 0
333 334
    DC_ADD_MMXEXT_INIT r3, r2
    DC_ADD_MMXEXT_OP mova, r0, r2, r3
335
    lea          r0, [r0+r2*4]
336 337 338
    DC_ADD_MMXEXT_OP mova, r0, r2, r3
    RET
%else
339
; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
340 341
cglobal h264_idct_dc_add_8, 2, 3, 0
    movsx        r2, word [r1]
342
    mov  dword [r1], 0
343 344 345
    mov          r1, r2m
    DC_ADD_MMXEXT_INIT r2, r1
    DC_ADD_MMXEXT_OP movh, r0, r1, r2
346 347
    RET

348
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
349 350
cglobal h264_idct8_dc_add_8, 2, 3, 0
    movsx        r2, word [r1]
351
    mov  dword [r1], 0
352 353 354 355 356 357 358 359
    mov          r1, r2m
    DC_ADD_MMXEXT_INIT r2, r1
    DC_ADD_MMXEXT_OP mova, r0, r1, r2
    lea          r0, [r0+r1*4]
    DC_ADD_MMXEXT_OP mova, r0, r1, r2
    RET
%endif

360
INIT_MMX mmx
361 362 363
; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
;                               int16_t *block, int stride,
;                               const uint8_t nnzc[6 * 8])
364
cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
365
    movsxdifnidn r3, r3d
366 367
    xor          r5, r5
%ifdef PIC
368
    lea     picregq, [scan8_mem]
369
%endif
370
.nextblock:
371 372 373 374 375 376 377
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6]
    IDCT4_ADD    r6, r2, r3
378
.skipblock:
379 380 381 382 383 384
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

385 386 387
; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
;                               int16_t *block, int stride,
;                               const uint8_t nnzc[6 * 8])
388
cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
389
    movsxdifnidn r3, r3d
390 391 392 393 394
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    xor          r5, r5
%ifdef PIC
395
    lea     picregq, [scan8_mem]
396
%endif
397
.nextblock:
398 399 400 401 402
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
403
    add          r6, r0
404 405 406
    add   word [r2], 32
    IDCT8_ADD_MMX_START r2  , rsp
    IDCT8_ADD_MMX_START r2+8, rsp+64
407
    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
408 409 410
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6+4]
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
411
.skipblock:
412 413 414 415 416 417 418
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    ADD         rsp, pad
    RET

419
INIT_MMX mmxext
420 421 422
; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
;                                  int16_t *block, int stride,
;                                  const uint8_t nnzc[6 * 8])
423
cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
424
    movsxdifnidn r3, r3d
425 426
    xor          r5, r5
%ifdef PIC
427
    lea     picregq, [scan8_mem]
428
%endif
429
.nextblock:
430 431 432 433 434 435 436 437 438
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
439 440
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
441 442 443
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
444
%endif
445 446
    mov       dst2d, dword [r1+r5*4]
    lea       dst2q, [r0+dst2q]
447
    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
448
%if ARCH_X86_64 == 0
449 450 451 452 453 454 455
    mov          r1, r1m
%endif
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET
456
.no_dc:
457
    mov         r6d, dword [r1+r5*4]
458
    add          r6, r0
459
    IDCT4_ADD    r6, r2, r3
460
.skipblock:
461 462 463 464 465 466
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

467
INIT_MMX mmx
468 469 470
; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
;                                    int16_t *block, int stride,
;                                    const uint8_t nnzc[6 * 8])
471
cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
472
    movsxdifnidn r3, r3d
473 474
    xor          r5, r5
%ifdef PIC
475
    lea     picregq, [scan8_mem]
476
%endif
477
.nextblock:
478 479 480 481 482 483
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    or          r6w, word [r2]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
484
    add          r6, r0
485
    IDCT4_ADD    r6, r2, r3
486
.skipblock:
487 488 489 490 491 492
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

493
INIT_MMX mmxext
494 495 496
; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
;                                       int16_t *block, int stride,
;                                       const uint8_t nnzc[6 * 8])
497
cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
498
    movsxdifnidn r3, r3d
499 500
    xor          r5, r5
%ifdef PIC
501
    lea     picregq, [scan8_mem]
502
%endif
503
.nextblock:
504 505 506 507 508 509 510 511 512 513 514 515
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .try_dc
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6]
    IDCT4_ADD    r6, r2, r3
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET
516
.try_dc:
517 518 519
    movsx        r6, word [r2]
    test         r6, r6
    jz .skipblock
520 521
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
522 523 524
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
525
%endif
526 527
    mov       dst2d, dword [r1+r5*4]
    add       dst2q, r0
528
    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
529
%if ARCH_X86_64 == 0
530 531
    mov          r1, r1m
%endif
532
.skipblock:
533 534 535 536 537 538
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

539 540 541
; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
;                                  int16_t *block, int stride,
;                                  const uint8_t nnzc[6 * 8])
542
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
543
    movsxdifnidn r3, r3d
544 545 546 547 548
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    xor          r5, r5
%ifdef PIC
549
    lea     picregq, [scan8_mem]
550
%endif
551
.nextblock:
552 553 554 555 556 557 558 559 560
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
561 562
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
563 564 565 566 567 568
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
    mov       dst2d, dword [r1+r5*4]
    lea       dst2q, [r0+dst2q]
569
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
570
    lea       dst2q, [dst2q+r3*4]
571
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
572
%if ARCH_X86_64 == 0
573 574 575 576 577 578 579 580 581
    mov          r1, r1m
%endif
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock

    ADD         rsp, pad
    RET
582
.no_dc:
583
    mov         r6d, dword [r1+r5*4]
584
    add          r6, r0
585 586 587
    add   word [r2], 32
    IDCT8_ADD_MMX_START r2  , rsp
    IDCT8_ADD_MMX_START r2+8, rsp+64
588
    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
589 590 591
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6+4]
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
592
.skipblock:
593 594 595 596 597 598 599 600
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock

    ADD         rsp, pad
    RET

601
INIT_XMM sse2
602 603 604
; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
;                                int16_t *block, int stride,
;                                const uint8_t nnzc[6 * 8])
605
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
606
    movsxdifnidn r3, r3d
607 608
    xor          r5, r5
%ifdef PIC
609
    lea     picregq, [scan8_mem]
610
%endif
611
.nextblock:
612 613 614 615 616 617 618 619 620
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
621
INIT_MMX cpuname
622 623
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
624 625 626 627 628 629
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
    mov       dst2d, dword [r1+r5*4]
    add       dst2q, r0
630
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
631
    lea       dst2q, [dst2q+r3*4]
632
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
633
%if ARCH_X86_64 == 0
634 635 636 637 638 639 640
    mov          r1, r1m
%endif
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    REP_RET
641
.no_dc:
642
INIT_XMM cpuname
643 644 645
    mov       dst2d, dword [r1+r5*4]
    add       dst2q, r0
    IDCT8_ADD_SSE dst2q, r2, r3, r6
646
%if ARCH_X86_64 == 0
647 648
    mov          r1, r1m
%endif
649
.skipblock:
650 651 652 653 654 655
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    REP_RET

656
INIT_MMX mmx
657
h264_idct_add8_mmx_plane:
658
    movsxdifnidn r3, r3d
659
.nextblock:
660 661 662 663 664
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    or          r6w, word [r2]
    test         r6, r6
    jz .skipblock
665
%if ARCH_X86_64
666
    mov         r0d, dword [r1+r5*4]
667
    add          r0, [dst2q]
668 669 670 671 672 673
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    IDCT4_ADD    r0, r2, r3
674
.skipblock:
675 676 677 678 679 680
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret

681 682 683
; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
;                              int16_t *block, int stride,
;                              const uint8_t nnzc[6 * 8])
684
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
685
    movsxdifnidn r3, r3d
686 687 688
    mov          r5, 16
    add          r2, 512
%ifdef PIC
689
    lea     picregq, [scan8_mem]
690
%endif
691
%if ARCH_X86_64
692
    mov       dst2q, r0
693 694
%endif
    call         h264_idct_add8_mmx_plane
695 696
    mov          r5, 32
    add          r2, 384
697
%if ARCH_X86_64
698
    add       dst2q, gprsize
699 700 701 702
%else
    add        r0mp, gprsize
%endif
    call         h264_idct_add8_mmx_plane
703
    RET ; TODO: check rep ret after a function call
704

705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
    movsxdifnidn r3, r3d
%ifdef PIC
    lea     picregq, [scan8_mem]
%endif
%if ARCH_X86_64
    mov       dst2q, r0
%endif

    mov          r5, 16  ; i
    add          r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t

    call         h264_idct_add8_mmx_plane
    add r5, 4
    call         h264_idct_add8_mmx_plane

%if ARCH_X86_64
    add       dst2q, gprsize ; dest[1]
%else
    add        r0mp, gprsize
%endif

    add r5, 4   ; set to 32
    add r2, 256 ; set to i * 16 * sizeof(dctcoef)

    call         h264_idct_add8_mmx_plane
    add r5, 4
    call         h264_idct_add8_mmx_plane

735
    RET ; TODO: check rep ret after a function call
736

737
h264_idct_add8_mmxext_plane:
738
    movsxdifnidn r3, r3d
739
.nextblock:
740 741 742 743
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .try_dc
744
%if ARCH_X86_64
745
    mov         r0d, dword [r1+r5*4]
746
    add          r0, [dst2q]
747 748 749 750 751 752 753 754 755 756 757
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    IDCT4_ADD    r0, r2, r3
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret
758
.try_dc:
759 760 761
    movsx        r6, word [r2]
    test         r6, r6
    jz .skipblock
762 763
    mov   word [r2], 0
    DC_ADD_MMXEXT_INIT r6, r3
764
%if ARCH_X86_64
765
    mov         r0d, dword [r1+r5*4]
766
    add          r0, [dst2q]
767 768 769 770 771
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
772
    DC_ADD_MMXEXT_OP movh, r0, r3, r6
773
.skipblock:
774 775 776 777 778 779
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret

780
INIT_MMX mmxext
781 782 783
; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
;                                 int16_t *block, int stride,
;                                 const uint8_t nnzc[6 * 8])
784
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
785
    movsxdifnidn r3, r3d
786 787
    mov          r5, 16
    add          r2, 512
788
%if ARCH_X86_64
789
    mov       dst2q, r0
790 791
%endif
%ifdef PIC
792
    lea     picregq, [scan8_mem]
793
%endif
794
    call h264_idct_add8_mmxext_plane
795 796
    mov          r5, 32
    add          r2, 384
797
%if ARCH_X86_64
798
    add       dst2q, gprsize
799 800 801
%else
    add        r0mp, gprsize
%endif
802
    call h264_idct_add8_mmxext_plane
803
    RET ; TODO: check rep ret after a function call
804 805

; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
806
h264_idct_dc_add8_mmxext:
807
    movsxdifnidn r3, r3d
808
    movd         m0, [r2   ]          ;  0 0 X D
809
    mov word [r2+ 0], 0
810
    punpcklwd    m0, [r2+32]          ;  x X d D
811
    mov word [r2+32], 0
812 813 814 815 816 817 818 819 820
    paddsw       m0, [pw_32]
    psraw        m0, 6
    punpcklwd    m0, m0               ;  d d D D
    pxor         m1, m1               ;  0 0 0 0
    psubw        m1, m0               ; -d-d-D-D
    packuswb     m0, m1               ; -d-d-D-D d d D D
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
    punpcklwd    m0, m0               ;  d d d d D D D D
    lea          r6, [r3*3]
821
    DC_ADD_MMXEXT_OP movq, r0, r3, r6
822 823 824
    ret

ALIGN 16
825
INIT_XMM sse2
826
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
827
h264_add8x4_idct_sse2:
828
    movsxdifnidn r3, r3d
829 830 831 832 833 834 835 836
    movq   m0, [r2+ 0]
    movq   m1, [r2+ 8]
    movq   m2, [r2+16]
    movq   m3, [r2+24]
    movhps m0, [r2+32]
    movhps m1, [r2+40]
    movhps m2, [r2+48]
    movhps m3, [r2+56]
837
    IDCT4_1D w,0,1,2,3,4,5
838 839
    TRANSPOSE2x4x4W 0,1,2,3,4
    paddw m0, [pw_32]
840
    IDCT4_1D w,0,1,2,3,4,5
841
    pxor  m7, m7
842 843 844 845
    mova [r2+ 0], m7
    mova [r2+16], m7
    mova [r2+32], m7
    mova [r2+48], m7
846 847 848 849 850 851 852 853 854 855
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
    lea   r0, [r0+r3*2]
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
    ret

%macro add16_sse2_cycle 2
    movzx       r0, word [r4+%2]
    test        r0, r0
    jz .cycle%1end
    mov        r0d, dword [r1+%1*8]
856
%if ARCH_X86_64
857
    add         r0, r5
858 859 860
%else
    add         r0, r0m
%endif
861
    call        h264_add8x4_idct_sse2
862
.cycle%1end:
863 864 865 866 867
%if %1 < 7
    add         r2, 64
%endif
%endmacro

868 869 870
; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
;                                int16_t *block, int stride,
;                                const uint8_t nnzc[6 * 8])
871
cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
872
    movsxdifnidn r3, r3d
873
%if ARCH_X86_64
874
    mov         r5, r0
875 876 877 878 879 880 881 882 883 884 885
%endif
    ; unrolling of the loop leads to an average performance gain of
    ; 20-25%
    add16_sse2_cycle 0, 0xc
    add16_sse2_cycle 1, 0x14
    add16_sse2_cycle 2, 0xe
    add16_sse2_cycle 3, 0x16
    add16_sse2_cycle 4, 0x1c
    add16_sse2_cycle 5, 0x24
    add16_sse2_cycle 6, 0x1e
    add16_sse2_cycle 7, 0x26
886
REP_RET
887

888 889
%macro add16intra_sse2_cycle 2
    movzx       r0, word [r4+%2]
890
    test        r0, r0
891 892
    jz .try%1dc
    mov        r0d, dword [r1+%1*8]
893
%if ARCH_X86_64
894
    add         r0, r7
895 896 897
%else
    add         r0, r0m
%endif
898
    call        h264_add8x4_idct_sse2
899
    jmp .cycle%1end
900
.try%1dc:
901 902
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
903 904
    jz .cycle%1end
    mov        r0d, dword [r1+%1*8]
905
%if ARCH_X86_64
906
    add         r0, r7
907 908 909
%else
    add         r0, r0m
%endif
910
    call        h264_idct_dc_add8_mmxext
911
.cycle%1end:
912
%if %1 < 7
913
    add         r2, 64
914 915 916
%endif
%endmacro

917 918 919
; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
;                                     int16_t *block, int stride,
;                                     const uint8_t nnzc[6 * 8])
920
cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
921
    movsxdifnidn r3, r3d
922
%if ARCH_X86_64
923
    mov         r7, r0
924 925 926 927 928 929 930 931 932
%endif
    add16intra_sse2_cycle 0, 0xc
    add16intra_sse2_cycle 1, 0x14
    add16intra_sse2_cycle 2, 0xe
    add16intra_sse2_cycle 3, 0x16
    add16intra_sse2_cycle 4, 0x1c
    add16intra_sse2_cycle 5, 0x24
    add16intra_sse2_cycle 6, 0x1e
    add16intra_sse2_cycle 7, 0x26
933
REP_RET
934

935 936
%macro add8_sse2_cycle 2
    movzx       r0, word [r4+%2]
937
    test        r0, r0
938
    jz .try%1dc
939
%if ARCH_X86_64
940
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
941
    add         r0, [r7]
942
%else
943
    mov         r0, r0m
944
    mov         r0, [r0]
945
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
946
%endif
947
    call        h264_add8x4_idct_sse2
948
    jmp .cycle%1end
949
.try%1dc:
950 951
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
952
    jz .cycle%1end
953
%if ARCH_X86_64
954
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
955
    add         r0, [r7]
956
%else
957
    mov         r0, r0m
958
    mov         r0, [r0]
959
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
960
%endif
961
    call        h264_idct_dc_add8_mmxext
962
.cycle%1end:
963 964 965
%if %1 == 1
    add         r2, 384+64
%elif %1 < 3
966
    add         r2, 64
967 968
%endif
%endmacro
969

970 971 972
; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
;                               int16_t *block, int stride,
;                               const uint8_t nnzc[6 * 8])
973
cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
974
    movsxdifnidn r3, r3d
975
    add          r2, 512
976
%if ARCH_X86_64
977
    mov          r7, r0
978
%endif
979 980
    add8_sse2_cycle 0, 0x34
    add8_sse2_cycle 1, 0x3c
981
%if ARCH_X86_64
982
    add          r7, gprsize
983 984 985
%else
    add        r0mp, gprsize
%endif
986 987
    add8_sse2_cycle 2, 0x5c
    add8_sse2_cycle 3, 0x64
988
REP_RET
989

Diego Biurrun's avatar
Diego Biurrun committed
990
;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
991 992

%macro WALSH4_1D 5
993 994
    SUMSUB_BADC w, %4, %3, %2, %1, %5
    SUMSUB_BADC w, %4, %2, %3, %1, %5
995 996 997
    SWAP %1, %4, %3
%endmacro

998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
%macro DEQUANT 1-3
%if cpuflag(sse2)
    movd      xmm4, t3d
    movq      xmm5, [pw_1]
    pshufd    xmm4, xmm4, 0
    movq2dq   xmm0, m0
    movq2dq   xmm1, m1
    movq2dq   xmm2, m2
    movq2dq   xmm3, m3
    punpcklwd xmm0, xmm5
    punpcklwd xmm1, xmm5
    punpcklwd xmm2, xmm5
    punpcklwd xmm3, xmm5
    pmaddwd   xmm0, xmm4
    pmaddwd   xmm1, xmm4
    pmaddwd   xmm2, xmm4
    pmaddwd   xmm3, xmm4
    psrad     xmm0, %1
    psrad     xmm1, %1
    psrad     xmm2, %1
    psrad     xmm3, %1
    packssdw  xmm0, xmm1
    packssdw  xmm2, xmm3
%else
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
    mova        m7, [pw_1]
    mova        m4, %1
    punpcklwd   %1, m7
    punpckhwd   m4, m7
    mova        m5, %2
    punpcklwd   %2, m7
    punpckhwd   m5, m7
    movd        m7, t3d
    punpckldq   m7, m7
    pmaddwd     %1, m7
    pmaddwd     %2, m7
    pmaddwd     m4, m7
    pmaddwd     m5, m7
    psrad       %1, %3
    psrad       %2, %3
    psrad       m4, %3
    psrad       m5, %3
    packssdw    %1, m4
    packssdw    %2, m5
1041
%endif
1042 1043
%endmacro

1044 1045
%macro STORE_WORDS 5-9
%if cpuflag(sse)
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    psrldq  %1, 4
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    mov [t2+%6*32], t0w
    mov [t2+%8*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%7*32], t0w
    mov [t2+%9*32], t1w
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
%else
    movd  t0d, %1
    psrlq  %1, 32
    movd  t1d, %1
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
%endif
1076 1077
%endmacro

1078 1079
%macro DEQUANT_STORE 1
%if cpuflag(sse2)
1080
    DEQUANT     %1
1081 1082 1083
    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
%else
1084
    DEQUANT     m0, m1, %1
1085 1086 1087
    STORE_WORDS m0,  0,  1,  4,  5
    STORE_WORDS m1,  2,  3,  6,  7

1088
    DEQUANT     m2, m3, %1
1089 1090 1091
    STORE_WORDS m2,  8,  9, 12, 13
    STORE_WORDS m3, 10, 11, 14, 15
%endif
1092 1093
%endmacro

1094 1095
%macro IDCT_DC_DEQUANT 1
cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1096 1097
    ; manually spill XMM registers for Win64 because
    ; the code here is initialized with INIT_MMX
1098
    WIN64_SPILL_XMM %1
1099 1100 1101 1102 1103 1104 1105 1106 1107
    movq        m3, [r1+24]
    movq        m2, [r1+16]
    movq        m1, [r1+ 8]
    movq        m0, [r1+ 0]
    WALSH4_1D    0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    WALSH4_1D    0,1,2,3,4

; shift, tmp, output, qmul
1108
%if WIN64
1109 1110 1111
    DECLARE_REG_TMP 0,3,1,2
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
    xchg        r0, t2
1112
%elif ARCH_X86_64
1113 1114 1115 1116 1117 1118 1119 1120
    DECLARE_REG_TMP 3,1,0,2
%else
    DECLARE_REG_TMP 1,3,0,2
%endif

    cmp        t3d, 32767
    jg .big_qmul
    add        t3d, 128 << 16
1121
    DEQUANT_STORE 8
1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
    RET
.big_qmul:
    bsr        t0d, t3d
    add        t3d, 128 << 16
    mov        t1d, 7
    cmp        t0d, t1d
    cmovg      t0d, t1d
    inc        t1d
    shr        t3d, t0b
    sub        t1d, t0d
1132
%if cpuflag(sse2)
1133
    movd      xmm6, t1d
1134 1135 1136 1137
    DEQUANT_STORE xmm6
%else
    movd        m6, t1d
    DEQUANT_STORE m6
1138 1139 1140 1141
%endif
    RET
%endmacro

1142 1143 1144 1145
INIT_MMX mmx
IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162

; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
    movd       %3, [%7]
    movd       %4, [%7+%8]
    psraw      %1, %6
    psraw      %2, %6
    punpcklbw  %3, %5
    punpcklbw  %4, %5
    paddw      %3, %1
    paddw      %4, %2
    packuswb   %3, %5
    packuswb   %4, %5
    movd     [%7], %3
    movd  [%7+%8], %4
%endmacro

1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
%macro DC_ADD_INIT 1
    add      %1d, 32
    sar      %1d, 6
    movd     m0, %1d
    pshuflw  m0, m0, 0
    lea      %1, [3*stride_q]
    pxor     m1, m1
    psubw    m1, m0
    packuswb m0, m0
    packuswb m1, m1
%endmacro

1175 1176 1177 1178
%macro IDCT_XMM 1

INIT_XMM %1

1179 1180 1181 1182
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
    movsxdifnidn stride_q, stride_d
    IDCT4_ADD    dst_q, block_q, stride_q
RET
1183 1184 1185 1186 1187 1188 1189 1190

cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
    movsxdifnidn stride_q, stride_d
    movsx             r3d, word [block_q]
    mov   dword [block_q], 0
    DC_ADD_INIT r3
    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
RET
1191 1192 1193 1194 1195

%endmacro

IDCT_XMM sse2
IDCT_XMM avx