ac3dsp.asm 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
;*****************************************************************************
;* x86-optimized AC-3 DSP utils
;* Copyright (c) 2011 Justin Ruggles
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 21
;******************************************************************************

22
%include "libavutil/x86/x86util.asm"
23

24 25 26 27 28
SECTION_RODATA

; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000

29 30 31 32 33
; used in ff_ac3_compute_mantissa_size()
cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7

34 35 36 37
; used in ff_ac3_extract_exponents()
pd_1:   times 4 dd 1
pd_151: times 4 dd 151

38 39 40 41 42 43
SECTION .text

;-----------------------------------------------------------------------------
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
;-----------------------------------------------------------------------------

44 45
%macro AC3_EXPONENT_MIN 0
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    shl  reuse_blksq, 8
    jz .end
    LOOP_ALIGN
.nextexp:
    mov      offsetq, reuse_blksq
    mova          m0, [expq+offsetq]
    sub      offsetq, 256
    LOOP_ALIGN
.nextblk:
    PMINUB        m0, [expq+offsetq], m1
    sub      offsetq, 256
    jae .nextblk
    mova      [expq], m0
    add         expq, mmsize
    sub        expnq, mmsize
    jg .nextexp
.end:
    REP_RET
%endmacro

%define LOOP_ALIGN
67 68
INIT_MMX mmx
AC3_EXPONENT_MIN
69
%if HAVE_MMXEXT_EXTERNAL
70
%define LOOP_ALIGN ALIGN 16
71 72
INIT_MMX mmxext
AC3_EXPONENT_MIN
73
%endif
74
%if HAVE_SSE2_EXTERNAL
75 76
INIT_XMM sse2
AC3_EXPONENT_MIN
77 78
%endif
%undef LOOP_ALIGN
79 80 81 82 83 84 85 86 87 88 89 90

;-----------------------------------------------------------------------------
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
;
; This function uses 2 different methods to calculate a valid result.
; 1) logical 'or' of abs of each element
;        This is used for ssse3 because of the pabsw instruction.
;        It is also used for mmx because of the lack of min/max instructions.
; 2) calculate min/max for the array, then or(abs(min),abs(max))
;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
;-----------------------------------------------------------------------------

91 92 93 94 95 96 97 98 99
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
%macro OR_WORDS_HORIZ 2 ; src, tmp
%if cpuflag(sse2)
    movhlps     %2, %1
    por         %1, %2
    pshuflw     %2, %1, q0032
    por         %1, %2
    pshuflw     %2, %1, q0001
    por         %1, %2
100
%elif cpuflag(mmxext)
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
    pshufw      %2, %1, q0032
    por         %1, %2
    pshufw      %2, %1, q0001
    por         %1, %2
%else ; mmx
    movq        %2, %1
    psrlq       %2, 32
    por         %1, %2
    movq        %2, %1
    psrlq       %2, 16
    por         %1, %2
%endif
%endmacro

%macro AC3_MAX_MSB_ABS_INT16 1
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
117 118 119
    pxor        m2, m2
    pxor        m3, m3
.loop:
120
%ifidn %1, min_max
121 122 123 124 125 126 127
    mova        m0, [srcq]
    mova        m1, [srcq+mmsize]
    pminsw      m2, m0
    pminsw      m2, m1
    pmaxsw      m3, m0
    pmaxsw      m3, m1
%else ; or_abs
128
%if notcpuflag(ssse3)
129 130 131 132 133 134 135 136 137 138 139 140 141 142
    mova        m0, [srcq]
    mova        m1, [srcq+mmsize]
    ABS2        m0, m1, m3, m4
%else ; ssse3
    ; using memory args is faster for ssse3
    pabsw       m0, [srcq]
    pabsw       m1, [srcq+mmsize]
%endif
    por         m2, m0
    por         m2, m1
%endif
    add       srcq, mmsize*2
    sub       lend, mmsize
    ja .loop
143
%ifidn %1, min_max
144 145 146
    ABS2        m2, m3, m0, m1
    por         m2, m3
%endif
147
    OR_WORDS_HORIZ m2, m0
148 149 150 151 152
    movd       eax, m2
    and        eax, 0xFFFF
    RET
%endmacro

153 154
INIT_MMX mmx
AC3_MAX_MSB_ABS_INT16 or_abs
155
INIT_MMX mmxext
156 157 158 159 160
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM sse2
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM ssse3
AC3_MAX_MSB_ABS_INT16 or_abs
161 162 163 164 165

;-----------------------------------------------------------------------------
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
;-----------------------------------------------------------------------------

166 167
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
    movd      m0, shiftd
.loop:
    mova      m1, [srcq         ]
    mova      m2, [srcq+mmsize  ]
    mova      m3, [srcq+mmsize*2]
    mova      m4, [srcq+mmsize*3]
    %3        m1, m0
    %3        m2, m0
    %3        m3, m0
    %3        m4, m0
    mova  [srcq         ], m1
    mova  [srcq+mmsize  ], m2
    mova  [srcq+mmsize*2], m3
    mova  [srcq+mmsize*3], m4
    add     srcq, mmsize*4
    sub     lend, mmsize*32/%2
    ja .loop
.end:
    REP_RET
%endmacro

;-----------------------------------------------------------------------------
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------

193 194 195 196
INIT_MMX mmx
AC3_SHIFT l, 16, psllw
INIT_XMM sse2
AC3_SHIFT l, 16, psllw
197 198 199 200 201

;-----------------------------------------------------------------------------
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------

202 203 204 205
INIT_MMX mmx
AC3_SHIFT r, 32, psrad
INIT_XMM sse2
AC3_SHIFT r, 32, psrad
206 207 208 209 210 211 212

;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
;-----------------------------------------------------------------------------

; The 3DNow! version is not bit-identical because pf2id uses truncation rather
; than round-to-nearest.
213 214
INIT_MMX 3dnow
cglobal float_to_fixed24, 3, 3, 0, dst, src, len
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
    movq   m0, [pf_1_24]
.loop:
    movq   m1, [srcq   ]
    movq   m2, [srcq+8 ]
    movq   m3, [srcq+16]
    movq   m4, [srcq+24]
    pfmul  m1, m0
    pfmul  m2, m0
    pfmul  m3, m0
    pfmul  m4, m0
    pf2id  m1, m1
    pf2id  m2, m2
    pf2id  m3, m3
    pf2id  m4, m4
    movq  [dstq   ], m1
    movq  [dstq+8 ], m2
    movq  [dstq+16], m3
    movq  [dstq+24], m4
    add  srcq, 32
    add  dstq, 32
    sub  lend, 8
    ja .loop
237 238
    femms
    RET
239

240 241
INIT_XMM sse
cglobal float_to_fixed24, 3, 3, 3, dst, src, len
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
    movaps     m0, [pf_1_24]
.loop:
    movaps     m1, [srcq   ]
    movaps     m2, [srcq+16]
    mulps      m1, m0
    mulps      m2, m0
    cvtps2pi  mm0, m1
    movhlps    m1, m1
    cvtps2pi  mm1, m1
    cvtps2pi  mm2, m2
    movhlps    m2, m2
    cvtps2pi  mm3, m2
    movq  [dstq   ], mm0
    movq  [dstq+ 8], mm1
    movq  [dstq+16], mm2
    movq  [dstq+24], mm3
    add      srcq, 32
    add      dstq, 32
    sub      lend, 8
    ja .loop
262 263
    emms
    RET
264

265 266
INIT_XMM sse2
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
    movaps     m0, [pf_1_24]
.loop:
    movaps     m1, [srcq    ]
    movaps     m2, [srcq+16 ]
    movaps     m3, [srcq+32 ]
    movaps     m4, [srcq+48 ]
%ifdef m8
    movaps     m5, [srcq+64 ]
    movaps     m6, [srcq+80 ]
    movaps     m7, [srcq+96 ]
    movaps     m8, [srcq+112]
%endif
    mulps      m1, m0
    mulps      m2, m0
    mulps      m3, m0
    mulps      m4, m0
%ifdef m8
    mulps      m5, m0
    mulps      m6, m0
    mulps      m7, m0
    mulps      m8, m0
%endif
    cvtps2dq   m1, m1
    cvtps2dq   m2, m2
    cvtps2dq   m3, m3
    cvtps2dq   m4, m4
%ifdef m8
    cvtps2dq   m5, m5
    cvtps2dq   m6, m6
    cvtps2dq   m7, m7
    cvtps2dq   m8, m8
%endif
    movdqa  [dstq    ], m1
    movdqa  [dstq+16 ], m2
    movdqa  [dstq+32 ], m3
    movdqa  [dstq+48 ], m4
%ifdef m8
    movdqa  [dstq+64 ], m5
    movdqa  [dstq+80 ], m6
    movdqa  [dstq+96 ], m7
    movdqa  [dstq+112], m8
    add      srcq, 128
    add      dstq, 128
    sub      lenq, 32
%else
    add      srcq, 64
    add      dstq, 64
    sub      lenq, 16
%endif
    ja .loop
    REP_RET
318 319 320 321 322 323 324 325 326 327 328 329

;------------------------------------------------------------------------------
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
;------------------------------------------------------------------------------

%macro PHADDD4 2 ; xmm src, xmm tmp
    movhlps  %2, %1
    paddd    %1, %2
    pshufd   %2, %1, 0x1
    paddd    %1, %2
%endmacro

330 331
INIT_XMM sse2
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
332 333 334 335 336 337 338 339 340 341 342 343
    movdqa      m0, [mant_cntq      ]
    movdqa      m1, [mant_cntq+ 1*16]
    paddw       m0, [mant_cntq+ 2*16]
    paddw       m1, [mant_cntq+ 3*16]
    paddw       m0, [mant_cntq+ 4*16]
    paddw       m1, [mant_cntq+ 5*16]
    paddw       m0, [mant_cntq+ 6*16]
    paddw       m1, [mant_cntq+ 7*16]
    paddw       m0, [mant_cntq+ 8*16]
    paddw       m1, [mant_cntq+ 9*16]
    paddw       m0, [mant_cntq+10*16]
    paddw       m1, [mant_cntq+11*16]
344 345
    pmaddwd     m0, [ac3_bap_bits   ]
    pmaddwd     m1, [ac3_bap_bits+16]
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
    paddd       m0, m1
    PHADDD4     m0, m1
    movd      sumd, m0
    movdqa      m3, [pw_bap_mul1]
    movhpd      m0, [mant_cntq     +2]
    movlpd      m0, [mant_cntq+1*32+2]
    movhpd      m1, [mant_cntq+2*32+2]
    movlpd      m1, [mant_cntq+3*32+2]
    movhpd      m2, [mant_cntq+4*32+2]
    movlpd      m2, [mant_cntq+5*32+2]
    pmulhuw     m0, m3
    pmulhuw     m1, m3
    pmulhuw     m2, m3
    paddusw     m0, m1
    paddusw     m0, m2
    pmaddwd     m0, [pw_bap_mul2]
    PHADDD4     m0, m1
    movd       eax, m0
    add        eax, sumd
    RET
366 367 368 369 370

;------------------------------------------------------------------------------
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
;------------------------------------------------------------------------------

371 372 373 374
%macro PABSD 1-2 ; src/dst, unused
%if cpuflag(ssse3)
    pabsd    %1, %1
%else ; src/dst, tmp
375 376 377 378
    pxor     %2, %2
    pcmpgtd  %2, %1
    pxor     %1, %2
    psubd    %1, %2
379
%endif
380 381
%endmacro

382 383
%macro AC3_EXTRACT_EXPONENTS 0
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
    add     expq, lenq
    lea    coefq, [coefq+4*lenq]
    neg     lenq
    mova      m2, [pd_1]
    mova      m3, [pd_151]
.loop:
    ; move 4 32-bit coefs to xmm0
    mova      m0, [coefq+4*lenq]
    ; absolute value
    PABSD     m0, m1
    ; convert to float and extract exponents
    pslld     m0, 1
    por       m0, m2
    cvtdq2ps  m1, m0
    psrld     m1, 23
    mova      m0, m3
    psubd     m0, m1
    ; move the lowest byte in each of 4 dwords to the low dword
402 403 404
    ; NOTE: We cannot just extract the low bytes with pshufb because the dword
    ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
    ;       clips this to 0, which is the correct exponent.
405 406 407 408 409 410 411 412 413
    packssdw  m0, m0
    packuswb  m0, m0
    movd  [expq+lenq], m0

    add     lenq, 4
    jl .loop
    REP_RET
%endmacro

414
%if HAVE_SSE2_EXTERNAL
415 416
INIT_XMM sse2
AC3_EXTRACT_EXPONENTS
417
%endif
418 419 420
%if HAVE_SSSE3_EXTERNAL
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
421
%endif