dcadsp.asm 11.1 KB
Newer Older
1 2 3 4
;******************************************************************************
;* SSE-optimized functions for the DCA decoder
;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8 9 10 11
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19 20 21 22 23 24 25 26 27 28
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA
pf_inv16:  times 4 dd 0x3D800000 ; 1/16

SECTION_TEXT

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
;                const int8_t hf_vq[1024][32], intptr_t vq_offset,
;                int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)

%macro DECODE_HF 0
cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
    lea       srcq, [srcq + offsetq]
    shl     startq, 2
    mov    offsetd, endm
%define DICT offsetq
    shl    offsetq, 2
    mov       endm, offsetq
.loop:
%if ARCH_X86_64
    mov    offsetd, [scaleq + 2 * startq]
    cvtsi2ss    m0, offsetd
%else
    cvtsi2ss    m0, [scaleq + 2 * startq]
%endif
    mov    offsetd, [numq + startq]
49
    mulss       m0, [pf_inv16]
50
    shl       DICT, 5
51 52 53
    shufps      m0, m0, 0
%if cpuflag(sse2)
%if cpuflag(sse4)
54 55
    pmovsxbd    m1, [srcq + DICT + 0]
    pmovsxbd    m2, [srcq + DICT + 4]
56
%else
57
    movq        m1, [srcq + DICT]
58 59 60 61 62 63 64 65 66 67
    punpcklbw   m1, m1
    mova        m2, m1
    punpcklwd   m1, m1
    punpckhwd   m2, m2
    psrad       m1, 24
    psrad       m2, 24
%endif
    cvtdq2ps    m1, m1
    cvtdq2ps    m2, m2
%else
68 69
    movd       mm0, [srcq + DICT + 0]
    movd       mm1, [srcq + DICT + 4]
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
    punpcklbw  mm0, mm0
    punpcklbw  mm1, mm1
    movq       mm2, mm0
    movq       mm3, mm1
    punpcklwd  mm0, mm0
    punpcklwd  mm1, mm1
    punpckhwd  mm2, mm2
    punpckhwd  mm3, mm3
    psrad      mm0, 24
    psrad      mm1, 24
    psrad      mm2, 24
    psrad      mm3, 24
    cvtpi2ps    m1, mm0
    cvtpi2ps    m2, mm1
    cvtpi2ps    m3, mm2
    cvtpi2ps    m4, mm3
    shufps      m0, m0, 0
    shufps      m1, m3, q1010
    shufps      m2, m4, q1010
%endif
    mulps       m1, m0
    mulps       m2, m0
92 93 94 95 96 97 98 99 100
    mova [dstq + 8 * startq +  0], m1
    mova [dstq + 8 * startq + 16], m2
    add     startq, 4
    cmp     startq, endm
    jl       .loop
.end:
%if notcpuflag(sse2)
    emms
%endif
101 102 103 104 105
    REP_RET
%endmacro

%if ARCH_X86_32
INIT_XMM sse
106
DECODE_HF
107 108 109
%endif

INIT_XMM sse2
110
DECODE_HF
111 112

INIT_XMM sse4
113
DECODE_HF
114 115 116 117 118 119 120 121 122 123 124

; %1=v0/v1  %2=in1  %3=in2
%macro FIR_LOOP 2-3
.loop%1:
%define va          m1
%define vb          m2
%if %1
%define OFFSET      0
%else
%define OFFSET      NUM_COEF*count
%endif
125
; for v0, incrementing and for v1, decrementing
126 127 128 129 130 131 132 133 134
    mova        va, [cf0q + OFFSET]
    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
%if %0 == 3
    mova        m4, [cf0q + OFFSET + mmsize]
    mova        m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
%endif
    mulps       va, %2
    mulps       vb, %2
%if %0 == 3
135 136 137 138
%if cpuflag(fma3)
    fmaddps     va, m4, %3, va
    fmaddps     vb, m0, %3, vb
%else
139 140 141 142
    mulps       m4, %3
    mulps       m0, %3
    addps       va, m4
    addps       vb, m0
143
%endif
144 145 146 147 148 149 150 151 152 153 154 155
%endif
    ; va = va1 va2 va3 va4
    ; vb = vb1 vb2 vb3 vb4
%if %1
    SWAP        va, vb
%endif
    mova        m4, va
    unpcklps    va, vb ; va3 vb3 va4 vb4
    unpckhps    m4, vb ; va1 vb1 va2 vb2
    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
    movhlps     vb, m4 ; va1+3  vb1+3
    addps       vb, m4 ; va0..4 vb0..4
156
    movlps  [outq + count], vb
157 158 159 160 161 162 163
%if %1
    sub       cf0q, 8*NUM_COEF
%endif
    add      count, 8
    jl   .loop%1
%endmacro

164
; void dca_lfe_fir(float *out, float *in, float *coefs)
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
%macro DCA_LFE_FIR 1
cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
%define IN1       m3
%define IN2       m5
%define count     inq
%define NUM_COEF  4*(2-%1)
%define NUM_OUT   32*(%1+1)

    movu     IN1, [inq + 4 - 1*mmsize]
    shufps   IN1, IN1, q0123
%if %1 == 0
    movu     IN2, [inq + 4 - 2*mmsize]
    shufps   IN2, IN2, q0123
%endif

    mov    count, -4*NUM_OUT
    add     cf0q, 4*NUM_COEF*NUM_OUT
    add     outq, 4*NUM_OUT
    ; compute v0 first
%if %1 == 0
    FIR_LOOP   0, IN1, IN2
%else
    FIR_LOOP   0, IN1
%endif
    shufps   IN1, IN1, q0123
    mov    count, -4*NUM_OUT
    ; cf1 already correctly positioned
    add     outq, 4*NUM_OUT          ; outq now at out2
    sub     cf0q, 8*NUM_COEF
%if %1 == 0
    shufps   IN2, IN2, q0123
    FIR_LOOP   1, IN2, IN1
%else
    FIR_LOOP   1, IN1
%endif
    RET
%endmacro

INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
206 207 208 209
%if HAVE_FMA3_EXTERNAL
INIT_XMM fma3
DCA_LFE_FIR 0
%endif
210

211 212 213 214 215 216 217 218
%macro SETZERO 1
%if cpuflag(sse2) && notcpuflag(avx)
    pxor          %1, %1
%else
    xorps         %1, %1, %1
%endif
%endmacro

219 220 221 222 223 224 225
%macro SHUF 3
%if cpuflag(avx)
    mova          %3, [%2 - 16]
    vperm2f128    %1, %3, %3, 1
    vshufps       %1, %1, %1, q0123
%elif cpuflag(sse2)
    pshufd        %1, [%2], q0123
226
%else
227
    mova          %1, [%2]
228 229 230 231
    shufps        %1, %1, q0123
%endif
%endmacro

232
%macro INNER_LOOP   1
233 234 235
    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
    ;~ b += window[i + j + 16] * (synth_buf[i + j])
236
    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
237 238
    mova          m6, [ptr1 + j]
%if ARCH_X86_64
239
    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
240 241
    mova         m12, [ptr1 + j + mmsize]
%endif
242 243 244 245 246 247 248 249 250
%if cpuflag(fma3)
    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
    fnmaddps      m1, m5,  [win + %1 + j], m1
%if ARCH_X86_64
    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
%endif
%else ; non-FMA
    mulps         m6, m6,  [win + %1 + j + 16 * 4]
251
    mulps         m5, m5,  [win + %1 + j]
252
%if ARCH_X86_64
253
    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
254
    mulps        m11, m11, [win + %1 + j + mmsize]
255 256 257 258 259
%endif
    addps         m2, m2, m6
    subps         m1, m1, m5
%if ARCH_X86_64
    addps         m8, m8, m12
260
    subps         m7, m7, m11
261
%endif
262
%endif ; cpuflag(fma3)
263 264
    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
265
    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
266
    mova          m5, [ptr1 + j + 16 * 4]
267
%if ARCH_X86_64
268
    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
269
    mova         m11, [ptr1 + j + mmsize + 16 * 4]
270
%endif
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
%if cpuflag(fma3)
    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
%if ARCH_X86_64
    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
%endif
%else ; non-FMA
    mulps         m5, m5,  [win + %1 + j + 32 * 4]
    mulps         m6, m6,  [win + %1 + j + 48 * 4]
%if ARCH_X86_64
    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
%endif
    addps         m3, m3, m5
    addps         m4, m4, m6
287
%if ARCH_X86_64
288 289
    addps         m9, m9, m11
    addps        m10, m10, m12
290
%endif
291
%endif ; cpuflag(fma3)
292
    sub            j, 64 * 4
293 294
%endmacro

295 296 297 298
; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
;                                  const float window[512], float out[32],
;                                  intptr_t offset, float scale)
%macro SYNTH_FILTER 0
299
cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
300 301 302
                              synth_buf, synth_buf2, window, out, off, scale
%define scale m0
%if ARCH_X86_32 || WIN64
303
%if cpuflag(sse2) && notcpuflag(avx)
304
    movd       scale, scalem
305 306
    SPLATD        m0
%else
307
    VBROADCASTSS  m0, scalem
308
%endif
309 310 311
; Make sure offset is in a register and not on the stack
%define OFFQ  r4q
%else
312 313 314 315
    SPLATD      xmm0
%if cpuflag(avx)
    vinsertf128   m0, m0, xmm0, 1
%endif
316 317 318 319 320 321 322
%define OFFQ  offq
%endif
    ; prepare inner counter limit 1
    mov          r5q, 480
    sub          r5q, offmp
    and          r5q, -64
    shl          r5q, 2
323
%if ARCH_X86_32 || notcpuflag(avx)
324 325
    mov         OFFQ, r5q
%define i        r5q
326
    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
327 328 329 330
%else
%define i 0
%define OFFQ  r5q
%endif
331 332 333 334 335 336

%define buf2     synth_buf2q
%if ARCH_X86_32
    mov         buf2, synth_buf2mp
%endif
.mainloop
337
    ; m1 = a  m2 = b  m3 = c  m4 = d
338 339
    SETZERO       m3
    SETZERO       m4
340
    mova          m1, [buf2 + i]
341
    mova          m2, [buf2 + i + 16 * 4]
342 343 344 345 346 347 348
%if ARCH_X86_32
%define ptr1     r0q
%define ptr2     r1q
%define win      r2q
%define j        r3q
    mov          win, windowm
    mov         ptr1, synth_bufm
349
%if ARCH_X86_32 || notcpuflag(avx)
350 351
    add          win, i
    add         ptr1, i
352
%endif
353
%else ; ARCH_X86_64
354 355 356 357
%define ptr1     r6q
%define ptr2     r7q ; must be loaded
%define win      r8q
%define j        r9q
358 359
    SETZERO       m9
    SETZERO      m10
360
    mova          m7, [buf2 + i + mmsize]
361
    mova          m8, [buf2 + i + mmsize + 16 * 4]
362 363 364 365 366 367
    lea          win, [windowq + i]
    lea         ptr1, [synth_bufq + i]
%endif
    mov         ptr2, synth_bufmp
    ; prepare the inner loop counter
    mov            j, OFFQ
368
%if ARCH_X86_32 || notcpuflag(avx)
369
    sub         ptr2, i
370
%endif
371 372 373 374
.loop1:
    INNER_LOOP  0
    jge       .loop1

375
    mov            j, 448 * 4
376 377 378 379 380
    sub            j, OFFQ
    jz          .end
    sub         ptr1, j
    sub         ptr2, j
    add          win, OFFQ ; now at j-64, so define OFFSET
381
    sub            j, 64 * 4
382
.loop2:
383
    INNER_LOOP  64 * 4
384 385 386 387 388 389 390
    jge       .loop2

.end:
%if ARCH_X86_32
    mov         buf2, synth_buf2m ; needed for next iteration anyway
    mov         outq, outmp       ; j, which will be set again during it
%endif
391 392
    ;~ out[i]      = a * scale;
    ;~ out[i + 16] = b * scale;
393 394
    mulps         m1, m1, scale
    mulps         m2, m2, scale
395
%if ARCH_X86_64
396 397
    mulps         m7, m7, scale
    mulps         m8, m8, scale
398
%endif
399
    ;~ synth_buf2[i]      = c;
400
    ;~ synth_buf2[i + 16] = d;
401 402
    mova   [buf2 + i +  0 * 4], m3
    mova   [buf2 + i + 16 * 4], m4
403
%if ARCH_X86_64
404 405
    mova   [buf2 + i +  0 * 4 + mmsize], m9
    mova   [buf2 + i + 16 * 4 + mmsize], m10
406
%endif
407
    ;~ out[i]      = a;
408
    ;~ out[i + 16] = a;
409 410
    mova   [outq + i +  0 * 4], m1
    mova   [outq + i + 16 * 4], m2
411
%if ARCH_X86_64
412 413
    mova   [outq + i +  0 * 4 + mmsize], m7
    mova   [outq + i + 16 * 4 + mmsize], m8
414
%endif
415
%if ARCH_X86_32 || notcpuflag(avx)
416
    sub            i, (ARCH_X86_64 + 1) * mmsize
417
    jge    .mainloop
418
%endif
419
    RET
420 421 422 423 424 425 426 427
%endmacro

%if ARCH_X86_32
INIT_XMM sse
SYNTH_FILTER
%endif
INIT_XMM sse2
SYNTH_FILTER
428 429
INIT_YMM avx
SYNTH_FILTER
430 431
INIT_YMM fma3
SYNTH_FILTER