dcadsp.asm 9.01 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
;******************************************************************************
;* SSE-optimized functions for the DCA decoder
;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA
pf_inv16:  times 4 dd 0x3D800000 ; 1/16

27
SECTION .text
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
; %1=v0/v1  %2=in1  %3=in2
%macro FIR_LOOP 2-3
.loop%1:
%define va          m1
%define vb          m2
%if %1
%define OFFSET      0
%else
%define OFFSET      NUM_COEF*count
%endif
; for v0, incrementing and for v1, decrementing
    mova        va, [cf0q + OFFSET]
    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
%if %0 == 3
    mova        m4, [cf0q + OFFSET + mmsize]
    mova        m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
%endif
    mulps       va, %2
    mulps       vb, %2
%if %0 == 3
    mulps       m4, %3
    mulps       m0, %3
    addps       va, m4
    addps       vb, m0
%endif
    ; va = va1 va2 va3 va4
    ; vb = vb1 vb2 vb3 vb4
%if %1
    SWAP        va, vb
%endif
    mova        m4, va
    unpcklps    va, vb ; va3 vb3 va4 vb4
    unpckhps    m4, vb ; va1 vb1 va2 vb2
    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
    movhlps     vb, m4 ; va1+3  vb1+3
    addps       vb, m4 ; va0..4 vb0..4
65
    movlps  [outq + count], vb
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
%if %1
    sub       cf0q, 8*NUM_COEF
%endif
    add      count, 8
    jl   .loop%1
%endmacro

; void dca_lfe_fir(float *out, float *in, float *coefs)
%macro DCA_LFE_FIR 1
cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
%define IN1       m3
%define IN2       m5
%define count     inq
%define NUM_COEF  4*(2-%1)
%define NUM_OUT   32*(%1+1)

    movu     IN1, [inq + 4 - 1*mmsize]
    shufps   IN1, IN1, q0123
%if %1 == 0
    movu     IN2, [inq + 4 - 2*mmsize]
    shufps   IN2, IN2, q0123
%endif

    mov    count, -4*NUM_OUT
    add     cf0q, 4*NUM_COEF*NUM_OUT
    add     outq, 4*NUM_OUT
    ; compute v0 first
%if %1 == 0
    FIR_LOOP   0, IN1, IN2
%else
    FIR_LOOP   0, IN1
%endif
    shufps   IN1, IN1, q0123
    mov    count, -4*NUM_OUT
    ; cf1 already correctly positioned
    add     outq, 4*NUM_OUT          ; outq now at out2
    sub     cf0q, 8*NUM_COEF
%if %1 == 0
    shufps   IN2, IN2, q0123
    FIR_LOOP   1, IN2, IN1
%else
    FIR_LOOP   1, IN1
%endif
    RET
%endmacro

INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
115

116
%macro SETZERO 1
117
%if cpuflag(sse2) && notcpuflag(avx)
118 119 120 121 122 123
    pxor          %1, %1
%else
    xorps         %1, %1, %1
%endif
%endmacro

124 125 126 127 128 129 130
%macro SHUF 3
%if cpuflag(avx)
    mova          %3, [%2 - 16]
    vperm2f128    %1, %3, %3, 1
    vshufps       %1, %1, %1, q0123
%elif cpuflag(sse2)
    pshufd        %1, [%2], q0123
131
%else
132
    mova          %1, [%2]
133 134 135 136
    shufps        %1, %1, q0123
%endif
%endmacro

137 138 139 140
%macro INNER_LOOP   1
    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
    ;~ b += window[i + j + 16] * (synth_buf[i + j])
141
    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
142 143
    mova          m6, [ptr1 + j]
%if ARCH_X86_64
144
    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
145 146
    mova         m12, [ptr1 + j + mmsize]
%endif
147 148 149 150 151 152 153 154
%if cpuflag(fma3)
    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
    fnmaddps      m1, m5,  [win + %1 + j], m1
%if ARCH_X86_64
    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
%endif
%else ; non-FMA
155 156
    mulps         m6, m6,  [win + %1 + j + 16 * 4]
    mulps         m5, m5,  [win + %1 + j]
157
%if ARCH_X86_64
158 159
    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
    mulps        m11, m11, [win + %1 + j + mmsize]
160
%endif
161 162
    addps         m2, m2, m6
    subps         m1, m1, m5
163
%if ARCH_X86_64
164 165
    addps         m8, m8, m12
    subps         m7, m7, m11
166
%endif
167
%endif ; cpuflag(fma3)
168 169
    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
170
    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
171 172
    mova          m5, [ptr1 + j + 16 * 4]
%if ARCH_X86_64
173
    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
174 175
    mova         m11, [ptr1 + j + mmsize + 16 * 4]
%endif
176 177 178 179 180 181 182 183
%if cpuflag(fma3)
    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
%if ARCH_X86_64
    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
%endif
%else ; non-FMA
184 185
    mulps         m5, m5,  [win + %1 + j + 32 * 4]
    mulps         m6, m6,  [win + %1 + j + 48 * 4]
186
%if ARCH_X86_64
187 188
    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
189
%endif
190 191
    addps         m3, m3, m5
    addps         m4, m4, m6
192
%if ARCH_X86_64
193 194
    addps         m9, m9, m11
    addps        m10, m10, m12
195
%endif
196
%endif ; cpuflag(fma3)
197 198 199
    sub            j, 64 * 4
%endmacro

200 201 202 203
; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
;                                  const float window[512], float out[32],
;                                  intptr_t offset, float scale)
%macro SYNTH_FILTER 0
204 205 206 207
cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
                              synth_buf, synth_buf2, window, out, off, scale
%define scale m0
%if ARCH_X86_32 || WIN64
208
%if cpuflag(sse2) && notcpuflag(avx)
209
    movd       scale, scalem
210
    SPLATD        m0
211
%else
212
    VBROADCASTSS  m0, scalem
213
%endif
214 215 216
; Make sure offset is in a register and not on the stack
%define OFFQ  r4q
%else
217 218 219 220
    SPLATD      xmm0
%if cpuflag(avx)
    vinsertf128   m0, m0, xmm0, 1
%endif
221 222 223 224 225 226 227
%define OFFQ  offq
%endif
    ; prepare inner counter limit 1
    mov          r5q, 480
    sub          r5q, offmp
    and          r5q, -64
    shl          r5q, 2
228
%if ARCH_X86_32 || notcpuflag(avx)
229 230 231
    mov         OFFQ, r5q
%define i        r5q
    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
232 233 234 235
%else
%define i 0
%define OFFQ  r5q
%endif
236 237 238 239 240

%define buf2     synth_buf2q
%if ARCH_X86_32
    mov         buf2, synth_buf2mp
%endif
241
.mainloop:
242
    ; m1 = a  m2 = b  m3 = c  m4 = d
243 244
    SETZERO       m3
    SETZERO       m4
245 246 247 248 249 250 251 252 253
    mova          m1, [buf2 + i]
    mova          m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
%define ptr1     r0q
%define ptr2     r1q
%define win      r2q
%define j        r3q
    mov          win, windowm
    mov         ptr1, synth_bufm
254
%if ARCH_X86_32 || notcpuflag(avx)
255 256
    add          win, i
    add         ptr1, i
257
%endif
258 259 260 261 262
%else ; ARCH_X86_64
%define ptr1     r6q
%define ptr2     r7q ; must be loaded
%define win      r8q
%define j        r9q
263 264
    SETZERO       m9
    SETZERO      m10
265 266 267 268 269 270 271 272
    mova          m7, [buf2 + i + mmsize]
    mova          m8, [buf2 + i + mmsize + 16 * 4]
    lea          win, [windowq + i]
    lea         ptr1, [synth_bufq + i]
%endif
    mov         ptr2, synth_bufmp
    ; prepare the inner loop counter
    mov            j, OFFQ
273
%if ARCH_X86_32 || notcpuflag(avx)
274
    sub         ptr2, i
275
%endif
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
.loop1:
    INNER_LOOP  0
    jge       .loop1

    mov            j, 448 * 4
    sub            j, OFFQ
    jz          .end
    sub         ptr1, j
    sub         ptr2, j
    add          win, OFFQ ; now at j-64, so define OFFSET
    sub            j, 64 * 4
.loop2:
    INNER_LOOP  64 * 4
    jge       .loop2

.end:
%if ARCH_X86_32
    mov         buf2, synth_buf2m ; needed for next iteration anyway
    mov         outq, outmp       ; j, which will be set again during it
%endif
    ;~ out[i]      = a * scale;
    ;~ out[i + 16] = b * scale;
298 299
    mulps         m1, m1, scale
    mulps         m2, m2, scale
300
%if ARCH_X86_64
301 302
    mulps         m7, m7, scale
    mulps         m8, m8, scale
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
%endif
    ;~ synth_buf2[i]      = c;
    ;~ synth_buf2[i + 16] = d;
    mova   [buf2 + i +  0 * 4], m3
    mova   [buf2 + i + 16 * 4], m4
%if ARCH_X86_64
    mova   [buf2 + i +  0 * 4 + mmsize], m9
    mova   [buf2 + i + 16 * 4 + mmsize], m10
%endif
    ;~ out[i]      = a;
    ;~ out[i + 16] = a;
    mova   [outq + i +  0 * 4], m1
    mova   [outq + i + 16 * 4], m2
%if ARCH_X86_64
    mova   [outq + i +  0 * 4 + mmsize], m7
    mova   [outq + i + 16 * 4 + mmsize], m8
%endif
320
%if ARCH_X86_32 || notcpuflag(avx)
321 322
    sub            i, (ARCH_X86_64 + 1) * mmsize
    jge    .mainloop
323
%endif
324
    RET
325 326 327 328 329 330 331 332
%endmacro

%if ARCH_X86_32
INIT_XMM sse
SYNTH_FILTER
%endif
INIT_XMM sse2
SYNTH_FILTER
333 334
INIT_YMM avx
SYNTH_FILTER
335 336
INIT_YMM fma3
SYNTH_FILTER