sbrdsp.asm 16 KB
Newer Older
1 2 3 4
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8 9 10 11
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19 20 21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

22
%include "libavutil/x86/x86util.asm"
23

24 25 26
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask         times 2 dd 1<<31, 0
27
ps_mask2        times 2 dd 0, 1<<31
28
ps_mask3        dd  0, 0, 0, 1<<31
29 30 31 32 33 34
ps_noise0       times 2 dd  1.0,  0.0,
ps_noise2       times 2 dd -1.0,  0.0
ps_noise13      dd  0.0,  1.0, 0.0, -1.0
                dd  0.0, -1.0, 0.0,  1.0
                dd  0.0,  1.0, 0.0, -1.0
cextern         sbr_noise_table
35
cextern         ps_neg
36

37
SECTION .text
38 39 40

INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
41
    mov        r2d, r1d
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
    xorps       m0, m0
    xorps       m1, m1
    sar         r2, 3
    jz          .prepare
.loop:
    movu        m2, [r0 +  0]
    movu        m3, [r0 + 16]
    movu        m4, [r0 + 32]
    movu        m5, [r0 + 48]
    mulps       m2, m2
    mulps       m3, m3
    mulps       m4, m4
    mulps       m5, m5
    addps       m0, m2
    addps       m1, m3
    addps       m0, m4
    addps       m1, m5
    add         r0, 64
    dec         r2
    jnz         .loop
.prepare:
    and         r1, 7
    sar         r1, 1
    jz          .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
    movu        m2, [r0]
    add         r0, 16
    mulps       m2, m2
    dec         r1
    addps       m0, m2
    jnz         .endloop
.end:
    addps       m0, m1
    movhlps     m2, m0
    addps       m0, m2
    movss       m1, m0
    shufps      m0, m0, 1
    addss       m0, m1
%if ARCH_X86_64 == 0
82
    movss       r0m,  m0
83 84 85
    fld         dword r0m
%endif
    RET
86 87 88 89 90 91 92 93 94

%define STEP  40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
    lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
    mov         r5, r3
    and         r3, 0xFC
    lea         r2, [r2 + r3*4]
    lea         r0, [r0 + r3*8]
    neg         r3
95
    jz          .loop1
96
.loop4:
97 98 99 100
    movlps      m0, [r2 + 4*r3 + 0]
    movlps      m1, [r2 + 4*r3 + 8]
    movlps      m2, [r1 + 0*STEP]
    movlps      m3, [r1 + 2*STEP]
101 102
    movhps      m2, [r1 + 1*STEP]
    movhps      m3, [r1 + 3*STEP]
103 104
    unpcklps    m0, m0
    unpcklps    m1, m1
105 106 107 108 109 110 111 112 113 114 115
    mulps       m0, m2
    mulps       m1, m3
    movu        [r0 + 8*r3 +  0], m0
    movu        [r0 + 8*r3 + 16], m1
    add         r1, 4*STEP
    add         r3, 4
    jnz         .loop4
    and         r5, 3 ; number of single element loops
    jz          .end
.loop1: ; element 0 and 1 can be computed at the same time
    movss       m0, [r2]
116 117
    movlps      m2, [r1]
    unpcklps    m0, m0
118
    mulps       m2, m0
119
    movlps    [r0], m2
120 121 122 123 124 125 126
    add         r0, 8
    add         r2, 4
    add         r1, STEP
    dec         r5
    jnz         .loop1
.end:
    RET
127

128 129 130
; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
;                        const float alpha0[2], const float alpha1[2],
;                        float bw, int start, int end)
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
    ; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
    movss      bw, BWm
%endif
    movlps     m2, [alpha1q]
    movlps     m1, [alpha0q]
    shufps     bw, bw, 0
    mulps      m2, bw             ; (a1[0] a1[1])*bw
    mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
    mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
    mova       m3, m1
    mova       m4, m2

    ; Set pointers
%if ARCH_X86_64 == 0 || WIN64
    ; start and end 6th and 7th args on stack
    mov        r2d, Sm
    mov        r3d, Em
%define  start r2q
%define  end   r3q
%else
; BW does not actually occupy a register, so shift by 1
%define  start BWq
%define  end   Sq
%endif
    sub      start, end          ; neg num of loops
    lea    X_highq, [X_highq + end*2*4]
    lea     X_lowq, [X_lowq  + end*2*4 - 2*2*4]
    shl      start, 3            ; offset from num loops

    mova        m0, [X_lowq + start]
165 166 167 168 169 170
    shufps      m3, m3, q1111
    shufps      m4, m4, q1111
    xorps       m3, [ps_mask]
    shufps      m1, m1, q0000
    shufps      m2, m2, q0000
    xorps       m4, [ps_mask]
171
.loop2:
172
    movu        m7, [X_lowq + start + 8]        ; BbCc
173
    mova        m6, m0
174 175 176 177
    mova        m5, m7
    shufps      m0, m0, q2301                   ; aAbB
    shufps      m7, m7, q2301                   ; bBcC
    mulps       m0, m4
178
    mulps       m7, m3
179 180 181 182
    mulps       m6, m2
    mulps       m5, m1
    addps       m7, m0
    mova        m0, [X_lowq + start +16]        ; CcDd
183
    addps       m7, m0
184 185 186
    addps       m6, m5
    addps       m7, m6
    mova  [X_highq + start], m7
187 188 189
    add     start, 16
    jnz         .loop2
    RET
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211

cglobal sbr_sum64x5, 1,2,4,z
    lea    r1q, [zq+ 256]
.loop:
    mova    m0, [zq+   0]
    mova    m2, [zq+  16]
    mova    m1, [zq+ 256]
    mova    m3, [zq+ 272]
    addps   m0, [zq+ 512]
    addps   m2, [zq+ 528]
    addps   m1, [zq+ 768]
    addps   m3, [zq+ 784]
    addps   m0, [zq+1024]
    addps   m2, [zq+1040]
    addps   m0, m1
    addps   m2, m3
    mova  [zq], m0
    mova  [zq+16], m2
    add     zq, 32
    cmp     zq, r1q
    jne  .loop
    REP_RET
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230

INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
    lea              r2q, [zq + (64-4)*4]
    mova              m3, [ps_neg]
.loop:
    mova              m1, [zq]
    xorps             m0, m3, [r2q]
    shufps            m0, m0, m0, q0123
    unpcklps          m2, m0, m1
    unpckhps          m0, m0, m1
    mova       [Wq +  0], m2
    mova       [Wq + 16], m0
    add               Wq, 32
    sub              r2q, 16
    add               zq, 16
    cmp               zq, r2q
    jl             .loop
    REP_RET
231 232 233

INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
234
    lea        r1q, [zq+256]
235
.loop:
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
    mova        m0, [zq+ 0]
    mova        m1, [zq+16]
    mova        m2, [zq+32]
    mova        m3, [zq+48]
    xorps       m0, [ps_mask2]
    xorps       m1, [ps_mask2]
    xorps       m2, [ps_mask2]
    xorps       m3, [ps_mask2]
    mova   [zq+ 0], m0
    mova   [zq+16], m1
    mova   [zq+32], m2
    mova   [zq+48], m3
    add         zq, 64
    cmp         zq, r1q
    jne      .loop
251
    REP_RET
252

253
; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
254 255
%macro SBR_QMF_DEINT_BFLY  0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
256 257
    mov               cq, 64*4-2*mmsize
    lea            vrevq, [vq + 64*4]
258
.loop:
259 260
    mova              m0, [src0q+cq]
    mova              m1, [src1q]
261 262
    mova              m4, [src0q+cq+mmsize]
    mova              m5, [src1q+mmsize]
263
%if cpuflag(sse2)
264 265 266 267
    pshufd            m2, m0, q0123
    pshufd            m3, m1, q0123
    pshufd            m6, m4, q0123
    pshufd            m7, m5, q0123
268
%else
269 270 271 272
    shufps            m2, m0, m0, q0123
    shufps            m3, m1, m1, q0123
    shufps            m6, m4, m4, q0123
    shufps            m7, m5, m5, q0123
273
%endif
274
    addps             m5, m2
275 276
    subps             m0, m7
    addps             m1, m6
277
    subps             m4, m3
278
    mova         [vrevq], m1
279
    mova  [vrevq+mmsize], m5
280
    mova         [vq+cq], m0
281
    mova  [vq+cq+mmsize], m4
282 283 284 285
    add            src1q, 2*mmsize
    add            vrevq, 2*mmsize
    sub               cq, 2*mmsize
    jge            .loop
286 287 288 289 290 291 292 293
    REP_RET
%endmacro

INIT_XMM sse
SBR_QMF_DEINT_BFLY

INIT_XMM sse2
SBR_QMF_DEINT_BFLY
294 295

INIT_XMM sse2
296
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
297 298 299 300
%define OFFSET  (32*4-2*mmsize)
    mov       r3q, OFFSET
    lea       r1q, [zq + (32+1)*4]
    lea       r2q, [zq + 64*4]
301
    mova       m5, [ps_neg]
302 303 304 305 306 307
.loop:
    movu       m0, [r1q]
    movu       m2, [r1q + mmsize]
    movu       m1, [zq + r3q + 4 + mmsize]
    movu       m3, [zq + r3q + 4]

308 309
    pxor       m2, m5
    pxor       m0, m5
310 311
    pshufd     m2, m2, q0123
    pshufd     m0, m0, q0123
312
    SBUTTERFLY dq, 2, 3, 4
313 314 315 316 317 318 319 320
    SBUTTERFLY dq, 0, 1, 4
    mova  [r2q + 2*r3q + 0*mmsize], m2
    mova  [r2q + 2*r3q + 1*mmsize], m3
    mova  [r2q + 2*r3q + 2*mmsize], m0
    mova  [r2q + 2*r3q + 3*mmsize], m1
    add       r1q, 2*mmsize
    sub       r3q, 2*mmsize
    jge      .loop
321
    movq       m2, [zq]
322 323
    movq    [r2q], m2
    REP_RET
324 325 326 327 328 329 330 331 332 333 334 335 336 337

%ifdef PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
%else
%define NOISE_TABLE r5q
%endif
%else
%define NREGS 0
%define NOISE_TABLE sbr_noise_table
%endif

%macro LOAD_NST  1
338
%ifdef PIC
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
    lea  NOISE_TABLE, [%1]
    mova          m0, [kxq + NOISE_TABLE]
%else
    mova          m0, [kxq + %1]
%endif
%endmacro

INIT_XMM sse2
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
;                      const float *q_filt, int noise,
;                      int kx, int m_max)
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
    mova       m0, [ps_noise0]
    jmp apply_noise_main

; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
;                      const float *q_filt, int noise,
;                      int kx, int m_max)
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
    and       kxq, 1
    shl       kxq, 4
    LOAD_NST  ps_noise13
    jmp apply_noise_main

; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
;                      const float *q_filt, int noise,
;                      int kx, int m_max)
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
    mova       m0, [ps_noise2]
    jmp apply_noise_main

; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
;                      const float *q_filt, int noise,
;                      int kx, int m_max)
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
    and       kxq, 1
    shl       kxq, 4
    LOAD_NST  ps_noise13+16

apply_noise_main:
%if ARCH_X86_64 == 0 || WIN64
    mov       kxd, m_maxm
%define count kxq
%else
%define count m_maxq
%endif
385
    movsxdifnidn    noiseq, noised
386 387
    dec    noiseq
    shl    count, 2
388
%ifdef PIC
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
    lea NOISE_TABLE, [sbr_noise_table]
%endif
    lea        Yq, [Yq + 2*count]
    add      s_mq, count
    add   q_filtq, count
    shl    noiseq, 3
    pxor       m5, m5
    neg    count
.loop:
    mova       m1, [q_filtq + count]
    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
    add    noiseq, 2*mmsize
    and    noiseq, 0x1ff<<3
    punpckhdq  m2, m1, m1
    punpckldq  m1, m1
    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
    mova       m3, [s_mq + count]
    ; TODO: replace by a vpermd in AVX2
    punpckhdq  m4, m3, m3
    punpckldq  m3, m3
    pcmpeqd    m6, m3, m5 ; m6 == 0
    pcmpeqd    m7, m4, m5 ; m7 == 0
    mulps      m3, m0 ; s_m[m] * phi_sign
    mulps      m4, m0 ; s_m[m] * phi_sign
    pand       m1, m6
    pand       m2, m7
    movu       m6, [Yq + 2*count]
    movu       m7, [Yq + 2*count + mmsize]
    addps      m3, m1
    addps      m4, m2
    addps      m6, m3
    addps      m7, m4
    movu    [Yq + 2*count], m6
    movu    [Yq + 2*count + mmsize], m7
    add    count, mmsize
    jl      .loop
    RET
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449

INIT_XMM sse
cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
%define COUNT  32*4
%define OFFSET 32*4
    mov        cq, -COUNT
    lea     vrevq, [vq + OFFSET + COUNT]
    add        vq, OFFSET-mmsize
    add      srcq, 2*COUNT
    mova       m3, [ps_neg]
.loop:
    mova       m0, [srcq + 2*cq + 0*mmsize]
    mova       m1, [srcq + 2*cq + 1*mmsize]
    shufps     m2, m0, m1, q2020
    shufps     m1, m0, q1313
    xorps      m2, m3
    mova     [vq], m1
    mova  [vrevq + cq], m2
    sub        vq, mmsize
    add        cq, mmsize
    jl      .loop
    REP_RET
450 451 452 453 454 455 456 457

%macro SBR_AUTOCORRELATE 0
cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
    mov   cntq, 37*8
    add     xq, cntq
    neg   cntq

%if cpuflag(sse3)
458
%define   MOVH  movsd
459 460
    movddup m5, [xq+cntq]
%else
461
%define   MOVH  movlps
462 463 464
    movlps  m5, [xq+cntq]
    movlhps m5, m5
%endif
465 466
    MOVH    m7, [xq+cntq+8 ]
    MOVH    m1, [xq+cntq+16]
467 468 469 470 471 472 473 474 475
    shufps  m7, m7, q0110
    shufps  m1, m1, q0110
    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
    movaps  [rsp   ], m3
    movaps  [rsp+16], m4
    add   cntq, 8

476
    MOVH    m2, [xq+cntq+16]
477 478 479 480 481 482 483 484 485 486
    movlhps m7, m7
    shufps  m2, m2, q0110
    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
    mulps   m4, m7, m2
    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]

align 16
.loop:
    add   cntq, 8
487
    MOVH    m0, [xq+cntq+16]
488 489 490 491 492 493 494 495 496
    movlhps m1, m1
    shufps  m0, m0, q0110
    mulps   m3, m1, m2
    mulps   m4, m1, m0
    mulps   m1, m1
    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
    add   cntq, 8
497
    MOVH    m1, [xq+cntq+16]
498 499 500 501 502 503 504 505 506
    movlhps m2, m2
    shufps  m1, m1, q0110
    mulps   m3, m2, m0
    mulps   m4, m2, m1
    mulps   m2, m2
    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
    add   cntq, 8
507
    MOVH    m2, [xq+cntq+16]
508 509 510 511 512 513 514 515 516 517 518
    movlhps m0, m0
    shufps  m2, m2, q0110
    mulps   m3, m0, m1
    mulps   m4, m0, m2
    mulps   m0, m0
    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
    jl .loop

    movlhps m1, m1
519
    mulps   m2, m1
520
    mulps   m1, m1
521
    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
522 523 524 525
    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];

526
    xorps   m2, [ps_mask3]
527 528
    xorps   m5, [ps_mask3]
    xorps   m6, [ps_mask3]
529 530
    HADDPS  m2, m5, m3
    HADDPS  m7, m6, m4
531
%if cpuflag(sse3)
532
    movshdup m0, m1
533
%else
534
    movss   m0, m1
535 536
    shufps  m1, m1, q0001
%endif
537
    addss   m1, m0
538
    movaps  [phiq     ], m2
539 540 541 542 543 544 545 546 547 548
    movhps  [phiq+0x18], m7
    movss   [phiq+0x28], m7
    movss   [phiq+0x10], m1
    RET
%endmacro

INIT_XMM sse
SBR_AUTOCORRELATE
INIT_XMM sse3
SBR_AUTOCORRELATE