sbrdsp.asm 7.66 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

22
%include "libavutil/x86/x86util.asm"
23

24 25 26
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask         times 2 dd 1<<31, 0
27
ps_mask2        times 2 dd 0, 1<<31
28
ps_neg          times 4 dd 1<<31
29 30

SECTION_TEXT
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74

INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
    mov         r2, r1
    xorps       m0, m0
    xorps       m1, m1
    sar         r2, 3
    jz          .prepare
.loop:
    movu        m2, [r0 +  0]
    movu        m3, [r0 + 16]
    movu        m4, [r0 + 32]
    movu        m5, [r0 + 48]
    mulps       m2, m2
    mulps       m3, m3
    mulps       m4, m4
    mulps       m5, m5
    addps       m0, m2
    addps       m1, m3
    addps       m0, m4
    addps       m1, m5
    add         r0, 64
    dec         r2
    jnz         .loop
.prepare:
    and         r1, 7
    sar         r1, 1
    jz          .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
    movu        m2, [r0]
    add         r0, 16
    mulps       m2, m2
    dec         r1
    addps       m0, m2
    jnz         .endloop
.end:
    addps       m0, m1
    movhlps     m2, m0
    addps       m0, m2
    movss       m1, m0
    shufps      m0, m0, 1
    addss       m0, m1
%if ARCH_X86_64 == 0
75
    movss       r0m,  m0
76 77 78
    fld         dword r0m
%endif
    RET
79 80 81 82 83 84 85 86 87

%define STEP  40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
    lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
    mov         r5, r3
    and         r3, 0xFC
    lea         r2, [r2 + r3*4]
    lea         r0, [r0 + r3*8]
    neg         r3
88
    jz          .loop1
89
.loop4:
90 91 92 93
    movlps      m0, [r2 + 4*r3 + 0]
    movlps      m1, [r2 + 4*r3 + 8]
    movlps      m2, [r1 + 0*STEP]
    movlps      m3, [r1 + 2*STEP]
94 95
    movhps      m2, [r1 + 1*STEP]
    movhps      m3, [r1 + 3*STEP]
96 97
    unpcklps    m0, m0
    unpcklps    m1, m1
98 99 100 101 102 103 104 105 106 107 108
    mulps       m0, m2
    mulps       m1, m3
    movu        [r0 + 8*r3 +  0], m0
    movu        [r0 + 8*r3 + 16], m1
    add         r1, 4*STEP
    add         r3, 4
    jnz         .loop4
    and         r5, 3 ; number of single element loops
    jz          .end
.loop1: ; element 0 and 1 can be computed at the same time
    movss       m0, [r2]
109 110
    movlps      m2, [r1]
    unpcklps    m0, m0
111
    mulps       m2, m0
112
    movlps    [r0], m2
113 114 115 116 117 118 119
    add         r0, 8
    add         r2, 4
    add         r1, STEP
    dec         r5
    jnz         .loop1
.end:
    RET
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157

; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
;                          const float alpha0[2], const float alpha1[2],
;                          float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
    ; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
    movss      bw, BWm
%endif
    movlps     m2, [alpha1q]
    movlps     m1, [alpha0q]
    shufps     bw, bw, 0
    mulps      m2, bw             ; (a1[0] a1[1])*bw
    mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
    mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
    mova       m3, m1
    mova       m4, m2

    ; Set pointers
%if ARCH_X86_64 == 0 || WIN64
    ; start and end 6th and 7th args on stack
    mov        r2d, Sm
    mov        r3d, Em
%define  start r2q
%define  end   r3q
%else
; BW does not actually occupy a register, so shift by 1
%define  start BWq
%define  end   Sq
%endif
    sub      start, end          ; neg num of loops
    lea    X_highq, [X_highq + end*2*4]
    lea     X_lowq, [X_lowq  + end*2*4 - 2*2*4]
    shl      start, 3            ; offset from num loops

    mova        m0, [X_lowq + start]
158 159 160 161 162 163
    shufps      m3, m3, q1111
    shufps      m4, m4, q1111
    xorps       m3, [ps_mask]
    shufps      m1, m1, q0000
    shufps      m2, m2, q0000
    xorps       m4, [ps_mask]
164
.loop2:
165
    movu        m7, [X_lowq + start + 8]        ; BbCc
166
    mova        m6, m0
167 168 169 170
    mova        m5, m7
    shufps      m0, m0, q2301                   ; aAbB
    shufps      m7, m7, q2301                   ; bBcC
    mulps       m0, m4
171
    mulps       m7, m3
172 173 174 175
    mulps       m6, m2
    mulps       m5, m1
    addps       m7, m0
    mova        m0, [X_lowq + start +16]        ; CcDd
176
    addps       m7, m0
177 178 179
    addps       m6, m5
    addps       m7, m6
    mova  [X_highq + start], m7
180 181 182
    add     start, 16
    jnz         .loop2
    RET
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204

cglobal sbr_sum64x5, 1,2,4,z
    lea    r1q, [zq+ 256]
.loop:
    mova    m0, [zq+   0]
    mova    m2, [zq+  16]
    mova    m1, [zq+ 256]
    mova    m3, [zq+ 272]
    addps   m0, [zq+ 512]
    addps   m2, [zq+ 528]
    addps   m1, [zq+ 768]
    addps   m3, [zq+ 784]
    addps   m0, [zq+1024]
    addps   m2, [zq+1040]
    addps   m0, m1
    addps   m2, m3
    mova  [zq], m0
    mova  [zq+16], m2
    add     zq, 32
    cmp     zq, r1q
    jne  .loop
    REP_RET
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223

INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
    lea              r2q, [zq + (64-4)*4]
    mova              m3, [ps_neg]
.loop:
    mova              m1, [zq]
    xorps             m0, m3, [r2q]
    shufps            m0, m0, m0, q0123
    unpcklps          m2, m0, m1
    unpckhps          m0, m0, m1
    mova       [Wq +  0], m2
    mova       [Wq + 16], m0
    add               Wq, 32
    sub              r2q, 16
    add               zq, 16
    cmp               zq, r2q
    jl             .loop
    REP_RET
224 225 226

INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
227
    lea        r1q, [zq+256]
228
.loop:
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
    mova        m0, [zq+ 0]
    mova        m1, [zq+16]
    mova        m2, [zq+32]
    mova        m3, [zq+48]
    xorps       m0, [ps_mask2]
    xorps       m1, [ps_mask2]
    xorps       m2, [ps_mask2]
    xorps       m3, [ps_mask2]
    mova   [zq+ 0], m0
    mova   [zq+16], m1
    mova   [zq+32], m2
    mova   [zq+48], m3
    add         zq, 64
    cmp         zq, r1q
    jne      .loop
244
    REP_RET
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286

; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
%macro SBR_QMF_DEINT_BFLY  0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
    mov        cq, 64*4-2*mmsize
    lea     vrevq, [vq + 64*4]
.loop:
    mova       m0, [src0q+cq]
    mova       m1, [src1q]
    mova       m4, [src0q+cq+mmsize]
    mova       m5, [src1q+mmsize]
%if cpuflag(sse2)
    pshufd     m2, m0, q0123
    pshufd     m3, m1, q0123
    pshufd     m6, m4, q0123
    pshufd     m7, m5, q0123
%else
    shufps     m2, m0, m0, q0123
    shufps     m3, m1, m1, q0123
    shufps     m6, m4, m4, q0123
    shufps     m7, m5, m5, q0123
%endif
    addps      m5, m2
    subps      m0, m7
    addps      m1, m6
    subps      m4, m3
    mova  [vrevq], m1
    mova  [vrevq+mmsize], m5
    mova  [vq+cq], m0
    mova  [vq+cq+mmsize], m4
    add     src1q, 2*mmsize
    add     vrevq, 2*mmsize
    sub        cq, 2*mmsize
    jge     .loop
    REP_RET
%endmacro

INIT_XMM sse
SBR_QMF_DEINT_BFLY

INIT_XMM sse2
SBR_QMF_DEINT_BFLY