vp9itxfm_template.asm 4.76 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
;******************************************************************************
;* VP9 IDCT SIMD optimizations
;*
;* Copyright (C) 2013 Clément Bœsch <u pkh me>
;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%macro VP9_IWHT4_1D 0
    SWAP                 1, 2, 3
    paddw               m0, m2
    psubw               m3, m1
    psubw               m4, m0, m3
    psraw               m4, 1
    psubw               m5, m4, m1
    SWAP                 5, 1
    psubw               m4, m2
    SWAP                 4, 2
    psubw               m0, m1
    paddw               m3, m2
    SWAP                 3, 2, 1
%endmacro
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84

; (a*x + b*y + round) >> shift
%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
    pmaddwd            m%1, m%2, %4
    pmaddwd            m%2,  %5
    paddd              m%1,  %3
    paddd              m%2,  %3
    psrad              m%1,  14
    psrad              m%2,  14
%endmacro

%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
    VP9_MULSUB_2W_2X    %7,  %6,  %5, [pw_m%3_%4], [pw_%4_%3]
    VP9_MULSUB_2W_2X    %1,  %2,  %5, [pw_m%3_%4], [pw_%4_%3]
    packssdw           m%1, m%7
    packssdw           m%2, m%6
%endmacro

%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if %0 == 7
    punpckhwd          m%6, m%2, m%1
    punpcklwd          m%2, m%1
    VP9_MULSUB_2W_4X   %1, %2, %3, %4, %5, %6, %7
%else
    punpckhwd          m%8, m%4, m%3
    punpcklwd          m%2, m%4, m%3
    VP9_MULSUB_2W_4X   %1, %2, %5, %6, %7, %8, %9
%endif
%endmacro

%macro VP9_IDCT4_1D_FINALIZE 0
    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
    SWAP                 0, 3, 2                            ; 3102 -> 0123
%endmacro

%macro VP9_IDCT4_1D 0
%if cpuflag(ssse3)
    SUMSUB_BA            w, 2, 0, 4                         ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
    pmulhrsw            m2, m6                              ; m2=t0
    pmulhrsw            m0, m6                              ; m0=t1
%else ; <= sse2
    VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5    ; m0=t1, m1=t0
%endif
    VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
    VP9_IDCT4_1D_FINALIZE
%endmacro
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142

%macro VP9_IADST4_1D 0
    movq2dq           xmm0, m0
    movq2dq           xmm1, m1
    movq2dq           xmm2, m2
    movq2dq           xmm3, m3
%if cpuflag(ssse3)
    paddw               m3, m0
%endif
    punpcklwd         xmm0, xmm1
    punpcklwd         xmm2, xmm3
    pmaddwd           xmm1, xmm0, [pw_5283_13377]
    pmaddwd           xmm4, xmm0, [pw_9929_13377]
%if notcpuflag(ssse3)
    pmaddwd           xmm6, xmm0, [pw_13377_0]
%endif
    pmaddwd           xmm0, [pw_15212_m13377]
    pmaddwd           xmm3, xmm2, [pw_15212_9929]
%if notcpuflag(ssse3)
    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
%endif
    pmaddwd           xmm2, [pw_m5283_m15212]
%if cpuflag(ssse3)
    psubw               m3, m2
%else
    paddd             xmm6, xmm7
%endif
    paddd             xmm0, xmm2
    paddd             xmm3, xmm5
    paddd             xmm2, xmm5
%if notcpuflag(ssse3)
    paddd             xmm6, xmm5
%endif
    paddd             xmm1, xmm3
    paddd             xmm0, xmm3
    paddd             xmm4, xmm2
    psrad             xmm1, 14
    psrad             xmm0, 14
    psrad             xmm4, 14
%if cpuflag(ssse3)
    pmulhrsw            m3, [pw_13377x2]        ; out2
%else
    psrad             xmm6, 14
%endif
    packssdw          xmm0, xmm0
    packssdw          xmm1, xmm1
    packssdw          xmm4, xmm4
%if notcpuflag(ssse3)
    packssdw          xmm6, xmm6
%endif
    movdq2q             m0, xmm0                ; out3
    movdq2q             m1, xmm1                ; out0
    movdq2q             m2, xmm4                ; out1
%if notcpuflag(ssse3)
    movdq2q             m3, xmm6                ; out2
%endif
    SWAP                 0, 1, 2, 3
%endmacro