diracdsp_yasm.asm 6.53 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
;******************************************************************************
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

21
%include "libavutil/x86/x86util.asm"
22 23 24

SECTION_RODATA
pw_7: times 8 dw 7
25 26 27 28 29

cextern pw_3
cextern pw_16
cextern pw_32
cextern pb_80
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54

section .text

%macro UNPACK_ADD 6
    mov%5   %1, %3
    mov%6   m5, %4
    mova    m4, %1
    mova    %2, m5
    punpcklbw %1, m7
    punpcklbw m5, m7
    punpckhbw m4, m7
    punpckhbw %2, m7
    paddw   %1, m5
    paddw   %2, m4
%endmacro

%macro HPEL_FILTER 1
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
    mov     src0q, srcq
    lea     stridex3q, [3*strideq]
    sub     src0q, stridex3q
    pxor    m7, m7
.loop:
    ; 7*(src[0] + src[1])
55
    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
56 57 58 59
    pmullw  m0, [pw_7]
    pmullw  m1, [pw_7]

    ; 3*( ... + src[-2] + src[3])
60
    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
61 62 63 64 65 66
    paddw   m0, m2
    paddw   m1, m3
    pmullw  m0, [pw_3]
    pmullw  m1, [pw_3]

    ; ... - 7*(src[-1] + src[2])
67
    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
68 69 70 71 72 73
    pmullw  m2, [pw_7]
    pmullw  m3, [pw_7]
    psubw   m0, m2
    psubw   m1, m3

    ; ... - (src[-3] + src[4])
74
    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
75 76 77 78 79 80 81 82
    psubw   m0, m2
    psubw   m1, m3

    paddw   m0, [pw_16]
    paddw   m1, [pw_16]
    psraw   m0, 5
    psraw   m1, 5
    packuswb m0, m1
83
    mova    [dstq], m0
84 85 86 87 88 89 90 91 92 93 94 95 96 97
    add     dstq, mmsize
    add     srcq, mmsize
    add     src0q, mmsize
    sub     widthd, mmsize
    jg      .loop
    RET

; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
    dec     widthd
    pxor    m7, m7
    and     widthd, ~(mmsize-1)
.loop:
    ; 7*(src[0] + src[1])
98
    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    pmullw  m0, [pw_7]
    pmullw  m1, [pw_7]

    ; 3*( ... + src[-2] + src[3])
    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
    paddw   m0, m2
    paddw   m1, m3
    pmullw  m0, [pw_3]
    pmullw  m1, [pw_3]

    ; ... - 7*(src[-1] + src[2])
    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
    pmullw  m2, [pw_7]
    pmullw  m3, [pw_7]
    psubw   m0, m2
    psubw   m1, m3

    ; ... - (src[-3] + src[4])
    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
    psubw   m0, m2
    psubw   m1, m3

    paddw   m0, [pw_16]
    paddw   m1, [pw_16]
    psraw   m0, 5
    psraw   m1, 5
    packuswb m0, m1
126
    mova    [dstq + widthq], m0
127 128 129 130 131 132 133
    sub     widthd, mmsize
    jge     .loop
    RET
%endmacro

%macro PUT_RECT 1
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
134
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
135
    mova    m0, [pb_80]
136 137 138
    add     wd, (mmsize-1)
    and     wd, ~(mmsize-1)

139
%if ARCH_X86_64
140 141
    movsxd   dst_strideq, dst_strided
    movsxd   src_strideq, src_strided
142 143 144 145
    mov   r7d, r5m
    mov   r8d, wd
    %define wspill r8d
    %define hd r7d
146 147 148 149 150 151
%else
    mov    r4m, wd
    %define wspill r4m
    %define hd r5mp
%endif

152
.loopy:
153
    lea     src2q, [srcq+src_strideq]
154 155 156 157 158 159 160 161 162
    lea     dst2q, [dstq+dst_strideq]
.loopx:
    sub      wd, mmsize
    mova     m1, [srcq +2*wq]
    mova     m2, [src2q+2*wq]
    packsswb m1, [srcq +2*wq+mmsize]
    packsswb m2, [src2q+2*wq+mmsize]
    paddb    m1, m0
    paddb    m2, m0
163 164
    mova    [dstq +wq], m1
    mova    [dst2q+wq], m2
165 166
    jg      .loopx

167
    lea   srcq, [srcq+src_strideq*2]
168 169 170 171 172 173 174 175 176
    lea   dstq, [dstq+dst_strideq*2]
    sub     hd, 2
    mov     wd, wspill
    jg      .loopy
    RET
%endm

%macro ADD_RECT 1
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
177
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
178 179 180 181
    mova    m0, [pw_32]
    add     wd, (mmsize-1)
    and     wd, ~(mmsize-1)

182
%if ARCH_X86_64
183 184
    movsxd   strideq, strided
    movsxd   idwt_strideq, idwt_strided
185 186
    mov   r8d, wd
    %define wspill r8d
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
%else
    mov    r5m, wd
    %define wspill r5m
%endif

.loop:
    sub     wd, mmsize
    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
    paddw   m1, m0
    psraw   m1, 6
    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
    paddw   m2, m0
    psraw   m2, 6
    paddw   m1, [idwtq+2*wq]
    paddw   m2, [idwtq+2*wq+mmsize]
    packuswb m1, m2
203
    mova    [dstq +wq], m1
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
    jg      .loop

    lea   srcq, [srcq + 2*strideq]
    add   dstq, strideq
    lea  idwtq, [idwtq+ 2*idwt_strideq]
    sub     hd, 1
    mov     wd, wspill
    jg      .loop
    RET
%endm

%macro ADD_OBMC 2
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
    pxor        m4, m4
.loop:
%assign i 0
%rep %1 / mmsize
    mova        m0, [srcq+i]
    mova        m1, m0
    punpcklbw   m0, m4
    punpckhbw   m1, m4
    mova        m2, [obmcq+i]
    mova        m3, m2
   punpcklbw   m2, m4
    punpckhbw   m3, m4
    pmullw      m0, m2
    pmullw      m1, m3
232 233
    movu        m2, [dstq+2*i]
    movu        m3, [dstq+2*i+mmsize]
234 235
    paddw       m0, m2
    paddw       m1, m3
236 237
    movu        [dstq+2*i], m0
    movu        [dstq+2*i+mmsize], m1
238 239 240 241 242 243 244 245 246 247 248
%assign i i+mmsize
%endrep
    lea         srcq, [srcq+strideq]
    lea         dstq, [dstq+2*strideq]
    add         obmcq, 32
    sub         yblend, 1
    jg          .loop
    RET
%endm

INIT_MMX
249
%if ARCH_X86_64 == 0
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
PUT_RECT mmx
ADD_RECT mmx

HPEL_FILTER mmx
ADD_OBMC 32, mmx
ADD_OBMC 16, mmx
%endif
ADD_OBMC 8, mmx

INIT_XMM
PUT_RECT sse2
ADD_RECT sse2

HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2