h264_weight.asm 6.96 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
;*****************************************************************************
;* SSE2-optimized weighted prediction code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 23
;******************************************************************************

24
%include "libavutil/x86/x86util.asm"
25 26 27 28 29 30

SECTION .text

;-----------------------------------------------------------------------------
; biweight pred:
;
31 32 33
; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
;                               int height, int log2_denom, int weightd,
;                               int weights, int offset);
34
; and
35 36
; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
;                             int log2_denom, int weight, int offset);
37 38 39
;-----------------------------------------------------------------------------

%macro WEIGHT_SETUP 0
40 41 42 43 44
    add        r5, r5
    inc        r5
    movd       m3, r4d
    movd       m5, r5d
    movd       m6, r3d
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
    pslld      m5, m6
    psrld      m5, 1
%if mmsize == 16
    pshuflw    m3, m3, 0
    pshuflw    m5, m5, 0
    punpcklqdq m3, m3
    punpcklqdq m5, m5
%else
    pshufw     m3, m3, 0
    pshufw     m5, m5, 0
%endif
    pxor       m7, m7
%endmacro

%macro WEIGHT_OP 2
    movh          m0, [r0+%1]
    movh          m1, [r0+%2]
    punpcklbw     m0, m7
    punpcklbw     m1, m7
    pmullw        m0, m3
    pmullw        m1, m3
    paddsw        m0, m5
    paddsw        m1, m5
    psraw         m0, m6
    psraw         m1, m6
    packuswb      m0, m1
%endmacro

73 74
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
75
    WEIGHT_SETUP
76
.nextrow:
77 78 79 80 81
    WEIGHT_OP 0,  4
    mova     [r0  ], m0
    WEIGHT_OP 8, 12
    mova     [r0+8], m0
    add        r0, r1
82
    dec        r2d
83 84 85
    jnz .nextrow
    REP_RET

86 87
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
88
    WEIGHT_SETUP
89
.nextrow:
90 91 92
    WEIGHT_OP 0, mmsize/2
    mova     [r0], m0
    add        r0, r1
93
    dec        r2d
94 95 96 97
    jnz .nextrow
    REP_RET
%endmacro

98 99 100 101
INIT_MMX mmxext
WEIGHT_FUNC_MM  8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
102

103 104
%macro WEIGHT_FUNC_HALF_MM 2
cglobal h264_weight_%1, 6, 6, %2
105
    WEIGHT_SETUP
106
    sar       r2d, 1
107
    lea        r3, [r1*2]
108
.nextrow:
109 110 111 112 113 114 115 116 117
    WEIGHT_OP 0, r1
    movh     [r0], m0
%if mmsize == 16
    movhps   [r0+r1], m0
%else
    psrlq      m0, 32
    movh     [r0+r1], m0
%endif
    add        r0, r3
118
    dec        r2d
119 120 121 122
    jnz .nextrow
    REP_RET
%endmacro

123 124 125 126
INIT_MMX mmxext
WEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM 8, 8
127 128

%macro BIWEIGHT_SETUP 0
129
%if ARCH_X86_64
130
%define off_regd r7d
131 132 133 134 135 136 137
%else
%define off_regd r3d
%endif
    mov  off_regd, r7m
    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
138
    cmp        r6d, 128
139
    je .nonnormal
140
    cmp        r5, 128
141
    jne .normal
142
.nonnormal:
143 144 145 146
    sar        r5, 1
    sar        r6, 1
    sar  off_regd, 1
    sub        r4, 1
147
.normal:
148 149 150 151
%if cpuflag(ssse3)
    movd       m4, r5d
    movd       m0, r6d
%else
152 153
    movd       m3, r5d
    movd       m4, r6d
154
%endif
155 156
    movd       m5, off_regd
    movd       m6, r4d
157 158
    pslld      m5, m6
    psrld      m5, 1
159 160 161 162 163 164 165 166
%if cpuflag(ssse3)
    punpcklbw  m4, m0
    pshuflw    m4, m4, 0
    pshuflw    m5, m5, 0
    punpcklqdq m4, m4
    punpcklqdq m5, m5

%else
167 168 169 170 171 172 173 174 175 176 177 178 179
%if mmsize == 16
    pshuflw    m3, m3, 0
    pshuflw    m4, m4, 0
    pshuflw    m5, m5, 0
    punpcklqdq m3, m3
    punpcklqdq m4, m4
    punpcklqdq m5, m5
%else
    pshufw     m3, m3, 0
    pshufw     m4, m4, 0
    pshufw     m5, m5, 0
%endif
    pxor       m7, m7
180
%endif
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
%endmacro

%macro BIWEIGHT_STEPA 3
    movh       m%1, [r0+%3]
    movh       m%2, [r1+%3]
    punpcklbw  m%1, m7
    punpcklbw  m%2, m7
    pmullw     m%1, m3
    pmullw     m%2, m4
    paddsw     m%1, m%2
%endmacro

%macro BIWEIGHT_STEPB 0
    paddsw     m0, m5
    paddsw     m1, m5
    psraw      m0, m6
    psraw      m1, m6
    packuswb   m0, m1
%endmacro

201 202
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
203
    BIWEIGHT_SETUP
204
    movifnidn r3d, r3m
205
.nextrow:
206 207 208 209 210 211 212 213 214 215
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, 4
    BIWEIGHT_STEPB
    mova       [r0], m0
    BIWEIGHT_STEPA 0, 1, 8
    BIWEIGHT_STEPA 1, 2, 12
    BIWEIGHT_STEPB
    mova     [r0+8], m0
    add        r0, r2
    add        r1, r2
216
    dec        r3d
217 218 219
    jnz .nextrow
    REP_RET

220 221
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
222
    BIWEIGHT_SETUP
223
    movifnidn r3d, r3m
224
.nextrow:
225 226 227 228 229 230
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, mmsize/2
    BIWEIGHT_STEPB
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
231
    dec        r3d
232 233 234 235
    jnz .nextrow
    REP_RET
%endmacro

236 237 238 239
INIT_MMX mmxext
BIWEIGHT_FUNC_MM  8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
240

241 242
%macro BIWEIGHT_FUNC_HALF_MM 2
cglobal h264_biweight_%1, 7, 8, %2
243
    BIWEIGHT_SETUP
244 245
    movifnidn r3d, r3m
    sar        r3, 1
246
    lea        r4, [r2*2]
247
.nextrow:
248 249 250 251 252 253 254 255 256 257 258 259
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, r2
    BIWEIGHT_STEPB
    movh       [r0], m0
%if mmsize == 16
    movhps     [r0+r2], m0
%else
    psrlq      m0, 32
    movh       [r0+r2], m0
%endif
    add        r0, r4
    add        r1, r4
260
    dec        r3d
261 262 263 264
    jnz .nextrow
    REP_RET
%endmacro

265 266 267 268
INIT_MMX mmxext
BIWEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
BIWEIGHT_FUNC_HALF_MM 8, 8
269 270 271 272 273 274 275 276 277 278 279

%macro BIWEIGHT_SSSE3_OP 0
    pmaddubsw  m0, m4
    pmaddubsw  m2, m4
    paddsw     m0, m5
    paddsw     m2, m5
    psraw      m0, m6
    psraw      m2, m6
    packuswb   m0, m2
%endmacro

280 281 282
INIT_XMM ssse3
cglobal h264_biweight_16, 7, 8, 8
    BIWEIGHT_SETUP
283
    movifnidn r3d, r3m
284

285
.nextrow:
286 287 288 289 290 291 292 293 294
    movh       m0, [r0]
    movh       m2, [r0+8]
    movh       m3, [r1+8]
    punpcklbw  m0, [r1]
    punpcklbw  m2, m3
    BIWEIGHT_SSSE3_OP
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
295
    dec        r3d
296 297 298
    jnz .nextrow
    REP_RET

299 300 301
INIT_XMM ssse3
cglobal h264_biweight_8, 7, 8, 8
    BIWEIGHT_SETUP
302 303
    movifnidn r3d, r3m
    sar        r3, 1
304 305
    lea        r4, [r2*2]

306
.nextrow:
307 308 309 310 311 312 313 314 315 316 317
    movh       m0, [r0]
    movh       m1, [r1]
    movh       m2, [r0+r2]
    movh       m3, [r1+r2]
    punpcklbw  m0, m1
    punpcklbw  m2, m3
    BIWEIGHT_SSSE3_OP
    movh       [r0], m0
    movhps     [r0+r2], m0
    add        r0, r4
    add        r1, r4
318
    dec        r3d
319 320
    jnz .nextrow
    REP_RET