h264_weight.asm 6.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
;*****************************************************************************
;* SSE2-optimized weighted prediction code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 23
;******************************************************************************

24
%include "libavutil/x86/x86util.asm"
25 26 27 28 29 30

SECTION .text

;-----------------------------------------------------------------------------
; biweight pred:
;
31 32 33
; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
;                               int height, int log2_denom, int weightd,
;                               int weights, int offset);
34
; and
35 36
; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
;                             int log2_denom, int weight, int offset);
37 38 39
;-----------------------------------------------------------------------------

%macro WEIGHT_SETUP 0
40 41 42 43 44
    add        r5, r5
    inc        r5
    movd       m3, r4d
    movd       m5, r5d
    movd       m6, r3d
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
    pslld      m5, m6
    psrld      m5, 1
%if mmsize == 16
    pshuflw    m3, m3, 0
    pshuflw    m5, m5, 0
    punpcklqdq m3, m3
    punpcklqdq m5, m5
%else
    pshufw     m3, m3, 0
    pshufw     m5, m5, 0
%endif
    pxor       m7, m7
%endmacro

%macro WEIGHT_OP 2
    movh          m0, [r0+%1]
    movh          m1, [r0+%2]
    punpcklbw     m0, m7
    punpcklbw     m1, m7
    pmullw        m0, m3
    pmullw        m1, m3
    paddsw        m0, m5
    paddsw        m1, m5
    psraw         m0, m6
    psraw         m1, m6
    packuswb      m0, m1
%endmacro

73 74
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
75
    WEIGHT_SETUP
76
.nextrow:
77 78 79 80 81
    WEIGHT_OP 0,  4
    mova     [r0  ], m0
    WEIGHT_OP 8, 12
    mova     [r0+8], m0
    add        r0, r1
82
    dec        r2d
83 84 85
    jnz .nextrow
    REP_RET

86 87
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
88
    WEIGHT_SETUP
89
.nextrow:
90 91 92
    WEIGHT_OP 0, mmsize/2
    mova     [r0], m0
    add        r0, r1
93
    dec        r2d
94 95 96 97
    jnz .nextrow
    REP_RET
%endmacro

98 99 100 101
INIT_MMX mmxext
WEIGHT_FUNC_MM  8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
102

103 104
%macro WEIGHT_FUNC_HALF_MM 2
cglobal h264_weight_%1, 6, 6, %2
105
    WEIGHT_SETUP
106
    sar       r2d, 1
107
    lea        r3, [r1*2]
108
.nextrow:
109 110 111 112 113 114 115 116 117
    WEIGHT_OP 0, r1
    movh     [r0], m0
%if mmsize == 16
    movhps   [r0+r1], m0
%else
    psrlq      m0, 32
    movh     [r0+r1], m0
%endif
    add        r0, r3
118
    dec        r2d
119 120 121 122
    jnz .nextrow
    REP_RET
%endmacro

123 124 125 126
INIT_MMX mmxext
WEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM 8, 8
127 128

%macro BIWEIGHT_SETUP 0
129
%if ARCH_X86_64
130
%define off_regd r7d
131 132 133 134 135 136 137
%else
%define off_regd r3d
%endif
    mov  off_regd, r7m
    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
138 139 140 141 142 143 144
    cmp        r5, 128
     jne .normal
    sar        r5, 1
    sar        r6, 1
    sar  off_regd, 1
    sub        r4, 1
.normal
145 146 147 148
%if cpuflag(ssse3)
    movd       m4, r5d
    movd       m0, r6d
%else
149 150
    movd       m3, r5d
    movd       m4, r6d
151
%endif
152 153
    movd       m5, off_regd
    movd       m6, r4d
154 155
    pslld      m5, m6
    psrld      m5, 1
156 157 158 159 160 161 162 163
%if cpuflag(ssse3)
    punpcklbw  m4, m0
    pshuflw    m4, m4, 0
    pshuflw    m5, m5, 0
    punpcklqdq m4, m4
    punpcklqdq m5, m5

%else
164 165 166 167 168 169 170 171 172 173 174 175 176
%if mmsize == 16
    pshuflw    m3, m3, 0
    pshuflw    m4, m4, 0
    pshuflw    m5, m5, 0
    punpcklqdq m3, m3
    punpcklqdq m4, m4
    punpcklqdq m5, m5
%else
    pshufw     m3, m3, 0
    pshufw     m4, m4, 0
    pshufw     m5, m5, 0
%endif
    pxor       m7, m7
177
%endif
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
%endmacro

%macro BIWEIGHT_STEPA 3
    movh       m%1, [r0+%3]
    movh       m%2, [r1+%3]
    punpcklbw  m%1, m7
    punpcklbw  m%2, m7
    pmullw     m%1, m3
    pmullw     m%2, m4
    paddsw     m%1, m%2
%endmacro

%macro BIWEIGHT_STEPB 0
    paddsw     m0, m5
    paddsw     m1, m5
    psraw      m0, m6
    psraw      m1, m6
    packuswb   m0, m1
%endmacro

198 199
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
200
    BIWEIGHT_SETUP
201
    movifnidn r3d, r3m
202
.nextrow:
203 204 205 206 207 208 209 210 211 212
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, 4
    BIWEIGHT_STEPB
    mova       [r0], m0
    BIWEIGHT_STEPA 0, 1, 8
    BIWEIGHT_STEPA 1, 2, 12
    BIWEIGHT_STEPB
    mova     [r0+8], m0
    add        r0, r2
    add        r1, r2
213
    dec        r3d
214 215 216
    jnz .nextrow
    REP_RET

217 218
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
219
    BIWEIGHT_SETUP
220
    movifnidn r3d, r3m
221
.nextrow:
222 223 224 225 226 227
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, mmsize/2
    BIWEIGHT_STEPB
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
228
    dec        r3d
229 230 231 232
    jnz .nextrow
    REP_RET
%endmacro

233 234 235 236
INIT_MMX mmxext
BIWEIGHT_FUNC_MM  8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
237

238 239
%macro BIWEIGHT_FUNC_HALF_MM 2
cglobal h264_biweight_%1, 7, 8, %2
240
    BIWEIGHT_SETUP
241 242
    movifnidn r3d, r3m
    sar        r3, 1
243
    lea        r4, [r2*2]
244
.nextrow:
245 246 247 248 249 250 251 252 253 254 255 256
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, r2
    BIWEIGHT_STEPB
    movh       [r0], m0
%if mmsize == 16
    movhps     [r0+r2], m0
%else
    psrlq      m0, 32
    movh       [r0+r2], m0
%endif
    add        r0, r4
    add        r1, r4
257
    dec        r3d
258 259 260 261
    jnz .nextrow
    REP_RET
%endmacro

262 263 264 265
INIT_MMX mmxext
BIWEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
BIWEIGHT_FUNC_HALF_MM 8, 8
266 267 268 269 270 271 272 273 274 275 276

%macro BIWEIGHT_SSSE3_OP 0
    pmaddubsw  m0, m4
    pmaddubsw  m2, m4
    paddsw     m0, m5
    paddsw     m2, m5
    psraw      m0, m6
    psraw      m2, m6
    packuswb   m0, m2
%endmacro

277 278 279
INIT_XMM ssse3
cglobal h264_biweight_16, 7, 8, 8
    BIWEIGHT_SETUP
280
    movifnidn r3d, r3m
281

282
.nextrow:
283 284 285 286 287 288 289 290 291
    movh       m0, [r0]
    movh       m2, [r0+8]
    movh       m3, [r1+8]
    punpcklbw  m0, [r1]
    punpcklbw  m2, m3
    BIWEIGHT_SSSE3_OP
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
292
    dec        r3d
293 294 295
    jnz .nextrow
    REP_RET

296 297 298
INIT_XMM ssse3
cglobal h264_biweight_8, 7, 8, 8
    BIWEIGHT_SETUP
299 300
    movifnidn r3d, r3m
    sar        r3, 1
301 302
    lea        r4, [r2*2]

303
.nextrow:
304 305 306 307 308 309 310 311 312 313 314
    movh       m0, [r0]
    movh       m1, [r1]
    movh       m2, [r0+r2]
    movh       m3, [r1+r2]
    punpcklbw  m0, m1
    punpcklbw  m2, m3
    BIWEIGHT_SSSE3_OP
    movh       [r0], m0
    movhps     [r0+r2], m0
    add        r0, r4
    add        r1, r4
315
    dec        r3d
316 317
    jnz .nextrow
    REP_RET