h264_weight_10bit.asm 6.25 KB
Newer Older
1 2 3 4 5 6 7
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11 12 13 14
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16 17 18 19 20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22 23 24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

25
%include "libavutil/x86/x86util.asm"
26 27 28 29 30 31 32 33 34 35 36 37

SECTION_RODATA 32

pw_pixel_max: times 8 dw ((1 << 10)-1)
sq_1: dq 1
      dq 0

cextern pw_1

SECTION .text

;-----------------------------------------------------------------------------
38 39
; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
;                           int log2_denom, int weight, int offset);
40
;-----------------------------------------------------------------------------
41
%macro WEIGHT_PROLOGUE 0
42
.prologue:
43
    PROLOGUE 0,6,8
44 45
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
46
    movifnidn r2d, r2m
47
    movifnidn r4d, r4m
48
    movifnidn r5d, r5m
49 50
%endmacro

51
%macro WEIGHT_SETUP 0
52
    mova       m0, [pw_1]
53
    movd       m2, r3m
54 55
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
56 57 58
    shl        r5, 19       ; *8, move to upper half of dword
    lea        r5, [r5+r4*2+0x10000]
    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
59 60 61
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
62
%if notcpuflag(sse4)
63 64 65 66
    pxor       m7, m7
%endif
%endmacro

67 68 69
%macro WEIGHT_OP 1-2
%if %0==1
    mova        m5, [r0+%1]
70 71 72
    punpckhwd   m6, m5, m0
    punpcklwd   m5, m0
%else
73 74
    movq        m5, [r0+%1]
    movq        m6, [r0+%2]
75 76 77 78 79 80 81
    punpcklwd   m5, m0
    punpcklwd   m6, m0
%endif
    pmaddwd     m5, m3
    pmaddwd     m6, m3
    psrad       m5, m2
    psrad       m6, m2
82
%if cpuflag(sse4)
83 84 85 86 87 88 89 90
    packusdw    m5, m6
    pminsw      m5, m4
%else
    packssdw    m5, m6
    CLIPW       m5, m7, m4
%endif
%endmacro

91 92
%macro WEIGHT_FUNC_DBL 0
cglobal h264_weight_16_10
93
    WEIGHT_PROLOGUE
94
    WEIGHT_SETUP
95
.nextrow:
96
    WEIGHT_OP  0
97
    mova [r0   ], m5
98
    WEIGHT_OP 16
99 100
    mova [r0+16], m5
    add       r0, r1
101
    dec       r2d
102 103 104 105
    jnz .nextrow
    REP_RET
%endmacro

106 107 108 109
INIT_XMM sse2
WEIGHT_FUNC_DBL
INIT_XMM sse4
WEIGHT_FUNC_DBL
110 111


112 113
%macro WEIGHT_FUNC_MM 0
cglobal h264_weight_8_10
114
    WEIGHT_PROLOGUE
115
    WEIGHT_SETUP
116
.nextrow:
117
    WEIGHT_OP   0
118 119
    mova     [r0], m5
    add        r0, r1
120
    dec        r2d
121 122 123 124
    jnz .nextrow
    REP_RET
%endmacro

125 126 127 128
INIT_XMM sse2
WEIGHT_FUNC_MM
INIT_XMM sse4
WEIGHT_FUNC_MM
129 130


131 132
%macro WEIGHT_FUNC_HALF_MM 0
cglobal h264_weight_4_10
133 134
    WEIGHT_PROLOGUE
    sar         r2d, 1
135
    WEIGHT_SETUP
136
    lea         r3, [r1*2]
137
.nextrow:
138
    WEIGHT_OP    0, r1
139 140 141
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
142
    dec         r2d
143 144 145 146
    jnz .nextrow
    REP_RET
%endmacro

147 148 149 150
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM
INIT_XMM sse4
WEIGHT_FUNC_HALF_MM
151 152 153


;-----------------------------------------------------------------------------
154 155 156
; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
;                             int height, int log2_denom, int weightd,
;                             int weights, int offset);
157
;-----------------------------------------------------------------------------
158
%if ARCH_X86_32
159
DECLARE_REG_TMP 3
160
%else
161
DECLARE_REG_TMP 7
162 163
%endif

164
%macro BIWEIGHT_PROLOGUE 0
165
.prologue:
166
    PROLOGUE 0,8,8
167 168
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
169
    movifnidn r2d, r2m
170 171
    movifnidn r5d, r5m
    movifnidn r6d, r6m
172
    movifnidn t0d, r7m
173 174
%endmacro

175
%macro BIWEIGHT_SETUP 0
176 177 178 179 180 181 182
    lea        t0, [t0*4+1] ; (offset<<2)+1
    or         t0, 1
    shl        r6, 16
    or         r5, r6
    movd       m4, r5d      ; weightd | weights
    movd       m5, t0d      ; (offset+1)|1
    movd       m6, r4m      ; log2_denom
183 184 185 186 187
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
188
    movifnidn r3d, r3m
189
%if notcpuflag(sse4)
190 191 192 193
    pxor       m7, m7
%endif
%endmacro

194 195 196 197
%macro BIWEIGHT 1-2
%if %0==1
    mova       m0, [r0+%1]
    mova       m1, [r1+%1]
198 199 200
    punpckhwd  m2, m0, m1
    punpcklwd  m0, m1
%else
201 202
    movq       m0, [r0+%1]
    movq       m1, [r1+%1]
203
    punpcklwd  m0, m1
204 205
    movq       m2, [r0+%2]
    movq       m1, [r1+%2]
206 207 208 209 210 211 212 213
    punpcklwd  m2, m1
%endif
    pmaddwd    m0, m4
    pmaddwd    m2, m4
    paddd      m0, m5
    paddd      m2, m5
    psrad      m0, m6
    psrad      m2, m6
214
%if cpuflag(sse4)
215 216 217 218 219 220 221 222
    packusdw   m0, m2
    pminsw     m0, m3
%else
    packssdw   m0, m2
    CLIPW      m0, m7, m3
%endif
%endmacro

223 224
%macro BIWEIGHT_FUNC_DBL 0
cglobal h264_biweight_16_10
225
    BIWEIGHT_PROLOGUE
226
    BIWEIGHT_SETUP
227
.nextrow:
228
    BIWEIGHT   0
229
    mova [r0   ], m0
230
    BIWEIGHT  16
231
    mova [r0+16], m0
232 233 234
    add       r0, r2
    add       r1, r2
    dec       r3d
235 236 237 238
    jnz .nextrow
    REP_RET
%endmacro

239 240 241 242
INIT_XMM sse2
BIWEIGHT_FUNC_DBL
INIT_XMM sse4
BIWEIGHT_FUNC_DBL
243

244 245
%macro BIWEIGHT_FUNC 0
cglobal h264_biweight_8_10
246
    BIWEIGHT_PROLOGUE
247
    BIWEIGHT_SETUP
248
.nextrow:
249
    BIWEIGHT  0
250
    mova   [r0], m0
251 252 253
    add      r0, r2
    add      r1, r2
    dec      r3d
254 255 256 257
    jnz .nextrow
    REP_RET
%endmacro

258 259 260 261
INIT_XMM sse2
BIWEIGHT_FUNC
INIT_XMM sse4
BIWEIGHT_FUNC
262

263 264
%macro BIWEIGHT_FUNC_HALF 0
cglobal h264_biweight_4_10
265
    BIWEIGHT_PROLOGUE
266
    BIWEIGHT_SETUP
267 268
    sar        r3d, 1
    lea        r4, [r2*2]
269
.nextrow:
270
    BIWEIGHT     0, r2
271
    movh   [r0   ], m0
272
    movhps [r0+r2], m0
273 274
    add         r0, r4
    add         r1, r4
275
    dec         r3d
276 277 278 279
    jnz .nextrow
    REP_RET
%endmacro

280 281 282 283
INIT_XMM sse2
BIWEIGHT_FUNC_HALF
INIT_XMM sse4
BIWEIGHT_FUNC_HALF