h264_weight_10bit.asm 6.18 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

Dave Yeo's avatar
Dave Yeo committed
25 26
%include "x86inc.asm"
%include "x86util.asm"
27 28 29 30 31 32 33 34 35 36 37 38

SECTION_RODATA 32

pw_pixel_max: times 8 dw ((1 << 10)-1)
sq_1: dq 1
      dq 0

cextern pw_1

SECTION .text

;-----------------------------------------------------------------------------
39
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
40 41
;                  int weight, int offset);
;-----------------------------------------------------------------------------
42
%macro WEIGHT_PROLOGUE 0
43
.prologue
44
    PROLOGUE 0,6,8
45 46
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
47
    movifnidn r2d, r2m
48
    movifnidn r4d, r4m
49
    movifnidn r5d, r5m
50 51 52 53
%endmacro

%macro WEIGHT_SETUP 1
    mova       m0, [pw_1]
54
    movd       m2, r3m
55 56
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
57 58 59
    shl        r5, 19       ; *8, move to upper half of dword
    lea        r5, [r5+r4*2+0x10000]
    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
%ifnidn %1, sse4
    pxor       m7, m7
%endif
%endmacro

%macro WEIGHT_OP 2-3
%if %0==2
    mova        m5, [r0+%2]
    punpckhwd   m6, m5, m0
    punpcklwd   m5, m0
%else
    movq        m5, [r0+%2]
    movq        m6, [r0+%3]
    punpcklwd   m5, m0
    punpcklwd   m6, m0
%endif
    pmaddwd     m5, m3
    pmaddwd     m6, m3
    psrad       m5, m2
    psrad       m6, m2
%ifidn %1, sse4
    packusdw    m5, m6
    pminsw      m5, m4
%else
    packssdw    m5, m6
    CLIPW       m5, m7, m4
%endif
%endmacro

%macro WEIGHT_FUNC_DBL 1
93 94
cglobal h264_weight_16_10_%1
    WEIGHT_PROLOGUE
95 96 97 98 99 100 101
    WEIGHT_SETUP %1
.nextrow
    WEIGHT_OP %1,  0
    mova [r0   ], m5
    WEIGHT_OP %1, 16
    mova [r0+16], m5
    add       r0, r1
102
    dec       r2d
103 104 105 106 107 108 109 110 111 112
    jnz .nextrow
    REP_RET
%endmacro

INIT_XMM
WEIGHT_FUNC_DBL sse2
WEIGHT_FUNC_DBL sse4


%macro WEIGHT_FUNC_MM 1
113 114
cglobal h264_weight_8_10_%1
    WEIGHT_PROLOGUE
115 116 117 118 119
    WEIGHT_SETUP %1
.nextrow
    WEIGHT_OP  %1, 0
    mova     [r0], m5
    add        r0, r1
120
    dec        r2d
121 122 123 124 125 126 127 128 129 130
    jnz .nextrow
    REP_RET
%endmacro

INIT_XMM
WEIGHT_FUNC_MM sse2
WEIGHT_FUNC_MM sse4


%macro WEIGHT_FUNC_HALF_MM 1
131 132 133
cglobal h264_weight_4_10_%1
    WEIGHT_PROLOGUE
    sar         r2d, 1
134 135 136 137 138 139 140
    WEIGHT_SETUP %1
    lea         r3, [r1*2]
.nextrow
    WEIGHT_OP   %1, 0, r1
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
141
    dec         r2d
142 143 144 145 146 147 148 149 150 151
    jnz .nextrow
    REP_RET
%endmacro

INIT_XMM
WEIGHT_FUNC_HALF_MM sse2
WEIGHT_FUNC_HALF_MM sse4


;-----------------------------------------------------------------------------
152 153
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
;                    int log2_denom, int weightd, int weights, int offset);
154
;-----------------------------------------------------------------------------
155
%if ARCH_X86_32
156
DECLARE_REG_TMP 3
157
%else
158
DECLARE_REG_TMP 7
159 160
%endif

161
%macro BIWEIGHT_PROLOGUE 0
162
.prologue
163
    PROLOGUE 0,8,8
164 165
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
166
    movifnidn r2d, r2m
167 168
    movifnidn r5d, r5m
    movifnidn r6d, r6m
169
    movifnidn t0d, r7m
170 171 172
%endmacro

%macro BIWEIGHT_SETUP 1
173 174 175 176 177 178 179
    lea        t0, [t0*4+1] ; (offset<<2)+1
    or         t0, 1
    shl        r6, 16
    or         r5, r6
    movd       m4, r5d      ; weightd | weights
    movd       m5, t0d      ; (offset+1)|1
    movd       m6, r4m      ; log2_denom
180 181 182 183 184
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
185
    movifnidn r3d, r3m
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
%ifnidn %1, sse4
    pxor       m7, m7
%endif
%endmacro

%macro BIWEIGHT 2-3
%if %0==2
    mova       m0, [r0+%2]
    mova       m1, [r1+%2]
    punpckhwd  m2, m0, m1
    punpcklwd  m0, m1
%else
    movq       m0, [r0+%2]
    movq       m1, [r1+%2]
    punpcklwd  m0, m1
    movq       m2, [r0+%3]
    movq       m1, [r1+%3]
    punpcklwd  m2, m1
%endif
    pmaddwd    m0, m4
    pmaddwd    m2, m4
    paddd      m0, m5
    paddd      m2, m5
    psrad      m0, m6
    psrad      m2, m6
%ifidn %1, sse4
    packusdw   m0, m2
    pminsw     m0, m3
%else
    packssdw   m0, m2
    CLIPW      m0, m7, m3
%endif
%endmacro

%macro BIWEIGHT_FUNC_DBL 1
221
cglobal h264_biweight_16_10_%1
222
    BIWEIGHT_PROLOGUE
223 224 225 226 227 228
    BIWEIGHT_SETUP %1
.nextrow
    BIWEIGHT  %1,  0
    mova [r0   ], m0
    BIWEIGHT  %1, 16
    mova [r0+16], m0
229 230 231
    add       r0, r2
    add       r1, r2
    dec       r3d
232 233 234 235 236 237 238 239 240
    jnz .nextrow
    REP_RET
%endmacro

INIT_XMM
BIWEIGHT_FUNC_DBL sse2
BIWEIGHT_FUNC_DBL sse4

%macro BIWEIGHT_FUNC 1
241
cglobal h264_biweight_8_10_%1
242
    BIWEIGHT_PROLOGUE
243 244 245 246
    BIWEIGHT_SETUP %1
.nextrow
    BIWEIGHT %1, 0
    mova   [r0], m0
247 248 249
    add      r0, r2
    add      r1, r2
    dec      r3d
250 251 252 253 254 255 256 257 258
    jnz .nextrow
    REP_RET
%endmacro

INIT_XMM
BIWEIGHT_FUNC sse2
BIWEIGHT_FUNC sse4

%macro BIWEIGHT_FUNC_HALF 1
259
cglobal h264_biweight_4_10_%1
260
    BIWEIGHT_PROLOGUE
261
    BIWEIGHT_SETUP %1
262 263
    sar        r3d, 1
    lea        r4, [r2*2]
264
.nextrow
265
    BIWEIGHT    %1, 0, r2
266
    movh   [r0   ], m0
267
    movhps [r0+r2], m0
268 269
    add         r0, r4
    add         r1, r4
270
    dec         r3d
271 272 273 274 275 276 277
    jnz .nextrow
    REP_RET
%endmacro

INIT_XMM
BIWEIGHT_FUNC_HALF sse2
BIWEIGHT_FUNC_HALF sse4