Commit 8a1cff1c authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp9/x86: make filter_44_h work on 32-bit.

parent 047088b8
......@@ -346,9 +346,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
} \
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
if (ARCH_X86_64) { \
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
} \
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
if (ARCH_X86_64) { \
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
......
......@@ -289,38 +289,6 @@ SECTION .text
SWAP %12, %14
%endmacro
; transpose 16 half lines (high part) to 8 full centered lines
%macro TRANSPOSE16x8B 16
punpcklbw m%1, m%2
punpcklbw m%3, m%4
punpcklbw m%5, m%6
punpcklbw m%7, m%8
punpcklbw m%9, m%10
punpcklbw m%11, m%12
punpcklbw m%13, m%14
punpcklbw m%15, m%16
SBUTTERFLY wd, %1, %3, %2
SBUTTERFLY wd, %5, %7, %2
SBUTTERFLY wd, %9, %11, %2
SBUTTERFLY wd, %13, %15, %2
SBUTTERFLY dq, %1, %5, %2
SBUTTERFLY dq, %3, %7, %2
SBUTTERFLY dq, %9, %13, %2
SBUTTERFLY dq, %11, %15, %2
SBUTTERFLY qdq, %1, %9, %2
SBUTTERFLY qdq, %3, %11, %2
SBUTTERFLY qdq, %5, %13, %2
SBUTTERFLY qdq, %7, %15, %2
SWAP %5, %1
SWAP %6, %9
SWAP %7, %1
SWAP %8, %13
SWAP %9, %3
SWAP %10, %11
SWAP %11, %1
SWAP %12, %15
%endmacro
%macro DEFINE_REAL_P7_TO_Q7 0-1 0
%define P7 dstq + 4*mstrideq + %1
%define P6 dstq + mstride3q + %1
......@@ -396,6 +364,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movx m5, [P2]
movx m6, [P1]
movx m7, [P0]
%if ARCH_X86_64
movx m8, [Q0]
movx m9, [Q1]
movx m10, [Q2]
......@@ -404,32 +373,67 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
movx m13, [Q5]
movx m14, [Q6]
movx m15, [Q7]
%define P7 rsp + 0
%define P6 rsp + 16
%define P5 rsp + 32
%define P4 rsp + 48
%define P3 rsp + 64
%define P2 rsp + 80
%define P1 rsp + 96
%define P0 rsp + 112
%define Q0 rsp + 128
%define Q1 rsp + 144
%define Q2 rsp + 160
%define Q3 rsp + 176
%if %2 == 16
TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
%define P7 rsp + 128
%define P6 rsp + 144
%define P5 rsp + 160
%define P4 rsp + 176
%define Q4 rsp + 192
%define Q5 rsp + 208
%define Q6 rsp + 224
%define Q7 rsp + 240
%if %2 == 16
TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
mova [P7], m0
mova [P6], m1
mova [P5], m2
mova [P4], m3
%else
TRANSPOSE16x8B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%endif
; 8x16 transpose
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
punpcklbw m8, m9
punpcklbw m10, m11
punpcklbw m12, m13
punpcklbw m14, m15
TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15
SWAP 0, 4
SWAP 2, 5
SWAP 0, 6
SWAP 0, 7
SWAP 10, 9
SWAP 12, 10
SWAP 14, 11
%endif
%else ; x86-32
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
movx m1, [Q0]
movx m3, [Q1]
movx m5, [Q2]
movx m7, [Q3]
punpcklbw m1, m3
punpcklbw m5, m7
movx m3, [Q4]
movx m7, [Q5]
punpcklbw m3, m7
mova [rsp], m3
movx m3, [Q6]
movx m7, [Q7]
punpcklbw m3, m7
%endif
%define P3 rsp + 0
%define P2 rsp + 16
%define P1 rsp + 32
%define P0 rsp + 48
%define Q0 rsp + 64
%define Q1 rsp + 80
%define Q2 rsp + 96
%define Q3 rsp + 112
%if ARCH_X86_64
mova [P3], m4
mova [P2], m5
mova [P1], m6
......@@ -444,7 +448,17 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
mova [Q6], m14
mova [Q7], m15
%endif
%else ; x86-32
TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1
mova [P3], m0
mova [P2], m2
mova [P1], m4
mova [P0], m6
mova [Q1], m5
mova [Q2], m7
mova [Q3], m3
%endif
%endif ; %1 == h
; calc fm mask
%if %2 == 16
......@@ -960,22 +974,22 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
RET
%endmacro
%macro LPF_16_VH 4
INIT_XMM %4
LOOPFILTER v, %1, %2, 0, %3
%if ARCH_X86_64
LOOPFILTER h, %1, %2, 256, %3
%macro LPF_16_VH 5
INIT_XMM %5
LOOPFILTER v, %1, %2, 0, %4
%if ARCH_X86_64 || %1 == 44
LOOPFILTER h, %1, %2, %3, %4
%endif
%endmacro
%macro LPF_16_VH_ALL_OPTS 2-3 0
LPF_16_VH %1, %2, %3, sse2
LPF_16_VH %1, %2, %3, ssse3
LPF_16_VH %1, %2, %3, avx
%macro LPF_16_VH_ALL_OPTS 4
LPF_16_VH %1, %2, %3, %4, sse2
LPF_16_VH %1, %2, %3, %4, ssse3
LPF_16_VH %1, %2, %3, %4, avx
%endmacro
LPF_16_VH_ALL_OPTS 16, 512, 32
LPF_16_VH_ALL_OPTS 44, 0, 0
LPF_16_VH_ALL_OPTS 48, 256, 16
LPF_16_VH_ALL_OPTS 84, 256, 16
LPF_16_VH_ALL_OPTS 88, 256, 16
LPF_16_VH_ALL_OPTS 16, 512, 256, 32
LPF_16_VH_ALL_OPTS 44, 0, 128, 0
LPF_16_VH_ALL_OPTS 48, 256, 128, 16
LPF_16_VH_ALL_OPTS 84, 256, 128, 16
LPF_16_VH_ALL_OPTS 88, 256, 128, 16
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment