Commit 6f40e9f0 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Luca Barbato

x86inc: support stack mem allocation and re-alignment in PROLOGUE

Use this in VP8/H264-8bit loopfilter functions so they can be used if
there is no aligned stack (e.g. MSVC 32bit or ICC 10.x).
Signed-off-by: 's avatarLuca Barbato <lu_zero@gentoo.org>
parent 14758e32
......@@ -398,14 +398,12 @@ DEBLOCK_LUMA
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_8, 5,5
cglobal deblock_%1_luma_8, 5,5,8,2*%2
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
%assign pad 2*%2+12-(stack_offset&15)
SUB esp, pad
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
......@@ -443,22 +441,19 @@ cglobal deblock_%1_luma_8, 5,5
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
ADD esp, pad
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma_8, 0,5
cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
lea r1, [r0+r4]
%assign pad 0x78-(stack_offset&15)
SUB esp, pad
%define pix_tmp esp+12
%define pix_tmp esp+12*HAVE_ALIGNED_STACK
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
......@@ -500,7 +495,6 @@ cglobal deblock_h_luma_8, 0,5
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
ADD esp, pad
RET
%endmacro ; DEBLOCK_LUMA
......@@ -631,7 +625,7 @@ DEBLOCK_LUMA v, 16
%define mpb_0 m14
%define mpb_1 m15
%else
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
%define spill(x) [esp+16*x]
%define p2 [r4+r1]
%define q2 [r0+2*r1]
%define t4 spill(0)
......@@ -646,10 +640,7 @@ DEBLOCK_LUMA v, 16
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra_8, 4,6,16
%if ARCH_X86_64 == 0
sub esp, 0x60
%endif
cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1
......@@ -698,9 +689,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
%if ARCH_X86_64 == 0
add esp, 0x60
%endif
RET
INIT_MMX cpuname
......@@ -737,12 +725,10 @@ cglobal deblock_h_luma_intra_8, 4,9
add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra_8, 2,4
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
%assign pad 0x8c-(stack_offset&15)
SUB rsp, pad
%define pix_tmp rsp
; transpose 8x16 -> tmp space
......@@ -773,7 +759,6 @@ cglobal deblock_h_luma_intra_8, 2,4
lea r0, [r0+r1*8]
lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
ADD rsp, pad
RET
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA
......
......@@ -275,18 +275,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSSE3(mm_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
}
if (EXTERNAL_AVX(mm_flags) && HAVE_ALIGNED_STACK) {
if (EXTERNAL_AVX(mm_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
......
......@@ -1631,28 +1631,31 @@ SIMPLE_LOOPFILTER h, 5
;-----------------------------------------------------------------------------
%macro INNER_LOOPFILTER 2
%define stack_size 0
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v ; [3]=hev() result
%define stack_size mmsize * -4
%else ; h ; extra storage space for transposes
%define stack_size mmsize * -5
%endif
%endif
%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr
cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
%endif
%if cpuflag(ssse3)
pxor m7, m7
%endif
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v ; [3]=hev() result
%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
%else ; h ; extra storage space for transposes
%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
%endif
%ifndef m8
; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh
SUB rsp, pad
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
......@@ -2082,12 +2085,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
dec cntrq
jg .next8px
%endif
%endif
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
ADD rsp, pad
%endif
REP_RET
%else ; mmsize == 16
RET
%endif
%endmacro
%if ARCH_X86_32
......@@ -2122,31 +2123,34 @@ INNER_LOOPFILTER h, 8
;-----------------------------------------------------------------------------
%macro MBEDGE_LOOPFILTER 2
%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
%endif
%if cpuflag(ssse3)
pxor m7, m7
%endif
%define stack_size 0
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%if mmsize == 16 ; [3]=hev() result
; [4]=filter tmp result
; [5]/[6] = p2/q2 backup
; [7]=lim_res sign result
%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15)
%define stack_size mmsize * -7
%else ; 8 ; extra storage space for transposes
%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15)
%define stack_size mmsize * -8
%endif
%endif
%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
%endif
%if cpuflag(ssse3)
pxor m7, m7
%endif
%ifndef m8
; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh
SUB rsp, pad
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
......@@ -2740,12 +2744,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt
dec cntrq
jg .next8px
%endif
%endif
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
ADD rsp, pad
%endif
REP_RET
%else ; mmsize == 16
RET
%endif
%endmacro
%if ARCH_X86_32
......
......@@ -390,13 +390,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
#endif
}
if (mm_flags & AV_CPU_FLAG_SSE2) {
......@@ -404,13 +402,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
#endif
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
......@@ -424,7 +420,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
......@@ -434,17 +429,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
#endif
}
if (mm_flags & AV_CPU_FLAG_SSE4) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
#endif
}
#endif /* HAVE_YASM */
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment