Commit aeaf268e authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp3: integrate clear_blocks with idct of previous block.

This is identical to what e.g. vp8 does, and prevents the function call
overhead (plus dependency on dsputil for this particular function).

Arm asm updated by Janne Grunau <janne-libav@jannau.net>.
Signed-off-by: 's avatarJanne Grunau <janne-libav@jannau.net>
parent 992b0318
......@@ -108,14 +108,20 @@ endfunc
function vp3_idct_start_neon
vpush {d8-d15}
vmov.i16 q4, #0
vmov.i16 q5, #0
movrel r3, vp3_idct_constants
vld1.64 {d0-d1}, [r3,:128]
vld1.64 {d16-d19}, [r2,:128]!
vld1.64 {d20-d23}, [r2,:128]!
vld1.64 {d24-d27}, [r2,:128]!
vld1.64 {d16-d19}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vld1.64 {d20-d23}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vld1.64 {d24-d27}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vadd.s16 q1, q8, q12
vsub.s16 q8, q8, q12
vld1.64 {d28-d31}, [r2,:128]!
vld1.64 {d28-d31}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vp3_idct_core_neon:
vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
......@@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1
endfunc
function ff_vp3_idct_dc_add_neon, export=1
ldrsh r2, [r2]
ldrsh r12, [r2]
mov r3, r0
add r2, r2, #15
vdup.16 q15, r2
add r12, r12, #15
vdup.16 q15, r12
mov r12, 0
strh r12, [r2]
vshr.s16 q15, q15, #5
vld1.8 {d0}, [r0,:64], r1
......
......@@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
PUT(b5) dst += stride;
PUT(b6) dst += stride;
PUT(b7)
memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
......@@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
ADD(b5) dst += stride;
ADD(b6) dst += stride;
ADD(b7)
memset(block, 0, sizeof(*block) * 64);
}
#endif /* HAVE_ALTIVEC */
......
......@@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext {
DSPContext dsp;
VideoDSPContext vdsp;
VP3DSPContext vp3dsp;
DECLARE_ALIGNED(16, DCTELEM, block)[64];
int flipped_image;
int last_slice_end;
int skip_loop_filter;
......@@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int
static void render_slice(Vp3DecodeContext *s, int slice)
{
int x, y, i, j, fragment;
LOCAL_ALIGNED_16(DCTELEM, block, [64]);
DCTELEM *block = s->block;
int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef;
int motion_halfpel_index;
uint8_t *motion_source;
......@@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
}
}
s->dsp.clear_block(block);
/* invert DCT and place (or add) in final output */
if (s->all_fragments[i].coding_method == MODE_INTRA) {
......
......@@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
idct(dest, line_size, block, 1);
memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
idct(dest, line_size, block, 2);
memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
const DCTELEM *block/*align 16*/){
DCTELEM *block/*align 16*/){
int i, dc = (block[0] + 15) >> 5;
for(i = 0; i < 8; i++){
......@@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
dest[7] = av_clip_uint8(dest[7] + dc);
dest += line_size;
}
block[0] = 0;
}
static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride,
......
......@@ -25,7 +25,7 @@
typedef struct VP3DSPContext {
void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block);
void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block);
void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block);
void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
......
......@@ -561,6 +561,13 @@ cglobal vp3_idct_put, 3, 4, 9
movhps [r0+r3 ], m3
%endif
%assign %%i %%i+64
%endrep
pxor m0, m0
%assign %%offset 0
%rep 128/mmsize
mova [r2+%%offset], m0
%assign %%offset %%offset+mmsize
%endrep
RET
......@@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
%assign %%offset 0
%rep 32/mmsize
mova [r2+%%offset], m4
%assign %%offset %%offset+mmsize
%endrep
add r2, 32
dec r3
jg .loop
......@@ -620,7 +632,7 @@ vp3_idct_funcs
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
movq m5, [r0+r3 ]
movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
......@@ -630,7 +642,7 @@ vp3_idct_funcs
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
movq [r0+r3 ], m5
movq [r0+r2 ], m5
%endmacro
INIT_MMX mmxext
......@@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
lea r3, [r1*3]
movsx r2, word [r2]
add r2, 15
sar r2, 5
movd m0, r2d
movsx r3, word [r2]
mov word [r2], 0
lea r2, [r1*3]
add r3, 15
sar r3, 5
movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0
......
......@@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
const DCTELEM *block);
DCTELEM *block);
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment