Commit e25dee60 authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

VP8: Much faster SSE2 MC

5-10% faster or more on Phenom, Athlon 64, and some others.
Helps some on pre-SSSE3 Intel chips as well, but not as much.

Originally committed as revision 24513 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 9dd9d67b
...@@ -438,48 +438,43 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 ...@@ -438,48 +438,43 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg .nextrow jg .nextrow
REP_RET REP_RET
; 4x4 block, H-only 4-tap filter
INIT_XMM INIT_XMM
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
shl r5d, 4 shl r5d, 5
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_hw_m] lea r11, [fourtap_filter_v_m]
%endif %endif
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words lea r5, [fourtap_filter_v+r5-32]
mova m6, [fourtap_filter_hw+r5]
pxor m7, m7 pxor m7, m7
mova m4, [pw_64]
mova m5, [r5+ 0]
mova m6, [r5+16]
%ifdef m8
mova m8, [r5+32]
mova m9, [r5+48]
%endif
.nextrow .nextrow
movh m0, [r2-1] movq m0, [r2-1]
punpcklbw m0, m7 ; ABCDEFGH movq m1, [r2-0]
mova m1, m0 movq m2, [r2+1]
mova m2, m0 movq m3, [r2+2]
mova m3, m0 punpcklbw m0, m7
psrldq m1, 2 ; BCDEFGH punpcklbw m1, m7
psrldq m2, 4 ; CDEFGH punpcklbw m2, m7
psrldq m3, 6 ; DEFGH punpcklbw m3, m7
punpcklwd m0, m1 ; ABBCCDDE pmullw m0, m5
punpcklwd m2, m3 ; CDDEEFFG pmullw m1, m6
pmaddwd m0, m5 %ifdef m8
pmaddwd m2, m6 pmullw m2, m8
paddd m0, m2 pmullw m3, m9
%else
movh m1, [r2+3] pmullw m2, [r5+32]
punpcklbw m1, m7 ; ABCDEFGH pmullw m3, [r5+48]
mova m2, m1 %endif
mova m3, m1 paddsw m0, m1
mova m4, m1 paddsw m2, m3
psrldq m2, 2 ; BCDEFGH paddsw m0, m2
psrldq m3, 4 ; CDEFGH paddsw m0, m4
psrldq m4, 6 ; DEFGH
punpcklwd m1, m2 ; ABBCCDDE
punpcklwd m3, m4 ; CDDEEFFG
pmaddwd m1, m5
pmaddwd m3, m6
paddd m1, m3
packssdw m0, m1
paddsw m0, [pw_64]
psraw m0, 7 psraw m0, 7
packuswb m0, m7 packuswb m0, m7
movh [r0], m0 ; store movh [r0], m0 ; store
...@@ -491,62 +486,57 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 ...@@ -491,62 +486,57 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
lea r5d, [r5*3] lea r5d, [r5*3]
shl r5d, 4
%ifdef PIC %ifdef PIC
lea r11, [sixtap_filter_hw_m] lea r11, [sixtap_filter_v_m]
%endif %endif
lea r5, [sixtap_filter_hw+r5*8] lea r5, [sixtap_filter_v+r5-96]
pxor m7, m7 pxor m7, m7
mova m6, [pw_64]
%ifdef m8
mova m8, [r5+ 0]
mova m9, [r5+16]
mova m10, [r5+32]
mova m11, [r5+48]
mova m12, [r5+64]
mova m13, [r5+80]
%endif
.nextrow .nextrow
movu m0, [r2-2] movq m0, [r2-2]
mova m6, m0 movq m1, [r2-1]
mova m4, m0 movq m2, [r2-0]
punpcklbw m0, m7 ; ABCDEFGHI movq m3, [r2+1]
mova m1, m0 movq m4, [r2+2]
mova m2, m0 movq m5, [r2+3]
mova m3, m0 punpcklbw m0, m7
psrldq m1, 2 ; BCDEFGH punpcklbw m1, m7
psrldq m2, 4 ; CDEFGH punpcklbw m2, m7
psrldq m3, 6 ; DEFGH punpcklbw m3, m7
psrldq m4, 4 punpcklbw m4, m7
punpcklbw m4, m7 ; EFGH punpcklbw m5, m7
mova m5, m4 %ifdef m8
psrldq m5, 2 ; FGH pmullw m0, m8
punpcklwd m0, m1 ; ABBCCDDE pmullw m1, m9
punpcklwd m2, m3 ; CDDEEFFG pmullw m2, m10
punpcklwd m4, m5 ; EFFGGHHI pmullw m3, m11
pmaddwd m0, [r5-48] pmullw m4, m12
pmaddwd m2, [r5-32] pmullw m5, m13
pmaddwd m4, [r5-16] %else
paddd m0, m2 pmullw m0, [r5+ 0]
paddd m0, m4 pmullw m1, [r5+16]
pmullw m2, [r5+32]
psrldq m6, 4 pmullw m3, [r5+48]
mova m4, m6 pmullw m4, [r5+64]
punpcklbw m6, m7 ; ABCDEFGHI pmullw m5, [r5+80]
mova m1, m6 %endif
mova m2, m6 paddsw m1, m4
mova m3, m6 paddsw m0, m5
psrldq m1, 2 ; BCDEFGH paddsw m1, m2
psrldq m2, 4 ; CDEFGH paddsw m0, m3
psrldq m3, 6 ; DEFGH paddsw m0, m1
psrldq m4, 4 paddsw m0, m6
punpcklbw m4, m7 ; EFGH
mova m5, m4
psrldq m5, 2 ; FGH
punpcklwd m6, m1 ; ABBCCDDE
punpcklwd m2, m3 ; CDDEEFFG
punpcklwd m4, m5 ; EFFGGHHI
pmaddwd m6, [r5-48]
pmaddwd m2, [r5-32]
pmaddwd m4, [r5-16]
paddd m6, m2
paddd m6, m4
packssdw m0, m6
paddsw m0, [pw_64]
psraw m0, 7 psraw m0, 7
packuswb m0, m7 packuswb m0, m7
movh [r0], m0 ; store movh [r0], m0 ; store
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment