Commit 3deb5384 authored by Alex Converse's avatar Alex Converse

Implement an sse version of scalarproduct_float().

Originally committed as revision 21386 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 57835fc1
...@@ -2510,6 +2510,8 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui ...@@ -2510,6 +2510,8 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{ {
mm_flags = mm_support(); mm_flags = mm_support();
...@@ -2965,6 +2967,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) ...@@ -2965,6 +2967,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->vector_clipf = vector_clipf_sse; c->vector_clipf = vector_clipf_sse;
c->float_to_int16 = float_to_int16_sse; c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
#endif
} }
if(mm_flags & FF_MM_3DNOW) if(mm_flags & FF_MM_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
......
...@@ -397,3 +397,27 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left ...@@ -397,3 +397,27 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
.unaligned: .unaligned:
ADD_HFYU_LEFT_LOOP 0 ADD_HFYU_LEFT_LOOP 0
; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
neg offsetq
shl offsetq, 2
sub v1q, offsetq
sub v2q, offsetq
xorps xmm0, xmm0
.loop:
movaps xmm1, [v1q+offsetq]
mulps xmm1, [v2q+offsetq]
addps xmm0, xmm1
add offsetq, 16
js .loop
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%ifndef ARCH_X86_64
movd r0m, xmm0
fld dword r0m
%endif
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment