Commit 45c7f399 authored by Clément Bœsch's avatar Clément Bœsch

avutil/pixelutils: faster pixelutils_sad_[au]_16x16

~560 → ~500 decicycles

This is following the comments from Michael in
https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html

Using 2 registers for accumulator didn't help. On the other hand,
some re-ordering between the movs and psadbw allowed going ~538 to ~500.
parent c82a288f
......@@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
%macro SAD_XMM_16x16 1
INIT_XMM sse2
cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
pxor m2, m2
%rep 8
mov%1 m0, [src2q]
mov%1 m2, [src2q]
psadbw m2, [src1q]
mov%1 m1, [src2q + stride2q]
psadbw m0, [src1q]
psadbw m1, [src1q + stride1q]
paddw m2, m0
paddw m2, m1
%rep 7
lea src1q, [src1q + 2*stride1q]
lea src2q, [src2q + 2*stride2q]
mov%1 m0, [src2q]
psadbw m0, [src1q]
mov%1 m1, [src2q + stride2q]
psadbw m1, [src1q + stride1q]
paddw m2, m0
paddw m2, m1
%endrep
movhlps m0, m2
paddw m2, m0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment