Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
610e00b3
Commit
610e00b3
authored
Oct 13, 2012
by
Daniel Kang
Committed by
Diego Biurrun
Nov 25, 2012
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: h264: Convert 8-bit QPEL inline assembly to YASM
Signed-off-by:
Diego Biurrun
<
diego@biurrun.de
>
parent
ad01ba6c
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1276 additions
and
1198 deletions
+1276
-1198
Makefile
libavcodec/x86/Makefile
+2
-1
dsputil.asm
libavcodec/x86/dsputil.asm
+231
-0
dsputil_avg_template.c
libavcodec/x86/dsputil_avg_template.c
+0
-130
dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c
+8
-99
h264_qpel.c
libavcodec/x86/h264_qpel.c
+173
-968
h264_qpel_8bit.asm
libavcodec/x86/h264_qpel_8bit.asm
+862
-0
No files found.
libavcodec/x86/Makefile
View file @
610e00b3
...
...
@@ -51,7 +51,8 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_weight_10bit.o
YASM-OBJS-$(CONFIG_H264PRED)
+=
x86/h264_intrapred.o
\
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL)
+=
x86/h264_qpel_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL)
+=
x86/h264_qpel_8bit.o
\
x86/h264_qpel_10bit.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP)
+=
x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER)
+=
x86/proresdsp.o
...
...
libavcodec/x86/dsputil.asm
View file @
610e00b3
...
...
@@ -1354,3 +1354,234 @@ BSWAP32_BUF
INIT_XMM
ssse3
BSWAP32_BUF
%macro
op_avgh
3
movh
%3
,
%2
pavgb
%1
,
%3
movh
%2
,
%1
%endmacro
%macro
op_avg
2
pavgb
%1
,
%2
mova
%2
,
%1
%endmacro
%macro
op_puth
2
-
3
movh
%2
,
%1
%endmacro
%macro
op_put
2
mova
%2
,
%1
%endmacro
; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro
PIXELS4_L2
1
%define
OP
op_
%1
h
cglobal
%1
_pixels4_l2
,
6
,
6
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
test
r5d
,
1
je
.
loop
movd
m0
,
[r1]
movd
m1
,
[r2]
add
r1
,
r4
add
r2
,
4
pavgb
m0
,
m1
OP
m0
,
[r0],
m3
add
r0
,
r3
dec
r5d
.
loop
:
mova
m0
,
[r1]
mova
m1
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
2
*
r4
]
pavgb
m0
,
[r2]
pavgb
m1
,
[
r2
+
4
]
OP
m0
,
[r0],
m3
OP
m1
,
[
r0
+
r3
]
,
m3
lea
r0
,
[
r0
+
2
*
r3
]
mova
m0
,
[r1]
mova
m1
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
2
*
r4
]
pavgb
m0
,
[
r2
+
8
]
pavgb
m1
,
[
r2
+
12
]
OP
m0
,
[r0],
m3
OP
m1
,
[
r0
+
r3
]
,
m3
lea
r0
,
[
r0
+
2
*
r3
]
add
r2
,
16
sub
r5d
,
4
jne
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
PIXELS4_L2
put
PIXELS4_L2
avg
; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro
PIXELS8_L2
1
%define
OP
op_
%1
cglobal
%1
_pixels8_l2
,
6
,
6
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
test
r5d
,
1
je
.
loop
mova
m0
,
[r1]
mova
m1
,
[r2]
add
r1
,
r4
add
r2
,
8
pavgb
m0
,
m1
OP
m0
,
[r0]
add
r0
,
r3
dec
r5d
.
loop
:
mova
m0
,
[r1]
mova
m1
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
2
*
r4
]
pavgb
m0
,
[r2]
pavgb
m1
,
[
r2
+
8
]
OP
m0
,
[r0]
OP
m1
,
[
r0
+
r3
]
lea
r0
,
[
r0
+
2
*
r3
]
mova
m0
,
[r1]
mova
m1
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
2
*
r4
]
pavgb
m0
,
[
r2
+
16
]
pavgb
m1
,
[
r2
+
24
]
OP
m0
,
[r0]
OP
m1
,
[
r0
+
r3
]
lea
r0
,
[
r0
+
2
*
r3
]
add
r2
,
32
sub
r5d
,
4
jne
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
PIXELS8_L2
put
PIXELS8_L2
avg
; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro
PIXELS16_L2
1
%define
OP
op_
%1
cglobal
%1
_pixels16_l2
,
6
,
6
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
test
r5d
,
1
je
.
loop
mova
m0
,
[r1]
mova
m1
,
[
r1
+
8
]
pavgb
m0
,
[r2]
pavgb
m1
,
[
r2
+
8
]
add
r1
,
r4
add
r2
,
16
OP
m0
,
[r0]
OP
m1
,
[
r0
+
8
]
add
r0
,
r3
dec
r5d
.
loop
:
mova
m0
,
[r1]
mova
m1
,
[
r1
+
8
]
add
r1
,
r4
pavgb
m0
,
[r2]
pavgb
m1
,
[
r2
+
8
]
OP
m0
,
[r0]
OP
m1
,
[
r0
+
8
]
add
r0
,
r3
mova
m0
,
[r1]
mova
m1
,
[
r1
+
8
]
add
r1
,
r4
pavgb
m0
,
[
r2
+
16
]
pavgb
m1
,
[
r2
+
24
]
OP
m0
,
[r0]
OP
m1
,
[
r0
+
8
]
add
r0
,
r3
add
r2
,
32
sub
r5d
,
2
jne
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
PIXELS16_L2
put
PIXELS16_L2
avg
INIT_MMX
mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro
PIXELS48
2
%if
%2
==
4
%define
OP
movh
%else
%define
OP
mova
%endif
cglobal
%1
_pixels
%2
,
4
,
5
movsxdifnidn
r2
,
r2d
lea
r4
,
[
r2
*
3
]
.
loop
:
OP
m0
,
[r1]
OP
m1
,
[
r1
+
r2
]
OP
m2
,
[
r1
+
r2
*
2
]
OP
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
%ifidn
%1
,
avg
pavgb
m0
,
[r0]
pavgb
m1
,
[
r0
+
r2
]
pavgb
m2
,
[
r0
+
r2
*
2
]
pavgb
m3
,
[
r0
+
r4
]
%endif
OP
[r0],
m0
OP
[
r0
+
r2
]
,
m1
OP
[
r0
+
r2
*
2
]
,
m2
OP
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jne
.
loop
RET
%endmacro
PIXELS48
put
,
4
PIXELS48
avg
,
4
PIXELS48
put
,
8
PIXELS48
avg
,
8
INIT_XMM
sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
cglobal
put_pixels16
,
4
,
5
,
4
movsxdifnidn
r2
,
r2d
lea
r4
,
[
r2
*
3
]
.
loop
:
movu
m0
,
[r1]
movu
m1
,
[
r1
+
r2
]
movu
m2
,
[
r1
+
r2
*
2
]
movu
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
mova
[r0],
m0
mova
[
r0
+
r2
]
,
m1
mova
[
r0
+
r2
*
2
]
,
m2
mova
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jnz
.
loop
REP_RET
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
cglobal
avg_pixels16
,
4
,
5
,
4
movsxdifnidn
r2
,
r2d
lea
r4
,
[
r2
*
3
]
.
loop
:
movu
m0
,
[r1]
movu
m1
,
[
r1
+
r2
]
movu
m2
,
[
r1
+
r2
*
2
]
movu
m3
,
[
r1
+
r4
]
lea
r1
,
[
r1
+
r2
*
4
]
pavgb
m0
,
[r0]
pavgb
m1
,
[
r0
+
r2
]
pavgb
m2
,
[
r0
+
r2
*
2
]
pavgb
m3
,
[
r0
+
r4
]
mova
[r0],
m0
mova
[
r0
+
r2
]
,
m1
mova
[
r0
+
r2
*
2
]
,
m2
mova
[
r0
+
r4
]
,
m3
sub
r3d
,
4
lea
r0
,
[
r0
+
r2
*
4
]
jnz
.
loop
REP_RET
libavcodec/x86/dsputil_avg_template.c
View file @
610e00b3
...
...
@@ -56,57 +56,6 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
}
#ifndef SKIP_FOR_3DNOW
static
void
DEF
(
put_pixels4_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
__asm__
volatile
(
"testl $1, %0
\n\t
"
" jz 1f
\n\t
"
"movd (%1), %%mm0
\n\t
"
"movd (%2), %%mm1
\n\t
"
"add %4, %1
\n\t
"
"add $4, %2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
"movd %%mm0, (%3)
\n\t
"
"add %5, %3
\n\t
"
"decl %0
\n\t
"
"1:
\n\t
"
"movd (%1), %%mm0
\n\t
"
"add %4, %1
\n\t
"
"movd (%1), %%mm1
\n\t
"
"movd (%2), %%mm2
\n\t
"
"movd 4(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
PAVGB
" %%mm2, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm1
\n\t
"
"movd %%mm0, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movd %%mm1, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movd (%1), %%mm0
\n\t
"
"add %4, %1
\n\t
"
"movd (%1), %%mm1
\n\t
"
"movd 8(%2), %%mm2
\n\t
"
"movd 12(%2), %%mm3
\n\t
"
"add %4, %1
\n\t
"
PAVGB
" %%mm2, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm1
\n\t
"
"movd %%mm0, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movd %%mm1, (%3)
\n\t
"
"add %5, %3
\n\t
"
"add $16, %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:
"+m"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#else
:
"+b"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#endif
:
"S"
((
x86_reg
)
src1Stride
),
"D"
((
x86_reg
)
dstStride
)
:
"memory"
);
}
static
void
DEF
(
put_pixels8_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
__asm__
volatile
(
...
...
@@ -227,58 +176,6 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
:"memory");*/
}
static
void
DEF
(
avg_pixels4_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
__asm__
volatile
(
"testl $1, %0
\n\t
"
" jz 1f
\n\t
"
"movd (%1), %%mm0
\n\t
"
"movd (%2), %%mm1
\n\t
"
"add %4, %1
\n\t
"
"add $4, %2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" (%3), %%mm0
\n\t
"
"movd %%mm0, (%3)
\n\t
"
"add %5, %3
\n\t
"
"decl %0
\n\t
"
"1:
\n\t
"
"movd (%1), %%mm0
\n\t
"
"add %4, %1
\n\t
"
"movd (%1), %%mm1
\n\t
"
"add %4, %1
\n\t
"
PAVGB
" (%2), %%mm0
\n\t
"
PAVGB
" 4(%2), %%mm1
\n\t
"
PAVGB
" (%3), %%mm0
\n\t
"
"movd %%mm0, (%3)
\n\t
"
"add %5, %3
\n\t
"
PAVGB
" (%3), %%mm1
\n\t
"
"movd %%mm1, (%3)
\n\t
"
"add %5, %3
\n\t
"
"movd (%1), %%mm0
\n\t
"
"add %4, %1
\n\t
"
"movd (%1), %%mm1
\n\t
"
"add %4, %1
\n\t
"
PAVGB
" 8(%2), %%mm0
\n\t
"
PAVGB
" 12(%2), %%mm1
\n\t
"
PAVGB
" (%3), %%mm0
\n\t
"
"movd %%mm0, (%3)
\n\t
"
"add %5, %3
\n\t
"
PAVGB
" (%3), %%mm1
\n\t
"
"movd %%mm1, (%3)
\n\t
"
"add %5, %3
\n\t
"
"add $16, %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:
"+m"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#else
:
"+b"
(
h
),
"+a"
(
src1
),
"+c"
(
src2
),
"+d"
(
dst
)
#endif
:
"S"
((
x86_reg
)
src1Stride
),
"D"
((
x86_reg
)
dstStride
)
:
"memory"
);
}
static
void
DEF
(
avg_pixels8_l2
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
)
{
__asm__
volatile
(
...
...
@@ -876,33 +773,6 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
:
"%"
REG_a
,
"memory"
);
}
#ifndef SKIP_FOR_3DNOW
static
void
DEF
(
avg_pixels4
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
do
{
__asm__
volatile
(
"movd (%1), %%mm0
\n\t
"
"movd (%1, %2), %%mm1
\n\t
"
"movd (%1, %2, 2), %%mm2
\n\t
"
"movd (%1, %3), %%mm3
\n\t
"
PAVGB
" (%0), %%mm0
\n\t
"
PAVGB
" (%0, %2), %%mm1
\n\t
"
PAVGB
" (%0, %2, 2), %%mm2
\n\t
"
PAVGB
" (%0, %3), %%mm3
\n\t
"
"movd %%mm0, (%1)
\n\t
"
"movd %%mm1, (%1, %2)
\n\t
"
"movd %%mm2, (%1, %2, 2)
\n\t
"
"movd %%mm3, (%1, %3)
\n\t
"
::
"S"
(
pixels
),
"D"
(
block
),
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
3L
*
line_size
)
:
"memory"
);
block
+=
4
*
line_size
;
pixels
+=
4
*
line_size
;
h
-=
4
;
}
while
(
h
>
0
);
}
#endif
/* SKIP_FOR_3DNOW */
//FIXME the following could be optimized too ...
static
void
DEF
(
put_no_rnd_pixels16_x2
)(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
){
DEF
(
put_no_rnd_pixels8_x2
)(
block
,
pixels
,
line_size
,
h
);
...
...
libavcodec/x86/dsputil_mmx.c
View file @
610e00b3
...
...
@@ -366,33 +366,6 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
}
while
(
--
i
);
}
static
void
put_pixels4_mmx
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
__asm__
volatile
(
"lea (%3, %3), %%"
REG_a
"
\n\t
"
".p2align 3
\n\t
"
"1:
\n\t
"
"movd (%1 ), %%mm0
\n\t
"
"movd (%1, %3), %%mm1
\n\t
"
"movd %%mm0, (%2)
\n\t
"
"movd %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"movd (%1 ), %%mm0
\n\t
"
"movd (%1, %3), %%mm1
\n\t
"
"movd %%mm0, (%2)
\n\t
"
"movd %%mm1, (%2, %3)
\n\t
"
"add %%"
REG_a
", %1
\n\t
"
"add %%"
REG_a
", %2
\n\t
"
"subl $4, %0
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+r"
(
pixels
),
"+r"
(
block
)
:
"r"
((
x86_reg
)
line_size
)
:
"%"
REG_a
,
"memory"
);
}
static
void
put_pixels8_mmx
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
...
...
@@ -455,56 +428,6 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
);
}
static
void
put_pixels16_sse2
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
__asm__
volatile
(
"1:
\n\t
"
"movdqu (%1 ), %%xmm0
\n\t
"
"movdqu (%1, %3 ), %%xmm1
\n\t
"
"movdqu (%1, %3, 2), %%xmm2
\n\t
"
"movdqu (%1, %4 ), %%xmm3
\n\t
"
"lea (%1, %3, 4), %1
\n\t
"
"movdqa %%xmm0, (%2)
\n\t
"
"movdqa %%xmm1, (%2, %3)
\n\t
"
"movdqa %%xmm2, (%2, %3, 2)
\n\t
"
"movdqa %%xmm3, (%2, %4)
\n\t
"
"subl $4, %0
\n\t
"
"lea (%2, %3, 4), %2
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+r"
(
pixels
),
"+r"
(
block
)
:
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
3L
*
line_size
)
:
"memory"
);
}
static
void
avg_pixels16_sse2
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
__asm__
volatile
(
"1:
\n\t
"
"movdqu (%1 ), %%xmm0
\n\t
"
"movdqu (%1, %3 ), %%xmm1
\n\t
"
"movdqu (%1, %3, 2), %%xmm2
\n\t
"
"movdqu (%1, %4 ), %%xmm3
\n\t
"
"lea (%1, %3, 4), %1
\n\t
"
"pavgb (%2 ), %%xmm0
\n\t
"
"pavgb (%2, %3 ), %%xmm1
\n\t
"
"pavgb (%2, %3, 2), %%xmm2
\n\t
"
"pavgb (%2, %4), %%xmm3
\n\t
"
"movdqa %%xmm0, (%2)
\n\t
"
"movdqa %%xmm1, (%2, %3)
\n\t
"
"movdqa %%xmm2, (%2, %3, 2)
\n\t
"
"movdqa %%xmm3, (%2, %4)
\n\t
"
"subl $4, %0
\n\t
"
"lea (%2, %3, 4), %2
\n\t
"
"jnz 1b
\n\t
"
:
"+g"
(
h
),
"+r"
(
pixels
),
"+r"
(
block
)
:
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
3L
*
line_size
)
:
"memory"
);
}
#define CLEAR_BLOCKS(name, n) \
static void name(DCTELEM *blocks) \
{ \
...
...
@@ -2381,27 +2304,23 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
}
#endif
/* HAVE_INLINE_ASM */
#if HAVE_MMXEXT_EXTERNAL
if
(
CONFIG_H264QPEL
)
{
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS
(
put_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_no_rnd_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_no_rnd_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_qpel
,
1
,
8
,
mmxext
,
);
#endif
/* HAVE_INLINE_ASM */
if
(
!
high_bit_depth
)
{
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS
(
put_h264_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_h264_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_h264_qpel
,
2
,
4
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_h264_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_h264_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_h264_qpel
,
2
,
4
,
mmxext
,
);
#endif
/* HAVE_INLINE_ASM */
}
else
if
(
bit_depth
==
10
)
{
#if HAVE_YASM
#if !ARCH_X86_64
SET_QPEL_FUNCS
(
avg_h264_qpel
,
0
,
16
,
10
_mmxext
,
ff_
);
SET_QPEL_FUNCS
(
put_h264_qpel
,
0
,
16
,
10
_mmxext
,
ff_
);
...
...
@@ -2410,18 +2329,14 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#endif
SET_QPEL_FUNCS
(
put_h264_qpel
,
2
,
4
,
10
_mmxext
,
ff_
);
SET_QPEL_FUNCS
(
avg_h264_qpel
,
2
,
4
,
10
_mmxext
,
ff_
);
#endif
/* HAVE_YASM */
}
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS
(
put_2tap_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
put_2tap_qpel
,
1
,
8
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_2tap_qpel
,
0
,
16
,
mmxext
,
);
SET_QPEL_FUNCS
(
avg_2tap_qpel
,
1
,
8
,
mmxext
,
);
#endif
/* HAVE_INLINE_ASM */
}
#if HAVE_YASM
if
(
!
high_bit_depth
&&
CONFIG_H264CHROMA
)
{
c
->
avg_h264_chroma_pixels_tab
[
0
]
=
ff_avg_h264_chroma_mc8_rnd_mmxext
;
c
->
avg_h264_chroma_pixels_tab
[
1
]
=
ff_avg_h264_chroma_mc4_mmxext
;
...
...
@@ -2447,7 +2362,7 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
}
else
{
c
->
apply_window_int16
=
ff_apply_window_int16_round_mmxext
;
}
#endif
/* HAVE_
YASM
*/
#endif
/* HAVE_
MMXEXT_EXTERNAL
*/
}
static
void
dsputil_init_3dnow
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
...
...
@@ -2546,17 +2461,16 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
static
void
dsputil_init_sse2
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
int
mm_flags
)
{
#if HAVE_SSE2_EXTERNAL
const
int
bit_depth
=
avctx
->
bits_per_raw_sample
;
#if HAVE_INLINE_ASM
const
int
high_bit_depth
=
bit_depth
>
8
;
if
(
!
(
mm_flags
&
AV_CPU_FLAG_SSE2SLOW
))
{
// these functions are slower than mmx on AMD, but faster on Intel
if
(
!
high_bit_depth
)
{
c
->
put_pixels_tab
[
0
][
0
]
=
put_pixels16_sse2
;
c
->
put_no_rnd_pixels_tab
[
0
][
0
]
=
put_pixels16_sse2
;
c
->
avg_pixels_tab
[
0
][
0
]
=
avg_pixels16_sse2
;
c
->
put_pixels_tab
[
0
][
0
]
=
ff_
put_pixels16_sse2
;
c
->
put_no_rnd_pixels_tab
[
0
][
0
]
=
ff_
put_pixels16_sse2
;
c
->
avg_pixels_tab
[
0
][
0
]
=
ff_
avg_pixels16_sse2
;
if
(
CONFIG_H264QPEL
)
H264_QPEL_FUNCS
(
0
,
0
,
sse2
);
}
...
...
@@ -2583,9 +2497,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c
->
idct
=
ff_idct_xvid_sse2
;
c
->
idct_permutation_type
=
FF_SSE2_IDCT_PERM
;
}
#endif
/* HAVE_INLINE_ASM */
#if HAVE_YASM
if
(
bit_depth
==
10
)
{
if
(
CONFIG_H264QPEL
)
{
SET_QPEL_FUNCS
(
put_h264_qpel
,
0
,
16
,
10
_sse2
,
ff_
);
...
...
@@ -2615,16 +2527,16 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c
->
apply_window_int16
=
ff_apply_window_int16_round_sse2
;
}
c
->
bswap_buf
=
ff_bswap32_buf_sse2
;
#endif
/* HAVE_
YASM
*/
#endif
/* HAVE_
SSE2_EXTERNAL
*/
}
static
void
dsputil_init_ssse3
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
int
mm_flags
)
{
#if HAVE_SSSE3_EXTERNAL
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
const
int
bit_depth
=
avctx
->
bits_per_raw_sample
;
#if HAVE_SSSE3_INLINE
if
(
!
high_bit_depth
&&
CONFIG_H264QPEL
)
{
H264_QPEL_FUNCS
(
1
,
0
,
ssse3
);
H264_QPEL_FUNCS
(
1
,
1
,
ssse3
);
...
...
@@ -2639,9 +2551,6 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
H264_QPEL_FUNCS
(
3
,
2
,
ssse3
);
H264_QPEL_FUNCS
(
3
,
3
,
ssse3
);
}
#endif
/* HAVE_SSSE3_INLINE */
#if HAVE_SSSE3_EXTERNAL
if
(
bit_depth
==
10
&&
CONFIG_H264QPEL
)
{
H264_QPEL_FUNCS_10
(
1
,
0
,
ssse3_cache64
);
H264_QPEL_FUNCS_10
(
2
,
0
,
ssse3_cache64
);
...
...
libavcodec/x86/h264_qpel.c
View file @
610e00b3
...
...
@@ -19,1019 +19,229 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_mmx.h"
#if HAVE_INLINE_ASM
/***********************************/
/* motion compensation */
#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
"mov"#q" "#C", "#T" \n\t"\
"mov"#d" (%0), "#F" \n\t"\
"paddw "#D", "#T" \n\t"\
"psllw $2, "#T" \n\t"\
"psubw "#B", "#T" \n\t"\
"psubw "#E", "#T" \n\t"\
"punpcklbw "#Z", "#F" \n\t"\
"pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
"paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
"add %2, %0 \n\t"\
"paddw "#F", "#A" \n\t"\
"paddw "#A", "#T" \n\t"\
"psraw $5, "#T" \n\t"\
"packuswb "#T", "#T" \n\t"\
OP(T, (%1), A, d)\
"add %3, %1 \n\t"
#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
"mov"#q" "#C", "#T" \n\t"\
"mov"#d" (%0), "#F" \n\t"\
"paddw "#D", "#T" \n\t"\
"psllw $2, "#T" \n\t"\
"paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
"psubw "#B", "#T" \n\t"\
"psubw "#E", "#T" \n\t"\
"punpcklbw "#Z", "#F" \n\t"\
"pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
"paddw "#F", "#A" \n\t"\
"add %2, %0 \n\t"\
"paddw "#A", "#T" \n\t"\
"mov"#q" "#T", "#OF"(%1) \n\t"
#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
#if HAVE_YASM
void
ff_put_pixels4_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
);
void
ff_avg_pixels4_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
);
void
ff_put_pixels8_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
);
void
ff_avg_pixels8_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
);
static
void
ff_put_pixels16_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
ff_put_pixels8_mmxext
(
block
,
pixels
,
line_size
,
h
);
ff_put_pixels8_mmxext
(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
static
void
ff_avg_pixels16_mmxext
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
ff_avg_pixels8_mmxext
(
block
,
pixels
,
line_size
,
h
);
ff_avg_pixels8_mmxext
(
block
+
8
,
pixels
+
8
,
line_size
,
h
);
}
void
ff_put_pixels4_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_avg_pixels4_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_avg_pixels8_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_avg_pixels16_l2_mmxext
(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
dstStride
,
int
src1Stride
,
int
h
);
void
ff_put_pixels16_sse2
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
);
void
ff_avg_pixels16_sse2
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
);
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
DEF_QPEL
(
avg
)
DEF_QPEL
(
put
)
#define QPEL_H264(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=4;\
\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
"movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
"1: \n\t"\
"movd -1(%0), %%mm1 \n\t"\
"movd (%0), %%mm2 \n\t"\
"movd 1(%0), %%mm3 \n\t"\
"movd 2(%0), %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"movd -2(%0), %%mm0 \n\t"\
"movd 3(%0), %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psllw $2, %%mm2 \n\t"\
"psubw %%mm1, %%mm2 \n\t"\
"pmullw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"psraw $5, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm6, d)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+g"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
: "memory"\
);\
}\
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=4;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq %0, %%mm4 \n\t"\
"movq %1, %%mm5 \n\t"\
:: "m"(ff_pw_5), "m"(ff_pw_16)\
);\
do{\
__asm__ volatile(\
"movd -1(%0), %%mm1 \n\t"\
"movd (%0), %%mm2 \n\t"\
"movd 1(%0), %%mm3 \n\t"\
"movd 2(%0), %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"movd -2(%0), %%mm0 \n\t"\
"movd 3(%0), %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psllw $2, %%mm2 \n\t"\
"psubw %%mm1, %%mm2 \n\t"\
"pmullw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"movd (%2), %%mm3 \n\t"\
"psraw $5, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
PAVGB" %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm6, d)\
"add %4, %0 \n\t"\
"add %4, %1 \n\t"\
"add %3, %2 \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
: "memory"\
);\
}while(--h);\
}\
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
src -= 2*srcStride;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\
);\
}\
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int h=4;\
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
\
: "+a"(src)\
: "c"(tmp), "S"((x86_reg)srcStride)\
: "memory"\
);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4
- 9*srcStride
;\
src += 4;\
}\
tmp -= 3*4;\
__asm__ volatile(\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"paddw 10(%0), %%mm0 \n\t"\
"movq 2(%0), %%mm1 \n\t"\
"paddw 8(%0), %%mm1 \n\t"\
"movq 4(%0), %%mm2 \n\t"\
"paddw 6(%0), %%mm2 \n\t"\
"psubw %%mm1, %%mm0 \n\t"
/*a-b (abccba)*/
\
"psraw $2, %%mm0 \n\t"
/*(a-b)/4 */
\
"psubw %%mm1, %%mm0 \n\t"
/*(a-b)/4-b */
\
"paddsw %%mm2, %%mm0 \n\t"\
"psraw $2, %%mm0 \n\t"
/*((a-b)/4-b+c)/4 */
\
"paddw %%mm2, %%mm0 \n\t"
/*(a-5*b+20*c)/16 */
\
"psraw $6, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, d)\
"add $24, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: "memory"\
);\
}\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"psllw $2, %%mm0 \n\t"\
"psllw $2, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movd -2(%0), %%mm2 \n\t"\
"movd 7(%0), %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
"paddw %%mm5, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm4, %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+g"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
: "memory"\
);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"psllw $2, %%mm0 \n\t"\
"psllw $2, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movd -2(%0), %%mm2 \n\t"\
"movd 7(%0), %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
"paddw %%mm5, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm4, %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"movq (%2), %%mm4 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
PAVGB" %%mm4, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\
"add %5, %0 \n\t"\
"add %5, %1 \n\t"\
"add %4, %2 \n\t"\
"decl %3 \n\t"\
"jg 1b \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
: "memory"\
);\
}\
\
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
int w= 2;\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
"cmpl $16, %4 \n\t"\
"jne 2f \n\t"\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
"2: \n\t"\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
: "memory"\
);\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
src += 4;\
dst += 4;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
}\
static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
static av_always_inline void
ff_ ##
OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
int w = (size+8)>>2;\
src -= 2*srcStride+2;\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
"cmpl $16, %3 \n\t"\
"jne 2f \n\t"\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
"2: \n\t"\
: "+a"(src)\
: "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\
: "memory"\
);\
ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
tmp += 4;\
src += 4
- (size+5)*srcStride
;\
src += 4;\
}\
}\
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
static av_always_inline void
ff_ ##
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int w = size>>4;\
do{\
int h = size;\
__asm__ volatile(\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 8(%0), %%mm3 \n\t"\
"movq 2(%0), %%mm1 \n\t"\
"movq 10(%0), %%mm4 \n\t"\
"paddw %%mm4, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"paddw 18(%0), %%mm3 \n\t"\
"paddw 16(%0), %%mm4 \n\t"\
"movq 4(%0), %%mm2 \n\t"\
"movq 12(%0), %%mm5 \n\t"\
"paddw 6(%0), %%mm2 \n\t"\
"paddw 14(%0), %%mm5 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psubw %%mm4, %%mm3 \n\t"\
"psraw $2, %%mm0 \n\t"\
"psraw $2, %%mm3 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psubw %%mm4, %%mm3 \n\t"\
"paddsw %%mm2, %%mm0 \n\t"\
"paddsw %%mm5, %%mm3 \n\t"\
"psraw $2, %%mm0 \n\t"\
"psraw $2, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm5, %%mm3 \n\t"\
"psraw $6, %%mm0 \n\t"\
"psraw $6, %%mm3 \n\t"\
"packuswb %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, q)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: "memory"\
);\
tmp += 8 - size*24;\
dst += 8 - size*dstStride;\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
}\
\
static
void
OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
static
av_always_inline void ff_ ##
OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_
noinline void
OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
static av_
always_inline void ff_ ##
OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ##
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static
void
OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
static
av_always_inline void ff_ ##
OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
static av_
noinline void
OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
static av_
always_inline void ff_ ##
OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}\
\
static av_
noinline void
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
static av_
always_inline void ff_ ##
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_
put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ff_ ##
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static
void
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
static
av_always_inline void ff_ ##
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static
void
OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
static
av_always_inline void ff_ ##
OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
\
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
__asm__ volatile(\
"movq (%1), %%mm0 \n\t"\
"movq 24(%1), %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
"packuswb %%mm1, %%mm1 \n\t"\
PAVGB" (%0), %%mm0 \n\t"\
PAVGB" (%0,%3), %%mm1 \n\t"\
OP(%%mm0, (%2), %%mm4, d)\
OP(%%mm1, (%2,%4), %%mm5, d)\
"lea (%0,%3,2), %0 \n\t"\
"lea (%2,%4,2), %2 \n\t"\
"movq 48(%1), %%mm0 \n\t"\
"movq 72(%1), %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
"packuswb %%mm1, %%mm1 \n\t"\
PAVGB" (%0), %%mm0 \n\t"\
PAVGB" (%0,%3), %%mm1 \n\t"\
OP(%%mm0, (%2), %%mm4, d)\
OP(%%mm1, (%2,%4), %%mm5, d)\
:"+a"(src8), "+c"(src16), "+d"(dst)\
:"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
:"memory");\
}\
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
do{\
__asm__ volatile(\
"movq (%1), %%mm0 \n\t"\
"movq 8(%1), %%mm1 \n\t"\
"movq 48(%1), %%mm2 \n\t"\
"movq 8+48(%1), %%mm3 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"psraw $5, %%mm2 \n\t"\
"psraw $5, %%mm3 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
"packuswb %%mm3, %%mm2 \n\t"\
PAVGB" (%0), %%mm0 \n\t"\
PAVGB" (%0,%3), %%mm2 \n\t"\
OP(%%mm0, (%2), %%mm5, q)\
OP(%%mm2, (%2,%4), %%mm5, q)\
::"a"(src8), "c"(src16), "d"(dst),\
"r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
:"memory");\
src8 += 2L*src8Stride;\
src16 += 48;\
dst += 2L*dstStride;\
}while(h-=2);\
}\
static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
}\
#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=16;\
__asm__ volatile(\
"pxor %%xmm15, %%xmm15 \n\t"\
"movdqa %6, %%xmm14 \n\t"\
"movdqa %7, %%xmm13 \n\t"\
"1: \n\t"\
"lddqu 6(%0), %%xmm1 \n\t"\
"lddqu -2(%0), %%xmm7 \n\t"\
"movdqa %%xmm1, %%xmm0 \n\t"\
"punpckhbw %%xmm15, %%xmm1 \n\t"\
"punpcklbw %%xmm15, %%xmm0 \n\t"\
"punpcklbw %%xmm15, %%xmm7 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm0, %%xmm6 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm0, %%xmm8 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm0, %%xmm9 \n\t"\
"movdqa %%xmm0, %%xmm12 \n\t"\
"movdqa %%xmm1, %%xmm11 \n\t"\
"palignr $10,%%xmm0, %%xmm11\n\t"\
"palignr $10,%%xmm7, %%xmm12\n\t"\
"palignr $2, %%xmm0, %%xmm4 \n\t"\
"palignr $2, %%xmm7, %%xmm9 \n\t"\
"palignr $4, %%xmm0, %%xmm3 \n\t"\
"palignr $4, %%xmm7, %%xmm8 \n\t"\
"palignr $6, %%xmm0, %%xmm2 \n\t"\
"palignr $6, %%xmm7, %%xmm6 \n\t"\
"paddw %%xmm0 ,%%xmm11 \n\t"\
"palignr $8, %%xmm0, %%xmm1 \n\t"\
"palignr $8, %%xmm7, %%xmm0 \n\t"\
"paddw %%xmm12,%%xmm7 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"paddw %%xmm8, %%xmm6 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"paddw %%xmm9, %%xmm0 \n\t"\
"psllw $2, %%xmm2 \n\t"\
"psllw $2, %%xmm6 \n\t"\
"psubw %%xmm1, %%xmm2 \n\t"\
"psubw %%xmm0, %%xmm6 \n\t"\
"paddw %%xmm13,%%xmm11 \n\t"\
"paddw %%xmm13,%%xmm7 \n\t"\
"pmullw %%xmm14,%%xmm2 \n\t"\
"pmullw %%xmm14,%%xmm6 \n\t"\
"lddqu (%2), %%xmm3 \n\t"\
"paddw %%xmm11,%%xmm2 \n\t"\
"paddw %%xmm7, %%xmm6 \n\t"\
"psraw $5, %%xmm2 \n\t"\
"psraw $5, %%xmm6 \n\t"\
"packuswb %%xmm2,%%xmm6 \n\t"\
"pavgb %%xmm3, %%xmm6 \n\t"\
OP(%%xmm6, (%1), %%xmm4, dqa)\
"add %5, %0 \n\t"\
"add %5, %1 \n\t"\
"add %4, %2 \n\t"\
"decl %3 \n\t"\
"jg 1b \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
"m"(ff_pw_5), "m"(ff_pw_16)\
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
"%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
"%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
"memory"\
);\
}
void
ff_avg_h264_qpel16_h_lowpass_l2_ssse3
(
uint8_t
*
dst
,
uint8_t
*
src
,
uint8_t
*
src2
,
int
dstStride
,
int
src2Stride
);
void
ff_put_h264_qpel16_h_lowpass_l2_ssse3
(
uint8_t
*
dst
,
uint8_t
*
src
,
uint8_t
*
src2
,
int
dstStride
,
int
src2Stride
);
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_
noinline void
OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
static av_
always_inline void ff_ ##
OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}
#endif // ARCH_X86_64
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=8;\
__asm__ volatile(\
"pxor %%xmm7, %%xmm7 \n\t"\
"movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
"1: \n\t"\
"lddqu -2(%0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm0 \n\t"\
"punpckhbw %%xmm7, %%xmm1 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm1, %%xmm5 \n\t"\
"palignr $2, %%xmm0, %%xmm4 \n\t"\
"palignr $4, %%xmm0, %%xmm3 \n\t"\
"palignr $6, %%xmm0, %%xmm2 \n\t"\
"palignr $8, %%xmm0, %%xmm1 \n\t"\
"palignr $10,%%xmm0, %%xmm5 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"psllw $2, %%xmm2 \n\t"\
"movq (%2), %%xmm3 \n\t"\
"psubw %%xmm1, %%xmm2 \n\t"\
"paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
"pmullw %%xmm6, %%xmm2 \n\t"\
"paddw %%xmm0, %%xmm2 \n\t"\
"psraw $5, %%xmm2 \n\t"\
"packuswb %%xmm2, %%xmm2 \n\t"\
"pavgb %%xmm3, %%xmm2 \n\t"\
OP(%%xmm2, (%1), %%xmm4, q)\
"add %5, %0 \n\t"\
"add %5, %1 \n\t"\
"add %4, %2 \n\t"\
"decl %3 \n\t"\
"jg 1b \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
__asm__ volatile(\
"pxor %%xmm7, %%xmm7 \n\t"\
"movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
"1: \n\t"\
"lddqu -2(%0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm0 \n\t"\
"punpckhbw %%xmm7, %%xmm1 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm1, %%xmm5 \n\t"\
"palignr $2, %%xmm0, %%xmm4 \n\t"\
"palignr $4, %%xmm0, %%xmm3 \n\t"\
"palignr $6, %%xmm0, %%xmm2 \n\t"\
"palignr $8, %%xmm0, %%xmm1 \n\t"\
"palignr $10,%%xmm0, %%xmm5 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"psllw $2, %%xmm2 \n\t"\
"psubw %%xmm1, %%xmm2 \n\t"\
"paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
"pmullw %%xmm6, %%xmm2 \n\t"\
"paddw %%xmm0, %%xmm2 \n\t"\
"psraw $5, %%xmm2 \n\t"\
"packuswb %%xmm2, %%xmm2 \n\t"\
OP(%%xmm2, (%1), %%xmm4, q)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+g"(h)\
: "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ##
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
\
__asm__ volatile(\
"pxor %%xmm7, %%xmm7 \n\t"\
"movq (%0), %%xmm0 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm1 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm2 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm3 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm1 \n\t"\
"punpcklbw %%xmm7, %%xmm2 \n\t"\
"punpcklbw %%xmm7, %%xmm3 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
"cmpl $16, %4 \n\t"\
"jne 2f \n\t"\
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
"2: \n\t"\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_
noinline void
OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
static av_
always_inline void ff_ ##
OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ##
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}
static
av_always_inline
void
put_h264_qpel8or16_hv1_lowpass_sse2
(
int16_t
*
tmp
,
uint8_t
*
src
,
int
tmpStride
,
int
srcStride
,
int
size
){
static
av_always_inline
void
ff_
put_h264_qpel8or16_hv1_lowpass_sse2
(
int16_t
*
tmp
,
uint8_t
*
src
,
int
tmpStride
,
int
srcStride
,
int
size
){
int
w
=
(
size
+
8
)
>>
3
;
src
-=
2
*
srcStride
+
2
;
while
(
w
--
){
__asm__
volatile
(
"pxor %%xmm7, %%xmm7
\n\t
"
"movq (%0), %%xmm0
\n\t
"
"add %2, %0
\n\t
"
"movq (%0), %%xmm1
\n\t
"
"add %2, %0
\n\t
"
"movq (%0), %%xmm2
\n\t
"
"add %2, %0
\n\t
"
"movq (%0), %%xmm3
\n\t
"
"add %2, %0
\n\t
"
"movq (%0), %%xmm4
\n\t
"
"add %2, %0
\n\t
"
"punpcklbw %%xmm7, %%xmm0
\n\t
"
"punpcklbw %%xmm7, %%xmm1
\n\t
"
"punpcklbw %%xmm7, %%xmm2
\n\t
"
"punpcklbw %%xmm7, %%xmm3
\n\t
"
"punpcklbw %%xmm7, %%xmm4
\n\t
"
QPEL_H264HV_XMM
(
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
0
*
48
)
QPEL_H264HV_XMM
(
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
1
*
48
)
QPEL_H264HV_XMM
(
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
2
*
48
)
QPEL_H264HV_XMM
(
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
3
*
48
)
QPEL_H264HV_XMM
(
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
4
*
48
)
QPEL_H264HV_XMM
(
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
5
*
48
)
QPEL_H264HV_XMM
(
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
6
*
48
)
QPEL_H264HV_XMM
(
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
7
*
48
)
"cmpl $16, %3
\n\t
"
"jne 2f
\n\t
"
QPEL_H264HV_XMM
(
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
8
*
48
)
QPEL_H264HV_XMM
(
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
9
*
48
)
QPEL_H264HV_XMM
(
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
10
*
48
)
QPEL_H264HV_XMM
(
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
11
*
48
)
QPEL_H264HV_XMM
(
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
12
*
48
)
QPEL_H264HV_XMM
(
%%
xmm1
,
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
13
*
48
)
QPEL_H264HV_XMM
(
%%
xmm2
,
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
14
*
48
)
QPEL_H264HV_XMM
(
%%
xmm3
,
%%
xmm4
,
%%
xmm5
,
%%
xmm0
,
%%
xmm1
,
%%
xmm2
,
15
*
48
)
"2:
\n\t
"
:
"+a"
(
src
)
:
"c"
(
tmp
),
"S"
((
x86_reg
)
srcStride
),
"rm"
(
size
)
:
XMM_CLOBBERS
(
"%xmm0"
,
"%xmm1"
,
"%xmm2"
,
"%xmm3"
,
"%xmm4"
,
"%xmm5"
,
"%xmm6"
,
"%xmm7"
,)
"memory"
);
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2
(
src
,
tmp
,
srcStride
,
size
);
tmp
+=
8
;
src
+=
8
-
(
size
+
5
)
*
srcStride
;
src
+=
8
;
}
}
#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int h = size;\
if(size == 16){\
__asm__ volatile(\
"1: \n\t"\
"movdqa 32(%0), %%xmm4 \n\t"\
"movdqa 16(%0), %%xmm5 \n\t"\
"movdqa (%0), %%xmm7 \n\t"\
"movdqa %%xmm4, %%xmm3 \n\t"\
"movdqa %%xmm4, %%xmm2 \n\t"\
"movdqa %%xmm4, %%xmm1 \n\t"\
"movdqa %%xmm4, %%xmm0 \n\t"\
"palignr $10, %%xmm5, %%xmm0 \n\t"\
"palignr $8, %%xmm5, %%xmm1 \n\t"\
"palignr $6, %%xmm5, %%xmm2 \n\t"\
"palignr $4, %%xmm5, %%xmm3 \n\t"\
"palignr $2, %%xmm5, %%xmm4 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"movdqa %%xmm5, %%xmm6 \n\t"\
"movdqa %%xmm5, %%xmm4 \n\t"\
"movdqa %%xmm5, %%xmm3 \n\t"\
"palignr $8, %%xmm7, %%xmm4 \n\t"\
"palignr $2, %%xmm7, %%xmm6 \n\t"\
"palignr $10, %%xmm7, %%xmm3 \n\t"\
"paddw %%xmm6, %%xmm4 \n\t"\
"movdqa %%xmm5, %%xmm6 \n\t"\
"palignr $6, %%xmm7, %%xmm5 \n\t"\
"palignr $4, %%xmm7, %%xmm6 \n\t"\
"paddw %%xmm7, %%xmm3 \n\t"\
"paddw %%xmm6, %%xmm5 \n\t"\
\
"psubw %%xmm1, %%xmm0 \n\t"\
"psubw %%xmm4, %%xmm3 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"psraw $2, %%xmm3 \n\t"\
"psubw %%xmm1, %%xmm0 \n\t"\
"psubw %%xmm4, %%xmm3 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"paddw %%xmm5, %%xmm3 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"psraw $2, %%xmm3 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"paddw %%xmm5, %%xmm3 \n\t"\
"psraw $6, %%xmm0 \n\t"\
"psraw $6, %%xmm3 \n\t"\
"packuswb %%xmm0, %%xmm3 \n\t"\
OP(%%xmm3, (%1), %%xmm7, dqa)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}else{\
__asm__ volatile(\
"1: \n\t"\
"movdqa 16(%0), %%xmm1 \n\t"\
"movdqa (%0), %%xmm0 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm1, %%xmm5 \n\t"\
"palignr $10, %%xmm0, %%xmm5 \n\t"\
"palignr $8, %%xmm0, %%xmm4 \n\t"\
"palignr $6, %%xmm0, %%xmm3 \n\t"\
"palignr $4, %%xmm0, %%xmm2 \n\t"\
"palignr $2, %%xmm0, %%xmm1 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"psubw %%xmm1, %%xmm0 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"psubw %%xmm1, %%xmm0 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"psraw $6, %%xmm0 \n\t"\
"packuswb %%xmm0, %%xmm0 \n\t"\
OP(%%xmm0, (%1), %%xmm7, q)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
static av_
noinline void
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
static av_
always_inline void ff_ ##
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ff_ ##
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static
void
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
static
av_always_inline void ff_ ##
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
}\
static
void
OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
static
av_always_inline void ff_ ##
OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ##
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
}\
#define put_pixels8_l2_sse2 put_pixels8_l2_mmxext
#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmxext
#define put_pixels16_l2_sse2 put_pixels16_l2_mmxext
#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmxext
#define put_pixels8_l2_ssse3 put_pixels8_l2_mmxext
#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmxext
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmxext
#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmxext
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmxext
#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmxext
#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmxext
#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmxext
#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmxext
#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmxext
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmxext
#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmxext
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmxext
#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmxext
#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmxext
#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmxext
#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmxext
#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
...
...
@@ -1040,77 +250,77 @@ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
static
void
put_h264_qpel16_mc00_sse2
(
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
){
put_pixels16_sse2
(
dst
,
src
,
stride
,
16
);
ff_
put_pixels16_sse2
(
dst
,
src
,
stride
,
16
);
}
static
void
avg_h264_qpel16_mc00_sse2
(
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
){
avg_pixels16_sse2
(
dst
,
src
,
stride
,
16
);
ff_
avg_pixels16_sse2
(
dst
,
src
,
stride
,
16
);
}
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
ff_ ##
OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
}\
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ##
OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ##
OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
...
...
@@ -1118,8 +328,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
...
...
@@ -1127,8 +337,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
ff_
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ##
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
...
...
@@ -1136,8 +346,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX
(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
ff_
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext
(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
...
...
@@ -1145,8 +355,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX
(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
ff_
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext
(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
#define H264_MC_4816(MMX)\
...
...
@@ -1171,25 +381,18 @@ QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM
(
avg_
,
AVG_MMXEXT_OP
,
sse2
)
QPEL_H264_HV_XMM
(
put_
,
PUT_OP
,
sse2
)
QPEL_H264_HV_XMM
(
avg_
,
AVG_MMXEXT_OP
,
sse2
)
#if HAVE_SSSE3_INLINE
QPEL_H264_H_XMM
(
put_
,
PUT_OP
,
ssse3
)
QPEL_H264_H_XMM
(
avg_
,
AVG_MMXEXT_OP
,
ssse3
)
QPEL_H264_HV2_XMM
(
put_
,
PUT_OP
,
ssse3
)
QPEL_H264_HV2_XMM
(
avg_
,
AVG_MMXEXT_OP
,
ssse3
)
QPEL_H264_HV_XMM
(
put_
,
PUT_OP
,
ssse3
)
QPEL_H264_HV_XMM
(
avg_
,
AVG_MMXEXT_OP
,
ssse3
)
#endif
#undef PAVGB
H264_MC_4816
(
mmxext
)
H264_MC_816
(
H264_MC_V
,
sse2
)
H264_MC_816
(
H264_MC_HV
,
sse2
)
#if HAVE_SSSE3_INLINE
H264_MC_816
(
H264_MC_H
,
ssse3
)
H264_MC_816
(
H264_MC_HV
,
ssse3
)
#endif
#endif
/* HAVE_INLINE_ASM */
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
...
...
@@ -1285,3 +488,5 @@ QPEL16_OP(mc33, MMX)
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
QPEL16
(
mmxext
)
#endif
#endif
/* HAVE_YASM */
libavcodec/x86/h264_qpel_8bit.asm
0 → 100644
View file @
610e00b3
;*****************************************************************************
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2012 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
32
cextern
pw_16
cextern
pw_5
cextern
pb_0
SECTION
.
text
%macro
op_avgh
3
movh
%3
,
%2
pavgb
%1
,
%3
movh
%2
,
%1
%endmacro
%macro
op_avg
2
-
3
pavgb
%1
,
%2
mova
%2
,
%1
%endmacro
%macro
op_puth
2
-
3
movh
%2
,
%1
%endmacro
%macro
op_put
2
-
3
mova
%2
,
%1
%endmacro
%macro
QPEL4_H_LOWPASS_OP
1
cglobal
%1
_h264_qpel4_h_lowpass
,
4
,
5
; dst, src, dstStride, srcStride
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
pxor
m7
,
m7
mova
m4
,
[
pw_5
]
mova
m5
,
[
pw_16
]
mov
r4d
,
4
.
loop
:
movh
m1
,
[
r1
-
1
]
movh
m2
,
[
r1
+
0
]
movh
m3
,
[
r1
+
1
]
movh
m0
,
[
r1
+
2
]
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
punpcklbw
m3
,
m7
punpcklbw
m0
,
m7
paddw
m1
,
m0
paddw
m2
,
m3
movh
m0
,
[
r1
-
2
]
movh
m3
,
[
r1
+
3
]
punpcklbw
m0
,
m7
punpcklbw
m3
,
m7
paddw
m0
,
m3
psllw
m2
,
2
psubw
m2
,
m1
pmullw
m2
,
m4
paddw
m0
,
m5
paddw
m0
,
m2
psraw
m0
,
5
packuswb
m0
,
m0
op_
%1
h
m0
,
[r0],
m6
add
r0
,
r2
add
r1
,
r3
dec
r4d
jg
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL4_H_LOWPASS_OP
put
QPEL4_H_LOWPASS_OP
avg
%macro
QPEL8_H_LOWPASS_OP
1
cglobal
%1
_h264_qpel8_h_lowpass
,
4
,
5
; dst, src, dstStride, srcStride
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
mov
r4d
,
8
pxor
m7
,
m7
mova
m6
,
[
pw_5
]
.
loop
:
mova
m0
,
[r1]
mova
m2
,
[
r1
+
1
]
mova
m1
,
m0
mova
m3
,
m2
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
paddw
m0
,
m2
paddw
m1
,
m3
psllw
m0
,
2
psllw
m1
,
2
mova
m2
,
[
r1
-
1
]
mova
m4
,
[
r1
+
2
]
mova
m3
,
m2
mova
m5
,
m4
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
punpcklbw
m4
,
m7
punpckhbw
m5
,
m7
paddw
m2
,
m4
paddw
m5
,
m3
psubw
m0
,
m2
psubw
m1
,
m5
pmullw
m0
,
m6
pmullw
m1
,
m6
movd
m2
,
[
r1
-
2
]
movd
m5
,
[
r1
+
7
]
punpcklbw
m2
,
m7
punpcklbw
m5
,
m7
paddw
m2
,
m3
paddw
m4
,
m5
mova
m5
,
[
pw_16
]
paddw
m2
,
m5
paddw
m4
,
m5
paddw
m0
,
m2
paddw
m1
,
m4
psraw
m0
,
5
psraw
m1
,
5
packuswb
m0
,
m1
op_
%1
m0
,
[r0],
m4
add
r0
,
r2
add
r1
,
r3
dec
r4d
jg
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL8_H_LOWPASS_OP
put
QPEL8_H_LOWPASS_OP
avg
%macro
QPEL8_H_LOWPASS_OP_XMM
1
cglobal
%1
_h264_qpel8_h_lowpass
,
4
,
5
,
7
; dst, src, dstStride, srcStride
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
mov
r4d
,
8
pxor
m7
,
m7
mova
m6
,
[
pw_5
]
.
loop
:
movu
m1
,
[
r1
-
2
]
mova
m0
,
m1
punpckhbw
m1
,
m7
punpcklbw
m0
,
m7
mova
m2
,
m1
mova
m3
,
m1
mova
m4
,
m1
mova
m5
,
m1
palignr
m4
,
m0
,
2
palignr
m3
,
m0
,
4
palignr
m2
,
m0
,
6
palignr
m1
,
m0
,
8
palignr
m5
,
m0
,
10
paddw
m0
,
m5
paddw
m2
,
m3
paddw
m1
,
m4
psllw
m2
,
2
psubw
m2
,
m1
paddw
m0
,
[
pw_16
]
pmullw
m2
,
m6
paddw
m2
,
m0
psraw
m2
,
5
packuswb
m2
,
m2
op_
%1
h
m2
,
[r0],
m4
add
r1
,
r3
add
r0
,
r2
dec
r4d
jne
.
loop
REP_RET
%endmacro
INIT_XMM
ssse3
QPEL8_H_LOWPASS_OP_XMM
put
QPEL8_H_LOWPASS_OP_XMM
avg
%macro
QPEL4_H_LOWPASS_L2_OP
1
cglobal
%1
_h264_qpel4_h_lowpass_l2
,
5
,
6
; dst, src, src2, dstStride, srcStride
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
pxor
m7
,
m7
mova
m4
,
[
pw_5
]
mova
m5
,
[
pw_16
]
mov
r5d
,
4
.
loop
:
movh
m1
,
[
r1
-
1
]
movh
m2
,
[
r1
+
0
]
movh
m3
,
[
r1
+
1
]
movh
m0
,
[
r1
+
2
]
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
punpcklbw
m3
,
m7
punpcklbw
m0
,
m7
paddw
m1
,
m0
paddw
m2
,
m3
movh
m0
,
[
r1
-
2
]
movh
m3
,
[
r1
+
3
]
punpcklbw
m0
,
m7
punpcklbw
m3
,
m7
paddw
m0
,
m3
psllw
m2
,
2
psubw
m2
,
m1
pmullw
m2
,
m4
paddw
m0
,
m5
paddw
m0
,
m2
movh
m3
,
[r2]
psraw
m0
,
5
packuswb
m0
,
m0
pavgb
m0
,
m3
op_
%1
h
m0
,
[r0],
m6
add
r0
,
r3
add
r1
,
r3
add
r2
,
r4
dec
r5d
jg
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL4_H_LOWPASS_L2_OP
put
QPEL4_H_LOWPASS_L2_OP
avg
%macro
QPEL8_H_LOWPASS_L2_OP
1
cglobal
%1
_h264_qpel8_h_lowpass_l2
,
5
,
6
; dst, src, src2, dstStride, srcStride
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
mov
r5d
,
8
pxor
m7
,
m7
mova
m6
,
[
pw_5
]
.
loop
:
mova
m0
,
[r1]
mova
m2
,
[
r1
+
1
]
mova
m1
,
m0
mova
m3
,
m2
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
paddw
m0
,
m2
paddw
m1
,
m3
psllw
m0
,
2
psllw
m1
,
2
mova
m2
,
[
r1
-
1
]
mova
m4
,
[
r1
+
2
]
mova
m3
,
m2
mova
m5
,
m4
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
punpcklbw
m4
,
m7
punpckhbw
m5
,
m7
paddw
m2
,
m4
paddw
m5
,
m3
psubw
m0
,
m2
psubw
m1
,
m5
pmullw
m0
,
m6
pmullw
m1
,
m6
movd
m2
,
[
r1
-
2
]
movd
m5
,
[
r1
+
7
]
punpcklbw
m2
,
m7
punpcklbw
m5
,
m7
paddw
m2
,
m3
paddw
m4
,
m5
mova
m5
,
[
pw_16
]
paddw
m2
,
m5
paddw
m4
,
m5
paddw
m0
,
m2
paddw
m1
,
m4
psraw
m0
,
5
psraw
m1
,
5
mova
m4
,
[r2]
packuswb
m0
,
m1
pavgb
m0
,
m4
op_
%1
m0
,
[r0],
m4
add
r0
,
r3
add
r1
,
r3
add
r2
,
r4
dec
r5d
jg
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL8_H_LOWPASS_L2_OP
put
QPEL8_H_LOWPASS_L2_OP
avg
%macro
QPEL8_H_LOWPASS_L2_OP_XMM
1
cglobal
%1
_h264_qpel8_h_lowpass_l2
,
5
,
6
,
7
; dst, src, src2, dstStride, src2Stride
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
mov
r5d
,
8
pxor
m7
,
m7
mova
m6
,
[
pw_5
]
.
loop
:
lddqu
m1
,
[
r1
-
2
]
mova
m0
,
m1
punpckhbw
m1
,
m7
punpcklbw
m0
,
m7
mova
m2
,
m1
mova
m3
,
m1
mova
m4
,
m1
mova
m5
,
m1
palignr
m4
,
m0
,
2
palignr
m3
,
m0
,
4
palignr
m2
,
m0
,
6
palignr
m1
,
m0
,
8
palignr
m5
,
m0
,
10
paddw
m0
,
m5
paddw
m2
,
m3
paddw
m1
,
m4
psllw
m2
,
2
movh
m3
,
[r2]
psubw
m2
,
m1
paddw
m0
,
[
pw_16
]
pmullw
m2
,
m6
paddw
m2
,
m0
psraw
m2
,
5
packuswb
m2
,
m2
pavgb
m2
,
m3
op_
%1
h
m2
,
[r0],
m4
add
r1
,
r3
add
r0
,
r3
add
r2
,
r4
dec
r5d
jg
.
loop
REP_RET
%endmacro
INIT_XMM
ssse3
QPEL8_H_LOWPASS_L2_OP_XMM
put
QPEL8_H_LOWPASS_L2_OP_XMM
avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
%macro
FILT_V
1
mova
m6
,
m2
movh
m5
,
[r1]
paddw
m6
,
m3
psllw
m6
,
2
psubw
m6
,
m1
psubw
m6
,
m4
punpcklbw
m5
,
m7
pmullw
m6
,
[
pw_5
]
paddw
m0
,
[
pw_16
]
add
r1
,
r3
paddw
m0
,
m5
paddw
m6
,
m0
psraw
m6
,
5
packuswb
m6
,
m6
op_
%1
h
m6
,
[r0],
m0
; 1
add
r0
,
r2
SWAP
0
,
1
,
2
,
3
,
4
,
5
%endmacro
%macro
QPEL4_V_LOWPASS_OP
1
cglobal
%1
_h264_qpel4_v_lowpass
,
4
,
4
; dst, src, dstStride, srcStride
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
sub
r1
,
r3
sub
r1
,
r3
pxor
m7
,
m7
movh
m0
,
[r1]
movh
m1
,
[
r1
+
r3
]
lea
r1
,
[
r1
+
2
*
r3
]
movh
m2
,
[r1]
movh
m3
,
[
r1
+
r3
]
lea
r1
,
[
r1
+
2
*
r3
]
movh
m4
,
[r1]
add
r1
,
r3
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
punpcklbw
m3
,
m7
punpcklbw
m4
,
m7
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
RET
%endmacro
INIT_MMX
mmxext
QPEL4_V_LOWPASS_OP
put
QPEL4_V_LOWPASS_OP
avg
%macro
QPEL8OR16_V_LOWPASS_OP
1
%if
cpuflag
(
sse2
)
cglobal
%1
_h264_qpel8or16_v_lowpass
,
5
,
5
,
7
; dst, src, dstStride, srcStride, h
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
sub
r1
,
r3
sub
r1
,
r3
%else
cglobal
%1
_h264_qpel8or16_v_lowpass_op
,
5
,
5
,
7
; dst, src, dstStride, srcStride, h
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
%endif
pxor
m7
,
m7
movh
m0
,
[r1]
movh
m1
,
[
r1
+
r3
]
lea
r1
,
[
r1
+
2
*
r3
]
movh
m2
,
[r1]
movh
m3
,
[
r1
+
r3
]
lea
r1
,
[
r1
+
2
*
r3
]
movh
m4
,
[r1]
add
r1
,
r3
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
punpcklbw
m3
,
m7
punpcklbw
m4
,
m7
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
cmp
r4d
,
16
jne
.
end
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
FILT_V
%1
.
end
:
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL8OR16_V_LOWPASS_OP
put
QPEL8OR16_V_LOWPASS_OP
avg
INIT_XMM
sse2
QPEL8OR16_V_LOWPASS_OP
put
QPEL8OR16_V_LOWPASS_OP
avg
; All functions that use this are required to have args:
; src, tmp, srcSize
%macro
FILT_HV
1
; offset
mova
m6
,
m2
movh
m5
,
[r0]
paddw
m6
,
m3
psllw
m6
,
2
paddw
m0
,
[
pw_16
]
psubw
m6
,
m1
psubw
m6
,
m4
punpcklbw
m5
,
m7
pmullw
m6
,
[
pw_5
]
paddw
m0
,
m5
add
r0
,
r2
paddw
m6
,
m0
mova
[
r1
+
%1
]
,
m6
SWAP
0
,
1
,
2
,
3
,
4
,
5
%endmacro
%macro
QPEL4_HV1_LOWPASS_OP
1
cglobal
%1
_h264_qpel4_hv_lowpass_v
,
3
,
3
; src, tmp, srcStride
movsxdifnidn
r2
,
r2d
pxor
m7
,
m7
movh
m0
,
[r0]
movh
m1
,
[
r0
+
r2
]
lea
r0
,
[
r0
+
2
*
r2
]
movh
m2
,
[r0]
movh
m3
,
[
r0
+
r2
]
lea
r0
,
[
r0
+
2
*
r2
]
movh
m4
,
[r0]
add
r0
,
r2
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
punpcklbw
m3
,
m7
punpcklbw
m4
,
m7
FILT_HV
0
*
24
FILT_HV
1
*
24
FILT_HV
2
*
24
FILT_HV
3
*
24
RET
cglobal
%1
_h264_qpel4_hv_lowpass_h
,
3
,
4
; tmp, dst, dstStride
movsxdifnidn
r2
,
r2d
mov
r3d
,
4
.
loop
:
mova
m0
,
[r0]
paddw
m0
,
[
r0
+
10
]
mova
m1
,
[
r0
+
2
]
paddw
m1
,
[
r0
+
8
]
mova
m2
,
[
r0
+
4
]
paddw
m2
,
[
r0
+
6
]
psubw
m0
,
m1
psraw
m0
,
2
psubw
m0
,
m1
paddsw
m0
,
m2
psraw
m0
,
2
paddw
m0
,
m2
psraw
m0
,
6
packuswb
m0
,
m0
op_
%1
h
m0
,
[r1],
m7
add
r0
,
24
add
r1
,
r2
dec
r3d
jnz
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL4_HV1_LOWPASS_OP
put
QPEL4_HV1_LOWPASS_OP
avg
%macro
QPEL8OR16_HV1_LOWPASS_OP
1
cglobal
%1
_h264_qpel8or16_hv1_lowpass_op
,
4
,
4
,
7
; src, tmp, srcStride, size
movsxdifnidn
r2
,
r2d
pxor
m7
,
m7
movh
m0
,
[r0]
movh
m1
,
[
r0
+
r2
]
lea
r0
,
[
r0
+
2
*
r2
]
movh
m2
,
[r0]
movh
m3
,
[
r0
+
r2
]
lea
r0
,
[
r0
+
2
*
r2
]
movh
m4
,
[r0]
add
r0
,
r2
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
punpcklbw
m3
,
m7
punpcklbw
m4
,
m7
FILT_HV
0
*
48
FILT_HV
1
*
48
FILT_HV
2
*
48
FILT_HV
3
*
48
FILT_HV
4
*
48
FILT_HV
5
*
48
FILT_HV
6
*
48
FILT_HV
7
*
48
cmp
r3d
,
16
jne
.
end
FILT_HV
8
*
48
FILT_HV
9
*
48
FILT_HV
10
*
48
FILT_HV
11
*
48
FILT_HV
12
*
48
FILT_HV
13
*
48
FILT_HV
14
*
48
FILT_HV
15
*
48
.
end
:
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL8OR16_HV1_LOWPASS_OP
put
QPEL8OR16_HV1_LOWPASS_OP
avg
INIT_XMM
sse2
QPEL8OR16_HV1_LOWPASS_OP
put
%macro
QPEL8OR16_HV2_LOWPASS_OP
1
; unused is to match ssse3 and mmxext args
cglobal
%1
_h264_qpel8or16_hv2_lowpass_op
,
5
,
5
; dst, tmp, dstStride, unused, h
movsxdifnidn
r2
,
r2d
.
loop
:
mova
m0
,
[r1]
mova
m3
,
[
r1
+
8
]
mova
m1
,
[
r1
+
2
]
mova
m4
,
[
r1
+
10
]
paddw
m0
,
m4
paddw
m1
,
m3
paddw
m3
,
[
r1
+
18
]
paddw
m4
,
[
r1
+
16
]
mova
m2
,
[
r1
+
4
]
mova
m5
,
[
r1
+
12
]
paddw
m2
,
[
r1
+
6
]
paddw
m5
,
[
r1
+
14
]
psubw
m0
,
m1
psubw
m3
,
m4
psraw
m0
,
2
psraw
m3
,
2
psubw
m0
,
m1
psubw
m3
,
m4
paddsw
m0
,
m2
paddsw
m3
,
m5
psraw
m0
,
2
psraw
m3
,
2
paddw
m0
,
m2
paddw
m3
,
m5
psraw
m0
,
6
psraw
m3
,
6
packuswb
m0
,
m3
op_
%1
m0
,
[r0],
m7
add
r1
,
48
add
r0
,
r2
dec
r4d
jne
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
QPEL8OR16_HV2_LOWPASS_OP
put
QPEL8OR16_HV2_LOWPASS_OP
avg
%macro
QPEL8OR16_HV2_LOWPASS_OP_XMM
1
cglobal
%1
_h264_qpel8or16_hv2_lowpass
,
5
,
5
,
7
; dst, tmp, dstStride, tmpStride, size
movsxdifnidn
r2
,
r2d
movsxdifnidn
r3
,
r3d
cmp
r4d
,
16
je
.
op16
.
loop8
:
mova
m1
,
[
r1
+
16
]
mova
m0
,
[r1]
mova
m2
,
m1
mova
m3
,
m1
mova
m4
,
m1
mova
m5
,
m1
palignr
m5
,
m0
,
10
palignr
m4
,
m0
,
8
palignr
m3
,
m0
,
6
palignr
m2
,
m0
,
4
palignr
m1
,
m0
,
2
paddw
m0
,
m5
paddw
m1
,
m4
paddw
m2
,
m3
psubw
m0
,
m1
psraw
m0
,
2
psubw
m0
,
m1
paddw
m0
,
m2
psraw
m0
,
2
paddw
m0
,
m2
psraw
m0
,
6
packuswb
m0
,
m0
op_
%1
h
m0
,
[r0],
m7
add
r1
,
48
add
r0
,
r2
dec
r4d
jne
.
loop8
jmp
.
done
.
op16
:
mova
m4
,
[
r1
+
32
]
mova
m5
,
[
r1
+
16
]
mova
m7
,
[r1]
mova
m3
,
m4
mova
m2
,
m4
mova
m1
,
m4
mova
m0
,
m4
palignr
m0
,
m5
,
10
palignr
m1
,
m5
,
8
palignr
m2
,
m5
,
6
palignr
m3
,
m5
,
4
palignr
m4
,
m5
,
2
paddw
m0
,
m5
paddw
m1
,
m4
paddw
m2
,
m3
mova
m6
,
m5
mova
m4
,
m5
mova
m3
,
m5
palignr
m4
,
m7
,
8
palignr
m6
,
m7
,
2
palignr
m3
,
m7
,
10
paddw
m4
,
m6
mova
m6
,
m5
palignr
m5
,
m7
,
6
palignr
m6
,
m7
,
4
paddw
m3
,
m7
paddw
m5
,
m6
psubw
m0
,
m1
psubw
m3
,
m4
psraw
m0
,
2
psraw
m3
,
2
psubw
m0
,
m1
psubw
m3
,
m4
paddw
m0
,
m2
paddw
m3
,
m5
psraw
m0
,
2
psraw
m3
,
2
paddw
m0
,
m2
paddw
m3
,
m5
psraw
m0
,
6
psraw
m3
,
6
packuswb
m3
,
m0
op_
%1
m3
,
[r0],
m7
add
r1
,
48
add
r0
,
r2
dec
r4d
jne
.
op16
.
done
:
REP_RET
%endmacro
INIT_XMM
ssse3
QPEL8OR16_HV2_LOWPASS_OP_XMM
put
QPEL8OR16_HV2_LOWPASS_OP_XMM
avg
%macro
PIXELS4_L2_SHIFT5
1
cglobal
%1
_pixels4_l2_shift5
,
6
,
6
; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
mova
m0
,
[r1]
mova
m1
,
[
r1
+
24
]
psraw
m0
,
5
psraw
m1
,
5
packuswb
m0
,
m0
packuswb
m1
,
m1
pavgb
m0
,
[r2]
pavgb
m1
,
[
r2
+
r4
]
op_
%1
h
m0
,
[r0],
m4
op_
%1
h
m1
,
[
r0
+
r3
]
,
m5
lea
r2
,
[
r2
+
r4
*
2
]
lea
r0
,
[
r0
+
r3
*
2
]
mova
m0
,
[
r1
+
48
]
mova
m1
,
[
r1
+
72
]
psraw
m0
,
5
psraw
m1
,
5
packuswb
m0
,
m0
packuswb
m1
,
m1
pavgb
m0
,
[r2]
pavgb
m1
,
[
r2
+
r4
]
op_
%1
h
m0
,
[r0],
m4
op_
%1
h
m1
,
[
r0
+
r3
]
,
m5
RET
%endmacro
INIT_MMX
mmxext
PIXELS4_L2_SHIFT5
put
PIXELS4_L2_SHIFT5
avg
%macro
PIXELS8_L2_SHIFT5
1
cglobal
%1
_pixels8_l2_shift5
,
6
,
6
; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
.
loop
:
mova
m0
,
[r1]
mova
m1
,
[
r1
+
8
]
mova
m2
,
[
r1
+
48
]
mova
m3
,
[
r1
+
48
+
8
]
psraw
m0
,
5
psraw
m1
,
5
psraw
m2
,
5
psraw
m3
,
5
packuswb
m0
,
m1
packuswb
m2
,
m3
pavgb
m0
,
[r2]
pavgb
m2
,
[
r2
+
r4
]
op_
%1
m0
,
[r0],
m4
op_
%1
m2
,
[
r0
+
r3
]
,
m5
lea
r2
,
[
r2
+
2
*
r4
]
add
r1
,
48
*
2
lea
r0
,
[
r0
+
2
*
r3
]
sub
r5d
,
2
jne
.
loop
REP_RET
%endmacro
INIT_MMX
mmxext
PIXELS8_L2_SHIFT5
put
PIXELS8_L2_SHIFT5
avg
%if
ARCH_X86_64
%macro
QPEL16_H_LOWPASS_L2_OP
1
cglobal
%1
_h264_qpel16_h_lowpass_l2
,
5
,
6
,
16
; dst, src, src2, dstStride, src2Stride
movsxdifnidn
r3
,
r3d
movsxdifnidn
r4
,
r4d
mov
r5d
,
16
pxor
m15
,
m15
mova
m14
,
[
pw_5
]
mova
m13
,
[
pw_16
]
.
loop
:
lddqu
m1
,
[
r1
+
6
]
lddqu
m7
,
[
r1
-
2
]
mova
m0
,
m1
punpckhbw
m1
,
m15
punpcklbw
m0
,
m15
punpcklbw
m7
,
m15
mova
m2
,
m1
mova
m6
,
m0
mova
m3
,
m1
mova
m8
,
m0
mova
m4
,
m1
mova
m9
,
m0
mova
m12
,
m0
mova
m11
,
m1
palignr
m11
,
m0
,
10
palignr
m12
,
m7
,
10
palignr
m4
,
m0
,
2
palignr
m9
,
m7
,
2
palignr
m3
,
m0
,
4
palignr
m8
,
m7
,
4
palignr
m2
,
m0
,
6
palignr
m6
,
m7
,
6
paddw
m11
,
m0
palignr
m1
,
m0
,
8
palignr
m0
,
m7
,
8
paddw
m7
,
m12
paddw
m2
,
m3
paddw
m6
,
m8
paddw
m1
,
m4
paddw
m0
,
m9
psllw
m2
,
2
psllw
m6
,
2
psubw
m2
,
m1
psubw
m6
,
m0
paddw
m11
,
m13
paddw
m7
,
m13
pmullw
m2
,
m14
pmullw
m6
,
m14
lddqu
m3
,
[r2]
paddw
m2
,
m11
paddw
m6
,
m7
psraw
m2
,
5
psraw
m6
,
5
packuswb
m6
,
m2
pavgb
m6
,
m3
op_
%1
m6
,
[r0],
m11
add
r1
,
r3
add
r0
,
r3
add
r2
,
r4
dec
r5d
jg
.
loop
REP_RET
%endmacro
INIT_XMM
ssse3
QPEL16_H_LOWPASS_L2_OP
put
QPEL16_H_LOWPASS_L2_OP
avg
%endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment