Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
42251a2a
Commit
42251a2a
authored
Apr 25, 2005
by
Loren Merritt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
MMX for H.264 deblocking filter
Originally committed as revision 4158 to
svn://svn.ffmpeg.org/ffmpeg/trunk
parent
dee6dde6
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
333 additions
and
149 deletions
+333
-149
dsputil.c
libavcodec/dsputil.c
+91
-0
dsputil.h
libavcodec/dsputil.h
+5
-0
h264.c
libavcodec/h264.c
+33
-149
dsputil_mmx.c
libavcodec/i386/dsputil_mmx.c
+204
-0
No files found.
libavcodec/dsputil.c
View file @
42251a2a
...
...
@@ -2640,6 +2640,92 @@ static void h261_loop_filter_c(uint8_t *src, int stride){
}
}
static
inline
void
h264_loop_filter_luma_c
(
uint8_t
*
pix
,
int
xstride
,
int
ystride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
int
i
,
d
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
if
(
tc0
[
i
]
<
0
)
{
pix
+=
4
*
ystride
;
continue
;
}
for
(
d
=
0
;
d
<
4
;
d
++
)
{
const
int
p0
=
pix
[
-
1
*
xstride
];
const
int
p1
=
pix
[
-
2
*
xstride
];
const
int
p2
=
pix
[
-
3
*
xstride
];
const
int
q0
=
pix
[
0
];
const
int
q1
=
pix
[
1
*
xstride
];
const
int
q2
=
pix
[
2
*
xstride
];
if
(
ABS
(
p0
-
q0
)
<
alpha
&&
ABS
(
p1
-
p0
)
<
beta
&&
ABS
(
q1
-
q0
)
<
beta
)
{
int
tc
=
tc0
[
i
];
int
i_delta
;
if
(
ABS
(
p2
-
p0
)
<
beta
)
{
pix
[
-
2
*
xstride
]
=
p1
+
clip
(
(
p2
+
(
(
p0
+
q0
+
1
)
>>
1
)
-
(
p1
<<
1
)
)
>>
1
,
-
tc0
[
i
],
tc0
[
i
]
);
tc
++
;
}
if
(
ABS
(
q2
-
q0
)
<
beta
)
{
pix
[
xstride
]
=
q1
+
clip
(
(
q2
+
(
(
p0
+
q0
+
1
)
>>
1
)
-
(
q1
<<
1
)
)
>>
1
,
-
tc0
[
i
],
tc0
[
i
]
);
tc
++
;
}
i_delta
=
clip
(
(((
q0
-
p0
)
<<
2
)
+
(
p1
-
q1
)
+
4
)
>>
3
,
-
tc
,
tc
);
pix
[
-
xstride
]
=
clip_uint8
(
p0
+
i_delta
);
/* p0' */
pix
[
0
]
=
clip_uint8
(
q0
-
i_delta
);
/* q0' */
}
pix
+=
ystride
;
}
}
}
static
void
h264_v_loop_filter_luma_c
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
h264_loop_filter_luma_c
(
pix
,
stride
,
1
,
alpha
,
beta
,
tc0
);
}
static
void
h264_h_loop_filter_luma_c
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
h264_loop_filter_luma_c
(
pix
,
1
,
stride
,
alpha
,
beta
,
tc0
);
}
static
inline
void
h264_loop_filter_chroma_c
(
uint8_t
*
pix
,
int
xstride
,
int
ystride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
int
i
,
d
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
const
int
tc
=
tc0
[
i
];
if
(
tc
<=
0
)
{
pix
+=
2
*
ystride
;
continue
;
}
for
(
d
=
0
;
d
<
2
;
d
++
)
{
const
int
p0
=
pix
[
-
1
*
xstride
];
const
int
p1
=
pix
[
-
2
*
xstride
];
const
int
q0
=
pix
[
0
];
const
int
q1
=
pix
[
1
*
xstride
];
if
(
ABS
(
p0
-
q0
)
<
alpha
&&
ABS
(
p1
-
p0
)
<
beta
&&
ABS
(
q1
-
q0
)
<
beta
)
{
int
delta
=
clip
(
(((
q0
-
p0
)
<<
2
)
+
(
p1
-
q1
)
+
4
)
>>
3
,
-
tc
,
tc
);
pix
[
-
xstride
]
=
clip_uint8
(
p0
+
delta
);
/* p0' */
pix
[
0
]
=
clip_uint8
(
q0
-
delta
);
/* q0' */
}
pix
+=
ystride
;
}
}
}
static
void
h264_v_loop_filter_chroma_c
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
h264_loop_filter_chroma_c
(
pix
,
stride
,
1
,
alpha
,
beta
,
tc0
);
}
static
void
h264_h_loop_filter_chroma_c
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
h264_loop_filter_chroma_c
(
pix
,
1
,
stride
,
alpha
,
beta
,
tc0
);
}
static
inline
int
pix_abs16_c
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
s
,
i
;
...
...
@@ -3739,6 +3825,11 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c
->
diff_bytes
=
diff_bytes_c
;
c
->
sub_hfyu_median_prediction
=
sub_hfyu_median_prediction_c
;
c
->
bswap_buf
=
bswap_buf
;
c
->
h264_v_loop_filter_luma
=
h264_v_loop_filter_luma_c
;
c
->
h264_h_loop_filter_luma
=
h264_h_loop_filter_luma_c
;
c
->
h264_v_loop_filter_chroma
=
h264_v_loop_filter_chroma_c
;
c
->
h264_h_loop_filter_chroma
=
h264_h_loop_filter_chroma_c
;
c
->
h263_h_loop_filter
=
h263_h_loop_filter_c
;
c
->
h263_v_loop_filter
=
h263_v_loop_filter_c
;
...
...
libavcodec/dsputil.h
View file @
42251a2a
...
...
@@ -274,6 +274,11 @@ typedef struct DSPContext {
*/
void
(
*
sub_hfyu_median_prediction
)(
uint8_t
*
dst
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
w
,
int
*
left
,
int
*
left_top
);
void
(
*
bswap_buf
)(
uint32_t
*
dst
,
uint32_t
*
src
,
int
w
);
void
(
*
h264_v_loop_filter_luma
)(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
);
void
(
*
h264_h_loop_filter_luma
)(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
);
void
(
*
h264_v_loop_filter_chroma
)(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
);
void
(
*
h264_h_loop_filter_chroma
)(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
);
void
(
*
h263_v_loop_filter
)(
uint8_t
*
src
,
int
stride
,
int
qscale
);
void
(
*
h263_h_loop_filter
)(
uint8_t
*
src
,
int
stride
,
int
qscale
);
...
...
libavcodec/h264.c
View file @
42251a2a
...
...
@@ -5624,48 +5624,15 @@ static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4]
const
int
alpha
=
alpha_table
[
index_a
];
const
int
beta
=
beta_table
[
clip
(
qp
+
h
->
slice_beta_offset
,
0
,
51
)];
for
(
i
=
0
;
i
<
4
;
i
++
)
{
if
(
bS
[
i
]
==
0
)
{
pix
+=
4
*
stride
;
continue
;
}
if
(
bS
[
i
]
<
4
)
{
const
int
tc0
=
tc0_table
[
index_a
][
bS
[
i
]
-
1
];
/* 4px edge length */
for
(
d
=
0
;
d
<
4
;
d
++
)
{
const
int
p0
=
pix
[
-
1
];
const
int
p1
=
pix
[
-
2
];
const
int
p2
=
pix
[
-
3
];
const
int
q0
=
pix
[
0
];
const
int
q1
=
pix
[
1
];
const
int
q2
=
pix
[
2
];
if
(
ABS
(
p0
-
q0
)
<
alpha
&&
ABS
(
p1
-
p0
)
<
beta
&&
ABS
(
q1
-
q0
)
<
beta
)
{
int
tc
=
tc0
;
int
i_delta
;
if
(
ABS
(
p2
-
p0
)
<
beta
)
{
pix
[
-
2
]
=
p1
+
clip
(
(
p2
+
(
(
p0
+
q0
+
1
)
>>
1
)
-
(
p1
<<
1
)
)
>>
1
,
-
tc0
,
tc0
);
tc
++
;
}
if
(
ABS
(
q2
-
q0
)
<
beta
)
{
pix
[
1
]
=
q1
+
clip
(
(
q2
+
(
(
p0
+
q0
+
1
)
>>
1
)
-
(
q1
<<
1
)
)
>>
1
,
-
tc0
,
tc0
);
tc
++
;
}
i_delta
=
clip
(
(((
q0
-
p0
)
<<
2
)
+
(
p1
-
q1
)
+
4
)
>>
3
,
-
tc
,
tc
);
pix
[
-
1
]
=
clip_uint8
(
p0
+
i_delta
);
/* p0' */
pix
[
0
]
=
clip_uint8
(
q0
-
i_delta
);
/* q0' */
tprintf
(
"filter_mb_edgev i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d
\n
# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]
\n
"
,
i
,
d
,
qp
,
index_a
,
alpha
,
beta
,
tc
,
bS
[
i
],
pix
[
-
3
],
p1
,
p0
,
q0
,
q1
,
pix
[
2
],
pix
[
-
2
],
pix
[
-
1
],
pix
[
0
],
pix
[
1
]);
}
pix
+=
stride
;
}
}
else
{
/* 4px edge length */
for
(
d
=
0
;
d
<
4
;
d
++
)
{
if
(
bS
[
0
]
<
4
)
{
int
tc
[
4
];
for
(
i
=
0
;
i
<
4
;
i
++
)
tc
[
i
]
=
bS
[
i
]
?
tc0_table
[
index_a
][
bS
[
i
]
-
1
]
:
-
1
;
h
->
s
.
dsp
.
h264_h_loop_filter_luma
(
pix
,
stride
,
alpha
,
beta
,
tc
);
}
else
{
/* 16px edge length, because bS=4 is triggered by being at
* the edge of an intra MB, so all 4 bS are the same */
for
(
d
=
0
;
d
<
16
;
d
++
)
{
const
int
p0
=
pix
[
-
1
];
const
int
p1
=
pix
[
-
2
];
const
int
p2
=
pix
[
-
3
];
...
...
@@ -5710,7 +5677,6 @@ static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4]
}
pix
+=
stride
;
}
}
}
}
static
void
filter_mb_edgecv
(
H264Context
*
h
,
uint8_t
*
pix
,
int
stride
,
int
bS
[
4
],
int
qp
)
{
...
...
@@ -5719,35 +5685,14 @@ static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4
const
int
alpha
=
alpha_table
[
index_a
];
const
int
beta
=
beta_table
[
clip
(
qp
+
h
->
slice_beta_offset
,
0
,
51
)];
for
(
i
=
0
;
i
<
4
;
i
++
)
{
if
(
bS
[
i
]
==
0
)
{
pix
+=
2
*
stride
;
continue
;
}
if
(
bS
[
i
]
<
4
)
{
const
int
tc
=
tc0_table
[
index_a
][
bS
[
i
]
-
1
]
+
1
;
/* 2px edge length (because we use same bS than the one for luma) */
for
(
d
=
0
;
d
<
2
;
d
++
){
const
int
p0
=
pix
[
-
1
];
const
int
p1
=
pix
[
-
2
];
const
int
q0
=
pix
[
0
];
const
int
q1
=
pix
[
1
];
if
(
ABS
(
p0
-
q0
)
<
alpha
&&
ABS
(
p1
-
p0
)
<
beta
&&
ABS
(
q1
-
q0
)
<
beta
)
{
const
int
i_delta
=
clip
(
(((
q0
-
p0
)
<<
2
)
+
(
p1
-
q1
)
+
4
)
>>
3
,
-
tc
,
tc
);
pix
[
-
1
]
=
clip_uint8
(
p0
+
i_delta
);
/* p0' */
pix
[
0
]
=
clip_uint8
(
q0
-
i_delta
);
/* q0' */
tprintf
(
"filter_mb_edgecv i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d
\n
# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]
\n
"
,
i
,
d
,
qp
,
index_a
,
alpha
,
beta
,
tc
,
bS
[
i
],
pix
[
-
3
],
p1
,
p0
,
q0
,
q1
,
pix
[
2
],
p1
,
pix
[
-
1
],
pix
[
0
],
q1
);
}
pix
+=
stride
;
}
}
else
{
/* 2px edge length (because we use same bS than the one for luma) */
for
(
d
=
0
;
d
<
2
;
d
++
){
if
(
bS
[
0
]
<
4
)
{
int
tc
[
4
];
for
(
i
=
0
;
i
<
4
;
i
++
)
tc
[
i
]
=
bS
[
i
]
?
tc0_table
[
index_a
][
bS
[
i
]
-
1
]
+
1
:
0
;
h
->
s
.
dsp
.
h264_h_loop_filter_chroma
(
pix
,
stride
,
alpha
,
beta
,
tc
);
}
else
{
/* 8px edge length, see filter_mb_edgev */
for
(
d
=
0
;
d
<
8
;
d
++
){
const
int
p0
=
pix
[
-
1
];
const
int
p1
=
pix
[
-
2
];
const
int
q0
=
pix
[
0
];
...
...
@@ -5763,7 +5708,6 @@ static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4
}
pix
+=
stride
;
}
}
}
}
...
...
@@ -5928,49 +5872,14 @@ static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4]
const
int
beta
=
beta_table
[
clip
(
qp
+
h
->
slice_beta_offset
,
0
,
51
)];
const
int
pix_next
=
stride
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
if
(
bS
[
i
]
==
0
)
{
pix
+=
4
;
continue
;
}
if
(
bS
[
i
]
<
4
)
{
const
int
tc0
=
tc0_table
[
index_a
][
bS
[
i
]
-
1
];
/* 4px edge length */
for
(
d
=
0
;
d
<
4
;
d
++
)
{
const
int
p0
=
pix
[
-
1
*
pix_next
];
const
int
p1
=
pix
[
-
2
*
pix_next
];
const
int
p2
=
pix
[
-
3
*
pix_next
];
const
int
q0
=
pix
[
0
];
const
int
q1
=
pix
[
1
*
pix_next
];
const
int
q2
=
pix
[
2
*
pix_next
];
if
(
ABS
(
p0
-
q0
)
<
alpha
&&
ABS
(
p1
-
p0
)
<
beta
&&
ABS
(
q1
-
q0
)
<
beta
)
{
int
tc
=
tc0
;
int
i_delta
;
if
(
ABS
(
p2
-
p0
)
<
beta
)
{
pix
[
-
2
*
pix_next
]
=
p1
+
clip
(
(
p2
+
(
(
p0
+
q0
+
1
)
>>
1
)
-
(
p1
<<
1
)
)
>>
1
,
-
tc0
,
tc0
);
tc
++
;
}
if
(
ABS
(
q2
-
q0
)
<
beta
)
{
pix
[
pix_next
]
=
q1
+
clip
(
(
q2
+
(
(
p0
+
q0
+
1
)
>>
1
)
-
(
q1
<<
1
)
)
>>
1
,
-
tc0
,
tc0
);
tc
++
;
}
i_delta
=
clip
(
(((
q0
-
p0
)
<<
2
)
+
(
p1
-
q1
)
+
4
)
>>
3
,
-
tc
,
tc
);
pix
[
-
pix_next
]
=
clip_uint8
(
p0
+
i_delta
);
/* p0' */
pix
[
0
]
=
clip_uint8
(
q0
-
i_delta
);
/* q0' */
tprintf
(
"filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d
\n
# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]
\n
"
,
i
,
d
,
qp
,
index_a
,
alpha
,
beta
,
tc
,
bS
[
i
],
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
pix
[
-
2
*
pix_next
],
pix
[
-
pix_next
],
pix
[
0
],
pix
[
pix_next
]);
}
pix
++
;
}
}
else
{
/* 4px edge length */
for
(
d
=
0
;
d
<
4
;
d
++
)
{
if
(
bS
[
0
]
<
4
)
{
int
tc
[
4
];
for
(
i
=
0
;
i
<
4
;
i
++
)
tc
[
i
]
=
bS
[
i
]
?
tc0_table
[
index_a
][
bS
[
i
]
-
1
]
:
-
1
;
h
->
s
.
dsp
.
h264_v_loop_filter_luma
(
pix
,
stride
,
alpha
,
beta
,
tc
);
}
else
{
/* 16px edge length, see filter_mb_edgev */
for
(
d
=
0
;
d
<
16
;
d
++
)
{
const
int
p0
=
pix
[
-
1
*
pix_next
];
const
int
p1
=
pix
[
-
2
*
pix_next
];
const
int
p2
=
pix
[
-
3
*
pix_next
];
...
...
@@ -6013,7 +5922,6 @@ static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4]
}
pix
++
;
}
}
}
}
...
...
@@ -6024,37 +5932,14 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
const
int
beta
=
beta_table
[
clip
(
qp
+
h
->
slice_beta_offset
,
0
,
51
)];
const
int
pix_next
=
stride
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
if
(
bS
[
i
]
==
0
)
{
pix
+=
2
;
continue
;
}
if
(
bS
[
i
]
<
4
)
{
int
tc
=
tc0_table
[
index_a
][
bS
[
i
]
-
1
]
+
1
;
/* 2px edge length (see deblocking_filter_edgecv) */
for
(
d
=
0
;
d
<
2
;
d
++
)
{
const
int
p0
=
pix
[
-
1
*
pix_next
];
const
int
p1
=
pix
[
-
2
*
pix_next
];
const
int
q0
=
pix
[
0
];
const
int
q1
=
pix
[
1
*
pix_next
];
if
(
ABS
(
p0
-
q0
)
<
alpha
&&
ABS
(
p1
-
p0
)
<
beta
&&
ABS
(
q1
-
q0
)
<
beta
)
{
int
i_delta
=
clip
(
(((
q0
-
p0
)
<<
2
)
+
(
p1
-
q1
)
+
4
)
>>
3
,
-
tc
,
tc
);
pix
[
-
pix_next
]
=
clip_uint8
(
p0
+
i_delta
);
/* p0' */
pix
[
0
]
=
clip_uint8
(
q0
-
i_delta
);
/* q0' */
tprintf
(
"filter_mb_edgech i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d
\n
# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]
\n
"
,
i
,
d
,
qp
,
index_a
,
alpha
,
beta
,
tc
,
bS
[
i
],
pix
[
-
3
*
pix_next
],
p1
,
p0
,
q0
,
q1
,
pix
[
2
*
pix_next
],
pix
[
-
2
*
pix_next
],
pix
[
-
pix_next
],
pix
[
0
],
pix
[
pix_next
]);
}
pix
++
;
}
}
else
{
/* 2px edge length (see deblocking_filter_edgecv) */
for
(
d
=
0
;
d
<
2
;
d
++
)
{
if
(
bS
[
0
]
<
4
)
{
int
tc
[
4
];
for
(
i
=
0
;
i
<
4
;
i
++
)
tc
[
i
]
=
bS
[
i
]
?
tc0_table
[
index_a
][
bS
[
i
]
-
1
]
+
1
:
0
;
h
->
s
.
dsp
.
h264_v_loop_filter_chroma
(
pix
,
stride
,
alpha
,
beta
,
tc
);
}
else
{
/* 8px edge length, see filter_mb_edgev */
for
(
d
=
0
;
d
<
8
;
d
++
)
{
const
int
p0
=
pix
[
-
1
*
pix_next
];
const
int
p1
=
pix
[
-
2
*
pix_next
];
const
int
q0
=
pix
[
0
];
...
...
@@ -6070,7 +5955,6 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
}
pix
++
;
}
}
}
}
...
...
libavcodec/i386/dsputil_mmx.c
View file @
42251a2a
...
...
@@ -39,6 +39,7 @@ static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x00
static
const
uint64_t
ff_pw_20
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0014001400140014ULL
;
static
const
uint64_t
ff_pw_3
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0003000300030003ULL
;
static
const
uint64_t
ff_pw_4
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0004000400040004ULL
;
static
const
uint64_t
ff_pw_5
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0005000500050005ULL
;
static
const
uint64_t
ff_pw_16
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0010001000100010ULL
;
static
const
uint64_t
ff_pw_32
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0020002000200020ULL
;
...
...
@@ -691,6 +692,204 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
);
}
// dst = ABS( a - b )
#define MMABS_DIFF_MMX2(a,b,dst,z)\
"movq " #b ", " #dst " \n\t"\
"movq " #a ", " #z " \n\t"\
"psubusw " #b ", " #z " \n\t"\
"psubusw " #a ", " #dst " \n\t"\
"pmaxsw " #z ", " #dst " \n\t"
// a = clip( a, -tc, tc )
#define CLIP_MMX2(a,tc,z)\
"pxor " #z ", " #z " \n\t"\
"psubw " #tc ", " #z " \n\t"\
"pmaxsw " #z ", " #a " \n\t"\
"pminsw " #tc ", " #a " \n\t"
// in: mm0=p1, mm1=p0, mm2=q0, mm3=q1
// out: mm7 = do we filter this pixel?
#define H264_DEBLOCK_THRESH(alpha,beta)\
"pxor %%mm7, %%mm7 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
MMABS_DIFF_MMX2(%%mm1, %%mm2, %%mm5, %%mm4)\
"movd " #alpha ", %%mm6 \n\t"\
"pshufw $0, %%mm6, %%mm6 \n\t"\
"pcmpgtw %%mm5, %%mm6 \n\t"
/* ABS(p0-q0) < alpha */
\
MMABS_DIFF_MMX2(%%mm0, %%mm1, %%mm5, %%mm4)\
MMABS_DIFF_MMX2(%%mm3, %%mm2, %%mm7, %%mm4)\
"pmaxsw %%mm7, %%mm5 \n\t"\
"movd " #beta ", %%mm7 \n\t"\
"pshufw $0, %%mm7, %%mm7 \n\t"\
"movq %%mm7, %%mm4 \n\t"\
"pcmpgtw %%mm5, %%mm7 \n\t"
/* ABS(p1-p0) < beta && ABS(q1-q0) < beta */
\
"pand %%mm6, %%mm7 \n\t"
// in: mm0=p1, mm1=p0, mm2=q0, mm3=q1, mm6=tc
// out: mm1=p0', mm2=q0'
#define H264_DEBLOCK_P0_Q0(pw4)\
"movq " #pw4 ", %%mm4 \n\t"\
"movq %%mm2, %%mm5 \n\t"\
"paddw %%mm4, %%mm0 \n\t"\
"psubw %%mm1, %%mm5 \n\t"\
"psubw %%mm3, %%mm0 \n\t"\
"psllw $2, %%mm5 \n\t"\
"paddw %%mm0, %%mm5 \n\t"\
"psraw $3, %%mm5 \n\t"
/* mm5 = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 */
\
CLIP_MMX2(%%mm5, %%mm6, %%mm4)
/* delta = clip( mm5, -tc, tc ) */
\
"paddw %%mm5, %%mm1 \n\t"
/* p0 += delta */
\
"psubw %%mm5, %%mm2 \n\t"
/* q0 -= delta */
// in: mm1=p0, mm2=q0, mm6=tc0
// out: mm5=delta
#define H264_DEBLOCK_DELTA_PQ1(p1,p2,z)\
"movq %%mm1, %%mm5 \n\t"\
"pavgb %%mm2, %%mm5 \n\t"\
"paddw " #p2 ", %%mm5 \n\t"\
"psraw $1, %%mm5 \n\t"\
"psubw " #p1 ", %%mm5 \n\t"
/* ( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 */
\
CLIP_MMX2(%%mm5, %%mm6, z)
static
inline
void
h264_loop_filter_luma_mmx2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
tc0
)
{
uint64_t
tmp0
,
tmp1
;
asm
volatile
(
"movd (%2,%4), %%mm0
\n\t
"
//p1
"movd (%2,%4,2), %%mm1
\n\t
"
//p0
"movd (%3), %%mm2
\n\t
"
//q0
"movd (%3,%4), %%mm3
\n\t
"
//q1
H264_DEBLOCK_THRESH
(
%
6
,
%
7
)
"movq %%mm7, %0
\n\t
"
// filter p1 if ABS(p2-p0) < beta
"movd (%2), %%mm3
\n\t
"
"pxor %%mm6, %%mm6
\n\t
"
"punpcklbw %%mm6, %%mm3
\n\t
"
//p2
MMABS_DIFF_MMX2
(
%%
mm1
,
%%
mm3
,
%%
mm5
,
%%
mm6
)
"pcmpgtw %%mm5, %%mm4
\n\t
"
"pand %%mm7, %%mm4
\n\t
"
// mm4 = ( ABS( p2 - p0 ) < beta && filterp )
"movd %5, %%mm6
\n\t
"
"pshufw $0, %%mm6, %%mm6
\n\t
"
//tc
H264_DEBLOCK_DELTA_PQ1
(
%%
mm0
,
%%
mm3
,
%%
mm7
)
// delta = clip( ( p2 + ((p0+q0+1)>>1) ) >> 1 ) - p1 )
"pand %%mm4, %%mm5
\n\t
"
"paddw %%mm0, %%mm5
\n\t
"
"packuswb %%mm5, %%mm5
\n\t
"
"movd %%mm5, (%2,%4)
\n\t
"
// *p1 += delta
"psrlw $15, %%mm4
\n\t
"
"paddw %%mm6, %%mm4
\n\t
"
// tc++
"movq %%mm4, %1
\n\t
"
// filter q1 if ABS(q2-q0) < beta
"pxor %%mm7, %%mm7
\n\t
"
"movd (%3,%4), %%mm3
\n\t
"
//q1
"movd (%3,%4,2), %%mm4
\n\t
"
//q2
"punpcklbw %%mm7, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
MMABS_DIFF_MMX2
(
%%
mm2
,
%%
mm4
,
%%
mm5
,
%%
mm7
)
"movd %7, %%mm7
\n\t
"
"pshufw $0, %%mm7, %%mm7
\n\t
"
"pcmpgtw %%mm5, %%mm7
\n\t
"
H264_DEBLOCK_DELTA_PQ1
(
%%
mm3
,
%%
mm4
,
%%
mm4
)
// delta = clip( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 )
"movq %0, %%mm4
\n\t
"
"pand %%mm4, %%mm7
\n\t
"
// mm7 = ( ABS( q2 - q0 ) < beta && filterp )
"pand %%mm7, %%mm5
\n\t
"
"paddw %%mm3, %%mm5
\n\t
"
"packuswb %%mm5, %%mm5
\n\t
"
"movd %%mm5, (%3,%4)
\n\t
"
// *q1 += delta
"movq %1, %%mm6
\n\t
"
"psrlw $15, %%mm7
\n\t
"
"paddw %%mm7, %%mm6
\n\t
"
// tc++
"movq %0, %%mm4
\n\t
"
"pand %%mm4, %%mm6
\n\t
"
H264_DEBLOCK_P0_Q0
(
%
8
)
"packuswb %%mm1, %%mm1
\n\t
"
"packuswb %%mm2, %%mm2
\n\t
"
"movd %%mm1, (%2,%4,2)
\n\t
"
"movd %%mm2, (%3)
\n\t
"
:
"=m"
(
tmp0
),
"=m"
(
tmp1
)
:
"r"
(
pix
-
3
*
stride
),
"r"
(
pix
),
"r"
((
long
)
stride
),
"r"
(
tc0
),
"r"
(
alpha
),
"r"
(
beta
),
"m"
(
ff_pw_4
)
);
}
static
void
h264_v_loop_filter_luma_mmx2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
int
i
;
for
(
i
=
0
;
i
<
4
;
i
++
,
pix
+=
4
)
{
if
(
tc0
[
i
]
<
0
)
continue
;
h264_loop_filter_luma_mmx2
(
pix
,
stride
,
alpha
,
beta
,
tc0
[
i
]);
}
}
static
void
h264_h_loop_filter_luma_mmx2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
uint8_t
trans
[
4
*
8
];
int
i
;
for
(
i
=
0
;
i
<
4
;
i
++
,
pix
+=
4
*
stride
)
{
if
(
tc0
[
i
]
<
0
)
continue
;
//FIXME: could cut some load/stores by merging transpose with filter
transpose4x4
(
trans
,
pix
-
4
,
4
,
stride
);
transpose4x4
(
trans
+
4
*
4
,
pix
,
4
,
stride
);
h264_loop_filter_luma_mmx2
(
trans
+
4
*
4
,
4
,
alpha
,
beta
,
tc0
[
i
]);
transpose4x4
(
pix
-
2
,
trans
+
2
*
4
,
stride
,
4
);
}
}
static
inline
void
h264_loop_filter_chroma_mmx2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
asm
volatile
(
"movd (%0), %%mm0
\n\t
"
"movd (%0,%2), %%mm1
\n\t
"
"movd (%1), %%mm2
\n\t
"
"movd (%1,%2), %%mm3
\n\t
"
H264_DEBLOCK_THRESH
(
%
4
,
%
5
)
"movd %3, %%mm6
\n\t
"
"pshufw $0x50, %%mm6, %%mm6
\n\t
"
// mm6 = tc[1], tc[1], tc[0], tc[0]
"pand %%mm7, %%mm6
\n\t
"
H264_DEBLOCK_P0_Q0
(
%
6
)
"packuswb %%mm1, %%mm1
\n\t
"
"packuswb %%mm2, %%mm2
\n\t
"
"movd %%mm1, (%0,%2)
\n\t
"
"movd %%mm2, (%1)
\n\t
"
::
"r"
(
pix
-
2
*
stride
),
"r"
(
pix
),
"r"
((
long
)
stride
),
"r"
(
tc0
[
1
]
<<
16
|
tc0
[
0
]),
"r"
(
alpha
),
"r"
(
beta
),
"m"
(
ff_pw_4
)
);
}
static
void
h264_v_loop_filter_chroma_mmx2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
int
i
;
for
(
i
=
0
;
i
<
2
;
i
++
)
{
h264_loop_filter_chroma_mmx2
(
pix
,
stride
,
alpha
,
beta
,
tc0
);
pix
+=
4
;
tc0
+=
2
;
}
}
static
void
h264_h_loop_filter_chroma_mmx2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int
*
tc0
)
{
uint8_t
trans
[
4
*
4
];
int
i
;
for
(
i
=
0
;
i
<
2
;
i
++
)
{
//FIXME: could cut some load/stores by merging transpose with filter
transpose4x4
(
trans
,
pix
-
2
,
4
,
stride
);
h264_loop_filter_chroma_mmx2
(
trans
+
2
*
4
,
4
,
alpha
,
beta
,
tc0
);
transpose4x4
(
pix
-
2
,
trans
,
stride
,
4
);
pix
+=
4
*
stride
;
tc0
+=
2
;
}
}
#ifdef CONFIG_ENCODERS
static
int
pix_norm1_mmx
(
uint8_t
*
pix
,
int
line_size
)
{
int
tmp
;
...
...
@@ -3184,6 +3383,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
dspfunc
(
avg_h264_qpel
,
2
,
4
);
#undef dspfunc
c
->
h264_v_loop_filter_luma
=
h264_v_loop_filter_luma_mmx2
;
c
->
h264_h_loop_filter_luma
=
h264_h_loop_filter_luma_mmx2
;
c
->
h264_v_loop_filter_chroma
=
h264_v_loop_filter_chroma_mmx2
;
c
->
h264_h_loop_filter_chroma
=
h264_h_loop_filter_chroma_mmx2
;
#ifdef CONFIG_ENCODERS
c
->
sub_hfyu_median_prediction
=
sub_hfyu_median_prediction_mmx2
;
#endif //CONFIG_ENCODERS
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment