Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
c9d98c56
Commit
c9d98c56
authored
Apr 04, 2017
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cavs: convert idct from inline asm to yasm.
parent
b51d7d89
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
171 additions
and
160 deletions
+171
-160
Makefile
libavcodec/x86/Makefile
+1
-0
cavsdsp.c
libavcodec/x86/cavsdsp.c
+5
-160
cavsidct.asm
libavcodec/x86/cavsidct.asm
+165
-0
No files found.
libavcodec/x86/Makefile
View file @
c9d98c56
...
@@ -142,6 +142,7 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
...
@@ -142,6 +142,7 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER)
+=
x86/g722dsp.o
YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER)
+=
x86/g722dsp.o
YASM-OBJS-$(CONFIG_ALAC_DECODER)
+=
x86/alacdsp.o
YASM-OBJS-$(CONFIG_ALAC_DECODER)
+=
x86/alacdsp.o
YASM-OBJS-$(CONFIG_APNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_APNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_CAVS_DECODER)
+=
x86/cavsidct.o
YASM-OBJS-$(CONFIG_DCA_DECODER)
+=
x86/dcadsp.o
x86/synth_filter.o
YASM-OBJS-$(CONFIG_DCA_DECODER)
+=
x86/dcadsp.o
x86/synth_filter.o
YASM-OBJS-$(CONFIG_DIRAC_DECODER)
+=
x86/diracdsp.o
\
YASM-OBJS-$(CONFIG_DIRAC_DECODER)
+=
x86/diracdsp.o
\
x86/dirac_dwt.o
x86/dirac_dwt.o
...
...
libavcodec/x86/cavsdsp.c
View file @
c9d98c56
...
@@ -34,172 +34,19 @@
...
@@ -34,172 +34,19 @@
#include "idctdsp.h"
#include "idctdsp.h"
#include "config.h"
#include "config.h"
#if HAVE_MMX_INLINE
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#if HAVE_MMX_EXTERNAL
#define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\
"paddw "#b", "#b" \n\t"\
"psubw "#a", "#b" \n\t"
/*****************************************************************************
*
* inverse transform
*
****************************************************************************/
static
inline
void
cavs_idct8_1d
(
int16_t
*
block
,
uint64_t
bias
)
{
__asm__
volatile
(
"movq 112(%0), %%mm4
\n\t
"
/* mm4 = src7 */
"movq 16(%0), %%mm5
\n\t
"
/* mm5 = src1 */
"movq 80(%0), %%mm2
\n\t
"
/* mm2 = src5 */
"movq 48(%0), %%mm7
\n\t
"
/* mm7 = src3 */
"movq %%mm4, %%mm0
\n\t
"
"movq %%mm5, %%mm3
\n\t
"
"movq %%mm2, %%mm6
\n\t
"
"movq %%mm7, %%mm1
\n\t
"
"paddw %%mm4, %%mm4
\n\t
"
/* mm4 = 2*src7 */
"paddw %%mm3, %%mm3
\n\t
"
/* mm3 = 2*src1 */
"paddw %%mm6, %%mm6
\n\t
"
/* mm6 = 2*src5 */
"paddw %%mm1, %%mm1
\n\t
"
/* mm1 = 2*src3 */
"paddw %%mm4, %%mm0
\n\t
"
/* mm0 = 3*src7 */
"paddw %%mm3, %%mm5
\n\t
"
/* mm5 = 3*src1 */
"paddw %%mm6, %%mm2
\n\t
"
/* mm2 = 3*src5 */
"paddw %%mm1, %%mm7
\n\t
"
/* mm7 = 3*src3 */
"psubw %%mm4, %%mm5
\n\t
"
/* mm5 = 3*src1 - 2*src7 = a0 */
"paddw %%mm6, %%mm7
\n\t
"
/* mm7 = 3*src3 + 2*src5 = a1 */
"psubw %%mm2, %%mm1
\n\t
"
/* mm1 = 2*src3 - 3*src5 = a2 */
"paddw %%mm0, %%mm3
\n\t
"
/* mm3 = 2*src1 + 3*src7 = a3 */
"movq %%mm5, %%mm4
\n\t
"
"movq %%mm7, %%mm6
\n\t
"
"movq %%mm3, %%mm0
\n\t
"
"movq %%mm1, %%mm2
\n\t
"
SUMSUB_BA
(
%%
mm7
,
%%
mm5
)
/* mm7 = a0 + a1 mm5 = a0 - a1 */
"paddw %%mm3, %%mm7
\n\t
"
/* mm7 = a0 + a1 + a3 */
"paddw %%mm1, %%mm5
\n\t
"
/* mm5 = a0 - a1 + a2 */
"paddw %%mm7, %%mm7
\n\t
"
"paddw %%mm5, %%mm5
\n\t
"
"paddw %%mm6, %%mm7
\n\t
"
/* mm7 = b4 */
"paddw %%mm4, %%mm5
\n\t
"
/* mm5 = b5 */
SUMSUB_BA
(
%%
mm1
,
%%
mm3
)
/* mm1 = a3 + a2 mm3 = a3 - a2 */
"psubw %%mm1, %%mm4
\n\t
"
/* mm4 = a0 - a2 - a3 */
"movq %%mm4, %%mm1
\n\t
"
/* mm1 = a0 - a2 - a3 */
"psubw %%mm6, %%mm3
\n\t
"
/* mm3 = a3 - a2 - a1 */
"paddw %%mm1, %%mm1
\n\t
"
"paddw %%mm3, %%mm3
\n\t
"
"psubw %%mm2, %%mm1
\n\t
"
/* mm1 = b7 */
"paddw %%mm0, %%mm3
\n\t
"
/* mm3 = b6 */
"movq 32(%0), %%mm2
\n\t
"
/* mm2 = src2 */
"movq 96(%0), %%mm6
\n\t
"
/* mm6 = src6 */
"movq %%mm2, %%mm4
\n\t
"
"movq %%mm6, %%mm0
\n\t
"
"psllw $2, %%mm4
\n\t
"
/* mm4 = 4*src2 */
"psllw $2, %%mm6
\n\t
"
/* mm6 = 4*src6 */
"paddw %%mm4, %%mm2
\n\t
"
/* mm2 = 5*src2 */
"paddw %%mm6, %%mm0
\n\t
"
/* mm0 = 5*src6 */
"paddw %%mm2, %%mm2
\n\t
"
"paddw %%mm0, %%mm0
\n\t
"
"psubw %%mm0, %%mm4
\n\t
"
/* mm4 = 4*src2 - 10*src6 = a7 */
"paddw %%mm2, %%mm6
\n\t
"
/* mm6 = 4*src6 + 10*src2 = a6 */
"movq (%0), %%mm2
\n\t
"
/* mm2 = src0 */
"movq 64(%0), %%mm0
\n\t
"
/* mm0 = src4 */
SUMSUB_BA
(
%%
mm0
,
%%
mm2
)
/* mm0 = src0+src4 mm2 = src0-src4 */
"psllw $3, %%mm0
\n\t
"
"psllw $3, %%mm2
\n\t
"
"paddw %1, %%mm0
\n\t
"
/* add rounding bias */
"paddw %1, %%mm2
\n\t
"
/* add rounding bias */
SUMSUB_BA
(
%%
mm6
,
%%
mm0
)
/* mm6 = a4 + a6 mm0 = a4 - a6 */
SUMSUB_BA
(
%%
mm4
,
%%
mm2
)
/* mm4 = a5 + a7 mm2 = a5 - a7 */
SUMSUB_BA
(
%%
mm7
,
%%
mm6
)
/* mm7 = dst0 mm6 = dst7 */
SUMSUB_BA
(
%%
mm5
,
%%
mm4
)
/* mm5 = dst1 mm4 = dst6 */
SUMSUB_BA
(
%%
mm3
,
%%
mm2
)
/* mm3 = dst2 mm2 = dst5 */
SUMSUB_BA
(
%%
mm1
,
%%
mm0
)
/* mm1 = dst3 mm0 = dst4 */
::
"r"
(
block
),
"m"
(
bias
)
);
}
#define SBUTTERFLY(a,b,t,n,m)\
"mov" #m " " #a ", " #t " \n\t"
/* abcd */
\
"punpckl" #n " " #b ", " #a " \n\t"
/* aebf */
\
"punpckh" #n " " #b ", " #t " \n\t"
/* cgdh */
\
#define TRANSPOSE4(a,b,c,d,t)\
void
ff_cavs_idct8_mmx
(
int16_t
*
out
,
const
int16_t
*
in
);
SBUTTERFLY(a,b,t,wd,q)
/* a=aebf t=cgdh */
\
SBUTTERFLY(c,d,b,wd,q)
/* c=imjn b=kolp */
\
SBUTTERFLY(a,c,d,dq,q)
/* a=aeim d=bfjn */
\
SBUTTERFLY(t,b,c,dq,q)
/* t=cgko c=dhlp */
static
void
cavs_idct8_add_mmx
(
uint8_t
*
dst
,
int16_t
*
block
,
ptrdiff_t
stride
)
static
void
cavs_idct8_add_mmx
(
uint8_t
*
dst
,
int16_t
*
block
,
ptrdiff_t
stride
)
{
{
int
i
;
LOCAL_ALIGNED
(
16
,
int16_t
,
b2
,
[
64
]);
LOCAL_ALIGNED
(
16
,
int16_t
,
b2
,
[
64
]);
ff_cavs_idct8_mmx
(
b2
,
block
);
for
(
i
=
0
;
i
<
2
;
i
++
){
cavs_idct8_1d
(
block
+
4
*
i
,
ff_pw_4
.
a
);
__asm__
volatile
(
"psraw $3, %%mm7
\n\t
"
"psraw $3, %%mm6
\n\t
"
"psraw $3, %%mm5
\n\t
"
"psraw $3, %%mm4
\n\t
"
"psraw $3, %%mm3
\n\t
"
"psraw $3, %%mm2
\n\t
"
"psraw $3, %%mm1
\n\t
"
"psraw $3, %%mm0
\n\t
"
"movq %%mm7, (%0)
\n\t
"
TRANSPOSE4
(
%%
mm0
,
%%
mm2
,
%%
mm4
,
%%
mm6
,
%%
mm7
)
"movq %%mm0, 8(%0)
\n\t
"
"movq %%mm6, 24(%0)
\n\t
"
"movq %%mm7, 40(%0)
\n\t
"
"movq %%mm4, 56(%0)
\n\t
"
"movq (%0), %%mm7
\n\t
"
TRANSPOSE4
(
%%
mm7
,
%%
mm5
,
%%
mm3
,
%%
mm1
,
%%
mm0
)
"movq %%mm7, (%0)
\n\t
"
"movq %%mm1, 16(%0)
\n\t
"
"movq %%mm0, 32(%0)
\n\t
"
"movq %%mm3, 48(%0)
\n\t
"
:
:
"r"
(
b2
+
32
*
i
)
:
"memory"
);
}
for
(
i
=
0
;
i
<
2
;
i
++
){
cavs_idct8_1d
(
b2
+
4
*
i
,
ff_pw_64
.
a
);
__asm__
volatile
(
"psraw $7, %%mm7
\n\t
"
"psraw $7, %%mm6
\n\t
"
"psraw $7, %%mm5
\n\t
"
"psraw $7, %%mm4
\n\t
"
"psraw $7, %%mm3
\n\t
"
"psraw $7, %%mm2
\n\t
"
"psraw $7, %%mm1
\n\t
"
"psraw $7, %%mm0
\n\t
"
"movq %%mm7, (%0)
\n\t
"
"movq %%mm5, 16(%0)
\n\t
"
"movq %%mm3, 32(%0)
\n\t
"
"movq %%mm1, 48(%0)
\n\t
"
"movq %%mm0, 64(%0)
\n\t
"
"movq %%mm2, 80(%0)
\n\t
"
"movq %%mm4, 96(%0)
\n\t
"
"movq %%mm6, 112(%0)
\n\t
"
::
"r"
(
b2
+
4
*
i
)
:
"memory"
);
}
ff_add_pixels_clamped
(
b2
,
dst
,
stride
);
ff_add_pixels_clamped
(
b2
,
dst
,
stride
);
}
}
#endif
/* HAVE_MMX_
INLINE
*/
#endif
/* HAVE_MMX_
EXTERNAL
*/
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
...
@@ -529,12 +376,10 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
...
@@ -529,12 +376,10 @@ static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
c
->
put_cavs_qpel_pixels_tab
[
1
][
0
]
=
put_cavs_qpel8_mc00_mmx
;
c
->
put_cavs_qpel_pixels_tab
[
1
][
0
]
=
put_cavs_qpel8_mc00_mmx
;
c
->
avg_cavs_qpel_pixels_tab
[
0
][
0
]
=
avg_cavs_qpel16_mc00_mmx
;
c
->
avg_cavs_qpel_pixels_tab
[
0
][
0
]
=
avg_cavs_qpel16_mc00_mmx
;
c
->
avg_cavs_qpel_pixels_tab
[
1
][
0
]
=
avg_cavs_qpel8_mc00_mmx
;
c
->
avg_cavs_qpel_pixels_tab
[
1
][
0
]
=
avg_cavs_qpel8_mc00_mmx
;
#endif
#if HAVE_MMX_INLINE
c
->
cavs_idct8_add
=
cavs_idct8_add_mmx
;
c
->
cavs_idct8_add
=
cavs_idct8_add_mmx
;
c
->
idct_perm
=
FF_IDCT_PERM_TRANSPOSE
;
c
->
idct_perm
=
FF_IDCT_PERM_TRANSPOSE
;
#endif
/* HAVE_MMX_
INLINE
*/
#endif
/* HAVE_MMX_
EXTERNAL
*/
}
}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
#define DSPFUNC(PFX, IDX, NUM, EXT) \
...
...
libavcodec/x86/cavsidct.asm
0 → 100644
View file @
c9d98c56
; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
; Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
;
; MMX-optimized DSP functions, based on H.264 optimizations by
; Michael Niedermayer and Loren Merritt
; Conversion from gcc syntax to x264asm syntax with modifications
; by Ronald S. Bultje <rsbultje@gmail.com>
;
; This file is part of FFmpeg.
;
; FFmpeg is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public
; License as published by the Free Software Foundation; either
; version 2.1 of the License, or (at your option) any later version.
;
; FFmpeg is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public License
; along with FFmpeg; if not, write to the Free Software Foundation,
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
%include
"libavutil/x86/x86util.asm"
cextern
pw_4
cextern
pw_64
SECTION
.
text
%macro
CAVS_IDCT8_1D
2
; source, round
mova
m4
,
[
%1
+
7
*
16
]
; m4 = src7
mova
m5
,
[
%1
+
1
*
16
]
; m5 = src1
mova
m2
,
[
%1
+
5
*
16
]
; m2 = src5
mova
m7
,
[
%1
+
3
*
16
]
; m7 = src3
mova
m0
,
m4
mova
m3
,
m5
mova
m6
,
m2
mova
m1
,
m7
paddw
m4
,
m4
; m4 = 2*src7
paddw
m3
,
m3
; m3 = 2*src1
paddw
m6
,
m6
; m6 = 2*src5
paddw
m1
,
m1
; m1 = 2*src3
paddw
m0
,
m4
; m0 = 3*src7
paddw
m5
,
m3
; m5 = 3*src1
paddw
m2
,
m6
; m2 = 3*src5
paddw
m7
,
m1
; m7 = 3*src3
psubw
m5
,
m4
; m5 = 3*src1 - 2*src7 = a0
paddw
m7
,
m6
; m7 = 3*src3 - 2*src5 = a1
psubw
m1
,
m2
; m1 = 2*src3 - 3*src5 = a2
paddw
m3
,
m0
; m3 = 2*src1 - 3*src7 = a3
mova
m4
,
m5
mova
m6
,
m7
mova
m0
,
m3
mova
m2
,
m1
SUMSUB_BA
w
,
7
,
5
; m7 = a0 + a1, m5 = a0 - a1
paddw
m7
,
m3
; m7 = a0 + a1 + a3
paddw
m5
,
m1
; m5 = a0 - a1 + a2
paddw
m7
,
m7
paddw
m5
,
m5
paddw
m7
,
m6
; m7 = b4
paddw
m5
,
m4
; m5 = b5
SUMSUB_BA
w
,
1
,
3
; m1 = a3 + a2, m3 = a3 - a2
psubw
m4
,
m1
; m4 = a0 - a2 - a3
mova
m1
,
m4
; m1 = a0 - a2 - a3
psubw
m3
,
m6
; m3 = a3 - a2 - a1
paddw
m1
,
m1
paddw
m3
,
m3
psubw
m1
,
m2
; m1 = b7
paddw
m3
,
m0
; m3 = b6
mova
m2
,
[
%1
+
2
*
16
]
; m2 = src2
mova
m6
,
[
%1
+
6
*
16
]
; m6 = src6
mova
m4
,
m2
mova
m0
,
m6
psllw
m4
,
2
; m4 = 4*src2
psllw
m6
,
2
; m6 = 4*src6
paddw
m2
,
m4
; m2 = 5*src2
paddw
m0
,
m6
; m0 = 5*src6
paddw
m2
,
m2
paddw
m0
,
m0
psubw
m4
,
m0
; m4 = 4*src2 - 10*src6 = a7
paddw
m6
,
m2
; m6 = 4*src6 + 10*src2 = a6
mova
m2
,
[
%1
+
0
*
16
]
; m2 = src0
mova
m0
,
[
%1
+
4
*
16
]
; m0 = src4
SUMSUB_BA
w
,
0
,
2
; m0 = src0 + src4, m2 = src0 - src4
psllw
m0
,
3
psllw
m2
,
3
paddw
m0
,
%2
; add rounding bias
paddw
m2
,
%2
; add rounding bias
SUMSUB_BA
w
,
6
,
0
; m6 = a4 + a6, m0 = a4 - a6
SUMSUB_BA
w
,
4
,
2
; m4 = a5 + a7, m2 = a5 - a7
SUMSUB_BA
w
,
7
,
6
; m7 = dst0, m6 = dst7
SUMSUB_BA
w
,
5
,
4
; m5 = dst1, m4 = dst6
SUMSUB_BA
w
,
3
,
2
; m3 = dst2, m2 = dst5
SUMSUB_BA
w
,
1
,
0
; m1 = dst3, m0 = dst4
%endmacro
INIT_MMX
mmx
cglobal
cavs_idct8
,
2
,
4
,
8
,
8
*
16
,
out
,
in
,
cnt
,
tmp
mov
cntd
,
2
mov
tmpq
,
rsp
.
loop_1
:
CAVS_IDCT8_1D
inq
,
[
pw_4
]
psraw
m7
,
3
psraw
m6
,
3
psraw
m5
,
3
psraw
m4
,
3
psraw
m3
,
3
psraw
m2
,
3
psraw
m1
,
3
psraw
m0
,
3
mova
[tmpq],
m7
TRANSPOSE4x4W
0
,
2
,
4
,
6
,
7
mova
[
tmpq
+
1
*
8
]
,
m0
mova
[
tmpq
+
3
*
8
]
,
m2
mova
[
tmpq
+
5
*
8
]
,
m4
mova
[
tmpq
+
7
*
8
]
,
m6
mova
m7
,
[tmpq]
TRANSPOSE4x4W
7
,
5
,
3
,
1
,
0
mova
[
tmpq
+
0
*
8
]
,
m7
mova
[
tmpq
+
2
*
8
]
,
m5
mova
[
tmpq
+
4
*
8
]
,
m3
mova
[
tmpq
+
6
*
8
]
,
m1
add
inq
,
mmsize
add
tmpq
,
64
dec
cntd
jg
.
loop_1
mov
cntd
,
2
mov
tmpq
,
rsp
.
loop_2
:
CAVS_IDCT8_1D
tmpq
,
[
pw_64
]
psraw
m7
,
7
psraw
m6
,
7
psraw
m5
,
7
psraw
m4
,
7
psraw
m3
,
7
psraw
m2
,
7
psraw
m1
,
7
psraw
m0
,
7
mova
[
outq
+
0
*
16
]
,
m7
mova
[
outq
+
1
*
16
]
,
m5
mova
[
outq
+
2
*
16
]
,
m3
mova
[
outq
+
3
*
16
]
,
m1
mova
[
outq
+
4
*
16
]
,
m0
mova
[
outq
+
5
*
16
]
,
m2
mova
[
outq
+
6
*
16
]
,
m4
mova
[
outq
+
7
*
16
]
,
m6
add
outq
,
mmsize
add
tmpq
,
mmsize
dec
cntd
jg
.
loop_2
RET
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment