Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
d45be178
Commit
d45be178
authored
Nov 14, 2008
by
Baptiste Coudurier
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
convert fdct_mmx to plain asm
Originally committed as revision 15819 to
svn://svn.ffmpeg.org/ffmpeg/trunk
parent
e202cc25
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
170 additions
and
159 deletions
+170
-159
fdct_mmx.c
libavcodec/i386/fdct_mmx.c
+170
-159
No files found.
libavcodec/i386/fdct_mmx.c
View file @
d45be178
...
@@ -32,7 +32,6 @@
...
@@ -32,7 +32,6 @@
#include "libavutil/common.h"
#include "libavutil/common.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/dsputil.h"
#include "mmx.h"
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
...
@@ -286,84 +285,90 @@ TABLE_SSE2
...
@@ -286,84 +285,90 @@ TABLE_SSE2
TABLE_SSE2
TABLE_SSE2
}};
}};
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
#define FDCT_COL(cpu, mm, mov)\
#define FDCT_COL(cpu, mm, mov)\
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
{\
{\
mov##_m2r(*(in + offset + 1 * 8), mm##0);\
__asm__ volatile (\
mov##_m2r(*(in + offset + 6 * 8), mm##1);\
#mov" 16(%0), %%"#mm"0 \n\t" \
mov##_r2r(mm##0, mm##2);\
#mov" 96(%0), %%"#mm"1 \n\t" \
mov##_m2r(*(in + offset + 2 * 8), mm##3);\
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
paddsw_r2r(mm##1, mm##0);\
#mov" 32(%0), %%"#mm"3 \n\t" \
mov##_m2r(*(in + offset + 5 * 8), mm##4);\
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
psllw_i2r(SHIFT_FRW_COL, mm##0);\
#mov" 80(%0), %%"#mm"4 \n\t" \
mov##_m2r(*(in + offset + 0 * 8), mm##5);\
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
paddsw_r2r(mm##3, mm##4);\
#mov" (%0), %%"#mm"5 \n\t" \
paddsw_m2r(*(in + offset + 7 * 8), mm##5);\
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
psllw_i2r(SHIFT_FRW_COL, mm##4);\
"paddsw 112(%0), %%"#mm"5 \n\t" \
mov##_r2r(mm##0, mm##6);\
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
psubsw_r2r(mm##1, mm##2);\
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
mov##_m2r(*(fdct_tg_all_16 + 8), mm##1);\
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
psubsw_r2r(mm##4, mm##0);\
#mov" 16(%1), %%"#mm"1 \n\t" \
mov##_m2r(*(in + offset + 3 * 8), mm##7);\
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
pmulhw_r2r(mm##0, mm##1);\
#mov" 48(%0), %%"#mm"7 \n\t" \
paddsw_m2r(*(in + offset + 4 * 8), mm##7);\
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
psllw_i2r(SHIFT_FRW_COL, mm##5);\
"paddsw 64(%0), %%"#mm"7 \n\t" \
paddsw_r2r(mm##4, mm##6);\
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
psllw_i2r(SHIFT_FRW_COL, mm##7);\
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
mov##_r2r(mm##5, mm##4);\
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
psubsw_r2r(mm##7, mm##5);\
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
paddsw_r2r(mm##5, mm##1);\
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
paddsw_r2r(mm##7, mm##4);\
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
por_m2r(*fdct_one_corr, mm##1);\
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
psllw_i2r(SHIFT_FRW_COL + 1, mm##2);\
"por (%2), %%"#mm"1 \n\t" \
pmulhw_m2r(*(fdct_tg_all_16 + 8), mm##5);\
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
mov##_r2r(mm##4, mm##7);\
"pmulhw 16(%1), %%"#mm"5 \n\t" \
psubsw_m2r(*(in + offset + 5 * 8), mm##3);\
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
psubsw_r2r(mm##6, mm##4);\
"psubsw 80(%0), %%"#mm"3 \n\t" \
mov##_r2m(mm##1, *(out + offset + 2 * 8));\
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
paddsw_r2r(mm##6, mm##7);\
#mov" %%"#mm"1, 32(%3) \n\t" \
mov##_m2r(*(in + offset + 3 * 8), mm##1);\
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
psllw_i2r(SHIFT_FRW_COL + 1, mm##3);\
#mov" 48(%0), %%"#mm"1 \n\t" \
psubsw_m2r(*(in + offset + 4 * 8), mm##1);\
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
mov##_r2r(mm##2, mm##6);\
"psubsw 64(%0), %%"#mm"1 \n\t" \
mov##_r2m(mm##4, *(out + offset + 4 * 8));\
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
paddsw_r2r(mm##3, mm##2);\
#mov" %%"#mm"4, 64(%3) \n\t" \
pmulhw_m2r(*ocos_4_16, mm##2);\
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
psubsw_r2r(mm##3, mm##6);\
"pmulhw (%4), %%"#mm"2 \n\t" \
pmulhw_m2r(*ocos_4_16, mm##6);\
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
psubsw_r2r(mm##0, mm##5);\
"pmulhw (%4), %%"#mm"6 \n\t" \
por_m2r(*fdct_one_corr, mm##5);\
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
psllw_i2r(SHIFT_FRW_COL, mm##1);\
"por (%2), %%"#mm"5 \n\t" \
por_m2r(*fdct_one_corr, mm##2);\
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
mov##_r2r(mm##1, mm##4);\
"por (%2), %%"#mm"2 \n\t" \
mov##_m2r(*(in + offset + 0 * 8), mm##3);\
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
paddsw_r2r(mm##6, mm##1);\
#mov" (%0), %%"#mm"3 \n\t" \
psubsw_m2r(*(in + offset + 7 * 8), mm##3);\
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
psubsw_r2r(mm##6, mm##4);\
"psubsw 112(%0), %%"#mm"3 \n\t" \
mov##_m2r(*(fdct_tg_all_16 + 0), mm##0);\
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
psllw_i2r(SHIFT_FRW_COL, mm##3);\
#mov" (%1), %%"#mm"0 \n\t" \
mov##_m2r(*(fdct_tg_all_16 + 16), mm##6);\
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
pmulhw_r2r(mm##1, mm##0);\
#mov" 32(%1), %%"#mm"6 \n\t" \
mov##_r2m(mm##7, *(out + offset + 0 * 8));\
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
pmulhw_r2r(mm##4, mm##6);\
#mov" %%"#mm"7, (%3) \n\t" \
mov##_r2m(mm##5, *(out + offset + 6 * 8));\
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
mov##_r2r(mm##3, mm##7);\
#mov" %%"#mm"5, 96(%3) \n\t" \
mov##_m2r(*(fdct_tg_all_16 + 16), mm##5);\
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
psubsw_r2r(mm##2, mm##7);\
#mov" 32(%1), %%"#mm"5 \n\t" \
paddsw_r2r(mm##2, mm##3);\
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
pmulhw_r2r(mm##7, mm##5);\
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
paddsw_r2r(mm##3, mm##0);\
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
paddsw_r2r(mm##4, mm##6);\
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
pmulhw_m2r(*(fdct_tg_all_16 + 0), mm##3);\
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
por_m2r(*fdct_one_corr, mm##0);\
"pmulhw (%1), %%"#mm"3 \n\t" \
paddsw_r2r(mm##7, mm##5);\
"por (%2), %%"#mm"0 \n\t" \
psubsw_r2r(mm##6, mm##7);\
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
mov##_r2m(mm##0, *(out + offset + 1 * 8));\
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
paddsw_r2r(mm##4, mm##5);\
#mov" %%"#mm"0, 16(%3) \n\t" \
mov##_r2m(mm##7, *(out + offset + 3 * 8));\
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
psubsw_r2r(mm##1, mm##3);\
#mov" %%"#mm"7, 48(%3) \n\t" \
mov##_r2m(mm##5, *(out + offset + 5 * 8));\
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
mov##_r2m(mm##3, *(out + offset + 7 * 8));\
#mov" %%"#mm"5, 80(%3) \n\t" \
#mov" %%"#mm"3, 112(%3) \n\t" \
: \
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
"r" (out + offset), "r" (ocos_4_16)); \
}
}
FDCT_COL
(
mmx
,
mm
,
movq
)
FDCT_COL
(
mmx
,
mm
,
movq
)
...
@@ -433,93 +438,99 @@ static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
...
@@ -433,93 +438,99 @@ static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
static
av_always_inline
void
fdct_row_mmx2
(
const
int16_t
*
in
,
int16_t
*
out
,
const
int16_t
*
table
)
static
av_always_inline
void
fdct_row_mmx2
(
const
int16_t
*
in
,
int16_t
*
out
,
const
int16_t
*
table
)
{
{
pshufw_m2r
(
*
(
in
+
4
),
mm5
,
0x1B
);
__asm__
volatile
(
movq_m2r
(
*
(
in
+
0
),
mm0
);
"pshufw $0x1B, 8(%0), %%mm5
\n\t
"
movq_r2r
(
mm0
,
mm1
);
"movq (%0), %%mm0
\n\t
"
paddsw_r2r
(
mm5
,
mm0
);
"movq %%mm0, %%mm1
\n\t
"
psubsw_r2r
(
mm5
,
mm1
);
"paddsw %%mm5, %%mm0
\n\t
"
movq_r2r
(
mm0
,
mm2
);
"psubsw %%mm5, %%mm1
\n\t
"
punpckldq_r2r
(
mm1
,
mm0
);
"movq %%mm0, %%mm2
\n\t
"
punpckhdq_r2r
(
mm1
,
mm2
);
"punpckldq %%mm1, %%mm0
\n\t
"
movq_m2r
(
*
(
table
+
0
),
mm1
);
"punpckhdq %%mm1, %%mm2
\n\t
"
movq_m2r
(
*
(
table
+
4
),
mm3
);
"movq (%1), %%mm1
\n\t
"
movq_m2r
(
*
(
table
+
8
),
mm4
);
"movq 8(%1), %%mm3
\n\t
"
movq_m2r
(
*
(
table
+
12
),
mm5
);
"movq 16(%1), %%mm4
\n\t
"
movq_m2r
(
*
(
table
+
16
),
mm6
);
"movq 24(%1), %%mm5
\n\t
"
movq_m2r
(
*
(
table
+
20
),
mm7
);
"movq 32(%1), %%mm6
\n\t
"
pmaddwd_r2r
(
mm0
,
mm1
);
"movq 40(%1), %%mm7
\n\t
"
pmaddwd_r2r
(
mm2
,
mm3
);
"pmaddwd %%mm0, %%mm1
\n\t
"
pmaddwd_r2r
(
mm0
,
mm4
);
"pmaddwd %%mm2, %%mm3
\n\t
"
pmaddwd_r2r
(
mm2
,
mm5
);
"pmaddwd %%mm0, %%mm4
\n\t
"
pmaddwd_r2r
(
mm0
,
mm6
);
"pmaddwd %%mm2, %%mm5
\n\t
"
pmaddwd_r2r
(
mm2
,
mm7
);
"pmaddwd %%mm0, %%mm6
\n\t
"
pmaddwd_m2r
(
*
(
table
+
24
),
mm0
);
"pmaddwd %%mm2, %%mm7
\n\t
"
pmaddwd_m2r
(
*
(
table
+
28
),
mm2
);
"pmaddwd 48(%1), %%mm0
\n\t
"
paddd_r2r
(
mm1
,
mm3
);
"pmaddwd 56(%1), %%mm2
\n\t
"
paddd_r2r
(
mm4
,
mm5
);
"paddd %%mm1, %%mm3
\n\t
"
paddd_r2r
(
mm6
,
mm7
);
"paddd %%mm4, %%mm5
\n\t
"
paddd_r2r
(
mm0
,
mm2
);
"paddd %%mm6, %%mm7
\n\t
"
movq_m2r
(
*
fdct_r_row
,
mm0
);
"paddd %%mm0, %%mm2
\n\t
"
paddd_r2r
(
mm0
,
mm3
);
"movq (%2), %%mm0
\n\t
"
paddd_r2r
(
mm0
,
mm5
);
"paddd %%mm0, %%mm3
\n\t
"
paddd_r2r
(
mm0
,
mm7
);
"paddd %%mm0, %%mm5
\n\t
"
paddd_r2r
(
mm0
,
mm2
);
"paddd %%mm0, %%mm7
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm3
);
"paddd %%mm0, %%mm2
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm5
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm3
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm7
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm5
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm2
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm7
\n\t
"
packssdw_r2r
(
mm5
,
mm3
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm2
\n\t
"
packssdw_r2r
(
mm2
,
mm7
);
"packssdw %%mm5, %%mm3
\n\t
"
movq_r2m
(
mm3
,
*
(
out
+
0
));
"packssdw %%mm2, %%mm7
\n\t
"
movq_r2m
(
mm7
,
*
(
out
+
4
));
"movq %%mm3, (%3)
\n\t
"
"movq %%mm7, 8(%3)
\n\t
"
:
:
"r"
(
in
),
"r"
(
table
),
"r"
(
fdct_r_row
),
"r"
(
out
));
}
}
static
av_always_inline
void
fdct_row_mmx
(
const
int16_t
*
in
,
int16_t
*
out
,
const
int16_t
*
table
)
static
av_always_inline
void
fdct_row_mmx
(
const
int16_t
*
in
,
int16_t
*
out
,
const
int16_t
*
table
)
{
{
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
movd_m2r
(
*
(
in
+
6
),
mm1
);
__asm__
volatile
(
punpcklwd_m2r
(
*
(
in
+
4
),
mm1
);
"movd 12(%0), %%mm1
\n\t
"
movq_r2r
(
mm1
,
mm2
);
"punpcklwd 8(%0), %%mm1
\n\t
"
psrlq_i2r
(
0x20
,
mm1
);
"movq %%mm1, %%mm2
\n\t
"
movq_m2r
(
*
(
in
+
0
),
mm0
);
"psrlq $0x20, %%mm1
\n\t
"
punpcklwd_r2r
(
mm2
,
mm1
);
"movq 0(%0), %%mm0
\n\t
"
movq_r2r
(
mm0
,
mm5
);
"punpcklwd %%mm2, %%mm1
\n\t
"
paddsw_r2r
(
mm1
,
mm0
);
"movq %%mm0, %%mm5
\n\t
"
psubsw_r2r
(
mm1
,
mm5
);
"paddsw %%mm1, %%mm0
\n\t
"
movq_r2r
(
mm0
,
mm2
);
"psubsw %%mm1, %%mm5
\n\t
"
punpckldq_r2r
(
mm5
,
mm0
);
"movq %%mm0, %%mm2
\n\t
"
punpckhdq_r2r
(
mm5
,
mm2
);
"punpckldq %%mm5, %%mm0
\n\t
"
movq_m2r
(
*
(
table
+
0
),
mm1
);
"punpckhdq %%mm5, %%mm2
\n\t
"
movq_m2r
(
*
(
table
+
4
),
mm3
);
"movq 0(%1), %%mm1
\n\t
"
movq_m2r
(
*
(
table
+
8
),
mm4
);
"movq 8(%1), %%mm3
\n\t
"
movq_m2r
(
*
(
table
+
12
),
mm5
);
"movq 16(%1), %%mm4
\n\t
"
movq_m2r
(
*
(
table
+
16
),
mm6
);
"movq 24(%1), %%mm5
\n\t
"
movq_m2r
(
*
(
table
+
20
),
mm7
);
"movq 32(%1), %%mm6
\n\t
"
pmaddwd_r2r
(
mm0
,
mm1
);
"movq 40(%1), %%mm7
\n\t
"
pmaddwd_r2r
(
mm2
,
mm3
);
"pmaddwd %%mm0, %%mm1
\n\t
"
pmaddwd_r2r
(
mm0
,
mm4
);
"pmaddwd %%mm2, %%mm3
\n\t
"
pmaddwd_r2r
(
mm2
,
mm5
);
"pmaddwd %%mm0, %%mm4
\n\t
"
pmaddwd_r2r
(
mm0
,
mm6
);
"pmaddwd %%mm2, %%mm5
\n\t
"
pmaddwd_r2r
(
mm2
,
mm7
);
"pmaddwd %%mm0, %%mm6
\n\t
"
pmaddwd_m2r
(
*
(
table
+
24
),
mm0
);
"pmaddwd %%mm2, %%mm7
\n\t
"
pmaddwd_m2r
(
*
(
table
+
28
),
mm2
);
"pmaddwd 48(%1), %%mm0
\n\t
"
paddd_r2r
(
mm1
,
mm3
);
"pmaddwd 56(%1), %%mm2
\n\t
"
paddd_r2r
(
mm4
,
mm5
);
"paddd %%mm1, %%mm3
\n\t
"
paddd_r2r
(
mm6
,
mm7
);
"paddd %%mm4, %%mm5
\n\t
"
paddd_r2r
(
mm0
,
mm2
);
"paddd %%mm6, %%mm7
\n\t
"
movq_m2r
(
*
fdct_r_row
,
mm0
);
"paddd %%mm0, %%mm2
\n\t
"
paddd_r2r
(
mm0
,
mm3
);
"movq (%2), %%mm0
\n\t
"
paddd_r2r
(
mm0
,
mm5
);
"paddd %%mm0, %%mm3
\n\t
"
paddd_r2r
(
mm0
,
mm7
);
"paddd %%mm0, %%mm5
\n\t
"
paddd_r2r
(
mm0
,
mm2
);
"paddd %%mm0, %%mm7
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm3
);
"paddd %%mm0, %%mm2
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm5
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm3
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm7
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm5
\n\t
"
psrad_i2r
(
SHIFT_FRW_ROW
,
mm2
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm7
\n\t
"
packssdw_r2r
(
mm5
,
mm3
);
"psrad $"
S
(
SHIFT_FRW_ROW
)
", %%mm2
\n\t
"
packssdw_r2r
(
mm2
,
mm7
);
"packssdw %%mm5, %%mm3
\n\t
"
movq_r2m
(
mm3
,
*
(
out
+
0
));
"packssdw %%mm2, %%mm7
\n\t
"
movq_r2m
(
mm7
,
*
(
out
+
4
));
"movq %%mm3, 0(%3)
\n\t
"
"movq %%mm7, 8(%3)
\n\t
"
:
:
"r"
(
in
),
"r"
(
table
),
"r"
(
fdct_r_row
),
"r"
(
out
));
}
}
void
ff_fdct_mmx
(
int16_t
*
block
)
void
ff_fdct_mmx
(
int16_t
*
block
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment