Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
9f00b1cb
Commit
9f00b1cb
authored
Jan 16, 2013
by
Daniel Kang
Committed by
Luca Barbato
Jan 21, 2013
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
dsputilenc: x86: Convert pixel inline asm to yasm
Signed-off-by:
Luca Barbato
<
lu_zero@gentoo.org
>
parent
c7df1532
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
172 additions
and
181 deletions
+172
-181
dsputilenc.asm
libavcodec/x86/dsputilenc.asm
+152
-0
dsputilenc_mmx.c
libavcodec/x86/dsputilenc_mmx.c
+20
-181
No files found.
libavcodec/x86/dsputilenc.asm
View file @
9f00b1cb
...
...
@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
paddd
m7
,
m1
movd
eax
,
m7
; return value
RET
INIT_MMX
mmx
; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
cglobal
get_pixels
,
3
,
4
movsxdifnidn
r2
,
r2d
add
r0
,
128
mov
r3
,
-
128
pxor
m7
,
m7
.
loop
:
mova
m0
,
[r1]
mova
m2
,
[
r1
+
r2
]
mova
m1
,
m0
mova
m3
,
m2
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
mova
[
r0
+
r3
+
0
]
,
m0
mova
[
r0
+
r3
+
8
]
,
m1
mova
[
r0
+
r3
+
16
]
,
m2
mova
[
r0
+
r3
+
24
]
,
m3
lea
r1
,
[
r1
+
r2
*
2
]
add
r3
,
32
js
.
loop
REP_RET
INIT_XMM
sse2
cglobal
get_pixels
,
3
,
4
movsxdifnidn
r2
,
r2d
lea
r3
,
[
r2
*
3
]
pxor
m4
,
m4
movh
m0
,
[r1]
movh
m1
,
[
r1
+
r2
]
movh
m2
,
[
r1
+
r2
*
2
]
movh
m3
,
[
r1
+
r3
]
lea
r1
,
[
r1
+
r2
*
4
]
punpcklbw
m0
,
m4
punpcklbw
m1
,
m4
punpcklbw
m2
,
m4
punpcklbw
m3
,
m4
mova
[r0],
m0
mova
[
r0
+
0x10
]
,
m1
mova
[
r0
+
0x20
]
,
m2
mova
[
r0
+
0x30
]
,
m3
movh
m0
,
[r1]
movh
m1
,
[
r1
+
r2
*
1
]
movh
m2
,
[
r1
+
r2
*
2
]
movh
m3
,
[
r1
+
r3
]
punpcklbw
m0
,
m4
punpcklbw
m1
,
m4
punpcklbw
m2
,
m4
punpcklbw
m3
,
m4
mova
[
r0
+
0x40
]
,
m0
mova
[
r0
+
0x50
]
,
m1
mova
[
r0
+
0x60
]
,
m2
mova
[
r0
+
0x70
]
,
m3
RET
INIT_MMX
mmx
; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride)
cglobal
diff_pixels
,
4
,
5
movsxdifnidn
r3
,
r3d
pxor
m7
,
m7
add
r0
,
128
mov
r4
,
-
128
.
loop
:
mova
m0
,
[r1]
mova
m2
,
[r2]
mova
m1
,
m0
mova
m3
,
m2
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
psubw
m0
,
m2
psubw
m1
,
m3
mova
[
r0
+
r4
+
0
]
,
m0
mova
[
r0
+
r4
+
8
]
,
m1
add
r1
,
r3
add
r2
,
r3
add
r4
,
16
jne
.
loop
REP_RET
INIT_MMX
mmx
; pix_sum16_mmx(uint8_t * pix, int line_size)
cglobal
pix_sum16
,
2
,
3
movsxdifnidn
r1
,
r1d
mov
r2
,
r1
neg
r2
shl
r2
,
4
sub
r0
,
r2
pxor
m7
,
m7
pxor
m6
,
m6
.
loop
:
mova
m0
,
[
r0
+
r2
+
0
]
mova
m1
,
[
r0
+
r2
+
0
]
mova
m2
,
[
r0
+
r2
+
8
]
mova
m3
,
[
r0
+
r2
+
8
]
punpcklbw
m0
,
m7
punpckhbw
m1
,
m7
punpcklbw
m2
,
m7
punpckhbw
m3
,
m7
paddw
m1
,
m0
paddw
m3
,
m2
paddw
m3
,
m1
paddw
m6
,
m3
add
r2
,
r1
js
.
loop
mova
m5
,
m6
psrlq
m6
,
32
paddw
m6
,
m5
mova
m5
,
m6
psrlq
m6
,
16
paddw
m6
,
m5
movd
eax
,
m6
and
eax
,
0xffff
RET
INIT_MMX
mmx
; pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal
pix_norm1
,
2
,
4
movsxdifnidn
r1
,
r1d
mov
r2
,
16
pxor
m0
,
m0
pxor
m7
,
m7
.
loop
:
mova
m2
,
[
r0
+
0
]
mova
m3
,
[
r0
+
8
]
mova
m1
,
m2
punpckhbw
m1
,
m0
punpcklbw
m2
,
m0
mova
m4
,
m3
punpckhbw
m3
,
m0
punpcklbw
m4
,
m0
pmaddwd
m1
,
m1
pmaddwd
m2
,
m2
pmaddwd
m3
,
m3
pmaddwd
m4
,
m4
paddd
m2
,
m1
paddd
m4
,
m3
paddd
m7
,
m2
add
r0
,
r1
paddd
m7
,
m4
dec
r2
jne
.
loop
mova
m1
,
m7
psrlq
m7
,
32
paddd
m1
,
m7
movd
eax
,
m1
RET
libavcodec/x86/dsputilenc_mmx.c
View file @
9f00b1cb
...
...
@@ -30,181 +30,14 @@
#include "libavcodec/mathops.h"
#include "dsputil_mmx.h"
void
ff_get_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
);
void
ff_get_pixels_sse2
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
);
void
ff_diff_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
s1
,
const
uint8_t
*
s2
,
int
stride
);
int
ff_pix_sum16_mmx
(
uint8_t
*
pix
,
int
line_size
);
int
ff_pix_norm1_mmx
(
uint8_t
*
pix
,
int
line_size
);
#if HAVE_INLINE_ASM
static
void
get_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
)
{
__asm__
volatile
(
"mov $-128, %%"
REG_a
"
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
".p2align 4
\n\t
"
"1:
\n\t
"
"movq (%0), %%mm0
\n\t
"
"movq (%0, %2), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"movq %%mm0, (%1, %%"
REG_a
")
\n\t
"
"movq %%mm1, 8(%1, %%"
REG_a
")
\n\t
"
"movq %%mm2, 16(%1, %%"
REG_a
")
\n\t
"
"movq %%mm3, 24(%1, %%"
REG_a
")
\n\t
"
"add %3, %0
\n\t
"
"add $32, %%"
REG_a
"
\n\t
"
"js 1b
\n\t
"
:
"+r"
(
pixels
)
:
"r"
(
block
+
64
),
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
line_size
*
2
)
:
"%"
REG_a
);
}
static
void
get_pixels_sse2
(
DCTELEM
*
block
,
const
uint8_t
*
pixels
,
int
line_size
)
{
__asm__
volatile
(
"pxor %%xmm4, %%xmm4
\n\t
"
"movq (%0), %%xmm0
\n\t
"
"movq (%0, %2), %%xmm1
\n\t
"
"movq (%0, %2,2), %%xmm2
\n\t
"
"movq (%0, %3), %%xmm3
\n\t
"
"lea (%0,%2,4), %0
\n\t
"
"punpcklbw %%xmm4, %%xmm0
\n\t
"
"punpcklbw %%xmm4, %%xmm1
\n\t
"
"punpcklbw %%xmm4, %%xmm2
\n\t
"
"punpcklbw %%xmm4, %%xmm3
\n\t
"
"movdqa %%xmm0, (%1)
\n\t
"
"movdqa %%xmm1, 16(%1)
\n\t
"
"movdqa %%xmm2, 32(%1)
\n\t
"
"movdqa %%xmm3, 48(%1)
\n\t
"
"movq (%0), %%xmm0
\n\t
"
"movq (%0, %2), %%xmm1
\n\t
"
"movq (%0, %2,2), %%xmm2
\n\t
"
"movq (%0, %3), %%xmm3
\n\t
"
"punpcklbw %%xmm4, %%xmm0
\n\t
"
"punpcklbw %%xmm4, %%xmm1
\n\t
"
"punpcklbw %%xmm4, %%xmm2
\n\t
"
"punpcklbw %%xmm4, %%xmm3
\n\t
"
"movdqa %%xmm0, 64(%1)
\n\t
"
"movdqa %%xmm1, 80(%1)
\n\t
"
"movdqa %%xmm2, 96(%1)
\n\t
"
"movdqa %%xmm3, 112(%1)
\n\t
"
:
"+r"
(
pixels
)
:
"r"
(
block
),
"r"
((
x86_reg
)
line_size
),
"r"
((
x86_reg
)
line_size
*
3
)
);
}
static
inline
void
diff_pixels_mmx
(
DCTELEM
*
block
,
const
uint8_t
*
s1
,
const
uint8_t
*
s2
,
int
stride
)
{
__asm__
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"mov $-128, %%"
REG_a
"
\n\t
"
".p2align 4
\n\t
"
"1:
\n\t
"
"movq (%0), %%mm0
\n\t
"
"movq (%1), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"psubw %%mm2, %%mm0
\n\t
"
"psubw %%mm3, %%mm1
\n\t
"
"movq %%mm0, (%2, %%"
REG_a
")
\n\t
"
"movq %%mm1, 8(%2, %%"
REG_a
")
\n\t
"
"add %3, %0
\n\t
"
"add %3, %1
\n\t
"
"add $16, %%"
REG_a
"
\n\t
"
"jnz 1b
\n\t
"
:
"+r"
(
s1
),
"+r"
(
s2
)
:
"r"
(
block
+
64
),
"r"
((
x86_reg
)
stride
)
:
"%"
REG_a
);
}
static
int
pix_sum16_mmx
(
uint8_t
*
pix
,
int
line_size
){
const
int
h
=
16
;
int
sum
;
x86_reg
index
=
-
line_size
*
h
;
__asm__
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"pxor %%mm6, %%mm6
\n\t
"
"1:
\n\t
"
"movq (%2, %1), %%mm0
\n\t
"
"movq (%2, %1), %%mm1
\n\t
"
"movq 8(%2, %1), %%mm2
\n\t
"
"movq 8(%2, %1), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"paddw %%mm0, %%mm1
\n\t
"
"paddw %%mm2, %%mm3
\n\t
"
"paddw %%mm1, %%mm3
\n\t
"
"paddw %%mm3, %%mm6
\n\t
"
"add %3, %1
\n\t
"
" js 1b
\n\t
"
"movq %%mm6, %%mm5
\n\t
"
"psrlq $32, %%mm6
\n\t
"
"paddw %%mm5, %%mm6
\n\t
"
"movq %%mm6, %%mm5
\n\t
"
"psrlq $16, %%mm6
\n\t
"
"paddw %%mm5, %%mm6
\n\t
"
"movd %%mm6, %0
\n\t
"
"andl $0xFFFF, %0
\n\t
"
:
"=&r"
(
sum
),
"+r"
(
index
)
:
"r"
(
pix
-
index
),
"r"
((
x86_reg
)
line_size
)
);
return
sum
;
}
static
int
pix_norm1_mmx
(
uint8_t
*
pix
,
int
line_size
)
{
int
tmp
;
__asm__
volatile
(
"movl $16,%%ecx
\n
"
"pxor %%mm0,%%mm0
\n
"
"pxor %%mm7,%%mm7
\n
"
"1:
\n
"
"movq (%0),%%mm2
\n
"
/* mm2 = pix[0-7] */
"movq 8(%0),%%mm3
\n
"
/* mm3 = pix[8-15] */
"movq %%mm2,%%mm1
\n
"
/* mm1 = mm2 = pix[0-7] */
"punpckhbw %%mm0,%%mm1
\n
"
/* mm1 = [pix4-7] */
"punpcklbw %%mm0,%%mm2
\n
"
/* mm2 = [pix0-3] */
"movq %%mm3,%%mm4
\n
"
/* mm4 = mm3 = pix[8-15] */
"punpckhbw %%mm0,%%mm3
\n
"
/* mm3 = [pix12-15] */
"punpcklbw %%mm0,%%mm4
\n
"
/* mm4 = [pix8-11] */
"pmaddwd %%mm1,%%mm1
\n
"
/* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
"pmaddwd %%mm2,%%mm2
\n
"
/* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
"pmaddwd %%mm3,%%mm3
\n
"
"pmaddwd %%mm4,%%mm4
\n
"
"paddd %%mm1,%%mm2
\n
"
/* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
pix2^2+pix3^2+pix6^2+pix7^2) */
"paddd %%mm3,%%mm4
\n
"
"paddd %%mm2,%%mm7
\n
"
"add %2, %0
\n
"
"paddd %%mm4,%%mm7
\n
"
"dec %%ecx
\n
"
"jnz 1b
\n
"
"movq %%mm7,%%mm1
\n
"
"psrlq $32, %%mm7
\n
"
/* shift hi dword to lo */
"paddd %%mm7,%%mm1
\n
"
"movd %%mm1,%1
\n
"
:
"+r"
(
pix
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
)
:
"%ecx"
);
return
tmp
;
}
static
int
sse8_mmx
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
tmp
;
__asm__
volatile
(
...
...
@@ -1111,10 +944,23 @@ hadamard_func(ssse3)
void
ff_dsputilenc_init_mmx
(
DSPContext
*
c
,
AVCodecContext
*
avctx
)
{
int
mm_flags
=
av_get_cpu_flags
();
#if HAVE_INLINE_ASM
int
bit_depth
=
avctx
->
bits_per_raw_sample
;
#if HAVE_YASM
if
(
EXTERNAL_MMX
(
mm_flags
))
{
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
ff_get_pixels_mmx
;
c
->
diff_pixels
=
ff_diff_pixels_mmx
;
c
->
pix_sum
=
ff_pix_sum16_mmx
;
c
->
pix_norm1
=
ff_pix_norm1_mmx
;
}
if
(
EXTERNAL_SSE2
(
mm_flags
))
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
ff_get_pixels_sse2
;
#endif
/* HAVE_YASM */
#if HAVE_INLINE_ASM
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
const
int
dct_algo
=
avctx
->
dct_algo
;
if
(
avctx
->
bits_per_raw_sample
<=
8
&&
...
...
@@ -1128,15 +974,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
get_pixels_mmx
;
c
->
diff_pixels
=
diff_pixels_mmx
;
c
->
pix_sum
=
pix_sum16_mmx
;
c
->
diff_bytes
=
diff_bytes_mmx
;
c
->
sum_abs_dctelem
=
sum_abs_dctelem_mmx
;
c
->
pix_norm1
=
pix_norm1_mmx
;
c
->
sse
[
0
]
=
sse16_mmx
;
c
->
sse
[
1
]
=
sse8_mmx
;
c
->
vsad
[
4
]
=
vsad_intra16_mmx
;
...
...
@@ -1166,8 +1007,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
){
if
(
bit_depth
<=
8
)
c
->
get_pixels
=
get_pixels_sse2
;
c
->
sum_abs_dctelem
=
sum_abs_dctelem_sse2
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment