Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
607dce96
Commit
607dce96
authored
May 17, 2002
by
Michael Niedermayer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
hopefully faster mmx2&3dnow MC
Originally committed as revision 506 to
svn://svn.ffmpeg.org/ffmpeg/trunk
parent
59fe111e
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
362 additions
and
323 deletions
+362
-323
dsputil_mmx.c
libavcodec/i386/dsputil_mmx.c
+98
-89
dsputil_mmx_avg.h
libavcodec/i386/dsputil_mmx_avg.h
+264
-234
No files found.
libavcodec/i386/dsputil_mmx.c
View file @
607dce96
...
...
@@ -21,6 +21,7 @@
#include "../dsputil.h"
#include "../simple_idct.h"
#include "../mangle.h"
int
mm_flags
;
/* multimedia extension flags */
...
...
@@ -49,6 +50,7 @@ void ff_mmx_idct(DCTELEM *block);
void
ff_mmxext_idct
(
DCTELEM
*
block
);
/* pixel operations */
static
const
unsigned
long
long
int
mm_bone
__attribute__
((
aligned
(
8
)))
=
0x0101010101010101LL
;
static
const
unsigned
long
long
int
mm_wone
__attribute__
((
aligned
(
8
)))
=
0x0001000100010001LL
;
static
const
unsigned
long
long
int
mm_wtwo
__attribute__
((
aligned
(
8
)))
=
0x0002000200020002LL
;
//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
...
...
@@ -90,7 +92,7 @@ static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x000
/***********************************/
/* MMX2 specific */
#define DEF(x) x ## _
sse
#define DEF(x) x ## _
mmx2
/* Introduced only in MMX2 set */
#define PAVGB "pavgb"
...
...
@@ -105,41 +107,38 @@ static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x000
static
void
get_pixels_mmx
(
DCTELEM
*
block
,
const
UINT8
*
pixels
,
int
line_size
)
{
DCTELEM
*
p
;
const
UINT8
*
pix
;
int
i
;
/* read the pixels */
p
=
block
;
pix
=
pixels
;
MOVQ_ZERO
(
mm7
);
for
(
i
=
0
;
i
<
4
;
i
++
)
{
__asm
__volatile
(
"movq %1, %%mm0
\n\t
"
"movq %2, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"movq %%mm0, %0
\n\t
"
"movq %%mm2, 8%0
\n\t
"
"movq %%mm1, 16%0
\n\t
"
"movq %%mm3, 24%0
\n\t
"
:
"=m"
(
*
p
)
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
))
:
"memory"
);
pix
+=
line_size
*
2
;
p
+=
16
;
}
asm
volatile
(
"movl $-128, %%eax
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%0), %%mm0
\n\t
"
"movq (%0, %2), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"movq %%mm0, (%1, %%eax)
\n\t
"
"movq %%mm1, 8(%1, %%eax)
\n\t
"
"movq %%mm2, 16(%1, %%eax)
\n\t
"
"movq %%mm3, 24(%1, %%eax)
\n\t
"
"addl %3, %0
\n\t
"
"addl $32, %%eax
\n\t
"
"js 1b
\n\t
"
:
"+r"
(
pixels
)
:
"r"
(
block
+
64
),
"r"
(
line_size
),
"r"
(
line_size
*
2
)
:
"%eax"
);
}
static
void
diff_pixels_mmx
(
DCTELEM
*
block
,
const
UINT8
*
s1
,
const
UINT8
*
s2
,
int
stride
)
{
asm
volatile
(
"
.balign 16
\n\t
"
"
pxor %%mm7, %%mm7
\n\t
"
"movl $-128, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%0), %%mm0
\n\t
"
"movq (%1), %%mm2
\n\t
"
...
...
@@ -261,56 +260,62 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
static
void
put_pixels_mmx
(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
int
hh
;
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
// 2s
#if 0
do {
__asm __volatile(
"movq %1, %%mm0\n\t"
"movq %%mm0, %0\n\t"
:"=m"(*p)
:"m"(*pix)
:"memory");
pix += line_size;
p += line_size;
} while (--h);
#if 0 //FIXME h==4 case
asm volatile(
"xorl %%eax, %%eax \n\t"
"movl %3, %%esi \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq %%mm0, (%0, %%eax) \n\t"
"addl %2, %%eax \n\t"
"subl $8, %%esi \n\t"
" jnz 1b \n\t"
:: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
: "%eax", "%esi", "memory"
);
#else
// this optimized code is not very usefull
// the above loop is definitely faster
// at least on Celeron 500MHz
hh
=
h
&
3
;
while
(
hh
)
{
__asm
__volatile
(
"movq %1, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"=m"
(
*
p
)
:
"m"
(
*
pix
)
:
"memory"
);
pix
+=
line_size
;
p
+=
line_size
;
hh
--
;
}
hh
=
h
>>
2
;
while
(
hh
)
{
__asm
__volatile
(
"movq (%1), %%mm0
\n\t
"
"movq (%1, %2), %%mm1
\n\t
"
"movq (%1, %2, 2), %%mm2
\n\t
"
"movq (%1, %3), %%mm3
\n\t
"
"movq %%mm0, (%0)
\n\t
"
"movq %%mm1, (%0, %2)
\n\t
"
"movq %%mm2, (%0, %2, 2)
\n\t
"
"movq %%mm3, (%0, %3)
\n\t
"
::
"r"
(
p
),
"r"
(
pix
),
"r"
(
line_size
),
"r"
(
line_size
*
3
)
:
"memory"
);
pix
+=
line_size
*
4
;
p
+=
line_size
*
4
;
hh
--
;
}
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
"movl %3, %%esi
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq %%mm0, (%0, %%eax)
\n\t
"
"addl %2, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq %%mm0, (%0, %%eax)
\n\t
"
"addl %2, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq %%mm0, (%0, %%eax)
\n\t
"
"addl %2, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq %%mm0, (%0, %%eax)
\n\t
"
"addl %2, %%eax
\n\t
"
"subl $4, %%esi
\n\t
"
" jnz 1b
\n\t
"
::
"r"
(
block
),
"r"
(
pixels
),
"r"
(
line_size
),
"m"
(
h
)
:
"%eax"
,
"%esi"
,
"memory"
);
#endif
}
...
...
@@ -1124,7 +1129,7 @@ void dsputil_init_mmx(void)
avg_no_rnd_pixels_tab
[
1
]
=
avg_no_rnd_pixels_x2_mmx
;
avg_no_rnd_pixels_tab
[
2
]
=
avg_no_rnd_pixels_y2_mmx
;
avg_no_rnd_pixels_tab
[
3
]
=
avg_no_rnd_pixels_xy2_mmx
;
sub_pixels_tab
[
0
]
=
sub_pixels_mmx
;
sub_pixels_tab
[
1
]
=
sub_pixels_x2_mmx
;
sub_pixels_tab
[
2
]
=
sub_pixels_y2_mmx
;
...
...
@@ -1140,20 +1145,24 @@ void dsputil_init_mmx(void)
pix_abs8x8_x2
=
pix_abs8x8_x2_mmx2
;
pix_abs8x8_y2
=
pix_abs8x8_y2_mmx2
;
pix_abs8x8_xy2
=
pix_abs8x8_xy2_mmx2
;
put_pixels_tab
[
1
]
=
put_pixels_x2_mmx2
;
put_pixels_tab
[
2
]
=
put_pixels_y2_mmx2
;
put_no_rnd_pixels_tab
[
1
]
=
put_no_rnd_pixels_x2_mmx2
;
put_no_rnd_pixels_tab
[
2
]
=
put_no_rnd_pixels_y2_mmx2
;
put_pixels_tab
[
1
]
=
put_pixels_x2_sse
;
put_pixels_tab
[
2
]
=
put_pixels_y2_sse
;
avg_pixels_tab
[
0
]
=
avg_pixels_sse
;
avg_pixels_tab
[
1
]
=
avg_pixels_x2_sse
;
avg_pixels_tab
[
2
]
=
avg_pixels_y2_sse
;
avg_pixels_tab
[
3
]
=
avg_pixels_xy2_sse
;
avg_pixels_tab
[
0
]
=
avg_pixels_mmx2
;
avg_pixels_tab
[
1
]
=
avg_pixels_x2_mmx2
;
avg_pixels_tab
[
2
]
=
avg_pixels_y2_mmx2
;
avg_pixels_tab
[
3
]
=
avg_pixels_xy2_mmx2
;
sub_pixels_tab
[
1
]
=
sub_pixels_x2_
sse
;
sub_pixels_tab
[
2
]
=
sub_pixels_y2_
sse
;
sub_pixels_tab
[
1
]
=
sub_pixels_x2_
mmx2
;
sub_pixels_tab
[
2
]
=
sub_pixels_y2_
mmx2
;
}
else
if
(
mm_flags
&
MM_3DNOW
)
{
put_pixels_tab
[
1
]
=
put_pixels_x2_3dnow
;
put_pixels_tab
[
2
]
=
put_pixels_y2_3dnow
;
put_no_rnd_pixels_tab
[
1
]
=
put_no_rnd_pixels_x2_3dnow
;
put_no_rnd_pixels_tab
[
2
]
=
put_no_rnd_pixels_y2_3dnow
;
avg_pixels_tab
[
0
]
=
avg_pixels_3dnow
;
avg_pixels_tab
[
1
]
=
avg_pixels_x2_3dnow
;
...
...
libavcodec/i386/dsputil_mmx_avg.h
View file @
607dce96
/*
* DSP utils : average functions are compiled twice for 3dnow/mmx2
* Copyright (c) 2000, 2001 Gerard Lantau.
* Copyright (c) 2002 Michael Niedermayer
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
...
...
@@ -17,271 +18,300 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
*/
static
void
DEF
(
put_pixels_x2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
int
dh
,
hh
;
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
hh
=
h
>>
2
;
dh
=
h
&
3
;
while
(
hh
--
)
{
__asm
__volatile
(
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
"movq (%1, %2), %%mm2
\n\t
"
"movq 1(%1, %2), %%mm3
\n\t
"
"movq (%1, %2, 2), %%mm4
\n\t
"
"movq 1(%1, %2, 2), %%mm5
\n\t
"
"movq (%1, %3), %%mm6
\n\t
"
"movq 1(%1, %3), %%mm7
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
PAVGB
" %%mm5, %%mm4
\n\t
"
PAVGB
" %%mm7, %%mm6
\n\t
"
"movq %%mm0, (%0)
\n\t
"
"movq %%mm2, (%0, %2)
\n\t
"
"movq %%mm4, (%0, %2, 2)
\n\t
"
"movq %%mm6, (%0, %3)
\n\t
"
::
"r"
(
p
),
"r"
(
pix
),
"r"
(
line_size
),
"r"
(
line_size
*
3
)
:
"memory"
);
pix
+=
line_size
*
4
;
p
+=
line_size
*
4
;
}
while
(
dh
--
)
{
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 1(%1, %%eax), %%mm1
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 1(%1, %%eax), %%mm1
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"r"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
put_no_rnd_pixels_x2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
__asm
__volatile
(
"movq %1, %%mm0
\n\t
"
"movq 1%1, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"=m"
(
*
p
)
:
"m"
(
*
pix
)
:
"memory"
);
pix
+=
line_size
;
p
+=
line_size
;
}
"xorl %%eax, %%eax
\n\t
"
"movq "
MANGLE
(
mm_bone
)
", %%mm7
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 1(%1, %%eax), %%mm1
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
"psubusb %%mm7, %%mm0
\n\t
"
"psubusb %%mm7, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 1(%1, %%eax), %%mm1
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
"psubusb %%mm7, %%mm0
\n\t
"
"psubusb %%mm7, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"r"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
put_pixels_y2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
int
dh
,
hh
;
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
hh
=
h
>>
1
;
dh
=
h
&
1
;
while
(
hh
--
)
{
__asm
__volatile
(
"movq %2, %%mm0
\n\t
"
"movq %3, %%mm1
\n\t
"
"movq %4, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
"movq %%mm0, %0
\n\t
"
"movq %%mm1, %1
\n\t
"
:
"=m"
(
*
p
),
"=m"
(
*
(
p
+
line_size
))
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
)),
"m"
(
*
(
pix
+
line_size
*
2
))
:
"memory"
);
pix
+=
line_size
*
2
;
p
+=
line_size
*
2
;
}
if
(
dh
)
{
"xorl %%eax, %%eax
\n\t
"
"movq (%1), %%mm0
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
"movq %%mm0, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm0
\n\t
"
PAVGB
" %%mm1, %%mm2
\n\t
"
PAVGB
" %%mm0, %%mm1
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
pixels
+
line_size
*
2
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"g"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
put_no_rnd_pixels_y2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
__asm
__volatile
(
"movq %1, %%mm0
\n\t
"
"movq %2, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"=m"
(
*
p
)
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
))
:
"memory"
);
}
"movq "
MANGLE
(
mm_bone
)
", %%mm7
\n\t
"
"xorl %%eax, %%eax
\n\t
"
"movq (%1), %%mm0
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm2
\n\t
"
"psubusb %%mm7, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
"movq %%mm0, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm0
\n\t
"
"psubusb %%mm7, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm2
\n\t
"
PAVGB
" %%mm0, %%mm1
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
pixels
+
line_size
*
2
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"g"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
avg_pixels
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
int
dh
,
hh
;
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
hh
=
h
>>
2
;
dh
=
h
&
3
;
while
(
hh
--
)
{
__asm
__volatile
(
"movq (%0), %%mm0
\n\t
"
"movq (%1), %%mm1
\n\t
"
"movq (%0, %2), %%mm2
\n\t
"
"movq (%1, %2), %%mm3
\n\t
"
"movq (%0, %2, 2), %%mm4
\n\t
"
"movq (%1, %2, 2), %%mm5
\n\t
"
"movq (%0, %3), %%mm6
\n\t
"
"movq (%1, %3), %%mm7
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
PAVGB
" %%mm5, %%mm4
\n\t
"
PAVGB
" %%mm7, %%mm6
\n\t
"
"movq %%mm0, (%0)
\n\t
"
"movq %%mm2, (%0, %2)
\n\t
"
"movq %%mm4, (%0, %2, 2)
\n\t
"
"movq %%mm6, (%0, %3)
\n\t
"
::
"r"
(
p
),
"r"
(
pix
),
"r"
(
line_size
),
"r"
(
line_size
*
3
)
:
"memory"
);
pix
+=
line_size
*
4
;
p
+=
line_size
*
4
;
}
while
(
dh
--
)
{
__asm
__volatile
(
"movq %0, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
p
)
:
"m"
(
*
pix
)
:
"memory"
);
pix
+=
line_size
;
p
+=
line_size
;
}
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq (%3, %%eax), %%mm3
\n\t
"
"movq (%4, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm0
\n\t
"
PAVGB
" %%mm4, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq (%3, %%eax), %%mm3
\n\t
"
"movq (%4, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm0
\n\t
"
PAVGB
" %%mm4, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"r"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
avg_pixels_x2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
static
void
DEF
(
avg_pixels_x2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
int
dh
,
hh
;
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
hh
=
h
>>
1
;
dh
=
h
&
1
;
while
(
hh
--
)
{
__asm
__volatile
(
"movq %2, %%mm2
\n\t
"
"movq 1%2, %%mm3
\n\t
"
"movq %3, %%mm4
\n\t
"
"movq 1%3, %%mm5
\n\t
"
"movq %0, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
PAVGB
" %%mm2, %%mm0
\n\t
"
PAVGB
" %%mm5, %%mm4
\n\t
"
PAVGB
" %%mm4, %%mm1
\n\t
"
"movq %%mm0, %0
\n\t
"
"movq %%mm1, %1
\n\t
"
:
"+m"
(
*
p
),
"+m"
(
*
(
p
+
line_size
))
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
))
:
"memory"
);
pix
+=
line_size
*
2
;
p
+=
line_size
*
2
;
}
if
(
dh
)
{
__asm
__volatile
(
"movq %1, %%mm1
\n\t
"
"movq 1%1, %%mm2
\n\t
"
"movq %0, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
p
)
:
"m"
(
*
pix
)
:
"memory"
);
}
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 1(%1, %%eax), %%mm1
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq (%3, %%eax), %%mm3
\n\t
"
"movq (%4, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm0
\n\t
"
PAVGB
" %%mm4, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 1(%1, %%eax), %%mm1
\n\t
"
"movq (%2, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq (%3, %%eax), %%mm3
\n\t
"
"movq (%4, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm0
\n\t
"
PAVGB
" %%mm4, %%mm2
\n\t
"
"movq %%mm0, (%3, %%eax)
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"addl %5, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"r"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
avg_pixels_y2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
static
void
DEF
(
avg_pixels_y2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
int
dh
,
hh
;
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
hh
=
h
>>
1
;
dh
=
h
&
1
;
while
(
hh
--
)
{
__asm
__volatile
(
"movq %2, %%mm2
\n\t
"
"movq %3, %%mm3
\n\t
"
"movq %3, %%mm4
\n\t
"
"movq %4, %%mm5
\n\t
"
"movq %0, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
PAVGB
" %%mm2, %%mm0
\n\t
"
PAVGB
" %%mm5, %%mm4
\n\t
"
PAVGB
" %%mm4, %%mm1
\n\t
"
"movq %%mm0, %0
\n\t
"
"movq %%mm1, %1
\n\t
"
:
"+m"
(
*
p
),
"+m"
(
*
(
p
+
line_size
))
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
)),
"m"
(
*
(
pix
+
line_size
*
2
))
:
"memory"
);
pix
+=
line_size
*
2
;
p
+=
line_size
*
2
;
}
if
(
dh
)
{
__asm
__volatile
(
"movq %1, %%mm1
\n\t
"
"movq %2, %%mm2
\n\t
"
"movq %0, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
p
)
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
))
:
"memory"
);
}
"xorl %%eax, %%eax
\n\t
"
"movq (%1), %%mm0
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
"movq (%4, %%eax), %%mm3
\n\t
"
"movq (%5, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm0
\n\t
"
PAVGB
" %%mm4, %%mm1
\n\t
"
"movq %%mm0, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm0
\n\t
"
PAVGB
" %%mm1, %%mm2
\n\t
"
PAVGB
" %%mm0, %%mm1
\n\t
"
"movq (%4, %%eax), %%mm3
\n\t
"
"movq (%5, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
PAVGB
" %%mm4, %%mm1
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
pixels
+
line_size
*
2
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"g"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
static
void
DEF
(
avg_pixels_xy2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
static
void
DEF
(
avg_pixels_xy2
)(
UINT8
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
UINT8
*
p
;
const
UINT8
*
pix
;
p
=
block
;
pix
=
pixels
;
__asm
__volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"movq %0, %%mm6
\n\t
"
::
"m"
(
mm_wtwo
));
do
{
__asm
__volatile
(
"movq %1, %%mm0
\n\t
"
"movq %2, %%mm1
\n\t
"
"movq 1%1, %%mm4
\n\t
"
"movq 1%2, %%mm5
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm2
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"paddusw %%mm1, %%mm0
\n\t
"
"paddusw %%mm3, %%mm2
\n\t
"
"movq %%mm4, %%mm1
\n\t
"
"movq %%mm5, %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm5
\n\t
"
"punpckhbw %%mm7, %%mm1
\n\t
"
"punpckhbw %%mm7, %%mm3
\n\t
"
"paddusw %%mm5, %%mm4
\n\t
"
"paddusw %%mm3, %%mm1
\n\t
"
"paddusw %%mm6, %%mm4
\n\t
"
"paddusw %%mm6, %%mm1
\n\t
"
"paddusw %%mm4, %%mm0
\n\t
"
"paddusw %%mm1, %%mm2
\n\t
"
"psrlw $2, %%mm0
\n\t
"
"psrlw $2, %%mm2
\n\t
"
"packuswb %%mm2, %%mm0
\n\t
"
PAVGB
" %0, %%mm0
\n\t
"
"movq %%mm0, %0
\n\t
"
:
"+m"
(
*
p
)
:
"m"
(
*
pix
),
"m"
(
*
(
pix
+
line_size
))
:
"memory"
);
pix
+=
line_size
;
p
+=
line_size
;
}
while
(
--
h
);
"movq "
MANGLE
(
mm_bone
)
", %%mm7
\n\t
"
"xorl %%eax, %%eax
\n\t
"
"movq (%1), %%mm0
\n\t
"
"movq 1(%1), %%mm1
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm2
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
"movq 1(%3, %%eax), %%mm4
\n\t
"
"psubusb %%mm7, %%mm2
\n\t
"
PAVGB
" %%mm3, %%mm1
\n\t
"
PAVGB
" %%mm4, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm2, %%mm1
\n\t
"
"movq (%4, %%eax), %%mm3
\n\t
"
"movq (%5, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm0
\n\t
"
PAVGB
" %%mm4, %%mm1
\n\t
"
"movq %%mm0, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"movq (%2, %%eax), %%mm1
\n\t
"
"movq (%3, %%eax), %%mm0
\n\t
"
"movq 1(%2, %%eax), %%mm3
\n\t
"
"movq 1(%3, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm1
\n\t
"
PAVGB
" %%mm4, %%mm0
\n\t
"
PAVGB
" %%mm1, %%mm2
\n\t
"
PAVGB
" %%mm0, %%mm1
\n\t
"
"movq (%4, %%eax), %%mm3
\n\t
"
"movq (%5, %%eax), %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
PAVGB
" %%mm4, %%mm1
\n\t
"
"movq %%mm2, (%4, %%eax)
\n\t
"
"movq %%mm1, (%5, %%eax)
\n\t
"
"addl %6, %%eax
\n\t
"
"subl $4, %0
\n\t
"
" jnz 1b
\n\t
"
:
"+g"
(
h
)
:
"r"
(
pixels
),
"r"
(
pixels
+
line_size
),
"r"
(
pixels
+
line_size
*
2
),
"r"
(
block
),
"r"
(
block
+
line_size
),
"g"
(
line_size
<<
1
)
:
"%eax"
,
"memory"
);
}
//Note: the sub* functions are no used
static
void
DEF
(
sub_pixels_x2
)(
DCTELEM
*
block
,
const
UINT8
*
pixels
,
int
line_size
,
int
h
)
{
DCTELEM
*
p
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment