Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
73d33554
Commit
73d33554
authored
Oct 24, 2001
by
Michael Niedermayer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
more speed
Originally committed as revision 2438 to
svn://svn.mplayerhq.hu/mplayer/trunk/postproc
parent
2d83f323
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
112 additions
and
20 deletions
+112
-20
postprocess.c
postproc/postprocess.c
+56
-10
postprocess_template.c
postproc/postprocess_template.c
+56
-10
No files found.
postproc/postprocess.c
View file @
73d33554
...
@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
after watching a black picture for 5 hours*/
after watching a black picture for 5 hours*/
static
uint64_t
*
yHistogram
=
NULL
;
static
uint64_t
*
yHistogram
=
NULL
;
int
black
=
0
,
white
=
255
;
// blackest black and whitest white in the picture
int
black
=
0
,
white
=
255
;
// blackest black and whitest white in the picture
int
QPCorrecture
=
256
;
/* Temporary buffers for handling the last row(s) */
/* Temporary buffers for handling the last row(s) */
static
uint8_t
*
tempDst
=
NULL
;
static
uint8_t
*
tempDst
=
NULL
;
...
@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
packedYOffset
=
0
;
packedYOffset
=
0
;
}
}
if
(
mode
&
LEVEL_FIX
)
QPCorrecture
=
packedYScale
&
0xFFFF
;
else
QPCorrecture
=
256
;
/* copy first row of 8x8 blocks */
/* copy first row of 8x8 blocks */
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
blockCopy
(
dst
+
x
,
dstStride
,
src
+
x
,
srcStride
,
8
,
mode
&
LEVEL_FIX
);
blockCopy
(
dst
+
x
,
dstStride
,
src
+
x
,
srcStride
,
8
,
mode
&
LEVEL_FIX
);
...
@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
//1% speedup if these are here instead of the inner loop
//1% speedup if these are here instead of the inner loop
uint8_t
*
srcBlock
=
&
(
src
[
y
*
srcStride
]);
uint8_t
*
srcBlock
=
&
(
src
[
y
*
srcStride
]);
uint8_t
*
dstBlock
=
&
(
dst
[
y
*
dstStride
]);
uint8_t
*
dstBlock
=
&
(
dst
[
y
*
dstStride
]);
#ifdef ARCH_X86
int
*
QPptr
=
isColor
?
&
QPs
[(
y
>>
3
)
*
QPStride
]
:&
QPs
[(
y
>>
4
)
*
QPStride
];
int
QPDelta
=
isColor
?
1
<<
(
32
-
3
)
:
1
<<
(
32
-
4
);
int
QPFrac
=
QPDelta
;
#endif
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
than use a temporary buffer */
than use a temporary buffer */
if
(
y
+
15
>=
height
)
if
(
y
+
15
>=
height
)
...
@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
{
{
const
int
stride
=
dstStride
;
const
int
stride
=
dstStride
;
int
QP
;
#ifdef ARCH_X86
if
(
isColor
)
int
QP
=
*
QPptr
;
{
asm
volatile
(
QP
=
QPs
[(
y
>>
3
)
*
QPStride
+
(
x
>>
3
)];
"addl %2, %1
\n\t
"
}
"sbbl %%eax, %%eax
\n\t
"
else
"shll $2, %%eax
\n\t
"
"subl %%eax, %0
\n\t
"
:
"+r"
(
QPptr
),
"+m"
(
QPFrac
)
:
"r"
(
QPDelta
)
:
"%eax"
);
#else
int
QP
=
isColor
?
QPs
[(
y
>>
3
)
*
QPStride
+
(
x
>>
3
)]
:
QPs
[(
y
>>
4
)
*
QPStride
+
(
x
>>
4
)];
#endif
if
(
!
isColor
)
{
{
QP
=
QPs
[(
y
>>
4
)
*
QPStride
+
(
x
>>
4
)];
QP
=
(
QP
*
QPCorrecture
)
>>
8
;
if
(
mode
&
LEVEL_FIX
)
QP
=
(
QP
*
(
packedYScale
&
0xFFFF
))
>>
8
;
yHistogram
[
srcBlock
[
srcStride
*
4
+
4
]
]
++
;
yHistogram
[
srcBlock
[
srcStride
*
5
]
]
++
;
}
}
#ifdef HAVE_MMX
#ifdef HAVE_MMX
asm
volatile
(
asm
volatile
(
...
@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
#endif
#endif
#ifdef HAVE_MMX2
#ifdef HAVE_MMX2
/*
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
*/
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm
(
"movl %4, %%eax
\n\t
"
"shrl $2, %%eax
\n\t
"
"andl $6, %%eax
\n\t
"
"addl $5, %%eax
\n\t
"
"movl %%eax, %%ebx
\n\t
"
"imul %1, %%eax
\n\t
"
"imul %3, %%ebx
\n\t
"
"prefetchnta 32(%%eax, %0)
\n\t
"
"prefetcht0 32(%%ebx, %2)
\n\t
"
"addl %1, %%eax
\n\t
"
"addl %3, %%ebx
\n\t
"
"prefetchnta 32(%%eax, %0)
\n\t
"
"prefetcht0 32(%%ebx, %2)
\n\t
"
::
"r"
(
srcBlock
),
"r"
(
srcStride
),
"r"
(
dstBlock
),
"r"
(
dstStride
),
"m"
(
x
)
:
"%eax"
,
"%ebx"
);
#elif defined(HAVE_3DNOW)
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
...
...
postproc/postprocess_template.c
View file @
73d33554
...
@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
after watching a black picture for 5 hours*/
after watching a black picture for 5 hours*/
static
uint64_t
*
yHistogram
=
NULL
;
static
uint64_t
*
yHistogram
=
NULL
;
int
black
=
0
,
white
=
255
;
// blackest black and whitest white in the picture
int
black
=
0
,
white
=
255
;
// blackest black and whitest white in the picture
int
QPCorrecture
=
256
;
/* Temporary buffers for handling the last row(s) */
/* Temporary buffers for handling the last row(s) */
static
uint8_t
*
tempDst
=
NULL
;
static
uint8_t
*
tempDst
=
NULL
;
...
@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
packedYOffset
=
0
;
packedYOffset
=
0
;
}
}
if
(
mode
&
LEVEL_FIX
)
QPCorrecture
=
packedYScale
&
0xFFFF
;
else
QPCorrecture
=
256
;
/* copy first row of 8x8 blocks */
/* copy first row of 8x8 blocks */
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
blockCopy
(
dst
+
x
,
dstStride
,
src
+
x
,
srcStride
,
8
,
mode
&
LEVEL_FIX
);
blockCopy
(
dst
+
x
,
dstStride
,
src
+
x
,
srcStride
,
8
,
mode
&
LEVEL_FIX
);
...
@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
//1% speedup if these are here instead of the inner loop
//1% speedup if these are here instead of the inner loop
uint8_t
*
srcBlock
=
&
(
src
[
y
*
srcStride
]);
uint8_t
*
srcBlock
=
&
(
src
[
y
*
srcStride
]);
uint8_t
*
dstBlock
=
&
(
dst
[
y
*
dstStride
]);
uint8_t
*
dstBlock
=
&
(
dst
[
y
*
dstStride
]);
#ifdef ARCH_X86
int
*
QPptr
=
isColor
?
&
QPs
[(
y
>>
3
)
*
QPStride
]
:&
QPs
[(
y
>>
4
)
*
QPStride
];
int
QPDelta
=
isColor
?
1
<<
(
32
-
3
)
:
1
<<
(
32
-
4
);
int
QPFrac
=
QPDelta
;
#endif
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
than use a temporary buffer */
than use a temporary buffer */
if
(
y
+
15
>=
height
)
if
(
y
+
15
>=
height
)
...
@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
for
(
x
=
0
;
x
<
width
;
x
+=
BLOCK_SIZE
)
{
{
const
int
stride
=
dstStride
;
const
int
stride
=
dstStride
;
int
QP
;
#ifdef ARCH_X86
if
(
isColor
)
int
QP
=
*
QPptr
;
{
asm
volatile
(
QP
=
QPs
[(
y
>>
3
)
*
QPStride
+
(
x
>>
3
)];
"addl %2, %1
\n\t
"
}
"sbbl %%eax, %%eax
\n\t
"
else
"shll $2, %%eax
\n\t
"
"subl %%eax, %0
\n\t
"
:
"+r"
(
QPptr
),
"+m"
(
QPFrac
)
:
"r"
(
QPDelta
)
:
"%eax"
);
#else
int
QP
=
isColor
?
QPs
[(
y
>>
3
)
*
QPStride
+
(
x
>>
3
)]
:
QPs
[(
y
>>
4
)
*
QPStride
+
(
x
>>
4
)];
#endif
if
(
!
isColor
)
{
{
QP
=
QPs
[(
y
>>
4
)
*
QPStride
+
(
x
>>
4
)];
QP
=
(
QP
*
QPCorrecture
)
>>
8
;
if
(
mode
&
LEVEL_FIX
)
QP
=
(
QP
*
(
packedYScale
&
0xFFFF
))
>>
8
;
yHistogram
[
srcBlock
[
srcStride
*
4
+
4
]
]
++
;
yHistogram
[
srcBlock
[
srcStride
*
5
]
]
++
;
}
}
#ifdef HAVE_MMX
#ifdef HAVE_MMX
asm
volatile
(
asm
volatile
(
...
@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
...
@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
#endif
#endif
#ifdef HAVE_MMX2
#ifdef HAVE_MMX2
/*
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
*/
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm
(
"movl %4, %%eax
\n\t
"
"shrl $2, %%eax
\n\t
"
"andl $6, %%eax
\n\t
"
"addl $5, %%eax
\n\t
"
"movl %%eax, %%ebx
\n\t
"
"imul %1, %%eax
\n\t
"
"imul %3, %%ebx
\n\t
"
"prefetchnta 32(%%eax, %0)
\n\t
"
"prefetcht0 32(%%ebx, %2)
\n\t
"
"addl %1, %%eax
\n\t
"
"addl %3, %%ebx
\n\t
"
"prefetchnta 32(%%eax, %0)
\n\t
"
"prefetcht0 32(%%ebx, %2)
\n\t
"
::
"r"
(
srcBlock
),
"r"
(
srcStride
),
"r"
(
dstBlock
),
"r"
(
dstStride
),
"m"
(
x
)
:
"%eax"
,
"%ebx"
);
#elif defined(HAVE_3DNOW)
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment