Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
9b15c0a9
Commit
9b15c0a9
authored
Jul 16, 2012
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: dsputilenc: port to cpuflags
parent
1f3f8965
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
55 additions
and
57 deletions
+55
-57
dsputilenc.asm
libavcodec/x86/dsputilenc.asm
+55
-57
No files found.
libavcodec/x86/dsputilenc.asm
View file @
9b15c0a9
...
...
@@ -99,35 +99,33 @@ SECTION .text
paddusw
m0
,
m1
%endmacro
; FIXME: HSUM
_*
saturates at 64k, while an 8x8 hadamard or dct block can get up to
; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
%macro
HSUM
_MMX
3
mova
%2
,
%1
psrlq
%1
,
32
%macro
HSUM
3
%if
cpuflag
(
sse2
)
movhlps
%2
,
%1
paddusw
%1
,
%2
mova
%2
,
%1
psrlq
%1
,
16
pshuflw
%2
,
%1
,
0xE
paddusw
%1
,
%2
pshuflw
%2
,
%1
,
0x1
paddusw
%1
,
%2
movd
%3
,
%1
%endmacro
%macro
HSUM_MMXEXT
3
%elif
cpuflag
(
mmxext
)
pshufw
%2
,
%1
,
0xE
paddusw
%1
,
%2
pshufw
%2
,
%1
,
0x1
paddusw
%1
,
%2
movd
%3
,
%1
%endmacro
%macro
HSUM_SSE2
3
movhlps
%2
,
%1
paddusw
%1
,
%2
pshuflw
%2
,
%1
,
0xE
%elif
cpuflag
(
mmx
)
mova
%2
,
%1
psrlq
%1
,
32
paddusw
%1
,
%2
pshuflw
%2
,
%1
,
0x1
mova
%2
,
%1
psrlq
%1
,
16
paddusw
%1
,
%2
movd
%3
,
%1
%endif
%endmacro
%macro
STORE4
5
...
...
@@ -144,30 +142,30 @@ SECTION .text
mova
%5
,
[
%1
+
mmsize
*
3
]
%endmacro
%macro
hadamard8_16_wrapper
3
cglobal
hadamard8_diff
_
%1
,
4
,
4
,
%2
%macro
hadamard8_16_wrapper
2
cglobal
hadamard8_diff
,
4
,
4
,
%1
%ifndef
m8
%
assign
pad
%
3
*
mmsize
-
(
4
+
stack_offset
&
(
mmsize
-
1
))
%
assign
pad
%
2
*
mmsize
-
(
4
+
stack_offset
&
(
mmsize
-
1
))
SUB
rsp
,
pad
%endif
call
hadamard8x8_diff
_
%1
call
hadamard8x8_diff
%
+
SUFFIX
%ifndef
m8
ADD
rsp
,
pad
%endif
RET
cglobal
hadamard8_diff16
_
%1
,
5
,
6
,
%2
cglobal
hadamard8_diff16
,
5
,
6
,
%1
%ifndef
m8
%
assign
pad
%
3
*
mmsize
-
(
4
+
stack_offset
&
(
mmsize
-
1
))
%
assign
pad
%
2
*
mmsize
-
(
4
+
stack_offset
&
(
mmsize
-
1
))
SUB
rsp
,
pad
%endif
call
hadamard8x8_diff
_
%1
call
hadamard8x8_diff
%
+
SUFFIX
mov
r5d
,
eax
add
r1
,
8
add
r2
,
8
call
hadamard8x8_diff
_
%1
call
hadamard8x8_diff
%
+
SUFFIX
add
r5d
,
eax
cmp
r4d
,
16
...
...
@@ -175,12 +173,12 @@ cglobal hadamard8_diff16_%1, 5, 6, %2
lea
r1
,
[
r1
+
r3
*
8
-
8
]
lea
r2
,
[
r2
+
r3
*
8
-
8
]
call
hadamard8x8_diff
_
%1
call
hadamard8x8_diff
%
+
SUFFIX
add
r5d
,
eax
add
r1
,
8
add
r2
,
8
call
hadamard8x8_diff
_
%1
call
hadamard8x8_diff
%
+
SUFFIX
add
r5d
,
eax
.
done
:
...
...
@@ -191,7 +189,25 @@ cglobal hadamard8_diff16_%1, 5, 6, %2
RET
%endmacro
%macro
HADAMARD8_DIFF_MMX
1
%macro
HADAMARD8_DIFF
0
-
1
%if
cpuflag
(
sse2
)
hadamard8x8_diff
%
+
SUFFIX
:
lea
r0
,
[
r3
*
3
]
DIFF_PIXELS_8
r1
,
r2
,
0
,
r3
,
r0
,
rsp
+
gprsize
HADAMARD8
%if
ARCH_X86_64
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
%else
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
[
rsp
+
gprsize
]
,
[
rsp
+
mmsize
+
gprsize
]
%endif
HADAMARD8
ABS_SUM_8x8
rsp
+
gprsize
HSUM
m0
,
m1
,
eax
and
eax
,
0xFFFF
ret
hadamard8_16_wrapper
%1
,
3
%elif
cpuflag
(
mmx
)
ALIGN
16
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
; int stride, int h)
...
...
@@ -199,7 +215,7 @@ ALIGN 16
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
; can simply call this 2x2x (and that's why we access rsp+gprsize
; everywhere, which is rsp of calling func
hadamard8x8_diff
_
%1
:
hadamard8x8_diff
%
+
SUFFIX
:
lea
r0
,
[
r3
*
3
]
; first 4x8 pixels
...
...
@@ -236,53 +252,35 @@ hadamard8x8_diff_%1:
and
rax
,
0xFFFF
ret
hadamard8_16_wrapper
%1
,
0
,
14
%endmacro
%macro
HADAMARD8_DIFF_SSE2
2
hadamard8x8_diff_
%1
:
lea
r0
,
[
r3
*
3
]
DIFF_PIXELS_8
r1
,
r2
,
0
,
r3
,
r0
,
rsp
+
gprsize
HADAMARD8
%if
ARCH_X86_64
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
%else
TRANSPOSE8x8W
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
[
rsp
+
gprsize
]
,
[
rsp
+
mmsize
+
gprsize
]
hadamard8_16_wrapper
0
,
14
%endif
HADAMARD8
ABS_SUM_8x8
rsp
+
gprsize
HSUM_SSE2
m0
,
m1
,
eax
and
eax
,
0xFFFF
ret
hadamard8_16_wrapper
%1
,
%2
,
3
%endmacro
INIT_MMX
INIT_MMX
mmx
%define
ABS1
ABS1_MMX
%define
HSUM
HSUM_MMX
HADAMARD8_DIFF_MMX
mmx
HADAMARD8_DIFF
INIT_MMX
mmxext
%define
ABS1
ABS1_MMXEXT
%define
HSUM
HSUM_MMXEXT
HADAMARD8_DIFF_MMX
mmxext
HADAMARD8_DIFF
INIT_XMM
INIT_XMM
sse2
%define
ABS2
ABS2_MMXEXT
%if
ARCH_X86_64
%define
ABS_SUM_8x8
ABS_SUM_8x8_64
%else
%define
ABS_SUM_8x8
ABS_SUM_8x8_32
%endif
HADAMARD8_DIFF
_SSE2
sse2
,
10
HADAMARD8_DIFF
10
INIT_XMM
ssse3
%define
ABS2
ABS2_SSSE3
%define
ABS_SUM_8x8
ABS_SUM_8x8_64
HADAMARD8_DIFF
_SSE2
ssse3
,
9
HADAMARD8_DIFF
9
INIT_XMM
INIT_XMM
sse2
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
cglobal
sse16
_sse2
,
5
,
5
,
8
cglobal
sse16
,
5
,
5
,
8
shr
r4d
,
1
pxor
m0
,
m0
; mm0 = 0
pxor
m7
,
m7
; mm7 holds the sum
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment