Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
8e89f6fd
Commit
8e89f6fd
authored
May 11, 2017
by
James Darnley
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avcodec/x86: move simple_idct to external assembly
parent
87bddba4
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
892 additions
and
936 deletions
+892
-936
dct.c
libavcodec/tests/x86/dct.c
+1
-1
Makefile
libavcodec/x86/Makefile
+2
-2
idctdsp_init.c
libavcodec/x86/idctdsp_init.c
+0
-4
simple_idct.asm
libavcodec/x86/simple_idct.asm
+889
-0
simple_idct.c
libavcodec/x86/simple_idct.c
+0
-929
No files found.
libavcodec/tests/x86/dct.c
View file @
8e89f6fd
...
...
@@ -67,7 +67,7 @@ static const struct algo fdct_tab_arch[] = {
};
static
const
struct
algo
idct_tab_arch
[]
=
{
#if HAVE_MMX_
INLINE
#if HAVE_MMX_
EXTERNAL
{
"SIMPLE-MMX"
,
ff_simple_idct_mmx
,
FF_IDCT_PERM_SIMPLE
,
AV_CPU_FLAG_MMX
},
#endif
#if CONFIG_MPEG4_DECODER && HAVE_YASM
...
...
libavcodec/x86/Makefile
View file @
8e89f6fd
...
...
@@ -79,7 +79,6 @@ OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
# GCC inline assembly optimizations
# subsystems
MMX-OBJS-$(CONFIG_FDCTDSP)
+=
x86/fdct.o
MMX-OBJS-$(CONFIG_IDCTDSP)
+=
x86/simple_idct.o
MMX-OBJS-$(CONFIG_VC1DSP)
+=
x86/vc1dsp_mmx.o
# decoders/encoders
...
...
@@ -128,7 +127,8 @@ YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
YASM-OBJS-$(CONFIG_RV34DSP)
+=
x86/rv34dsp.o
YASM-OBJS-$(CONFIG_VC1DSP)
+=
x86/vc1dsp_loopfilter.o
\
x86/vc1dsp_mc.o
YASM-OBJS-$(CONFIG_IDCTDSP)
+=
x86/simple_idct10.o
YASM-OBJS-$(CONFIG_IDCTDSP)
+=
x86/simple_idct10.o
\
x86/simple_idct.o
YASM-OBJS-$(CONFIG_VIDEODSP)
+=
x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP)
+=
x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP8DSP)
+=
x86/vp8dsp.o
\
...
...
libavcodec/x86/idctdsp_init.c
View file @
8e89f6fd
...
...
@@ -68,7 +68,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c
->
put_pixels_clamped
=
ff_put_pixels_clamped_mmx
;
c
->
add_pixels_clamped
=
ff_add_pixels_clamped_mmx
;
if
(
INLINE_MMX
(
cpu_flags
))
{
if
(
!
high_bit_depth
&&
avctx
->
lowres
==
0
&&
(
avctx
->
idct_algo
==
FF_IDCT_AUTO
||
...
...
@@ -80,14 +79,12 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c
->
perm_type
=
FF_IDCT_PERM_SIMPLE
;
}
}
}
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
c
->
put_signed_pixels_clamped
=
ff_put_signed_pixels_clamped_sse2
;
c
->
put_pixels_clamped
=
ff_put_pixels_clamped_sse2
;
c
->
add_pixels_clamped
=
ff_add_pixels_clamped_sse2
;
if
(
INLINE_SSE2
(
cpu_flags
))
{
if
(
!
high_bit_depth
&&
avctx
->
lowres
==
0
&&
(
avctx
->
idct_algo
==
FF_IDCT_AUTO
||
...
...
@@ -98,7 +95,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c
->
perm_type
=
FF_IDCT_PERM_SIMPLE
;
}
}
}
if
(
ARCH_X86_64
&&
avctx
->
lowres
==
0
)
{
if
(
avctx
->
bits_per_raw_sample
==
10
&&
...
...
libavcodec/x86/simple_idct.asm
0 → 100644
View file @
8e89f6fd
;
; Simple IDCT MMX
;
; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
;
; Conversion from gcc syntax to x264asm syntax with minimal modifications
; by James Darnley <jdarnley@obe.tv>.
;
; This file is part of FFmpeg.
;
; FFmpeg is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public
; License as published by the Free Software Foundation; either
; version 2.1 of the License, or (at your option) any later version.
;
; FFmpeg is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with FFmpeg; if not, write to the Free Software
; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;/
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
cextern
pb_80
wm1010
:
dw
0
,
0xffff
,
0
,
0xffff
d40000
:
dd
4
<<
16
,
0
; 23170.475006
; 22725.260826
; 21406.727617
; 19265.545870
; 16384.000000
; 12872.826198
; 8866.956905
; 4520.335430
%define
C0
23170
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
C1
22725
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
C2
21407
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
C3
19266
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
C4
16383
; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
%define
C5
12873
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
C6
8867
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
C7
4520
; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
%define
ROW_SHIFT
11
%define
COL_SHIFT
20
; 6
coeffs
:
dw
1
<<
(
ROW_SHIFT
-
1
),
0
dw
1
<<
(
ROW_SHIFT
-
1
),
0
dw
1
<<
(
ROW_SHIFT
-
1
),
1
dw
1
<<
(
ROW_SHIFT
-
1
),
0
dw
C4
,
C4
,
C4
,
C4
dw
C4
,
-
C4
,
C4
,
-
C4
dw
C2
,
C6
,
C2
,
C6
dw
C6
,
-
C2
,
C6
,
-
C2
dw
C1
,
C3
,
C1
,
C3
dw
C5
,
C7
,
C5
,
C7
dw
C3
,
-
C7
,
C3
,
-
C7
dw
-
C1
,
-
C5
,
-
C1
,
-
C5
dw
C5
,
-
C1
,
C5
,
-
C1
dw
C7
,
C3
,
C7
,
C3
dw
C7
,
-
C5
,
C7
,
-
C5
dw
C3
,
-
C1
,
C3
,
-
C1
SECTION
.
text
%macro
DC_COND_IDCT
7
movq
mm0
,
[
blockq
+
%1
]
; R4 R0 r4 r0
movq
mm1
,
[
blockq
+
%2
]
; R6 R2 r6 r2
movq
mm2
,
[
blockq
+
%3
]
; R3 R1 r3 r1
movq
mm3
,
[
blockq
+
%4
]
; R7 R5 r7 r5
movq
mm4
,
[wm1010]
pand
mm4
,
mm0
por
mm4
,
mm1
por
mm4
,
mm2
por
mm4
,
mm3
packssdw
mm4
,
mm4
movd
t0d
,
mm4
or
t0d
,
t0d
jz
%
%1
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm5
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm5
,
mm1
; C6R6+C2R2 C6r6+C2r2
movq
mm6
,
[
coeffs
+
40
]
; -C2 C6 -C2 C6
pmaddwd
mm1
,
mm6
; -C2R6+C6R2 -C2r6+C6r2
movq
mm7
,
[
coeffs
+
48
]
; C3 C1 C3 C1
pmaddwd
mm7
,
mm2
; C3R3+C1R1 C3r3+C1r1
paddd
mm4
,
[
coeffs
+
8
]
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
paddd
mm4
,
mm5
; A0 a0
psubd
mm6
,
mm5
; A3 a3
movq
mm5
,
[
coeffs
+
56
]
; C7 C5 C7 C5
pmaddwd
mm5
,
mm3
; C7R7+C5R5 C7r7+C5r5
paddd
mm0
,
[
coeffs
+
8
]
paddd
mm1
,
mm0
; A1 a1
paddd
mm0
,
mm0
psubd
mm0
,
mm1
; A2 a2
pmaddwd
mm2
,
[
coeffs
+
64
]
; -C7R3+C3R1 -C7r3+C3r1
paddd
mm7
,
mm5
; B0 b0
movq
mm5
,
[
coeffs
+
72
]
; -C5 -C1 -C5 -C1
pmaddwd
mm5
,
mm3
; -C5R7-C1R5 -C5r7-C1r5
paddd
mm7
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm7
; A0-B0 a0-b0
paddd
mm5
,
mm2
; B1 b1
psrad
mm7
,
%7
psrad
mm4
,
%7
movq
mm2
,
mm1
; A1 a1
paddd
mm1
,
mm5
; A1+B1 a1+b1
psubd
mm2
,
mm5
; A1-B1 a1-b1
psrad
mm1
,
%7
psrad
mm2
,
%7
packssdw
mm7
,
mm1
; A1+B1 a1+b1 A0+B0 a0+b0
packssdw
mm2
,
mm4
; A0-B0 a0-b0 A1-B1 a1-b1
movq
[
%5
]
,
mm7
movq
mm1
,
[
blockq
+
%3
]
; R3 R1 r3 r1
movq
mm4
,
[
coeffs
+
80
]
; -C1 C5 -C1 C5
movq
[
24
+
%5
]
,
mm2
pmaddwd
mm4
,
mm1
; -C1R3+C5R1 -C1r3+C5r1
movq
mm7
,
[
coeffs
+
88
]
; C3 C7 C3 C7
pmaddwd
mm1
,
[
coeffs
+
96
]
; -C5R3+C7R1 -C5r3+C7r1
pmaddwd
mm7
,
mm3
; C3R7+C7R5 C3r7+C7r5
movq
mm2
,
mm0
; A2 a2
pmaddwd
mm3
,
[
coeffs
+
104
]
; -C1R7+C3R5 -C1r7+C3r5
paddd
mm4
,
mm7
; B2 b2
paddd
mm2
,
mm4
; A2+B2 a2+b2
psubd
mm0
,
mm4
; a2-B2 a2-b2
psrad
mm2
,
%7
psrad
mm0
,
%7
movq
mm4
,
mm6
; A3 a3
paddd
mm3
,
mm1
; B3 b3
paddd
mm6
,
mm3
; A3+B3 a3+b3
psubd
mm4
,
mm3
; a3-B3 a3-b3
psrad
mm6
,
%7
packssdw
mm2
,
mm6
; A3+B3 a3+b3 A2+B2 a2+b2
movq
[
8
+
%5
]
,
mm2
psrad
mm4
,
%7
packssdw
mm4
,
mm0
; A2-B2 a2-b2 A3-B3 a3-b3
movq
[
16
+
%5
]
,
mm4
jmp
%
%2
%
%1
:
pslld
mm0
,
16
paddd
mm0
,
[d40000]
psrad
mm0
,
13
packssdw
mm0
,
mm0
movq
[
%5
]
,
mm0
movq
[
8
+
%5
]
,
mm0
movq
[
16
+
%5
]
,
mm0
movq
[
24
+
%5
]
,
mm0
%
%2
:
%endmacro
%macro
Z_COND_IDCT
8
movq
mm0
,
[
blockq
+
%1
]
; R4 R0 r4 r0
movq
mm1
,
[
blockq
+
%2
]
; R6 R2 r6 r2
movq
mm2
,
[
blockq
+
%3
]
; R3 R1 r3 r1
movq
mm3
,
[
blockq
+
%4
]
; R7 R5 r7 r5
movq
mm4
,
mm0
por
mm4
,
mm1
por
mm4
,
mm2
por
mm4
,
mm3
packssdw
mm4
,
mm4
movd
t0d
,
mm4
or
t0d
,
t0d
jz
%8
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm5
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm5
,
mm1
; C6R6+C2R2 C6r6+C2r2
movq
mm6
,
[
coeffs
+
40
]
; -C2 C6 -C2 C6
pmaddwd
mm1
,
mm6
; -C2R6+C6R2 -C2r6+C6r2
movq
mm7
,
[
coeffs
+
48
]
; C3 C1 C3 C1
pmaddwd
mm7
,
mm2
; C3R3+C1R1 C3r3+C1r1
paddd
mm4
,
[coeffs]
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
paddd
mm4
,
mm5
; A0 a0
psubd
mm6
,
mm5
; A3 a3
movq
mm5
,
[
coeffs
+
56
]
; C7 C5 C7 C5
pmaddwd
mm5
,
mm3
; C7R7+C5R5 C7r7+C5r5
paddd
mm0
,
[coeffs]
paddd
mm1
,
mm0
; A1 a1
paddd
mm0
,
mm0
psubd
mm0
,
mm1
; A2 a2
pmaddwd
mm2
,
[
coeffs
+
64
]
; -C7R3+C3R1 -C7r3+C3r1
paddd
mm7
,
mm5
; B0 b0
movq
mm5
,
[
coeffs
+
72
]
; -C5 -C1 -C5 -C1
pmaddwd
mm5
,
mm3
; -C5R7-C1R5 -C5r7-C1r5
paddd
mm7
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm7
; A0-B0 a0-b0
paddd
mm5
,
mm2
; B1 b1
psrad
mm7
,
%7
psrad
mm4
,
%7
movq
mm2
,
mm1
; A1 a1
paddd
mm1
,
mm5
; A1+B1 a1+b1
psubd
mm2
,
mm5
; A1-B1 a1-b1
psrad
mm1
,
%7
psrad
mm2
,
%7
packssdw
mm7
,
mm1
; A1+B1 a1+b1 A0+B0 a0+b0
packssdw
mm2
,
mm4
; A0-B0 a0-b0 A1-B1 a1-b1
movq
[
%5
]
,
mm7
movq
mm1
,
[
blockq
+
%3
]
; R3 R1 r3 r1
movq
mm4
,
[
coeffs
+
80
]
; -C1 C5 -C1 C5
movq
[
24
+
%5
]
,
mm2
pmaddwd
mm4
,
mm1
; -C1R3+C5R1 -C1r3+C5r1
movq
mm7
,
[
coeffs
+
88
]
; C3 C7 C3 C7
pmaddwd
mm1
,
[
coeffs
+
96
]
; -C5R3+C7R1 -C5r3+C7r1
pmaddwd
mm7
,
mm3
; C3R7+C7R5 C3r7+C7r5
movq
mm2
,
mm0
; A2 a2
pmaddwd
mm3
,
[
coeffs
+
104
]
; -C1R7+C3R5 -C1r7+C3r5
paddd
mm4
,
mm7
; B2 b2
paddd
mm2
,
mm4
; A2+B2 a2+b2
psubd
mm0
,
mm4
; a2-B2 a2-b2
psrad
mm2
,
%7
psrad
mm0
,
%7
movq
mm4
,
mm6
; A3 a3
paddd
mm3
,
mm1
; B3 b3
paddd
mm6
,
mm3
; A3+B3 a3+b3
psubd
mm4
,
mm3
; a3-B3 a3-b3
psrad
mm6
,
%7
packssdw
mm2
,
mm6
; A3+B3 a3+b3 A2+B2 a2+b2
movq
[
8
+
%5
]
,
mm2
psrad
mm4
,
%7
packssdw
mm4
,
mm0
; A2-B2 a2-b2 A3-B3 a3-b3
movq
[
16
+
%5
]
,
mm4
%endmacro
%macro
IDCT1
6
movq
mm0
,
%1
; R4 R0 r4 r0
movq
mm1
,
%2
; R6 R2 r6 r2
movq
mm2
,
%3
; R3 R1 r3 r1
movq
mm3
,
%4
; R7 R5 r7 r5
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm5
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm5
,
mm1
; C6R6+C2R2 C6r6+C2r2
movq
mm6
,
[
coeffs
+
40
]
; -C2 C6 -C2 C6
pmaddwd
mm1
,
mm6
; -C2R6+C6R2 -C2r6+C6r2
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
movq
mm7
,
[
coeffs
+
48
]
; C3 C1 C3 C1
pmaddwd
mm7
,
mm2
; C3R3+C1R1 C3r3+C1r1
paddd
mm4
,
mm5
; A0 a0
psubd
mm6
,
mm5
; A3 a3
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
paddd
mm0
,
mm1
; A1 a1
psubd
mm5
,
mm1
; A2 a2
movq
mm1
,
[
coeffs
+
56
]
; C7 C5 C7 C5
pmaddwd
mm1
,
mm3
; C7R7+C5R5 C7r7+C5r5
pmaddwd
mm2
,
[
coeffs
+
64
]
; -C7R3+C3R1 -C7r3+C3r1
paddd
mm7
,
mm1
; B0 b0
movq
mm1
,
[
coeffs
+
72
]
; -C5 -C1 -C5 -C1
pmaddwd
mm1
,
mm3
; -C5R7-C1R5 -C5r7-C1r5
paddd
mm7
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm7
; A0-B0 a0-b0
paddd
mm1
,
mm2
; B1 b1
psrad
mm7
,
%6
psrad
mm4
,
%6
movq
mm2
,
mm0
; A1 a1
paddd
mm0
,
mm1
; A1+B1 a1+b1
psubd
mm2
,
mm1
; A1-B1 a1-b1
psrad
mm0
,
%6
psrad
mm2
,
%6
packssdw
mm7
,
mm7
; A0+B0 a0+b0
movd
[
%5
]
,
mm7
packssdw
mm0
,
mm0
; A1+B1 a1+b1
movd
[
16
+
%5
]
,
mm0
packssdw
mm2
,
mm2
; A1-B1 a1-b1
movd
[
96
+
%5
]
,
mm2
packssdw
mm4
,
mm4
; A0-B0 a0-b0
movd
[
112
+
%5
]
,
mm4
movq
mm0
,
%3
; R3 R1 r3 r1
movq
mm4
,
[
coeffs
+
80
]
; -C1 C5 -C1 C5
pmaddwd
mm4
,
mm0
; -C1R3+C5R1 -C1r3+C5r1
movq
mm7
,
[
coeffs
+
88
]
; C3 C7 C3 C7
pmaddwd
mm0
,
[
coeffs
+
96
]
; -C5R3+C7R1 -C5r3+C7r1
pmaddwd
mm7
,
mm3
; C3R7+C7R5 C3r7+C7r5
movq
mm2
,
mm5
; A2 a2
pmaddwd
mm3
,
[
coeffs
+
104
]
; -C1R7+C3R5 -C1r7+C3r5
paddd
mm4
,
mm7
; B2 b2
paddd
mm2
,
mm4
; A2+B2 a2+b2
psubd
mm5
,
mm4
; a2-B2 a2-b2
psrad
mm2
,
%6
psrad
mm5
,
%6
movq
mm4
,
mm6
; A3 a3
paddd
mm3
,
mm0
; B3 b3
paddd
mm6
,
mm3
; A3+B3 a3+b3
psubd
mm4
,
mm3
; a3-B3 a3-b3
psrad
mm6
,
%6
psrad
mm4
,
%6
packssdw
mm2
,
mm2
; A2+B2 a2+b2
packssdw
mm6
,
mm6
; A3+B3 a3+b3
movd
[
32
+
%5
]
,
mm2
packssdw
mm4
,
mm4
; A3-B3 a3-b3
packssdw
mm5
,
mm5
; A2-B2 a2-b2
movd
[
48
+
%5
]
,
mm6
movd
[
64
+
%5
]
,
mm4
movd
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT2
6
movq
mm0
,
%1
; R4 R0 r4 r0
movq
mm1
,
%2
; R6 R2 r6 r2
movq
mm3
,
%4
; R7 R5 r7 r5
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm5
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm5
,
mm1
; C6R6+C2R2 C6r6+C2r2
movq
mm6
,
[
coeffs
+
40
]
; -C2 C6 -C2 C6
pmaddwd
mm1
,
mm6
; -C2R6+C6R2 -C2r6+C6r2
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
paddd
mm4
,
mm5
; A0 a0
psubd
mm6
,
mm5
; A3 a3
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
paddd
mm0
,
mm1
; A1 a1
psubd
mm5
,
mm1
; A2 a2
movq
mm1
,
[
coeffs
+
56
]
; C7 C5 C7 C5
pmaddwd
mm1
,
mm3
; C7R7+C5R5 C7r7+C5r5
movq
mm7
,
[
coeffs
+
72
]
; -C5 -C1 -C5 -C1
pmaddwd
mm7
,
mm3
; -C5R7-C1R5 -C5r7-C1r5
paddd
mm1
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm1
; A0-B0 a0-b0
psrad
mm1
,
%6
psrad
mm4
,
%6
movq
mm2
,
mm0
; A1 a1
paddd
mm0
,
mm7
; A1+B1 a1+b1
psubd
mm2
,
mm7
; A1-B1 a1-b1
psrad
mm0
,
%6
psrad
mm2
,
%6
packssdw
mm1
,
mm1
; A0+B0 a0+b0
movd
[
%5
]
,
mm1
packssdw
mm0
,
mm0
; A1+B1 a1+b1
movd
[
16
+
%5
]
,
mm0
packssdw
mm2
,
mm2
; A1-B1 a1-b1
movd
[
96
+
%5
]
,
mm2
packssdw
mm4
,
mm4
; A0-B0 a0-b0
movd
[
112
+
%5
]
,
mm4
movq
mm1
,
[
coeffs
+
88
]
; C3 C7 C3 C7
pmaddwd
mm1
,
mm3
; C3R7+C7R5 C3r7+C7r5
movq
mm2
,
mm5
; A2 a2
pmaddwd
mm3
,
[
coeffs
+
104
]
; -C1R7+C3R5 -C1r7+C3r5
paddd
mm2
,
mm1
; A2+B2 a2+b2
psubd
mm5
,
mm1
; a2-B2 a2-b2
psrad
mm2
,
%6
psrad
mm5
,
%6
movq
mm1
,
mm6
; A3 a3
paddd
mm6
,
mm3
; A3+B3 a3+b3
psubd
mm1
,
mm3
; a3-B3 a3-b3
psrad
mm6
,
%6
psrad
mm1
,
%6
packssdw
mm2
,
mm2
; A2+B2 a2+b2
packssdw
mm6
,
mm6
; A3+B3 a3+b3
movd
[
32
+
%5
]
,
mm2
packssdw
mm1
,
mm1
; A3-B3 a3-b3
packssdw
mm5
,
mm5
; A2-B2 a2-b2
movd
[
48
+
%5
]
,
mm6
movd
[
64
+
%5
]
,
mm1
movd
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT3
6
movq
mm0
,
%1
; R4 R0 r4 r0
movq
mm3
,
%4
; R7 R5 r7 r5
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
movq
mm1
,
[
coeffs
+
56
]
; C7 C5 C7 C5
pmaddwd
mm1
,
mm3
; C7R7+C5R5 C7r7+C5r5
movq
mm7
,
[
coeffs
+
72
]
; -C5 -C1 -C5 -C1
pmaddwd
mm7
,
mm3
; -C5R7-C1R5 -C5r7-C1r5
paddd
mm1
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm1
; A0-B0 a0-b0
psrad
mm1
,
%6
psrad
mm4
,
%6
movq
mm2
,
mm0
; A1 a1
paddd
mm0
,
mm7
; A1+B1 a1+b1
psubd
mm2
,
mm7
; A1-B1 a1-b1
psrad
mm0
,
%6
psrad
mm2
,
%6
packssdw
mm1
,
mm1
; A0+B0 a0+b0
movd
[
%5
]
,
mm1
packssdw
mm0
,
mm0
; A1+B1 a1+b1
movd
[
16
+
%5
]
,
mm0
packssdw
mm2
,
mm2
; A1-B1 a1-b1
movd
[
96
+
%5
]
,
mm2
packssdw
mm4
,
mm4
; A0-B0 a0-b0
movd
[
112
+
%5
]
,
mm4
movq
mm1
,
[
coeffs
+
88
]
; C3 C7 C3 C7
pmaddwd
mm1
,
mm3
; C3R7+C7R5 C3r7+C7r5
movq
mm2
,
mm5
; A2 a2
pmaddwd
mm3
,
[
coeffs
+
104
]
; -C1R7+C3R5 -C1r7+C3r5
paddd
mm2
,
mm1
; A2+B2 a2+b2
psubd
mm5
,
mm1
; a2-B2 a2-b2
psrad
mm2
,
%6
psrad
mm5
,
%6
movq
mm1
,
mm6
; A3 a3
paddd
mm6
,
mm3
; A3+B3 a3+b3
psubd
mm1
,
mm3
; a3-B3 a3-b3
psrad
mm6
,
%6
psrad
mm1
,
%6
packssdw
mm2
,
mm2
; A2+B2 a2+b2
packssdw
mm6
,
mm6
; A3+B3 a3+b3
movd
[
32
+
%5
]
,
mm2
packssdw
mm1
,
mm1
; A3-B3 a3-b3
packssdw
mm5
,
mm5
; A2-B2 a2-b2
movd
[
48
+
%5
]
,
mm6
movd
[
64
+
%5
]
,
mm1
movd
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT4
6
movq
mm0
,
%1
; R4 R0 r4 r0
movq
mm2
,
%3
; R3 R1 r3 r1
movq
mm3
,
%4
; R7 R5 r7 r5
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
movq
mm7
,
[
coeffs
+
48
]
; C3 C1 C3 C1
pmaddwd
mm7
,
mm2
; C3R3+C1R1 C3r3+C1r1
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
movq
mm1
,
[
coeffs
+
56
]
; C7 C5 C7 C5
pmaddwd
mm1
,
mm3
; C7R7+C5R5 C7r7+C5r5
pmaddwd
mm2
,
[
coeffs
+
64
]
; -C7R3+C3R1 -C7r3+C3r1
paddd
mm7
,
mm1
; B0 b0
movq
mm1
,
[
coeffs
+
72
]
; -C5 -C1 -C5 -C1
pmaddwd
mm1
,
mm3
; -C5R7-C1R5 -C5r7-C1r5
paddd
mm7
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm7
; A0-B0 a0-b0
paddd
mm1
,
mm2
; B1 b1
psrad
mm7
,
%6
psrad
mm4
,
%6
movq
mm2
,
mm0
; A1 a1
paddd
mm0
,
mm1
; A1+B1 a1+b1
psubd
mm2
,
mm1
; A1-B1 a1-b1
psrad
mm0
,
%6
psrad
mm2
,
%6
packssdw
mm7
,
mm7
; A0+B0 a0+b0
movd
[
%5
]
,
mm7
packssdw
mm0
,
mm0
; A1+B1 a1+b1
movd
[
16
+
%5
]
,
mm0
packssdw
mm2
,
mm2
; A1-B1 a1-b1
movd
[
96
+
%5
]
,
mm2
packssdw
mm4
,
mm4
; A0-B0 a0-b0
movd
[
112
+
%5
]
,
mm4
movq
mm0
,
%3
; R3 R1 r3 r1
movq
mm4
,
[
coeffs
+
80
]
; -C1 C5 -C1 C5
pmaddwd
mm4
,
mm0
; -C1R3+C5R1 -C1r3+C5r1
movq
mm7
,
[
coeffs
+
88
]
; C3 C7 C3 C7
pmaddwd
mm0
,
[
coeffs
+
96
]
; -C5R3+C7R1 -C5r3+C7r1
pmaddwd
mm7
,
mm3
; C3R7+C7R5 C3r7+C7r5
movq
mm2
,
mm5
; A2 a2
pmaddwd
mm3
,
[
coeffs
+
104
]
; -C1R7+C3R5 -C1r7+C3r5
paddd
mm4
,
mm7
; B2 b2
paddd
mm2
,
mm4
; A2+B2 a2+b2
psubd
mm5
,
mm4
; a2-B2 a2-b2
psrad
mm2
,
%6
psrad
mm5
,
%6
movq
mm4
,
mm6
; A3 a3
paddd
mm3
,
mm0
; B3 b3
paddd
mm6
,
mm3
; A3+B3 a3+b3
psubd
mm4
,
mm3
; a3-B3 a3-b3
psrad
mm6
,
%6
psrad
mm4
,
%6
packssdw
mm2
,
mm2
; A2+B2 a2+b2
packssdw
mm6
,
mm6
; A3+B3 a3+b3
movd
[
32
+
%5
]
,
mm2
packssdw
mm4
,
mm4
; A3-B3 a3-b3
packssdw
mm5
,
mm5
; A2-B2 a2-b2
movd
[
48
+
%5
]
,
mm6
movd
[
64
+
%5
]
,
mm4
movd
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT5
6
movq
mm0
,
%1
; R4 R0 r4 r0
movq
mm2
,
%3
; R3 R1 r3 r1
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
movq
mm7
,
[
coeffs
+
48
]
; C3 C1 C3 C1
pmaddwd
mm7
,
mm2
; C3R3+C1R1 C3r3+C1r1
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
movq
mm3
,
[
coeffs
+
64
]
pmaddwd
mm3
,
mm2
; -C7R3+C3R1 -C7r3+C3r1
paddd
mm7
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm7
; A0-B0 a0-b0
psrad
mm7
,
%6
psrad
mm4
,
%6
movq
mm1
,
mm0
; A1 a1
paddd
mm0
,
mm3
; A1+B1 a1+b1
psubd
mm1
,
mm3
; A1-B1 a1-b1
psrad
mm0
,
%6
psrad
mm1
,
%6
packssdw
mm7
,
mm7
; A0+B0 a0+b0
movd
[
%5
]
,
mm7
packssdw
mm0
,
mm0
; A1+B1 a1+b1
movd
[
16
+
%5
]
,
mm0
packssdw
mm1
,
mm1
; A1-B1 a1-b1
movd
[
96
+
%5
]
,
mm1
packssdw
mm4
,
mm4
; A0-B0 a0-b0
movd
[
112
+
%5
]
,
mm4
movq
mm4
,
[
coeffs
+
80
]
; -C1 C5 -C1 C5
pmaddwd
mm4
,
mm2
; -C1R3+C5R1 -C1r3+C5r1
pmaddwd
mm2
,
[
coeffs
+
96
]
; -C5R3+C7R1 -C5r3+C7r1
movq
mm1
,
mm5
; A2 a2
paddd
mm1
,
mm4
; A2+B2 a2+b2
psubd
mm5
,
mm4
; a2-B2 a2-b2
psrad
mm1
,
%6
psrad
mm5
,
%6
movq
mm4
,
mm6
; A3 a3
paddd
mm6
,
mm2
; A3+B3 a3+b3
psubd
mm4
,
mm2
; a3-B3 a3-b3
psrad
mm6
,
%6
psrad
mm4
,
%6
packssdw
mm1
,
mm1
; A2+B2 a2+b2
packssdw
mm6
,
mm6
; A3+B3 a3+b3
movd
[
32
+
%5
]
,
mm1
packssdw
mm4
,
mm4
; A3-B3 a3-b3
packssdw
mm5
,
mm5
; A2-B2 a2-b2
movd
[
48
+
%5
]
,
mm6
movd
[
64
+
%5
]
,
mm4
movd
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT6
6
movq
mm0
,
[
%1
]
; R4 R0 r4 r0
movq
mm1
,
[
%2
]
; R6 R2 r6 r2
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm5
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm5
,
mm1
; C6R6+C2R2 C6r6+C2r2
movq
mm6
,
[
coeffs
+
40
]
; -C2 C6 -C2 C6
pmaddwd
mm1
,
mm6
; -C2R6+C6R2 -C2r6+C6r2
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
paddd
mm4
,
mm5
; A0 a0
psubd
mm6
,
mm5
; A3 a3
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
paddd
mm0
,
mm1
; A1 a1
psubd
mm5
,
mm1
; A2 a2
movq
mm2
,
[
8
+
%1
]
; R4 R0 r4 r0
movq
mm3
,
[
8
+
%2
]
; R6 R2 r6 r2
movq
mm1
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm1
,
mm2
; C4R4+C4R0 C4r4+C4r0
movq
mm7
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm2
,
mm7
; -C4R4+C4R0 -C4r4+C4r0
movq
mm7
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm7
,
mm3
; C6R6+C2R2 C6r6+C2r2
pmaddwd
mm3
,
[
coeffs
+
40
]
; -C2R6+C6R2 -C2r6+C6r2
paddd
mm7
,
mm1
; A0 a0
paddd
mm1
,
mm1
; 2C0 2c0
psubd
mm1
,
mm7
; A3 a3
paddd
mm3
,
mm2
; A1 a1
paddd
mm2
,
mm2
; 2C1 2c1
psubd
mm2
,
mm3
; A2 a2
psrad
mm4
,
%6
psrad
mm7
,
%6
psrad
mm3
,
%6
packssdw
mm4
,
mm7
; A0 a0
movq
[
%5
]
,
mm4
psrad
mm0
,
%6
packssdw
mm0
,
mm3
; A1 a1
movq
[
16
+
%5
]
,
mm0
movq
[
96
+
%5
]
,
mm0
movq
[
112
+
%5
]
,
mm4
psrad
mm5
,
%6
psrad
mm6
,
%6
psrad
mm2
,
%6
packssdw
mm5
,
mm2
; A2-B2 a2-b2
movq
[
32
+
%5
]
,
mm5
psrad
mm1
,
%6
packssdw
mm6
,
mm1
; A3+B3 a3+b3
movq
[
48
+
%5
]
,
mm6
movq
[
64
+
%5
]
,
mm6
movq
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT7
6
movq
mm0
,
%1
; R4 R0 r4 r0
movq
mm1
,
%2
; R6 R2 r6 r2
movq
mm2
,
%3
; R3 R1 r3 r1
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
movq
mm5
,
[
coeffs
+
32
]
; C6 C2 C6 C2
pmaddwd
mm5
,
mm1
; C6R6+C2R2 C6r6+C2r2
movq
mm6
,
[
coeffs
+
40
]
; -C2 C6 -C2 C6
pmaddwd
mm1
,
mm6
; -C2R6+C6R2 -C2r6+C6r2
movq
mm6
,
mm4
; C4R4+C4R0 C4r4+C4r0
movq
mm7
,
[
coeffs
+
48
]
; C3 C1 C3 C1
pmaddwd
mm7
,
mm2
; C3R3+C1R1 C3r3+C1r1
paddd
mm4
,
mm5
; A0 a0
psubd
mm6
,
mm5
; A3 a3
movq
mm5
,
mm0
; -C4R4+C4R0 -C4r4+C4r0
paddd
mm0
,
mm1
; A1 a1
psubd
mm5
,
mm1
; A2 a2
movq
mm1
,
[
coeffs
+
64
]
pmaddwd
mm1
,
mm2
; -C7R3+C3R1 -C7r3+C3r1
paddd
mm7
,
mm4
; A0+B0 a0+b0
paddd
mm4
,
mm4
; 2A0 2a0
psubd
mm4
,
mm7
; A0-B0 a0-b0
psrad
mm7
,
%6
psrad
mm4
,
%6
movq
mm3
,
mm0
; A1 a1
paddd
mm0
,
mm1
; A1+B1 a1+b1
psubd
mm3
,
mm1
; A1-B1 a1-b1
psrad
mm0
,
%6
psrad
mm3
,
%6
packssdw
mm7
,
mm7
; A0+B0 a0+b0
movd
[
%5
]
,
mm7
packssdw
mm0
,
mm0
; A1+B1 a1+b1
movd
[
16
+
%5
]
,
mm0
packssdw
mm3
,
mm3
; A1-B1 a1-b1
movd
[
96
+
%5
]
,
mm3
packssdw
mm4
,
mm4
; A0-B0 a0-b0
movd
[
112
+
%5
]
,
mm4
movq
mm4
,
[
coeffs
+
80
]
; -C1 C5 -C1 C5
pmaddwd
mm4
,
mm2
; -C1R3+C5R1 -C1r3+C5r1
pmaddwd
mm2
,
[
coeffs
+
96
]
; -C5R3+C7R1 -C5r3+C7r1
movq
mm3
,
mm5
; A2 a2
paddd
mm3
,
mm4
; A2+B2 a2+b2
psubd
mm5
,
mm4
; a2-B2 a2-b2
psrad
mm3
,
%6
psrad
mm5
,
%6
movq
mm4
,
mm6
; A3 a3
paddd
mm6
,
mm2
; A3+B3 a3+b3
psubd
mm4
,
mm2
; a3-B3 a3-b3
psrad
mm6
,
%6
packssdw
mm3
,
mm3
; A2+B2 a2+b2
movd
[
32
+
%5
]
,
mm3
psrad
mm4
,
%6
packssdw
mm6
,
mm6
; A3+B3 a3+b3
movd
[
48
+
%5
]
,
mm6
packssdw
mm4
,
mm4
; A3-B3 a3-b3
packssdw
mm5
,
mm5
; A2-B2 a2-b2
movd
[
64
+
%5
]
,
mm4
movd
[
80
+
%5
]
,
mm5
%endmacro
%macro
IDCT8
6
movq
mm0
,
[
%1
]
; R4 R0 r4 r0
movq
mm4
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm4
,
mm0
; C4R4+C4R0 C4r4+C4r0
movq
mm5
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm0
,
mm5
; -C4R4+C4R0 -C4r4+C4r0
psrad
mm4
,
%6
psrad
mm0
,
%6
movq
mm2
,
[
8
+
%1
]
; R4 R0 r4 r0
movq
mm1
,
[
coeffs
+
16
]
; C4 C4 C4 C4
pmaddwd
mm1
,
mm2
; C4R4+C4R0 C4r4+C4r0
movq
mm7
,
[
coeffs
+
24
]
; -C4 C4 -C4 C4
pmaddwd
mm2
,
mm7
; -C4R4+C4R0 -C4r4+C4r0
movq
mm7
,
[
coeffs
+
32
]
; C6 C2 C6 C2
psrad
mm1
,
%6
packssdw
mm4
,
mm1
; A0 a0
movq
[
%5
]
,
mm4
psrad
mm2
,
%6
packssdw
mm0
,
mm2
; A1 a1
movq
[
16
+
%5
]
,
mm0
movq
[
96
+
%5
]
,
mm0
movq
[
112
+
%5
]
,
mm4
movq
[
32
+
%5
]
,
mm0
movq
[
48
+
%5
]
,
mm4
movq
[
64
+
%5
]
,
mm4
movq
[
80
+
%5
]
,
mm0
%endmacro
%macro
IDCT
0
DC_COND_IDCT
0
,
8
,
16
,
24
,
rsp
+
0
,
null
,
11
Z_COND_IDCT
32
,
40
,
48
,
56
,
rsp
+
32
,
null
,
11
,
%
%4
Z_COND_IDCT
64
,
72
,
80
,
88
,
rsp
+
64
,
null
,
11
,
%
%2
Z_COND_IDCT
96
,
104
,
112
,
120
,
rsp
+
96
,
null
,
11
,
%
%1
IDCT1
[
rsp
+
0
]
,
[
rsp
+
64
]
,
[
rsp
+
32
]
,
[
rsp
+
96
]
,
blockq
+
0
,
20
IDCT1
[
rsp
+
8
]
,
[
rsp
+
72
]
,
[
rsp
+
40
]
,
[
rsp
+
104
]
,
blockq
+
4
,
20
IDCT1
[
rsp
+
16
]
,
[
rsp
+
80
]
,
[
rsp
+
48
]
,
[
rsp
+
112
]
,
blockq
+
8
,
20
IDCT1
[
rsp
+
24
]
,
[
rsp
+
88
]
,
[
rsp
+
56
]
,
[
rsp
+
120
]
,
blockq
+
12
,
20
jmp
%
%9
ALIGN
16
%
%4
:
Z_COND_IDCT
64
,
72
,
80
,
88
,
rsp
+
64
,
null
,
11
,
%
%6
Z_COND_IDCT
96
,
104
,
112
,
120
,
rsp
+
96
,
null
,
11
,
%
%5
IDCT2
[
rsp
+
0
]
,
[
rsp
+
64
]
,
[
rsp
+
32
]
,
[
rsp
+
96
]
,
blockq
+
0
,
20
IDCT2
[
rsp
+
8
]
,
[
rsp
+
72
]
,
[
rsp
+
40
]
,
[
rsp
+
104
]
,
blockq
+
4
,
20
IDCT2
[
rsp
+
16
]
,
[
rsp
+
80
]
,
[
rsp
+
48
]
,
[
rsp
+
112
]
,
blockq
+
8
,
20
IDCT2
[
rsp
+
24
]
,
[
rsp
+
88
]
,
[
rsp
+
56
]
,
[
rsp
+
120
]
,
blockq
+
12
,
20
jmp
%
%9
ALIGN
16
%
%6
:
Z_COND_IDCT
96
,
104
,
112
,
120
,
rsp
+
96
,
null
,
11
,
%
%7
IDCT3
[
rsp
+
0
]
,
[
rsp
+
64
]
,
[
rsp
+
32
]
,
[
rsp
+
96
]
,
blockq
+
0
,
20
IDCT3
[
rsp
+
8
]
,
[
rsp
+
72
]
,
[
rsp
+
40
]
,
[
rsp
+
104
]
,
blockq
+
4
,
20
IDCT3
[
rsp
+
16
]
,
[
rsp
+
80
]
,
[
rsp
+
48
]
,
[
rsp
+
112
]
,
blockq
+
8
,
20
IDCT3
[
rsp
+
24
]
,
[
rsp
+
88
]
,
[
rsp
+
56
]
,
[
rsp
+
120
]
,
blockq
+
12
,
20
jmp
%
%9
ALIGN
16
%
%2
:
Z_COND_IDCT
96
,
104
,
112
,
120
,
rsp
+
96
,
null
,
11
,
%
%3
IDCT4
[
rsp
+
0
]
,
[
rsp
+
64
]
,
[
rsp
+
32
]
,
[
rsp
+
96
]
,
blockq
+
0
,
20
IDCT4
[
rsp
+
8
]
,
[
rsp
+
72
]
,
[
rsp
+
40
]
,
[
rsp
+
104
]
,
blockq
+
4
,
20
IDCT4
[
rsp
+
16
]
,
[
rsp
+
80
]
,
[
rsp
+
48
]
,
[
rsp
+
112
]
,
blockq
+
8
,
20
IDCT4
[
rsp
+
24
]
,
[
rsp
+
88
]
,
[
rsp
+
56
]
,
[
rsp
+
120
]
,
blockq
+
12
,
20
jmp
%
%9
ALIGN
16
%
%3
:
IDCT5
[
rsp
+
0
]
,
[
rsp
+
64
]
,
[
rsp
+
32
]
,
[
rsp
+
96
]
,
blockq
+
0
,
20
IDCT5
[
rsp
+
8
]
,
[
rsp
+
72
]
,
[
rsp
+
40
]
,
[
rsp
+
104
]
,
blockq
+
4
,
20
IDCT5
[
rsp
+
16
]
,
[
rsp
+
80
]
,
[
rsp
+
48
]
,
[
rsp
+
112
]
,
blockq
+
8
,
20
IDCT5
[
rsp
+
24
]
,
[
rsp
+
88
]
,
[
rsp
+
56
]
,
[
rsp
+
120
]
,
blockq
+
12
,
20
jmp
%
%9
ALIGN
16
%
%5
:
IDCT6
rsp
+
0
,
rsp
+
64
,
rsp
+
32
,
rsp
+
96
,
blockq
+
0
,
20
IDCT6
rsp
+
16
,
rsp
+
80
,
rsp
+
48
,
rsp
+
112
,
blockq
+
8
,
20
jmp
%
%9
ALIGN
16
%
%1
:
IDCT7
[
rsp
+
0
]
,
[
rsp
+
64
]
,
[
rsp
+
32
]
,
[
rsp
+
96
]
,
blockq
+
0
,
20
IDCT7
[
rsp
+
8
]
,
[
rsp
+
72
]
,
[
rsp
+
40
]
,
[
rsp
+
104
]
,
blockq
+
4
,
20
IDCT7
[
rsp
+
16
]
,
[
rsp
+
80
]
,
[
rsp
+
48
]
,
[
rsp
+
112
]
,
blockq
+
8
,
20
IDCT7
[
rsp
+
24
]
,
[
rsp
+
88
]
,
[
rsp
+
56
]
,
[
rsp
+
120
]
,
blockq
+
12
,
20
jmp
%
%9
ALIGN
16
%
%7
:
IDCT8
rsp
+
0
,
rsp
+
64
,
rsp
+
32
,
rsp
+
96
,
blockq
+
0
,
20
IDCT8
rsp
+
16
,
rsp
+
80
,
rsp
+
48
,
rsp
+
112
,
blockq
+
8
,
20
%
%9
:
%endmacro
%macro
PUT_PIXELS_CLAMPED_HALF
1
mova
m0
,
[
blockq
+
mmsize
*
0
+
%1
]
mova
m1
,
[
blockq
+
mmsize
*
2
+
%1
]
%if
mmsize
==
8
mova
m2
,
[
blockq
+
mmsize
*
4
+
%1
]
mova
m3
,
[
blockq
+
mmsize
*
6
+
%1
]
%endif
packuswb
m0
,
[
blockq
+
mmsize
*
1
+
%1
]
packuswb
m1
,
[
blockq
+
mmsize
*
3
+
%1
]
%if
mmsize
==
8
packuswb
m2
,
[
blockq
+
mmsize
*
5
+
%1
]
packuswb
m3
,
[
blockq
+
mmsize
*
7
+
%1
]
movq
[pixelsq],
m0
movq
[
lsizeq
+
pixelsq
]
,
m1
movq
[
2
*
lsizeq
+
pixelsq
]
,
m2
movq
[
lsize3q
+
pixelsq
]
,
m3
%else
movq
[pixelsq],
m0
movhps
[
lsizeq
+
pixelsq
]
,
m0
movq
[
2
*
lsizeq
+
pixelsq
]
,
m1
movhps
[
lsize3q
+
pixelsq
]
,
m1
%endif
%endmacro
%macro
ADD_PIXELS_CLAMPED
1
mova
m0
,
[
blockq
+
mmsize
*
0
+
%1
]
mova
m1
,
[
blockq
+
mmsize
*
1
+
%1
]
%if
mmsize
==
8
mova
m5
,
[
blockq
+
mmsize
*
2
+
%1
]
mova
m6
,
[
blockq
+
mmsize
*
3
+
%1
]
%endif
movq
m2
,
[pixelsq]
movq
m3
,
[
pixelsq
+
lsizeq
]
%if
mmsize
==
8
mova
m7
,
m2
punpcklbw
m2
,
m4
punpckhbw
m7
,
m4
paddsw
m0
,
m2
paddsw
m1
,
m7
mova
m7
,
m3
punpcklbw
m3
,
m4
punpckhbw
m7
,
m4
paddsw
m5
,
m3
paddsw
m6
,
m7
%else
punpcklbw
m2
,
m4
punpcklbw
m3
,
m4
paddsw
m0
,
m2
paddsw
m1
,
m3
%endif
packuswb
m0
,
m1
%if
mmsize
==
8
packuswb
m5
,
m6
movq
[pixelsq],
m0
movq
[
pixelsq
+
lsizeq
]
,
m5
%else
movq
[pixelsq],
m0
movhps
[
pixelsq
+
lsizeq
]
,
m0
%endif
%endmacro
INIT_MMX
mmx
cglobal
simple_idct
,
1
,
2
,
8
,
128
,
block
,
t0
IDCT
RET
cglobal
simple_idct_put
,
3
,
5
,
8
,
128
,
pixels
,
lsize
,
block
,
lsize3
,
t0
IDCT
lea
lsize3q
,
[
lsizeq
*
3
]
PUT_PIXELS_CLAMPED_HALF
0
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
4
]
PUT_PIXELS_CLAMPED_HALF
64
RET
cglobal
simple_idct_add
,
3
,
4
,
8
,
128
,
pixels
,
lsize
,
block
,
t0
IDCT
pxor
m4
,
m4
ADD_PIXELS_CLAMPED
0
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
2
]
ADD_PIXELS_CLAMPED
32
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
2
]
ADD_PIXELS_CLAMPED
64
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
2
]
ADD_PIXELS_CLAMPED
96
RET
INIT_XMM
sse2
cglobal
simple_idct_put
,
3
,
5
,
8
,
128
,
pixels
,
lsize
,
block
,
lsize3
,
t0
IDCT
lea
lsize3q
,
[
lsizeq
*
3
]
PUT_PIXELS_CLAMPED_HALF
0
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
4
]
PUT_PIXELS_CLAMPED_HALF
64
RET
cglobal
simple_idct_add
,
3
,
4
,
8
,
128
,
pixels
,
lsize
,
block
,
t0
IDCT
pxor
m4
,
m4
ADD_PIXELS_CLAMPED
0
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
2
]
ADD_PIXELS_CLAMPED
32
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
2
]
ADD_PIXELS_CLAMPED
64
lea
pixelsq
,
[
pixelsq
+
lsizeq
*
2
]
ADD_PIXELS_CLAMPED
96
RET
libavcodec/x86/simple_idct.c
deleted
100644 → 0
View file @
87bddba4
/*
* Simple IDCT MMX
*
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/idctdsp.h"
#include "libavcodec/x86/idctdsp.h"
#include "idctdsp.h"
#include "simple_idct.h"
#if HAVE_INLINE_ASM
/*
23170.475006
22725.260826
21406.727617
19265.545870
16384.000000
12872.826198
8866.956905
4520.335430
*/
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define ROW_SHIFT 11
#define COL_SHIFT 20 // 6
DECLARE_ASM_CONST
(
8
,
uint64_t
,
wm1010
)
=
0xFFFF0000FFFF0000ULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
d40000
)
=
0x0000000000040000ULL
;
DECLARE_ALIGNED
(
8
,
static
const
int16_t
,
coeffs
)[]
=
{
1
<<
(
ROW_SHIFT
-
1
),
0
,
1
<<
(
ROW_SHIFT
-
1
),
0
,
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
1
<<
(
ROW_SHIFT
-
1
),
1
,
1
<<
(
ROW_SHIFT
-
1
),
0
,
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
// 0, 0, 0, 0,
// 0, 0, 0, 0,
C4
,
C4
,
C4
,
C4
,
C4
,
-
C4
,
C4
,
-
C4
,
C2
,
C6
,
C2
,
C6
,
C6
,
-
C2
,
C6
,
-
C2
,
C1
,
C3
,
C1
,
C3
,
C5
,
C7
,
C5
,
C7
,
C3
,
-
C7
,
C3
,
-
C7
,
-
C1
,
-
C5
,
-
C1
,
-
C5
,
C5
,
-
C1
,
C5
,
-
C1
,
C7
,
C3
,
C7
,
C3
,
C7
,
-
C5
,
C7
,
-
C5
,
C3
,
-
C1
,
C3
,
-
C1
};
static
inline
void
idct
(
int16_t
*
block
)
{
LOCAL_ALIGNED_8
(
int64_t
,
align_tmp
,
[
16
]);
int16_t
*
const
temp
=
(
int16_t
*
)
align_tmp
;
__asm__
volatile
(
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
"pand %%mm0, %%mm4 \n\t"\
"por %%mm1, %%mm4 \n\t"\
"por %%mm2, %%mm4 \n\t"\
"por %%mm3, %%mm4 \n\t"\
"packssdw %%mm4,%%mm4 \n\t"\
"movd %%mm4, %%eax \n\t"\
"orl %%eax, %%eax \n\t"\
"jz 1f \n\t"\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq 56(%2), %%mm5 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm5 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t"
/* A1 a1 */
\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t"
/* A2 a2 */
\
"pmaddwd 64(%2), %%mm2 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm5, %%mm7 \n\t"
/* B0 b0 */
\
"movq 72(%2), %%mm5 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm5 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"paddd %%mm2, %%mm5 \n\t"
/* B1 b1 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm5, %%mm1 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm5, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t"
/* A1+B1 a1+b1 A0+B0 a0+b0 */
\
"packssdw %%mm4, %%mm2 \n\t"
/* A0-B0 a0-b0 A1-B1 a1-b1 */
\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t"
/* R3 R1 r3 r1 */
\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"movq 88(%2), %%mm7 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd 96(%2), %%mm1 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm0, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm7, %%mm4 \n\t"
/* B2 b2 */
\
"paddd %%mm4, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm0 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm1, %%mm3 \n\t"
/* B3 b3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t"
/* A3+B3 a3+b3 A2+B2 a2+b2 */
\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t"
/* A2-B2 a2-b2 A3-B3 a3-b3 */
\
"movq %%mm4, 16+" #dst " \n\t"\
"jmp 2f \n\t"\
"1: \n\t"\
"pslld $16, %%mm0 \n\t"\
"paddd "MANGLE(d40000)", %%mm0 \n\t"\
"psrad $13, %%mm0 \n\t"\
"packssdw %%mm0, %%mm0 \n\t"\
"movq %%mm0, " #dst " \n\t"\
"movq %%mm0, 8+" #dst " \n\t"\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 24+" #dst " \n\t"\
"2: \n\t"
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq %%mm0, %%mm4 \n\t"\
"por %%mm1, %%mm4 \n\t"\
"por %%mm2, %%mm4 \n\t"\
"por %%mm3, %%mm4 \n\t"\
"packssdw %%mm4,%%mm4 \n\t"\
"movd %%mm4, %%eax \n\t"\
"orl %%eax, %%eax \n\t"\
"jz " #bt " \n\t"\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq 56(%2), %%mm5 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm5 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t"
/* A1 a1 */
\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t"
/* A2 a2 */
\
"pmaddwd 64(%2), %%mm2 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm5, %%mm7 \n\t"
/* B0 b0 */
\
"movq 72(%2), %%mm5 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm5 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"paddd %%mm2, %%mm5 \n\t"
/* B1 b1 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm5, %%mm1 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm5, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t"
/* A1+B1 a1+b1 A0+B0 a0+b0 */
\
"packssdw %%mm4, %%mm2 \n\t"
/* A0-B0 a0-b0 A1-B1 a1-b1 */
\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t"
/* R3 R1 r3 r1 */
\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"movq 88(%2), %%mm7 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd 96(%2), %%mm1 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm0, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm7, %%mm4 \n\t"
/* B2 b2 */
\
"paddd %%mm4, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm0 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm1, %%mm3 \n\t"
/* B3 b3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t"
/* A3+B3 a3+b3 A2+B2 a2+b2 */
\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t"
/* A2-B2 a2-b2 A3-B3 a3-b3 */
\
"movq %%mm4, 16+" #dst " \n\t"\
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq 56(%2), %%mm5 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm5 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t"
/* A1 a1 */
\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t"
/* A2 a2 */
\
"pmaddwd 64(%2), %%mm2 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm5, %%mm7 \n\t"
/* B0 b0 */
\
"movq 72(%2), %%mm5 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm5 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"paddd %%mm2, %%mm5 \n\t"
/* B1 b1 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm5, %%mm1 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm5, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t"
/* A1+B1 a1+b1 A0+B0 a0+b0 */
\
"packssdw %%mm4, %%mm2 \n\t"
/* A0-B0 a0-b0 A1-B1 a1-b1 */
\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t"
/* R3 R1 r3 r1 */
\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"movq 88(%2), %%mm7 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd 96(%2), %%mm1 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm0, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm7, %%mm4 \n\t"
/* B2 b2 */
\
"paddd %%mm4, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm0 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm1, %%mm3 \n\t"
/* B3 b3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t"
/* A3+B3 a3+b3 A2+B2 a2+b2 */
\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t"
/* A2-B2 a2-b2 A3-B3 a3-b3 */
\
"movq %%mm4, 16+" #dst " \n\t"\
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
DC_COND_IDCT
(
0
(
%
0
),
8
(
%
0
),
16
(
%
0
),
24
(
%
0
),
0
(
%
1
),
paddd
8
(
%
2
),
11
)
Z_COND_IDCT
(
32
(
%
0
),
40
(
%
0
),
48
(
%
0
),
56
(
%
0
),
32
(
%
1
),
paddd
(
%
2
),
11
,
4
f
)
Z_COND_IDCT
(
64
(
%
0
),
72
(
%
0
),
80
(
%
0
),
88
(
%
0
),
64
(
%
1
),
paddd
(
%
2
),
11
,
2
f
)
Z_COND_IDCT
(
96
(
%
0
),
104
(
%
0
),
112
(
%
0
),
120
(
%
0
),
96
(
%
1
),
paddd
(
%
2
),
11
,
1
f
)
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1 a1 */
\
"psubd %%mm1, %%mm5 \n\t"
/* A2 a2 */
\
"movq 56(%2), %%mm1 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
"pmaddwd 64(%2), %%mm2 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm1, %%mm7 \n\t"
/* B0 b0 */
\
"movq 72(%2), %%mm1 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"paddd %%mm2, %%mm1 \n\t"
/* B1 b1 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm1, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm7, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"movd %%mm4, 112+" #dst " \n\t"\
"movq " #src1 ", %%mm0 \n\t"
/* R3 R1 r3 r1 */
\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"movq 88(%2), %%mm7 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd 96(%2), %%mm0 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm5, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm7, %%mm4 \n\t"
/* B2 b2 */
\
"paddd %%mm4, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm5 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm0, %%mm3 \n\t"
/* B3 b3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"packssdw %%mm6, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A3-B3 a3-b3 */
\
"packssdw %%mm5, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
IDCT
(
8
(
%
1
),
72
(
%
1
),
40
(
%
1
),
104
(
%
1
),
4
(
%
0
),
20
)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
IDCT
(
24
(
%
1
),
88
(
%
1
),
56
(
%
1
),
120
(
%
1
),
12
(
%
0
),
20
)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
\
"4:
\n\t
"
Z_COND_IDCT
(
64
(
%
0
),
72
(
%
0
),
80
(
%
0
),
88
(
%
0
),
64
(
%
1
),
paddd
(
%
2
),
11
,
6
f
)
Z_COND_IDCT
(
96
(
%
0
),
104
(
%
0
),
112
(
%
0
),
120
(
%
0
),
96
(
%
1
),
paddd
(
%
2
),
11
,
5
f
)
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1 a1 */
\
"psubd %%mm1, %%mm5 \n\t"
/* A2 a2 */
\
"movq 56(%2), %%mm1 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
"movq 72(%2), %%mm7 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm1 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm1, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm7, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm7, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm1 \n\t"
/* A0+B0 a0+b0 */
\
"movd %%mm1, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 88(%2), %%mm1 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm5, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm1, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm1, %%mm5 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm1 \n\t"
/* A3 a3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm1 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"packssdw %%mm6, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm1, %%mm1 \n\t"
/* A3-B3 a3-b3 */
\
"packssdw %%mm5, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm1, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
IDCT
(
8
(
%
1
),
72
(
%
1
),
40
(
%
1
),
104
(
%
1
),
4
(
%
0
),
20
)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
IDCT
(
24
(
%
1
),
88
(
%
1
),
56
(
%
1
),
120
(
%
1
),
12
(
%
0
),
20
)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
\
"6:
\n\t
"
Z_COND_IDCT
(
96
(
%
0
),
104
(
%
0
),
112
(
%
0
),
120
(
%
0
),
96
(
%
1
),
paddd
(
%
2
),
11
,
7
f
)
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 56(%2), %%mm1 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
"movq 72(%2), %%mm7 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm1 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm1, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm7, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm7, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm1 \n\t"
/* A0+B0 a0+b0 */
\
"movd %%mm1, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 88(%2), %%mm1 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm5, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm1, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm1, %%mm5 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm1 \n\t"
/* A3 a3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm1 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"packssdw %%mm6, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm1, %%mm1 \n\t"
/* A3-B3 a3-b3 */
\
"packssdw %%mm5, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm1, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
IDCT
(
8
(
%
1
),
72
(
%
1
),
40
(
%
1
),
104
(
%
1
),
4
(
%
0
),
20
)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
IDCT
(
24
(
%
1
),
88
(
%
1
),
56
(
%
1
),
120
(
%
1
),
12
(
%
0
),
20
)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
\
"2:
\n\t
"
Z_COND_IDCT
(
96
(
%
0
),
104
(
%
0
),
112
(
%
0
),
120
(
%
0
),
96
(
%
1
),
paddd
(
%
2
),
11
,
3
f
)
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq " #src5 ", %%mm3 \n\t"
/* R7 R5 r7 r5 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 56(%2), %%mm1 \n\t"
/* C7 C5 C7 C5 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* C7R7+C5R5 C7r7+C5r5 */
\
"pmaddwd 64(%2), %%mm2 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm1, %%mm7 \n\t"
/* B0 b0 */
\
"movq 72(%2), %%mm1 \n\t"
/* -C5 -C1 -C5 -C1 */
\
"pmaddwd %%mm3, %%mm1 \n\t"
/* -C5R7-C1R5 -C5r7-C1r5 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"paddd %%mm2, %%mm1 \n\t"
/* B1 b1 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t"
/* A1 a1 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm1, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm7, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A1-B1 a1-b1 */
\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"movd %%mm4, 112+" #dst " \n\t"\
"movq " #src1 ", %%mm0 \n\t"
/* R3 R1 r3 r1 */
\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"movq 88(%2), %%mm7 \n\t"
/* C3 C7 C3 C7 */
\
"pmaddwd 96(%2), %%mm0 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* C3R7+C7R5 C3r7+C7r5 */
\
"movq %%mm5, %%mm2 \n\t"
/* A2 a2 */
\
"pmaddwd 104(%2), %%mm3 \n\t"
/* -C1R7+C3R5 -C1r7+C3r5 */
\
"paddd %%mm7, %%mm4 \n\t"
/* B2 b2 */
\
"paddd %%mm4, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm5 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm0, %%mm3 \n\t"
/* B3 b3 */
\
"paddd %%mm3, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm3, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm2, %%mm2 \n\t"
/* A2+B2 a2+b2 */
\
"packssdw %%mm6, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A3-B3 a3-b3 */
\
"packssdw %%mm5, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
IDCT
(
8
(
%
1
),
72
(
%
1
),
40
(
%
1
),
104
(
%
1
),
4
(
%
0
),
20
)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
IDCT
(
24
(
%
1
),
88
(
%
1
),
56
(
%
1
),
120
(
%
1
),
12
(
%
0
),
20
)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
\
"3:
\n\t
"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 64(%2), %%mm3 \n\t"\
"pmaddwd %%mm2, %%mm3 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm1 \n\t"
/* A1 a1 */
\
"paddd %%mm3, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm3, %%mm1 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm7, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm1, %%mm1 \n\t"
/* A1-B1 a1-b1 */
\
"movd %%mm1, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"pmaddwd %%mm2, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"pmaddwd 96(%2), %%mm2 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"movq %%mm5, %%mm1 \n\t"
/* A2 a2 */
\
"paddd %%mm4, %%mm1 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm5 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm2, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm2, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm1, %%mm1 \n\t"
/* A2+B2 a2+b2 */
\
"packssdw %%mm6, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movd %%mm1, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A3-B3 a3-b3 */
\
"packssdw %%mm5, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
IDCT
(
8
(
%
1
),
72
(
%
1
),
40
(
%
1
),
104
(
%
1
),
4
(
%
0
),
20
)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
IDCT
(
24
(
%
1
),
88
(
%
1
),
56
(
%
1
),
120
(
%
1
),
12
(
%
0
),
20
)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
\
"5:
\n\t
"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1 a1 */
\
"psubd %%mm1, %%mm5 \n\t"
/* A2 a2 */
\
"movq 8+" #src0 ", %%mm2 \n\t"
/* R4 R0 r4 r0 */
\
"movq 8+" #src4 ", %%mm3 \n\t"
/* R6 R2 r6 r2 */
\
"movq 16(%2), %%mm1 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm2, %%mm1 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm7 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm7, %%mm2 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm7 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm3, %%mm7 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"pmaddwd 40(%2), %%mm3 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"paddd %%mm1, %%mm7 \n\t"
/* A0 a0 */
\
"paddd %%mm1, %%mm1 \n\t"
/* 2C0 2c0 */
\
"psubd %%mm7, %%mm1 \n\t"
/* A3 a3 */
\
"paddd %%mm2, %%mm3 \n\t"
/* A1 a1 */
\
"paddd %%mm2, %%mm2 \n\t"
/* 2C1 2c1 */
\
"psubd %%mm3, %%mm2 \n\t"
/* A2 a2 */
\
"psrad $" #shift ", %%mm4 \n\t"\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm3 \n\t"\
"packssdw %%mm7, %%mm4 \n\t"
/* A0 a0 */
\
"movq %%mm4, " #dst " \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"packssdw %%mm3, %%mm0 \n\t"
/* A1 a1 */
\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 96+" #dst " \n\t"\
"movq %%mm4, 112+" #dst " \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm2, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movq %%mm5, 32+" #dst " \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm1, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movq %%mm6, 48+" #dst " \n\t"\
"movq %%mm6, 64+" #dst " \n\t"\
"movq %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
0
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
\
"1:
\n\t
"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq " #src4 ", %%mm1 \n\t"
/* R6 R2 r6 r2 */
\
"movq " #src1 ", %%mm2 \n\t"
/* R3 R1 r3 r1 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm5 \n\t"
/* C6 C2 C6 C2 */
\
"pmaddwd %%mm1, %%mm5 \n\t"
/* C6R6+C2R2 C6r6+C2r2 */
\
"movq 40(%2), %%mm6 \n\t"
/* -C2 C6 -C2 C6 */
\
"pmaddwd %%mm6, %%mm1 \n\t"
/* -C2R6+C6R2 -C2r6+C6r2 */
\
"movq %%mm4, %%mm6 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 48(%2), %%mm7 \n\t"
/* C3 C1 C3 C1 */
\
"pmaddwd %%mm2, %%mm7 \n\t"
/* C3R3+C1R1 C3r3+C1r1 */
\
"paddd %%mm5, %%mm4 \n\t"
/* A0 a0 */
\
"psubd %%mm5, %%mm6 \n\t"
/* A3 a3 */
\
"movq %%mm0, %%mm5 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1 a1 */
\
"psubd %%mm1, %%mm5 \n\t"
/* A2 a2 */
\
"movq 64(%2), %%mm1 \n\t"\
"pmaddwd %%mm2, %%mm1 \n\t"
/* -C7R3+C3R1 -C7r3+C3r1 */
\
"paddd %%mm4, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"paddd %%mm4, %%mm4 \n\t"
/* 2A0 2a0 */
\
"psubd %%mm7, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm3 \n\t"
/* A1 a1 */
\
"paddd %%mm1, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"psubd %%mm1, %%mm3 \n\t"
/* A1-B1 a1-b1 */
\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm3 \n\t"\
"packssdw %%mm7, %%mm7 \n\t"
/* A0+B0 a0+b0 */
\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t"
/* A1+B1 a1+b1 */
\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm3, %%mm3 \n\t"
/* A1-B1 a1-b1 */
\
"movd %%mm3, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A0-B0 a0-b0 */
\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 80(%2), %%mm4 \n\t"
/* -C1 C5 -C1 C5 */
\
"pmaddwd %%mm2, %%mm4 \n\t"
/* -C1R3+C5R1 -C1r3+C5r1 */
\
"pmaddwd 96(%2), %%mm2 \n\t"
/* -C5R3+C7R1 -C5r3+C7r1 */
\
"movq %%mm5, %%mm3 \n\t"
/* A2 a2 */
\
"paddd %%mm4, %%mm3 \n\t"
/* A2+B2 a2+b2 */
\
"psubd %%mm4, %%mm5 \n\t"
/* a2-B2 a2-b2 */
\
"psrad $" #shift ", %%mm3 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t"
/* A3 a3 */
\
"paddd %%mm2, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"psubd %%mm2, %%mm4 \n\t"
/* a3-B3 a3-b3 */
\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm3, %%mm3 \n\t"
/* A2+B2 a2+b2 */
\
"movd %%mm3, 32+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm6, %%mm6 \n\t"
/* A3+B3 a3+b3 */
\
"movd %%mm6, 48+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t"
/* A3-B3 a3-b3 */
\
"packssdw %%mm5, %%mm5 \n\t"
/* A2-B2 a2-b2 */
\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
IDCT
(
8
(
%
1
),
72
(
%
1
),
40
(
%
1
),
104
(
%
1
),
4
(
%
0
),
20
)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
IDCT
(
24
(
%
1
),
88
(
%
1
),
56
(
%
1
),
120
(
%
1
),
12
(
%
0
),
20
)
"jmp 9f
\n\t
"
"# .p2align 4
\n\t
"
"7:
\n\t
"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t"
/* R4 R0 r4 r0 */
\
"movq 16(%2), %%mm4 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm0, %%mm4 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm5 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm5, %%mm0 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"psrad $" #shift ", %%mm4 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq 8+" #src0 ", %%mm2 \n\t"
/* R4 R0 r4 r0 */
\
"movq 16(%2), %%mm1 \n\t"
/* C4 C4 C4 C4 */
\
"pmaddwd %%mm2, %%mm1 \n\t"
/* C4R4+C4R0 C4r4+C4r0 */
\
"movq 24(%2), %%mm7 \n\t"
/* -C4 C4 -C4 C4 */
\
"pmaddwd %%mm7, %%mm2 \n\t"
/* -C4R4+C4R0 -C4r4+C4r0 */
\
"movq 32(%2), %%mm7 \n\t"
/* C6 C2 C6 C2 */
\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm1, %%mm4 \n\t"
/* A0 a0 */
\
"movq %%mm4, " #dst " \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm2, %%mm0 \n\t"
/* A1 a1 */
\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 96+" #dst " \n\t"\
"movq %%mm4, 112+" #dst " \n\t"\
"movq %%mm0, 32+" #dst " \n\t"\
"movq %%mm4, 48+" #dst " \n\t"\
"movq %%mm4, 64+" #dst " \n\t"\
"movq %%mm0, 80+" #dst " \n\t"
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT
(
0
(
%
1
),
64
(
%
1
),
32
(
%
1
),
96
(
%
1
),
0
(
%
0
),
20
)
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT
(
16
(
%
1
),
80
(
%
1
),
48
(
%
1
),
112
(
%
1
),
8
(
%
0
),
20
)
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
/*
Input
00 40 04 44 20 60 24 64
10 30 14 34 50 70 54 74
01 41 03 43 21 61 23 63
11 31 13 33 51 71 53 73
02 42 06 46 22 62 26 66
12 32 16 36 52 72 56 76
05 45 07 47 25 65 27 67
15 35 17 37 55 75 57 77
Temp
00 04 10 14 20 24 30 34
40 44 50 54 60 64 70 74
01 03 11 13 21 23 31 33
41 43 51 53 61 63 71 73
02 06 12 16 22 26 32 36
42 46 52 56 62 66 72 76
05 07 15 17 25 27 35 37
45 47 55 57 65 67 75 77
*/
"9:
\n\t
"
::
"r"
(
block
),
"r"
(
temp
),
"r"
(
coeffs
)
NAMED_CONSTRAINTS_ADD
(
wm1010
,
d40000
)
:
"%eax"
);
}
void
ff_simple_idct_mmx
(
int16_t
*
block
)
{
idct
(
block
);
}
//FIXME merge add/put into the idct
void
ff_simple_idct_put_mmx
(
uint8_t
*
dest
,
ptrdiff_t
line_size
,
int16_t
*
block
)
{
idct
(
block
);
ff_put_pixels_clamped_mmx
(
block
,
dest
,
line_size
);
}
void
ff_simple_idct_add_mmx
(
uint8_t
*
dest
,
ptrdiff_t
line_size
,
int16_t
*
block
)
{
idct
(
block
);
ff_add_pixels_clamped_mmx
(
block
,
dest
,
line_size
);
}
void
ff_simple_idct_put_sse2
(
uint8_t
*
dest
,
ptrdiff_t
line_size
,
int16_t
*
block
)
{
idct
(
block
);
ff_put_pixels_clamped_sse2
(
block
,
dest
,
line_size
);
}
void
ff_simple_idct_add_sse2
(
uint8_t
*
dest
,
ptrdiff_t
line_size
,
int16_t
*
block
)
{
idct
(
block
);
ff_add_pixels_clamped_sse2
(
block
,
dest
,
line_size
);
}
#endif
/* HAVE_INLINE_ASM */
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment