bswapdsp.asm 3.18 KB
Newer Older
1
;******************************************************************************
2
;* optimized bswap buffer functions
3
;* Copyright (c) 2008 Loren Merritt
4 5
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 23
;******************************************************************************

24
%include "libavutil/x86/x86util.asm"
25

26
SECTION_RODATA
27
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
28

29 30
cextern pb_80

31
SECTION .text
32

33
; %1 = aligned/unaligned
34
%macro BSWAP_LOOPS  1
35 36
    mov      r3d, r2d
    sar      r2d, 3
37 38 39 40
    jz       .left4_%1
.loop8_%1:
    mov%1    m0, [r1 +  0]
    mov%1    m1, [r1 + 16]
41 42 43
%if cpuflag(ssse3)
    pshufb   m0, m2
    pshufb   m1, m2
44 45
    mov%1    [r0 +  0], m0
    mov%1    [r0 + 16], m1
46
%else
47 48 49 50 51 52 53 54 55 56 57 58
    pshuflw  m0, m0, 10110001b
    pshuflw  m1, m1, 10110001b
    pshufhw  m0, m0, 10110001b
    pshufhw  m1, m1, 10110001b
    mova     m2, m0
    mova     m3, m1
    psllw    m0, 8
    psllw    m1, 8
    psrlw    m2, 8
    psrlw    m3, 8
    por      m2, m0
    por      m3, m1
59 60
    mov%1    [r0 +  0], m2
    mov%1    [r0 + 16], m3
61
%endif
62
    add      r0, 32
63
    add      r1, 32
64
    dec      r2d
65 66
    jnz      .loop8_%1
.left4_%1:
67 68
    mov      r2d, r3d
    test     r3d, 4
69 70
    jz       .left
    mov%1    m0, [r1]
71 72
%if cpuflag(ssse3)
    pshufb   m0, m2
73
    mov%1    [r0], m0
74
%else
75 76 77 78 79 80
    pshuflw  m0, m0, 10110001b
    pshufhw  m0, m0, 10110001b
    mova     m2, m0
    psllw    m0, 8
    psrlw    m2, 8
    por      m2, m0
81
    mov%1    [r0], m2
82
%endif
83 84 85 86
    add      r1, 16
    add      r0, 16
%endmacro

87
; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
88 89 90 91 92 93
%macro BSWAP32_BUF 0
%if cpuflag(ssse3)
cglobal bswap32_buf, 3,4,3
    mov      r3, r1
    mova     m2, [pb_bswap32]
%else
94 95
cglobal bswap32_buf, 3,4,5
    mov      r3, r1
96
%endif
97
    or       r3, r0
98
    test     r3, 15
99
    jz       .start_align
100
    BSWAP_LOOPS  u
101 102
    jmp      .left
.start_align:
103
    BSWAP_LOOPS  a
104
.left:
105
%if cpuflag(ssse3)
106
    test     r2d, 2
107 108 109 110 111 112 113
    jz       .left1
    movq     m0, [r1]
    pshufb   m0, m2
    movq     [r0], m0
    add      r1, 8
    add      r0, 8
.left1:
114
    test     r2d, 1
115 116 117 118
    jz       .end
    mov      r2d, [r1]
    bswap    r2d
    mov      [r0], r2d
119
%else
120
    and      r2d, 3
121 122 123 124 125 126 127
    jz       .end
.loop2:
    mov      r3d, [r1]
    bswap    r3d
    mov      [r0], r3d
    add      r1, 4
    add      r0, 4
128
    dec      r2d
129 130
    jnz      .loop2
%endif
131 132
.end:
    RET
133 134 135 136
%endmacro

INIT_XMM sse2
BSWAP32_BUF
137

138 139
INIT_XMM ssse3
BSWAP32_BUF