yuv2rgb_neon.S 9.77 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
 * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"

24 25 26 27 28 29 30 31 32
.macro load_yoff_ycoeff yoff ycoeff
#if defined(__APPLE__)
    ldp                 w9, w10, [sp, #\yoff]
#else
    ldr                 w9,  [sp, #\yoff]
    ldr                 w10, [sp, #\ycoeff]
#endif
.endm

33 34
.macro load_args_nv12
    ldr                 x8,  [sp]                                       // table
35
    load_yoff_ycoeff    8, 16                                           // y_offset, y_coeff
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
    ld1                 {v1.1D}, [x8]
    dup                 v0.8H, w10
    dup                 v3.8H, w9
    sub                 w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
    sub                 w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
    sub                 w7, w7, w0                                      // w7 = linesizeC - width     (paddingC)
    neg                 w11, w0
.endm

.macro load_args_nv21
    load_args_nv12
.endm

.macro load_args_yuv420p
    ldr                 x13, [sp]                                       // srcV
    ldr                 w14, [sp, #8]                                   // linesizeV
    ldr                 x8,  [sp, #16]                                  // table
53
    load_yoff_ycoeff    24, 32                                          // y_offset, y_coeff
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
    ld1                 {v1.1D}, [x8]
    dup                 v0.8H, w10
    dup                 v3.8H, w9
    sub                 w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
    sub                 w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
    sub                 w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
    sub                 w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
    lsr                 w11, w0, #1
    neg                 w11, w11
.endm

.macro load_args_yuv422p
    ldr                 x13, [sp]                                       // srcV
    ldr                 w14, [sp, #8]                                   // linesizeV
    ldr                 x8,  [sp, #16]                                  // table
69
    load_yoff_ycoeff    24, 32                                          // y_offset, y_coeff
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
    ld1                 {v1.1D}, [x8]
    dup                 v0.8H, w10
    dup                 v3.8H, w9
    sub                 w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
    sub                 w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
    sub                 w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
    sub                 w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
.endm

.macro load_chroma_nv12
    ld2                 {v16.8B, v17.8B}, [x6], #16
    ushll               v18.8H, v16.8B, #3
    ushll               v19.8H, v17.8B, #3
.endm

.macro load_chroma_nv21
    ld2                 {v16.8B, v17.8B}, [x6], #16
    ushll               v19.8H, v16.8B, #3
    ushll               v18.8H, v17.8B, #3
.endm

.macro load_chroma_yuv420p
    ld1                 {v16.8B}, [ x6], #8
    ld1                 {v17.8B}, [x13], #8
    ushll               v18.8H, v16.8B, #3
    ushll               v19.8H, v17.8B, #3
.endm

.macro load_chroma_yuv422p
    load_chroma_yuv420p
.endm

.macro increment_nv12
    ands                w15, w1, #1
    csel                w16, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width
    add                 x6,  x6, w16, SXTW                              // srcC += incC
.endm

.macro increment_nv21
    increment_nv12
.endm

.macro increment_yuv420p
    ands                w15, w1, #1
    csel                w16,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2
    csel                w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2
    add                 x6,  x6,  w16, SXTW                             // srcU += incU
    add                 x13, x13, w17, SXTW                             // srcV += incV
.endm

.macro increment_yuv422p
    add                 x6,  x6,  w7, UXTW                              // srcU += incU
    add                 x13, x13, w14, UXTW                             // srcV += incV
.endm

.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
    add                 v20.8H, v26.8H, v20.8H                          // Y1 + R1
    add                 v21.8H, v27.8H, v21.8H                          // Y2 + R2
    add                 v22.8H, v26.8H, v22.8H                          // Y1 + G1
    add                 v23.8H, v27.8H, v23.8H                          // Y2 + G2
    add                 v24.8H, v26.8H, v24.8H                          // Y1 + B1
    add                 v25.8H, v27.8H, v25.8H                          // Y2 + B2
    sqrshrun            \r1, v20.8H, #1                                 // clip_u8((Y1 + R1) >> 1)
    sqrshrun            \r2, v21.8H, #1                                 // clip_u8((Y2 + R1) >> 1)
    sqrshrun            \g1, v22.8H, #1                                 // clip_u8((Y1 + G1) >> 1)
    sqrshrun            \g2, v23.8H, #1                                 // clip_u8((Y2 + G1) >> 1)
    sqrshrun            \b1, v24.8H, #1                                 // clip_u8((Y1 + B1) >> 1)
    sqrshrun            \b2, v25.8H, #1                                 // clip_u8((Y2 + B1) >> 1)
    movi                \a1, #255
    movi                \a2, #255
.endm

.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
    load_args_\ifmt
1:
    mov                 w8, w0                                          // w8 = width
2:
    movi                v5.8H, #4, lsl #8                               // 128 * (1<<3)
    load_chroma_\ifmt
    sub                 v18.8H, v18.8H, v5.8H                           // U*(1<<3) - 128*(1<<3)
    sub                 v19.8H, v19.8H, v5.8H                           // V*(1<<3) - 128*(1<<3)
152 153 154 155 156 157 158 159 160 161 162
    sqdmulh             v20.8H, v19.8H, v1.H[0]                         // V * v2r            (R)
    sqdmulh             v22.8H, v18.8H, v1.H[1]                         // U * u2g
    sqdmulh             v19.8H, v19.8H, v1.H[2]                         //           V * v2g
    add                 v22.8H, v22.8H, v19.8H                          // U * u2g + V * v2g  (G)
    sqdmulh             v24.8H, v18.8H, v1.H[3]                         // U * u2b            (B)
    zip2                v21.8H, v20.8H, v20.8H                          // R2
    zip1                v20.8H, v20.8H, v20.8H                          // R1
    zip2                v23.8H, v22.8H, v22.8H                          // G2
    zip1                v22.8H, v22.8H, v22.8H                          // G1
    zip2                v25.8H, v24.8H, v24.8H                          // B2
    zip1                v24.8H, v24.8H, v24.8H                          // B1
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
    ld1                 {v2.16B}, [x4], #16                             // load luma
    ushll               v26.8H, v2.8B,  #3                              // Y1*(1<<3)
    ushll2              v27.8H, v2.16B, #3                              // Y2*(1<<3)
    sub                 v26.8H, v26.8H, v3.8H                           // Y1*(1<<3) - y_offset
    sub                 v27.8H, v27.8H, v3.8H                           // Y2*(1<<3) - y_offset
    sqdmulh             v26.8H, v26.8H, v0.8H                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
    sqdmulh             v27.8H, v27.8H, v0.8H                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15

.ifc \ofmt,argb // 1 2 3 0
    compute_rgba        v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
.endif

.ifc \ofmt,rgba // 0 1 2 3
    compute_rgba        v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
.endif

.ifc \ofmt,abgr // 3 2 1 0
    compute_rgba        v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
.endif

.ifc \ofmt,bgra // 2 1 0 3
    compute_rgba        v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
.endif

    st4                 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
    st4                 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
    subs                w8, w8, #16                                     // width -= 16
    b.gt                2b
    add                 x2, x2, w3, UXTW                                // dst  += padding
    add                 x4, x4, w5, UXTW                                // srcY += paddingY
    increment_\ifmt
    subs                w1, w1, #1                                      // height -= 1
    b.gt                1b
    ret
endfunc
.endm

.macro declare_rgb_funcs ifmt
    declare_func \ifmt, argb
    declare_func \ifmt, rgba
    declare_func \ifmt, abgr
    declare_func \ifmt, bgra
.endm

declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p