swscale_altivec.c 13.6 KB
Newer Older
1
/*
2 3 4
 * AltiVec-enhanced yuv2yuvX
 *
 * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
Diego Biurrun's avatar
Diego Biurrun committed
5
 * based on the equivalent C code in swscale.c
6 7 8
 *
 * This file is part of FFmpeg.
 *
9 10 11 12
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
13 14 15
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
18
 *
19 20
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

24
#include <inttypes.h>
25

26 27 28
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
29
#include "libavutil/attributes.h"
30 31
#include "libavutil/cpu.h"
#include "yuv2rgb_altivec.h"
32
#include "libavutil/ppc/util_altivec.h"
33

34
#if HAVE_ALTIVEC
35
#define vzero vec_splat_s32(0)
36

37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
#if HAVE_BIGENDIAN
#define  GET_LS(a,b,c,s) {\
        vector signed short l2  = vec_ld(((b) << 1) + 16, s);\
        ls  = vec_perm(a, l2, c);\
        a = l2;\
    }
#else
#define  GET_LS(a,b,c,s) {\
        ls  = a;\
        a = vec_vsx_ld(((b) << 1)  + 16, s);\
    }
#endif

#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
        vector signed short ls;\
        GET_LS(l1, x, perm, src);\
        vector signed int   i1  = vec_mule(filter, ls);\
        vector signed int   i2  = vec_mulo(filter, ls);\
        vector signed int   vf1, vf2;\
56 57
        vf1 = vec_mergeh(i1, i2);\
        vf2 = vec_mergel(i1, i2);\
58 59
        d1 = vec_add(d1, vf1);\
        d2 = vec_add(d2, vf2);\
60 61
    } while (0)

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
#if HAVE_BIGENDIAN
#define LOAD_FILTER(vf,f) {\
        vector unsigned char perm0 = vec_lvsl(joffset, f);\
        vf = vec_ld(joffset, f);\
        vf = vec_perm(vf, vf, perm0);\
}
#define LOAD_L1(ll1,s,p){\
        p = vec_lvsl(xoffset, s);\
        ll1   = vec_ld(xoffset, s);\
}
#else
#define LOAD_FILTER(vf,f) {\
        vf = vec_vsx_ld(joffset, f);\
}
#define LOAD_L1(ll1,s,p){\
        ll1  = vec_vsx_ld(xoffset, s);\
}
#endif

81 82 83
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
                                  const int16_t **src, uint8_t *dest,
                                  const uint8_t *dither, int offset, int x)
Ramiro Polla's avatar
Ramiro Polla committed
84
{
85
    register int i, j;
86
    LOCAL_ALIGNED(16, int, val, [16]);
87 88 89
    vector signed int vo1, vo2, vo3, vo4;
    vector unsigned short vs1, vs2;
    vector unsigned char vf;
90
    vector unsigned int altivec_vectorShiftInt19 =
Benoit Fouet's avatar
Benoit Fouet committed
91
        vec_add(vec_splat_u32(10), vec_splat_u32(9));
92

93 94
    for (i = 0; i < 16; i++)
        val[i] = dither[(x + i + offset) & 7] << 12;
95

96 97 98 99
    vo1 = vec_ld(0,  val);
    vo2 = vec_ld(16, val);
    vo3 = vec_ld(32, val);
    vo4 = vec_ld(48, val);
100

101
    for (j = 0; j < filterSize; j++) {
102 103 104 105 106 107 108
        unsigned int joffset=j<<1;
        unsigned int xoffset=x<<1;
        vector unsigned char perm;
        vector signed short l1,vLumFilter;
        LOAD_FILTER(vLumFilter,filter);
        vLumFilter = vec_splat(vLumFilter, 0);
        LOAD_L1(l1,src[j],perm);
109 110 111
        yuv2planeX_8(vo1, vo2, l1, src[j], x,     perm, vLumFilter);
        yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
    }
112

113 114 115 116 117 118 119
    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
    vs1 = vec_packsu(vo1, vo2);
    vs2 = vec_packsu(vo3, vo4);
    vf  = vec_packsu(vs1, vs2);
120
    VEC_ST(vf, 0, dest);
121
}
122

123

124 125 126 127 128
static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
                                const int16_t **src, uint8_t *dest, int dstW,
                                const uint8_t *dither, int offset, int x)
{
    int i, j;
129

130 131 132 133 134 135 136
    for (i = x; i < dstW; i++) {
        int t = dither[(i + offset) & 7] << 12;
        for (j = 0; j < filterSize; j++)
            t += src[j][i] * filter[j];
        dest[i] = av_clip_uint8(t >> 19);
    }
}
137

138 139 140 141 142 143
static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
                               const int16_t **src, uint8_t *dest, int dstW,
                               const uint8_t *dither, int offset)
{
    int dst_u = -(uintptr_t)dest & 15;
    int i;
144

145
    yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
146

147 148 149
    for (i = dst_u; i < dstW - 15; i += 16)
        yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
                              offset, i);
150

151
    yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
152
}
153

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
#if HAVE_BIGENDIAN
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).

// The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots.
#define GET_VF4(a, vf, f) {\
    vf = vec_ld(a<< 3, f);\
    if ((a << 3) % 16)\
        vf = vec_mergel(vf, (vector signed short)vzero);\
    else\
        vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {\
    sv = vec_ld(pos, s);\
    per = vec_lvsl(pos, s);\
}
#define UPDATE_PTR(s0, d0, s1, d1) {\
    d0 = s0;\
    d1 = s1;\
}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
    v1 = vec_ld(pos + a + 16, s);\
    vf = vec_perm(v0, v1, per);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
    if ((((uintptr_t)s + pos) % 16) > 8) {\
        v1 = vec_ld(pos + a + 16, s);\
    }\
    vf = vec_perm(v0, src_v1, per);\
}
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
    vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
    vf  = vec_perm(vf0, vf1, per);\
}
#else /* else of #if HAVE_BIGENDIAN */
#define GET_VF4(a, vf, f) {\
    vf = (vector signed short)vec_vsx_ld(a << 3, f);\
    vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {}
#define UPDATE_PTR(s0, d0, s1, d1) {}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
    vf = vec_vsx_ld(pos + a, s);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
    vf  = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
}
#endif /* end of #if HAVE_BIGENDIAN */

206
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
207
                                const uint8_t *src, const int16_t *filter,
208
                                const int32_t *filterPos, int filterSize)
209
{
210
    register int i;
211
    LOCAL_ALIGNED(16, int, tempo, [4]);
212 213

    if (filterSize % 4) {
214
        for (i = 0; i < dstW; i++) {
215 216
            register int j;
            register int srcPos = filterPos[i];
217 218 219 220
            register int val    = 0;
            for (j = 0; j < filterSize; j++)
                val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
221
        }
222 223 224 225 226 227
    } else
        switch (filterSize) {
        case 4:
            for (i = 0; i < dstW; i++) {
                register int srcPos = filterPos[i];

228
                vector unsigned char src_vF = unaligned_load(srcPos, src);
229 230 231
                vector signed short src_v, filter_v;
                vector signed int val_vEven, val_s;
                src_v = // vec_unpackh sign-extends...
232
                        (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
233 234
                // now put our elements in the even slots
                src_v = vec_mergeh(src_v, (vector signed short)vzero);
235
                GET_VF4(i, filter_v, filter);
236 237 238 239 240 241 242 243 244
                val_vEven = vec_mule(src_v, filter_v);
                val_s     = vec_sums(val_vEven, vzero);
                vec_st(val_s, 0, tempo);
                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
            }
        break;
        case 8:
            for (i = 0; i < dstW; i++) {
                register int srcPos = filterPos[i];
245 246
                vector unsigned char src_vF, src_v0, src_v1;
                vector unsigned char permS;
247 248
                vector signed short src_v, filter_v;
                vector signed int val_v, val_s;
249 250
                FIRST_LOAD(src_v0, srcPos, src, permS);
                LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
251
                src_v = // vec_unpackh sign-extends...
252
                        (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
253 254 255 256 257 258 259
                filter_v = vec_ld(i << 4, filter);
                val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
                val_s = vec_sums(val_v, vzero);
                vec_st(val_s, 0, tempo);
                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
            }
        break;
260

261 262 263
        case 16:
            for (i = 0; i < dstW; i++) {
                register int srcPos = filterPos[i];
264

265
                vector unsigned char src_vF = unaligned_load(srcPos, src);
266
                vector signed short src_vA = // vec_unpackh sign-extends...
267
                                             (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
268
                vector signed short src_vB = // vec_unpackh sign-extends...
269
                                             (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
270 271
                vector signed short filter_v0 = vec_ld(i << 5, filter);
                vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
272

273 274
                vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
                vector signed int val_v   = vec_msums(src_vB, filter_v1, val_acc);
275

276
                vector signed int val_s = vec_sums(val_v, vzero);
277

278
                VEC_ST(val_s, 0, tempo);
279 280 281 282 283 284
                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
            }
        break;

        default:
            for (i = 0; i < dstW; i++) {
285
                register int j, offset = i * 2 * filterSize;
286 287 288
                register int srcPos = filterPos[i];

                vector signed int val_s, val_v = (vector signed int)vzero;
289 290 291 292
                vector signed short filter_v0R;
                vector unsigned char permF, src_v0, permS;
                FIRST_LOAD(filter_v0R, offset, filter, permF);
                FIRST_LOAD(src_v0, srcPos, src, permS);
293 294

                for (j = 0; j < filterSize - 15; j += 16) {
295 296 297
                    vector unsigned char src_v1, src_vF;
                    vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1;
                    LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
298
                    vector signed short src_vA = // vec_unpackh sign-extends...
299
                                                 (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
300
                    vector signed short src_vB = // vec_unpackh sign-extends...
301 302 303
                                                 (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
                    GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
                    GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
304 305 306

                    vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
                    val_v = vec_msums(src_vB, filter_v1, val_acc);
307
                    UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
308 309 310 311 312 313
                }

                if (j < filterSize - 7) {
                    // loading src_v0 is useless, it's already done above
                    vector unsigned char src_v1, src_vF;
                    vector signed short src_v, filter_v1R, filter_v;
314
                    LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
315
                    src_v = // vec_unpackh sign-extends...
316 317
                            (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
                    GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
318 319 320 321
                    val_v = vec_msums(src_v, filter_v, val_v);
                }
                val_s = vec_sums(val_v, vzero);

322
                VEC_ST(val_s, 0, tempo);
323
                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
324
            }
325 326
        }
}
327
#endif /* HAVE_ALTIVEC */
328

329
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
Ramiro Polla's avatar
Ramiro Polla committed
330
{
331
#if HAVE_ALTIVEC
332
    enum AVPixelFormat dstFormat = c->dstFormat;
333

334 335 336
    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
        return;

337
    if (c->srcBpc == 8 && c->dstBpc <= 14) {
338
        c->hyScale = c->hcScale = hScale_altivec_real;
339
    }
340
    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
341
        dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
342
        !c->needAlpha) {
343
        c->yuv2planeX = yuv2planeX_altivec;
344
    }
345 346 347

    /* The following list of supported dstFormat values should
     * match what's found in the body of ff_yuv2packedX_altivec() */
348
    if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
349
        switch (c->dstFormat) {
350
        case AV_PIX_FMT_ABGR:
351 352
            c->yuv2packedX = ff_yuv2abgr_X_altivec;
            break;
353
        case AV_PIX_FMT_BGRA:
354 355
            c->yuv2packedX = ff_yuv2bgra_X_altivec;
            break;
356
        case AV_PIX_FMT_ARGB:
357 358
            c->yuv2packedX = ff_yuv2argb_X_altivec;
            break;
359
        case AV_PIX_FMT_RGBA:
360 361
            c->yuv2packedX = ff_yuv2rgba_X_altivec;
            break;
362
        case AV_PIX_FMT_BGR24:
363 364
            c->yuv2packedX = ff_yuv2bgr24_X_altivec;
            break;
365
        case AV_PIX_FMT_RGB24:
366 367
            c->yuv2packedX = ff_yuv2rgb24_X_altivec;
            break;
368
        }
369
    }
370
#endif /* HAVE_ALTIVEC */
371
}