dsputil_ppc.c 9.62 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5
 *
6 7 8
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
9 10
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14 15 16 17 18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 22
 */

23 24
#include "../dsputil.h"

25 26
#include "dsputil_ppc.h"

27 28 29
#ifdef HAVE_ALTIVEC
#include "dsputil_altivec.h"

30
extern void fdct_altivec(int16_t *block);
31 32
extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
                         int x16, int y16, int rounder);
33 34
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
Luca Barbato's avatar
Luca Barbato committed
35 36

void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
37 38

void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
Luca Barbato's avatar
Luca Barbato committed
39
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
40
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
41
void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
Luca Barbato's avatar
Luca Barbato committed
42 43

#endif
Luca Barbato's avatar
Luca Barbato committed
44

Måns Rullgård's avatar
Måns Rullgård committed
45
int mm_flags = 0;
46

47 48 49
int mm_support(void)
{
    int result = 0;
50
#ifdef HAVE_ALTIVEC
51 52 53 54 55 56 57
    if (has_altivec()) {
        result |= MM_ALTIVEC;
    }
#endif /* result */
    return result;
}

58
#ifdef CONFIG_POWERPC_PERF
59
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
60
/* list below must match enum in dsputil_ppc.h */
61
static unsigned char* perfname[] = {
62
  "ff_fft_calc_altivec",
63 64
  "gmc1_altivec",
  "dct_unquantize_h263_altivec",
65
  "fdct_altivec",
66 67 68 69 70 71
  "idct_add_altivec",
  "idct_put_altivec",
  "put_pixels16_altivec",
  "avg_pixels16_altivec",
  "avg_pixels8_altivec",
  "put_pixels8_xy2_altivec",
72 73 74
  "put_no_rnd_pixels8_xy2_altivec",
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
75
  "hadamard8_diff8x8_altivec",
76
  "hadamard8_diff16_altivec",
77
  "avg_pixels8_xy2_altivec",
78
  "clear_blocks_dcbz32_ppc",
79 80 81 82 83 84 85 86 87 88
  "clear_blocks_dcbz128_ppc",
  "put_h264_chroma_mc8_altivec",
  "avg_h264_chroma_mc8_altivec",
  "put_h264_qpel16_h_lowpass_altivec",
  "avg_h264_qpel16_h_lowpass_altivec",
  "put_h264_qpel16_v_lowpass_altivec",
  "avg_h264_qpel16_v_lowpass_altivec",
  "put_h264_qpel16_hv_lowpass_altivec",
  "avg_h264_qpel16_hv_lowpass_altivec",
  ""
89 90 91 92
};
#include <stdio.h>
#endif

93
#ifdef CONFIG_POWERPC_PERF
94 95
void powerpc_display_perf_report(void)
{
96
  int i, j;
97
  av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
98 99
  for(i = 0 ; i < powerpc_perf_total ; i++)
  {
100 101
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
      {
102 103
        if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
          av_log(NULL, AV_LOG_INFO,
104
                  " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
105 106 107 108 109 110 111
                  perfname[i],
                  j+1,
                  perfdata[j][i][powerpc_data_min],
                  perfdata[j][i][powerpc_data_max],
                  (double)perfdata[j][i][powerpc_data_sum] /
                  (double)perfdata[j][i][powerpc_data_num],
                  perfdata[j][i][powerpc_data_num]);
112
      }
113 114
  }
}
115
#endif /* CONFIG_POWERPC_PERF */
116 117 118 119 120 121 122 123 124 125 126 127 128

/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
  clear_blocks_dcbz32_ppc will not work properly
  on PowerPC processors with a cache line size
  not equal to 32 bytes.
  Fortunately all processor used by Apple up to
  at least the 7450 (aka second generation G4)
  use 32 bytes cache line.
  This is due to the use of the 'dcbz' instruction.
  It simply clear to zero a single cache line,
  so you need to know the cache line size to use it !
  It's absurd, but it's fast...
129 130 131 132 133 134 135 136 137 138 139 140

  update 24/06/2003 : Apple released yesterday the G5,
  with a PPC970. cache line size : 128 bytes. Oups.
  The semantic of dcbz was changed, it always clear
  32 bytes. so the function below will work, but will
  be slow. So I fixed check_dcbz_effect to use dcbzl,
  which is defined to clear a cache line (as dcbz before).
  So we still can distinguish, and use dcbz (32 bytes)
  or dcbzl (one cache line) as required.

  see <http://developer.apple.com/technotes/tn/tn2087.html>
  and <http://developer.apple.com/technotes/tn/tn2086.html>
141 142 143
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
144
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
145 146
    register int misal = ((unsigned long)blocks & 0x00000010);
    register int i = 0;
147
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
148 149 150 151 152 153 154 155
#if 1
    if (misal) {
      ((unsigned long*)blocks)[0] = 0L;
      ((unsigned long*)blocks)[1] = 0L;
      ((unsigned long*)blocks)[2] = 0L;
      ((unsigned long*)blocks)[3] = 0L;
      i += 16;
    }
156
    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
157
#ifndef __MWERKS__
158
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
159 160 161
#else
      __dcbz( blocks, i );
#endif
162 163 164 165 166 167 168 169 170 171 172
    }
    if (misal) {
      ((unsigned long*)blocks)[188] = 0L;
      ((unsigned long*)blocks)[189] = 0L;
      ((unsigned long*)blocks)[190] = 0L;
      ((unsigned long*)blocks)[191] = 0L;
      i += 16;
    }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
173
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
174 175
}

176 177
/* same as above, when dcbzl clear a whole 128B cache line
   i.e. the PPC970 aka G5 */
178
#ifdef HAVE_DCBZL
179 180
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
181
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
182 183
    register int misal = ((unsigned long)blocks & 0x0000007f);
    register int i = 0;
184
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
185 186 187 188 189 190 191 192 193
#if 1
 if (misal) {
   // we could probably also optimize this case,
   // but there's not much point as the machines
   // aren't available yet (2003-06-26)
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    else
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
194
        asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
195 196 197 198
      }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
199
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
200 201 202 203 204 205 206 207
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif

208
#ifdef HAVE_DCBZL
209
/* check dcbz report how many bytes are set to 0 by dcbz */
210 211 212 213 214
/* update 24/06/2003 : replace dcbz by dcbzl to get
   the intended effect (Apple "fixed" dcbz)
   unfortunately this cannot be used unless the assembler
   knows about dcbzl ... */
long check_dcbzl_effect(void)
215
{
216
  register char *fakedata = (char*)av_malloc(1024);
217 218 219 220 221
  register char *fakedata_middle;
  register long zero = 0;
  register long i = 0;
  long count = 0;

222
  if (!fakedata)
223 224 225 226 227 228 229 230
  {
    return 0L;
  }

  fakedata_middle = (fakedata + 512);

  memset(fakedata, 0xFF, 1024);

231 232 233
  /* below the constraint "b" seems to mean "Address base register"
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
234 235 236 237 238 239 240

  for (i = 0; i < 1024 ; i ++)
  {
    if (fakedata[i] == (char)0)
      count++;
  }

241
  av_free(fakedata);
242

243 244
  return count;
}
245 246 247 248 249 250
#else
long check_dcbzl_effect(void)
{
  return 0;
}
#endif
251

Luca Barbato's avatar
Luca Barbato committed
252 253 254 255 256 257 258 259 260
static void prefetch_ppc(void *mem, int stride, int h)
{
    register const uint8_t *p = mem;
    do {
        asm volatile ("dcbt 0,%0" : : "r" (p));
        p+= stride;
    } while(--h);
}

261
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
262
{
263
    // Common optimizations whether Altivec is available or not
Luca Barbato's avatar
Luca Barbato committed
264
    c->prefetch = prefetch_ppc;
265 266 267 268 269 270 271 272 273 274
    switch (check_dcbzl_effect()) {
        case 32:
            c->clear_blocks = clear_blocks_dcbz32_ppc;
            break;
        case 128:
            c->clear_blocks = clear_blocks_dcbz128_ppc;
            break;
        default:
            break;
    }
275

276
#ifdef HAVE_ALTIVEC
277
    if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
278

279
    if (has_altivec()) {
Måns Rullgård's avatar
Måns Rullgård committed
280
        mm_flags |= MM_ALTIVEC;
281

282
        dsputil_init_altivec(c, avctx);
283 284 285
        if(ENABLE_SNOW_DECODER) snow_init_altivec(c, avctx);
        if(ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
            vc1dsp_init_altivec(c, avctx);
286
        float_init_altivec(c, avctx);
287
        c->gmc1 = gmc1_altivec;
288

289
#ifdef CONFIG_ENCODERS
290 291 292 293 294
        if (avctx->dct_algo == FF_DCT_AUTO ||
            avctx->dct_algo == FF_DCT_ALTIVEC)
        {
            c->fdct = fdct_altivec;
        }
295 296
#endif //CONFIG_ENCODERS

297 298
        if (avctx->lowres==0)
        {
299 300 301 302 303 304
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
        {
            c->idct_put = idct_put_altivec;
            c->idct_add = idct_add_altivec;
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
305
        }
306
        }
307

308
#ifdef CONFIG_POWERPC_PERF
309
        {
310
          int i, j;
311
          for (i = 0 ; i < powerpc_perf_total ; i++)
312
          {
313 314 315 316 317 318 319 320
            for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
              {
                perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
                perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
                perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
                perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
              }
          }
321
        }
322
#endif /* CONFIG_POWERPC_PERF */
323
    }
324
#endif /* HAVE_ALTIVEC */
325
}