Commit 020865f5 authored by Michael Niedermayer's avatar Michael Niedermayer

Merge commit 'c1661484'

* commit 'c1661484':
  dsputil: Move pix_sum, pix_norm1, shrink function pointers to mpegvideoenc

Conflicts:
	libavcodec/dsputil.c
	libavcodec/mpegvideo_enc.c
	libavcodec/x86/dsputilenc.asm
	libavcodec/x86/dsputilenc_mmx.c
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents 462c6cdb c1661484
......@@ -22,6 +22,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \
OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
......@@ -61,6 +62,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \
arm/idctdsp_armv6.o \
arm/simple_idct_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o
ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
ARMV6-OBJS-$(CONFIG_VC1_DECODER) += arm/startcode_armv6.o
......
......@@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1
pop {r4-r9, pc}
endfunc
function ff_pix_norm1_armv6, export=1
push {r4-r6, lr}
mov r12, #16
mov lr, #0
1:
ldm r0, {r2-r5}
uxtb16 r6, r2
uxtb16 r2, r2, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r3
smlad lr, r2, r2, lr
uxtb16 r3, r3, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r4
smlad lr, r3, r3, lr
uxtb16 r4, r4, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r5
smlad lr, r4, r4, lr
uxtb16 r5, r5, ror #8
smlad lr, r6, r6, lr
subs r12, r12, #1
add r0, r0, r1
smlad lr, r5, r5, lr
bgt 1b
mov r0, lr
pop {r4-r6, pc}
endfunc
function ff_pix_sum_armv6, export=1
push {r4-r7, lr}
mov r12, #16
mov r2, #0
mov r3, #0
mov lr, #0
ldr r4, [r0]
1:
subs r12, r12, #1
ldr r5, [r0, #4]
usada8 r2, r4, lr, r2
ldr r6, [r0, #8]
usada8 r3, r5, lr, r3
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
usada8 r3, r7, lr, r3
add r0, r2, r3
pop {r4-r7, pc}
endfunc
......@@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
int line_size, int h);
int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
int ff_pix_sum_armv6(uint8_t *pix, int line_size);
av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
......@@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
c->sad[1] = ff_pix_abs8_armv6;
c->sse[0] = ff_sse16_armv6;
c->pix_norm1 = ff_pix_norm1_armv6;
c->pix_sum = ff_pix_sum_armv6;
}
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_pix_norm1_armv6, export=1
push {r4-r6, lr}
mov r12, #16
mov lr, #0
1:
ldm r0, {r2-r5}
uxtb16 r6, r2
uxtb16 r2, r2, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r3
smlad lr, r2, r2, lr
uxtb16 r3, r3, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r4
smlad lr, r3, r3, lr
uxtb16 r4, r4, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r5
smlad lr, r4, r4, lr
uxtb16 r5, r5, ror #8
smlad lr, r6, r6, lr
subs r12, r12, #1
add r0, r0, r1
smlad lr, r5, r5, lr
bgt 1b
mov r0, lr
pop {r4-r6, pc}
endfunc
function ff_pix_sum_armv6, export=1
push {r4-r7, lr}
mov r12, #16
mov r2, #0
mov r3, #0
mov lr, #0
ldr r4, [r0]
1:
subs r12, r12, #1
ldr r5, [r0, #4]
usada8 r2, r4, lr, r2
ldr r6, [r0, #8]
usada8 r3, r5, lr, r3
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
usada8 r3, r7, lr, r3
add r0, r2, r3
pop {r4-r7, pc}
endfunc
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
int ff_pix_sum_armv6(uint8_t *pix, int line_size);
av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags)) {
c->pix_norm1 = ff_pix_norm1_armv6;
c->pix_sum = ff_pix_sum_armv6;
}
}
......@@ -323,6 +323,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
ff_blockdsp_init(&ctx->bdsp, avctx);
ff_idctdsp_init(&ctx->m.idsp, avctx);
ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
ff_dct_common_init(&ctx->m);
ff_dct_encode_init(&ctx->m);
......@@ -733,8 +734,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg,
int varc;
if (!partial_last_row && mb_x * 16 <= avctx->width - 16) {
sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize);
sum = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize);
varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize);
} else {
int bw = FFMIN(avctx->width - 16 * mb_x, 16);
int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16);
......
......@@ -26,7 +26,6 @@
*/
#include "libavutil/attributes.h"
#include "libavutil/imgutils.h"
#include "libavutil/internal.h"
#include "avcodec.h"
#include "copy_block.h"
......@@ -34,8 +33,6 @@
#include "dsputil.h"
#include "simple_idct.h"
#include "faandct.h"
#include "imgconvert.h"
#include "mathops.h"
#include "mpegvideo.h"
#include "config.h"
......@@ -48,74 +45,6 @@ uint32_t ff_square_tab[512] = { 0, };
#define BIT_DEPTH 8
#include "dsputilenc_template.c"
static int pix_sum_c(uint8_t *pix, int line_size)
{
int s = 0, i, j;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
s += pix[0];
s += pix[1];
s += pix[2];
s += pix[3];
s += pix[4];
s += pix[5];
s += pix[6];
s += pix[7];
pix += 8;
}
pix += line_size - 16;
}
return s;
}
static int pix_norm1_c(uint8_t *pix, int line_size)
{
int s = 0, i, j;
uint32_t *sq = ff_square_tab + 256;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
#if 0
s += sq[pix[0]];
s += sq[pix[1]];
s += sq[pix[2]];
s += sq[pix[3]];
s += sq[pix[4]];
s += sq[pix[5]];
s += sq[pix[6]];
s += sq[pix[7]];
#else
#if HAVE_FAST_64BIT
register uint64_t x = *(uint64_t *) pix;
s += sq[x & 0xff];
s += sq[(x >> 8) & 0xff];
s += sq[(x >> 16) & 0xff];
s += sq[(x >> 24) & 0xff];
s += sq[(x >> 32) & 0xff];
s += sq[(x >> 40) & 0xff];
s += sq[(x >> 48) & 0xff];
s += sq[(x >> 56) & 0xff];
#else
register uint32_t x = *(uint32_t *) pix;
s += sq[x & 0xff];
s += sq[(x >> 8) & 0xff];
s += sq[(x >> 16) & 0xff];
s += sq[(x >> 24) & 0xff];
x = *(uint32_t *) (pix + 4);
s += sq[x & 0xff];
s += sq[(x >> 8) & 0xff];
s += sq[(x >> 16) & 0xff];
s += sq[(x >> 24) & 0xff];
#endif
#endif
pix += 8;
}
pix += line_size - 16;
}
return s;
}
static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
{
......@@ -1094,9 +1023,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
c->sum_abs_dctelem = sum_abs_dctelem_c;
c->pix_sum = pix_sum_c;
c->pix_norm1 = pix_norm1_c;
/* TODO [0] 16 [1] 8 */
c->pix_abs[0][0] = pix_abs16_c;
c->pix_abs[0][1] = pix_abs16_x2_c;
......@@ -1141,11 +1067,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
ff_dsputil_init_dwt(c);
#endif
c->shrink[0] = av_image_copy_plane;
c->shrink[1] = ff_shrink22;
c->shrink[2] = ff_shrink44;
c->shrink[3] = ff_shrink88;
c->draw_edges = draw_edges_8_c;
switch (avctx->bits_per_raw_sample) {
......
......@@ -72,9 +72,6 @@ typedef struct DSPContext {
int stride);
int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
int (*pix_sum)(uint8_t *pix, int line_size);
int (*pix_norm1)(uint8_t *pix, int line_size);
me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
me_cmp_func sse[6];
me_cmp_func hadamard8_diff[6];
......@@ -108,9 +105,6 @@ typedef struct DSPContext {
#define EDGE_WIDTH 16
#define EDGE_TOP 1
#define EDGE_BOTTOM 2
void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
int src_wrap, int width, int height);
} DSPContext;
void ff_dsputil_static_init(void);
......
......@@ -903,8 +903,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
/* intra / predictive decision */
pix = c->src[0][0];
sum = s->dsp.pix_sum(pix, s->linesize);
varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500;
sum = s->mpvencdsp.pix_sum(pix, s->linesize);
varc = s->mpvencdsp.pix_norm1(pix, s->linesize) -
(((unsigned) sum * sum) >> 8) + 500;
pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8;
......
......@@ -1010,7 +1010,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src,
int offset = x + y * stride;
int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride,
16);
int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8;
int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8;
int sae = get_sae(src + offset, mean, stride);
acc += sae + 500 < sad;
......@@ -1278,15 +1278,21 @@ static int estimate_best_b_count(MpegEncContext *s)
data[2] += INPLACE_OFFSET;
}
s->dsp.shrink[scale](s->tmp_frames[i]->data[0], s->tmp_frames[i]->linesize[0],
data[0], pre_input.f->linesize[0],
c->width, c->height);
s->dsp.shrink[scale](s->tmp_frames[i]->data[1], s->tmp_frames[i]->linesize[1],
data[1], pre_input.f->linesize[1],
c->width >> 1, c->height >> 1);
s->dsp.shrink[scale](s->tmp_frames[i]->data[2], s->tmp_frames[i]->linesize[2],
data[2], pre_input.f->linesize[2],
c->width >> 1, c->height >> 1);
s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0],
s->tmp_frames[i]->linesize[0],
data[0],
pre_input.f->linesize[0],
c->width, c->height);
s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1],
s->tmp_frames[i]->linesize[1],
data[1],
pre_input.f->linesize[1],
c->width >> 1, c->height >> 1);
s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2],
s->tmp_frames[i]->linesize[2],
data[2],
pre_input.f->linesize[2],
c->width >> 1, c->height >> 1);
}
}
......@@ -2585,9 +2591,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
int yy = mb_y * 16;
uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx;
int varc;
int sum = s->dsp.pix_sum(pix, s->linesize);
int sum = s->mpvencdsp.pix_sum(pix, s->linesize);
varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500 + 128)>>8;
varc = (s->mpvencdsp.pix_norm1(pix, s->linesize) -
(((unsigned) sum * sum) >> 8) + 500 + 128) >> 8;
s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc;
s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
......
......@@ -22,7 +22,10 @@
#include "config.h"
#include "libavutil/avassert.h"
#include "libavutil/attributes.h"
#include "libavutil/imgutils.h"
#include "avcodec.h"
#include "dsputil.h"
#include "imgconvert.h"
#include "mpegvideoencdsp.h"
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
......@@ -54,12 +57,92 @@ static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
(BASIS_SHIFT - RECON_SHIFT);
}
static int pix_sum_c(uint8_t *pix, int line_size)
{
int s = 0, i, j;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
s += pix[0];
s += pix[1];
s += pix[2];
s += pix[3];
s += pix[4];
s += pix[5];
s += pix[6];
s += pix[7];
pix += 8;
}
pix += line_size - 16;
}
return s;
}
static int pix_norm1_c(uint8_t *pix, int line_size)
{
int s = 0, i, j;
uint32_t *sq = ff_square_tab + 256;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
#if 0
s += sq[pix[0]];
s += sq[pix[1]];
s += sq[pix[2]];
s += sq[pix[3]];
s += sq[pix[4]];
s += sq[pix[5]];
s += sq[pix[6]];
s += sq[pix[7]];
#else
#if HAVE_FAST_64BIT
register uint64_t x = *(uint64_t *) pix;
s += sq[x & 0xff];
s += sq[(x >> 8) & 0xff];
s += sq[(x >> 16) & 0xff];
s += sq[(x >> 24) & 0xff];
s += sq[(x >> 32) & 0xff];
s += sq[(x >> 40) & 0xff];
s += sq[(x >> 48) & 0xff];
s += sq[(x >> 56) & 0xff];
#else
register uint32_t x = *(uint32_t *) pix;
s += sq[x & 0xff];
s += sq[(x >> 8) & 0xff];
s += sq[(x >> 16) & 0xff];
s += sq[(x >> 24) & 0xff];
x = *(uint32_t *) (pix + 4);
s += sq[x & 0xff];
s += sq[(x >> 8) & 0xff];
s += sq[(x >> 16) & 0xff];
s += sq[(x >> 24) & 0xff];
#endif
#endif
pix += 8;
}
pix += line_size - 16;
}
return s;
}
av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
c->try_8x8basis = try_8x8basis_c;
c->add_8x8basis = add_8x8basis_c;
c->shrink[0] = av_image_copy_plane;
c->shrink[1] = ff_shrink22;
c->shrink[2] = ff_shrink44;
c->shrink[3] = ff_shrink88;
c->pix_sum = pix_sum_c;
c->pix_norm1 = pix_norm1_c;
if (ARCH_ARM)
ff_mpegvideoencdsp_init_arm(c, avctx);
if (ARCH_PPC)
ff_mpegvideoencdsp_init_ppc(c, avctx);
if (ARCH_X86)
ff_mpegvideoencdsp_init_x86(c, avctx);
}
......@@ -31,10 +31,19 @@ typedef struct MpegvideoEncDSPContext {
int16_t basis[64], int scale);
void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
int (*pix_sum)(uint8_t *pix, int line_size);
int (*pix_norm1)(uint8_t *pix, int line_size);
void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
int src_wrap, int width, int height);
} MpegvideoEncDSPContext;
void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
AVCodecContext *avctx);
void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
AVCodecContext *avctx);
void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
AVCodecContext *avctx);
void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
AVCodecContext *avctx);
......
......@@ -13,6 +13,7 @@ OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o
OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o
OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \
ppc/mpegvideodsp.o
OBJS-$(CONFIG_MPEGVIDEOENC) += ppc/mpegvideoencdsp.o
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
......
......@@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
return s;
}
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
int i, s = 0;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
vector signed int sum;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned pixels. */
vector unsigned char pixl = vec_ld(0, pix);
vector unsigned char pixr = vec_ld(15, pix);
vector unsigned char pixv = vec_perm(pixl, pixr, perm);
/* Square the values, and add them to our sum. */
sv = vec_msum(pixv, pixv, sv);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sum = vec_sums((vector signed int) sv, (vector signed int) zero);
sum = vec_splat(sum, 3);
vec_ste(sum, 0, &s);
return s;
}
/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
* It's the sad8_altivec code above w/ squaring added. */
static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
......@@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
return s;
}
static int pix_sum_altivec(uint8_t *pix, int line_size)
{
int i, s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned 16 pixels into t1. */
vector unsigned char pixl = vec_ld(0, pix);
vector unsigned char pixr = vec_ld(15, pix);
vector unsigned char t1 = vec_perm(pixl, pixr, perm);
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t1, sad);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
int line_size)
{
......@@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
c->sse[0] = sse16_altivec;
c->sse[1] = sse8_altivec;
c->pix_norm1 = pix_norm1_altivec;
c->pix_sum = pix_sum_altivec;
c->diff_pixels = diff_pixels_altivec;
if (!high_bit_depth) {
......
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <stdint.h>
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/mpegvideoencdsp.h"
#if HAVE_ALTIVEC
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
int i, s = 0;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
vector signed int sum;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned pixels. */
vector unsigned char pixl = vec_ld(0, pix);
vector unsigned char pixr = vec_ld(15, pix);
vector unsigned char pixv = vec_perm(pixl, pixr, perm);
/* Square the values, and add them to our sum. */
sv = vec_msum(pixv, pixv, sv);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sum = vec_sums((vector signed int) sv, (vector signed int) zero);
sum = vec_splat(sum, 3);
vec_ste(sum, 0, &s);
return s;
}
static int pix_sum_altivec(uint8_t *pix, int line_size)
{
int i, s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned 16 pixels into t1. */
vector unsigned char pixl = vec_ld(0, pix);
vector unsigned char pixr = vec_ld(15, pix);
vector unsigned char t1 = vec_perm(pixl, pixr, perm);
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t1, sad);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->pix_norm1 = pix_norm1_altivec;
c->pix_sum = pix_sum_altivec;
#endif /* HAVE_ALTIVEC */
}
......@@ -517,6 +517,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
ff_dsputil_init(&s->dsp, avctx);
ff_hpeldsp_init(&s->hdsp, avctx->flags);
ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
avctx->coded_frame = av_frame_alloc();
s->current_picture = av_frame_alloc();
......
......@@ -109,6 +109,7 @@ YASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
YASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
......
......@@ -23,10 +23,6 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_1
SECTION .text
%macro DIFF_PIXELS_1 4
......@@ -465,113 +461,6 @@ cglobal diff_pixels, 4, 5, 5
jne .loop
RET
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used
; %2 = number of loops
; %3 = number of GPRs used
%macro PIX_SUM16 4
cglobal pix_sum16, 2, %3, %1
movsxdifnidn r1, r1d
mov r2, %2
%if cpuflag(xop)
lea r3, [r1*3]
%else
pxor m5, m5
%endif
pxor m4, m4
.loop:
%if cpuflag(xop)
vphaddubq m0, [r0]
vphaddubq m1, [r0+r1]
vphaddubq m2, [r0+r1*2]
vphaddubq m3, [r0+r3]
%else
mova m0, [r0]
%if mmsize == 8
mova m1, [r0+8]
%else
mova m1, [r0+r1]
%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif ; cpuflag(xop)
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m4, m3
%if mmsize == 8
add r0, r1
%else
lea r0, [r0+r1*%4]
%endif
dec r2
jne .loop
%if cpuflag(xop)
pshufd m0, m4, q0032
paddd m4, m0
%else
HADDW m4, m5
%endif
movd eax, m4
RET
%endmacro
INIT_MMX mmx
PIX_SUM16 0, 16, 3, 0
INIT_XMM sse2
PIX_SUM16 6, 8, 3, 2
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
PIX_SUM16 5, 4, 4, 4
%endif
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used
; %2 = number of loops
%macro PIX_NORM1 2
cglobal pix_norm1, 2, 3, %1
movsxdifnidn r1, r1d
mov r2, %2
pxor m0, m0
pxor m5, m5
.loop:
mova m2, [r0+0]
%if mmsize == 8
mova m3, [r0+8]
%else
mova m3, [r0+r1]
%endif
punpckhbw m1, m2, m0
punpcklbw m2, m0
punpckhbw m4, m3, m0
punpcklbw m3, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
paddd m5, m2
paddd m5, m4
%if mmsize == 8
add r0, r1
%else
lea r0, [r0+r1*2]
%endif
dec r2
jne .loop
HADDD m5, m1
movd eax, m5
RET
%endmacro
INIT_MMX mmx
PIX_NORM1 0, 16
INIT_XMM sse2
PIX_NORM1 6, 8
;-----------------------------------------------
;int ff_sum_abs_dctelem(int16_t *block)
;-----------------------------------------------
......
......@@ -37,11 +37,6 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride);
void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride);
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
int ff_sum_abs_dctelem_mmx(int16_t *block);
int ff_sum_abs_dctelem_mmxext(int16_t *block);
int ff_sum_abs_dctelem_sse2(int16_t *block);
......@@ -364,8 +359,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_mmx;
c->diff_pixels = ff_diff_pixels_mmx;
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_SSE2(cpu_flags))
......@@ -431,8 +424,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
c->diff_pixels = ff_diff_pixels_sse2;
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
......@@ -448,9 +439,5 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif
}
if (EXTERNAL_XOP(cpu_flags)) {
c->pix_sum = ff_pix_sum16_xop;
}
ff_dsputil_init_pix_mmx(c, avctx);
}
;*****************************************************************************
;* SIMD-optimized MPEG encoding functions
;*****************************************************************************
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_1
SECTION .text
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used
; %2 = number of loops
; %3 = number of GPRs used
%macro PIX_SUM16 4
cglobal pix_sum16, 2, %3, %1
movsxdifnidn r1, r1d
mov r2, %2
%if cpuflag(xop)
lea r3, [r1*3]
%else
pxor m5, m5
%endif
pxor m4, m4
.loop:
%if cpuflag(xop)
vphaddubq m0, [r0]
vphaddubq m1, [r0+r1]
vphaddubq m2, [r0+r1*2]
vphaddubq m3, [r0+r3]
%else
mova m0, [r0]
%if mmsize == 8
mova m1, [r0+8]
%else
mova m1, [r0+r1]
%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif ; cpuflag(xop)
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m4, m3
%if mmsize == 8
add r0, r1
%else
lea r0, [r0+r1*%4]
%endif
dec r2
jne .loop
%if cpuflag(xop)
pshufd m0, m4, q0032
paddd m4, m0
%else
HADDW m4, m5
%endif
movd eax, m4
RET
%endmacro
INIT_MMX mmx
PIX_SUM16 0, 16, 3, 0
INIT_XMM sse2
PIX_SUM16 6, 8, 3, 2
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
PIX_SUM16 5, 4, 4, 4
%endif
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used
; %2 = number of loops
%macro PIX_NORM1 2
cglobal pix_norm1, 2, 3, %1
movsxdifnidn r1, r1d
mov r2, %2
pxor m0, m0
pxor m5, m5
.loop:
mova m2, [r0+0]
%if mmsize == 8
mova m3, [r0+8]
%else
mova m3, [r0+r1]
%endif
punpckhbw m1, m2, m0
punpcklbw m2, m0
punpckhbw m4, m3, m0
punpcklbw m3, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
paddd m5, m2
paddd m5, m4
%if mmsize == 8
add r0, r1
%else
lea r0, [r0+r1*2]
%endif
dec r2
jne .loop
HADDD m5, m1
movd eax, m5
RET
%endmacro
INIT_MMX mmx
PIX_NORM1 0, 16
INIT_XMM sse2
PIX_NORM1 6, 8
......@@ -22,6 +22,12 @@
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
#if HAVE_INLINE_ASM
#define PHADDD(a, t) \
......@@ -95,9 +101,24 @@
av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;
}
if (EXTERNAL_XOP(cpu_flags)) {
c->pix_sum = ff_pix_sum16_xop;
}
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_mmx;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment