Commit f6a80d6e authored by Michael Niedermayer's avatar Michael Niedermayer

Merge remote-tracking branch 'qatar/master'

* qatar/master:
  dsputilenc: x86: Convert pixel inline asm to yasm
  libgsm: detect libgsm header path
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents cf4515ec 9f00b1cb
......@@ -1381,6 +1381,7 @@ HAVE_LIST="
gettimeofday
glob
gnu_as
gsm_h
ibm_asm
inet_aton
io_h
......@@ -3839,7 +3840,9 @@ enabled libfdk_aac && require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-
flite_libs="-lflite_cmu_time_awb -lflite_cmu_us_awb -lflite_cmu_us_kal -lflite_cmu_us_kal16 -lflite_cmu_us_rms -lflite_cmu_us_slt -lflite_usenglish -lflite_cmulex -lflite"
enabled libflite && require2 libflite "flite/flite.h" flite_init $flite_libs
enabled libfreetype && require_pkg_config freetype2 "ft2build.h freetype/freetype.h" FT_Init_FreeType
enabled libgsm && require libgsm gsm/gsm.h gsm_create -lgsm
enabled libgsm && { for gsm_hdr in "gsm.h" "gsm/gsm.h"; do
check_lib "${gsm_hdr}" gsm_create -lgsm && break;
done || die "ERROR: libgsm not found"; }
enabled libilbc && require libilbc ilbc.h WebRtcIlbcfix_InitDecode -lilbc
enabled libmodplug && require libmodplug libmodplug/modplug.h ModPlug_Load -lmodplug
enabled libmp3lame && require "libmp3lame >= 3.98.3" lame/lame.h lame_set_VBR_quality -lmp3lame
......
......@@ -27,7 +27,12 @@
// The idiosyncrasies of GSM-in-WAV are explained at http://kbs.cs.tu-berlin.de/~jutta/toast.html
#include "config.h"
#if HAVE_GSM_H
#include <gsm.h>
#else
#include <gsm/gsm.h>
#endif
#include "libavutil/channel_layout.h"
#include "libavutil/common.h"
......
......@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
paddd m7, m1
movd eax, m7 ; return value
RET
INIT_MMX mmx
; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
cglobal get_pixels, 3,4
movsxdifnidn r2, r2d
add r0, 128
mov r3, -128
pxor m7, m7
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
mova [r0+r3+ 0], m0
mova [r0+r3+ 8], m1
mova [r0+r3+16], m2
mova [r0+r3+24], m3
lea r1, [r1+r2*2]
add r3, 32
js .loop
REP_RET
INIT_XMM sse2
cglobal get_pixels, 3, 4
movsxdifnidn r2, r2d
lea r3, [r2*3]
pxor m4, m4
movh m0, [r1]
movh m1, [r1+r2]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
lea r1, [r1+r2*4]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0], m0
mova [r0+0x10], m1
mova [r0+0x20], m2
mova [r0+0x30], m3
movh m0, [r1]
movh m1, [r1+r2*1]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0+0x40], m0
mova [r0+0x50], m1
mova [r0+0x60], m2
mova [r0+0x70], m3
RET
INIT_MMX mmx
; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride)
cglobal diff_pixels, 4,5
movsxdifnidn r3, r3d
pxor m7, m7
add r0, 128
mov r4, -128
.loop:
mova m0, [r1]
mova m2, [r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova [r0+r4+0], m0
mova [r0+r4+8], m1
add r1, r3
add r2, r3
add r4, 16
jne .loop
REP_RET
INIT_MMX mmx
; pix_sum16_mmx(uint8_t * pix, int line_size)
cglobal pix_sum16, 2, 3
movsxdifnidn r1, r1d
mov r2, r1
neg r2
shl r2, 4
sub r0, r2
pxor m7, m7
pxor m6, m6
.loop:
mova m0, [r0+r2+0]
mova m1, [r0+r2+0]
mova m2, [r0+r2+8]
mova m3, [r0+r2+8]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m6, m3
add r2, r1
js .loop
mova m5, m6
psrlq m6, 32
paddw m6, m5
mova m5, m6
psrlq m6, 16
paddw m6, m5
movd eax, m6
and eax, 0xffff
RET
INIT_MMX mmx
; pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal pix_norm1, 2, 4
movsxdifnidn r1, r1d
mov r2, 16
pxor m0, m0
pxor m7, m7
.loop:
mova m2, [r0+0]
mova m3, [r0+8]
mova m1, m2
punpckhbw m1, m0
punpcklbw m2, m0
mova m4, m3
punpckhbw m3, m0
punpcklbw m4, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
paddd m7, m2
add r0, r1
paddd m7, m4
dec r2
jne .loop
mova m1, m7
psrlq m7, 32
paddd m1, m7
movd eax, m1
RET
......@@ -30,181 +30,14 @@
#include "libavcodec/mathops.h"
#include "dsputil_mmx.h"
void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size);
void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size);
void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
#if HAVE_INLINE_ASM
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
"mov $-128, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0), %%mm0 \n\t"
"movq (%0, %2), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"movq %%mm0, (%1, %%"REG_a") \n\t"
"movq %%mm1, 8(%1, %%"REG_a") \n\t"
"movq %%mm2, 16(%1, %%"REG_a") \n\t"
"movq %%mm3, 24(%1, %%"REG_a") \n\t"
"add %3, %0 \n\t"
"add $32, %%"REG_a" \n\t"
"js 1b \n\t"
: "+r" (pixels)
: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
: "%"REG_a
);
}
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
"pxor %%xmm4, %%xmm4 \n\t"
"movq (%0), %%xmm0 \n\t"
"movq (%0, %2), %%xmm1 \n\t"
"movq (%0, %2,2), %%xmm2 \n\t"
"movq (%0, %3), %%xmm3 \n\t"
"lea (%0,%2,4), %0 \n\t"
"punpcklbw %%xmm4, %%xmm0 \n\t"
"punpcklbw %%xmm4, %%xmm1 \n\t"
"punpcklbw %%xmm4, %%xmm2 \n\t"
"punpcklbw %%xmm4, %%xmm3 \n\t"
"movdqa %%xmm0, (%1) \n\t"
"movdqa %%xmm1, 16(%1) \n\t"
"movdqa %%xmm2, 32(%1) \n\t"
"movdqa %%xmm3, 48(%1) \n\t"
"movq (%0), %%xmm0 \n\t"
"movq (%0, %2), %%xmm1 \n\t"
"movq (%0, %2,2), %%xmm2 \n\t"
"movq (%0, %3), %%xmm3 \n\t"
"punpcklbw %%xmm4, %%xmm0 \n\t"
"punpcklbw %%xmm4, %%xmm1 \n\t"
"punpcklbw %%xmm4, %%xmm2 \n\t"
"punpcklbw %%xmm4, %%xmm3 \n\t"
"movdqa %%xmm0, 64(%1) \n\t"
"movdqa %%xmm1, 80(%1) \n\t"
"movdqa %%xmm2, 96(%1) \n\t"
"movdqa %%xmm3, 112(%1) \n\t"
: "+r" (pixels)
: "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
);
}
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
{
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"mov $-128, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0), %%mm0 \n\t"
"movq (%1), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"movq %%mm0, (%2, %%"REG_a") \n\t"
"movq %%mm1, 8(%2, %%"REG_a") \n\t"
"add %3, %0 \n\t"
"add %3, %1 \n\t"
"add $16, %%"REG_a" \n\t"
"jnz 1b \n\t"
: "+r" (s1), "+r" (s2)
: "r" (block+64), "r" ((x86_reg)stride)
: "%"REG_a
);
}
static int pix_sum16_mmx(uint8_t * pix, int line_size){
const int h=16;
int sum;
x86_reg index= -line_size*h;
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"pxor %%mm6, %%mm6 \n\t"
"1: \n\t"
"movq (%2, %1), %%mm0 \n\t"
"movq (%2, %1), %%mm1 \n\t"
"movq 8(%2, %1), %%mm2 \n\t"
"movq 8(%2, %1), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"paddw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm6 \n\t"
"add %3, %1 \n\t"
" js 1b \n\t"
"movq %%mm6, %%mm5 \n\t"
"psrlq $32, %%mm6 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"movq %%mm6, %%mm5 \n\t"
"psrlq $16, %%mm6 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
"andl $0xFFFF, %0 \n\t"
: "=&r" (sum), "+r" (index)
: "r" (pix - index), "r" ((x86_reg)line_size)
);
return sum;
}
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
int tmp;
__asm__ volatile (
"movl $16,%%ecx\n"
"pxor %%mm0,%%mm0\n"
"pxor %%mm7,%%mm7\n"
"1:\n"
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
"pmaddwd %%mm3,%%mm3\n"
"pmaddwd %%mm4,%%mm4\n"
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
pix2^2+pix3^2+pix6^2+pix7^2) */
"paddd %%mm3,%%mm4\n"
"paddd %%mm2,%%mm7\n"
"add %2, %0\n"
"paddd %%mm4,%%mm7\n"
"dec %%ecx\n"
"jnz 1b\n"
"movq %%mm7,%%mm1\n"
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%1\n"
: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
return tmp;
}
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
__asm__ volatile (
......@@ -1112,10 +945,23 @@ hadamard_func(ssse3)
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
int bit_depth = avctx->bits_per_raw_sample;
#if HAVE_YASM
if (EXTERNAL_MMX(mm_flags)) {
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_mmx;
c->diff_pixels = ff_diff_pixels_mmx;
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_SSE2(mm_flags))
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_sse2;
#endif /* HAVE_YASM */
#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX) {
const int dct_algo = avctx->dct_algo;
if (avctx->bits_per_raw_sample <= 8 &&
......@@ -1129,15 +975,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
if (bit_depth <= 8)
c->get_pixels = get_pixels_mmx;
c->diff_pixels = diff_pixels_mmx;
c->pix_sum = pix_sum16_mmx;
c->diff_bytes= diff_bytes_mmx;
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
c->pix_norm1 = pix_norm1_mmx;
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
c->vsad[4]= vsad_intra16_mmx;
......@@ -1167,8 +1008,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
if(mm_flags & AV_CPU_FLAG_SSE2){
if (bit_depth <= 8)
c->get_pixels = get_pixels_sse2;
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment