Commit e6e98234 authored by Justin Ruggles's avatar Justin Ruggles

Add apply_window_int16() to DSPContext with x86-optimized versions and use it

in the ac3_fixed encoder.
parent e971d813
......@@ -167,7 +167,7 @@ static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
static void mdct512(AC3MDCTContext *mdct, CoefType *out, SampleType *in);
static void apply_window(DSPContext *dsp, SampleType *output, const SampleType *input,
const SampleType *window, int n);
const SampleType *window, unsigned int len);
static int normalize_samples(AC3EncodeContext *s);
......
......@@ -252,15 +252,9 @@ static void mdct512(AC3MDCTContext *mdct, int32_t *out, int16_t *in)
* Apply KBD window to input samples prior to MDCT.
*/
static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
const int16_t *window, int n)
const int16_t *window, unsigned int len)
{
int i;
int n2 = n >> 1;
for (i = 0; i < n2; i++) {
output[i] = MUL16(input[i], window[i]) >> 15;
output[n-i-1] = MUL16(input[n-i-1], window[i]) >> 15;
}
dsp->apply_window_int16(output, input, window, len);
}
......
......@@ -83,9 +83,9 @@ static void mdct512(AC3MDCTContext *mdct, float *out, float *in)
* Apply KBD window to input samples prior to MDCT.
*/
static void apply_window(DSPContext *dsp, float *output, const float *input,
const float *window, int n)
const float *window, unsigned int len)
{
dsp->vector_fmul(output, input, window, n);
dsp->vector_fmul(output, input, window, len);
}
......
......@@ -141,7 +141,7 @@ const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };
/* AC-3 MDCT window */
/* MDCT window */
const int16_t ff_ac3_window[AC3_WINDOW_SIZE/2] = {
DECLARE_ALIGNED(16, const int16_t, ff_ac3_window)[AC3_WINDOW_SIZE/2] = {
4, 7, 12, 16, 21, 28, 34, 42,
51, 61, 72, 84, 97, 111, 127, 145,
164, 184, 207, 231, 257, 285, 315, 347,
......
......@@ -3890,6 +3890,19 @@ static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, co
return res;
}
static void apply_window_int16_c(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len)
{
int i;
int len2 = len >> 1;
for (i = 0; i < len2; i++) {
int16_t w = window[i];
output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
}
}
#define W0 2048
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
......@@ -4364,6 +4377,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->vector_clipf = vector_clipf_c;
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->apply_window_int16 = apply_window_int16_c;
c->scalarproduct_float = scalarproduct_float_c;
c->butterflies_float = butterflies_float_c;
c->vector_fmul_scalar = vector_fmul_scalar_c;
......
......@@ -524,6 +524,20 @@ typedef struct DSPContext {
*/
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
/**
* Apply symmetric window in 16-bit fixed-point.
* @param output destination array
* constraints: 16-byte aligned
* @param input source array
* constraints: 16-byte aligned
* @param window window array
* constraints: 16-byte aligned, at least len/2 elements
* @param len full window length
* constraints: multiple of ? greater than zero
*/
void (*apply_window_int16)(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
/* rv30 functions */
qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
......
......@@ -2388,6 +2388,20 @@ int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int or
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
......@@ -2749,6 +2763,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
} else {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
}
#endif
}
if(mm_flags & AV_CPU_FLAG_SSE){
......@@ -2771,13 +2790,30 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
} else {
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
}
c->emulated_edge_mc = emulated_edge_mc_sse;
c->gmc= gmc_sse;
#endif
}
if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
if (mm_flags & AV_CPU_FLAG_SSSE3) {
#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
} else {
c->apply_window_int16 = ff_apply_window_int16_ssse3;
}
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
}
#endif
}
}
if (CONFIG_ENCODERS)
......
......@@ -27,6 +27,8 @@ pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
pd_16384: times 4 dd 16384
section .text align=16
......@@ -202,6 +204,130 @@ SCALARPRODUCT_LOOP 0
RET
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
%macro REVERSE_WORDS_MMXEXT 1-2
pshufw %1, %1, 0x1B
%endmacro
%macro REVERSE_WORDS_SSE2 1-2
pshuflw %1, %1, 0x1B
pshufhw %1, %1, 0x1B
pshufd %1, %1, 0x4E
%endmacro
%macro REVERSE_WORDS_SSSE3 2
pshufb %1, %2
%endmacro
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
mova %3, %1
pmulhw %1, %2
pmullw %3, %2
psrlw %3, 15
psllw %1, 1
por %1, %3
%endmacro
; dst = ((dst * src) + (1<<14)) >> 15
%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
pmulhrsw %1, %2
%endmacro
%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
lea offset2q, [offsetq-mmsize]
%if %2
mova m5, [pd_16384]
%elifidn %1, ssse3
mova m5, [pb_revwords]
ALIGN 16
%endif
.loop:
%if %2
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova m3, [windowq+offset2q]
mova m4, [ inputq+offset2q]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offset2q], m0
REVERSE_WORDS m3
mova m4, [ inputq+offsetq]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offsetq], m0
%elif %3
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
pmulhrsw m1, m0
REVERSE_WORDS m0, m5
pmulhrsw m0, [ inputq+offsetq ]
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
mova m2, [ inputq+offsetq ]
MUL16FIXED m1, m0, m3
REVERSE_WORDS m0
MUL16FIXED m2, m0, m3
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m2
%endif
add offsetd, mmsize
sub offset2d, mmsize
jae .loop
REP_RET
%endmacro
INIT_MMX
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
%define MUL16FIXED MUL16FIXED_MMXEXT
APPLY_WINDOW_INT16 mmxext, 0, 0
APPLY_WINDOW_INT16 mmxext_ba, 1, 0
INIT_XMM
%define REVERSE_WORDS REVERSE_WORDS_SSE2
APPLY_WINDOW_INT16 sse2, 0, 0
APPLY_WINDOW_INT16 sse2_ba, 1, 0
APPLY_WINDOW_INT16 ssse3_atom, 0, 1
%define REVERSE_WORDS REVERSE_WORDS_SSSE3
APPLY_WINDOW_INT16 ssse3, 0, 1
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
......
b3a8f0a8809a58b2ece90744f06fff96 *./tests/data/acodec/ac3.rm
346073c97eada69330f61e103a170ca1 *./tests/data/acodec/ac3.rm
98751 ./tests/data/acodec/ac3.rm
7da378131db880bcf2e58305d54418ec *./tests/data/lavf/lavf.rm
7b7ede9548a09346675edad36acfbf19 *./tests/data/lavf/lavf.rm
346706 ./tests/data/lavf/lavf.rm
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret: 0 st:-1 flags:0 ts:-1.000000
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret: 0 st:-1 flags:1 ts: 1.894167
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret: 0 st: 0 flags:0 ts: 0.788000
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
ret:-1 st:-1 flags:1 ts: 1.894167
ret:-1 st: 0 flags:0 ts: 0.788000
ret: 0 st: 0 flags:1 ts:-0.317000
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret: 0 st:-1 flags:0 ts: 2.576668
ret: 0 st: 0 flags:1 dts:524.800000 pts:524.800000 pos: 6155 size: 244
ret:-1 st:-1 flags:0 ts: 2.576668
ret:-1 st:-1 flags:1 ts: 1.470835
ret: 0 st: 0 flags:0 ts: 0.365000
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
ret:-1 st: 0 flags:0 ts: 0.365000
ret: 0 st: 0 flags:1 ts:-0.741000
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st:-1 flags:0 ts: 2.153336
ret: 0 st:-1 flags:1 ts: 1.047503
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st:-1 flags:1 ts: 1.047503
ret: 0 st: 0 flags:0 ts:-0.058000
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret: 0 st: 0 flags:1 ts: 2.836000
ret: 0 st: 0 flags:1 dts: 2.681000 pts: 2.681000 pos: 44105 size: 558
ret:-1 st: 0 flags:1 ts: 2.836000
ret:-1 st:-1 flags:0 ts: 1.730004
ret: 0 st:-1 flags:1 ts: 0.624171
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st:-1 flags:1 ts: 0.624171
ret: 0 st: 0 flags:0 ts:-0.482000
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st: 0 flags:1 ts: 2.413000
ret:-1 st:-1 flags:0 ts: 1.306672
ret: 0 st:-1 flags:1 ts: 0.200839
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st:-1 flags:1 ts: 0.200839
ret: 0 st: 0 flags:0 ts:-0.905000
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st: 0 flags:1 ts: 1.989000
ret: 0 st:-1 flags:0 ts: 0.883340
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
ret:-1 st:-1 flags:0 ts: 0.883340
ret: 0 st:-1 flags:1 ts:-0.222493
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
ret:-1 st: 0 flags:0 ts: 2.672000
ret:-1 st: 0 flags:1 ts: 1.566000
ret: 0 st:-1 flags:0 ts: 0.460008
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
ret:-1 st:-1 flags:0 ts: 0.460008
ret: 0 st:-1 flags:1 ts:-0.645825
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment