Commit 218d6844 authored by Ben Avison's avatar Ben Avison Committed by Martin Storsjö

h264dsp: Factorize code into a new function, h264_find_start_code_candidate

This performs the start code search which was previously part of
h264_find_frame_end() - the most CPU intensive part of the function.

By itself, this results in a performance regression:
              Before          After
              Mean   StdDev   Mean   StdDev  Change
Overall time  2925.6 26.2     3068.5 31.7    -4.7%

but this can more than be made up for by platform-optimised
implementations of the function.
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent 7a82022e
......@@ -47,30 +47,9 @@ static int h264_find_frame_end(H264Context *h, const uint8_t *buf,
for (i = 0; i < buf_size; i++) {
if (state == 7) {
#if HAVE_FAST_UNALIGNED
/* we check i < buf_size instead of i + 3 / 7 because it is
* simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
* bytes at the end.
*/
#if HAVE_FAST_64BIT
while (i < buf_size &&
!((~*(const uint64_t *)(buf + i) &
(*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
0x8080808080808080ULL))
i += 8;
#else
while (i < buf_size &&
!((~*(const uint32_t *)(buf + i) &
(*(const uint32_t *)(buf + i) - 0x01010101U)) &
0x80808080U))
i += 4;
#endif
#endif
for (; i < buf_size; i++)
if (!buf[i]) {
state = 2;
break;
}
i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i);
if (i < buf_size)
state = 2;
} else if (state <= 2) {
if (buf[i] == 1)
state ^= 5; // 2->7, 1->4, 0->5
......
......@@ -53,6 +53,34 @@
#include "h264addpx_template.c"
#undef BIT_DEPTH
static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
{
int i = 0;
#if HAVE_FAST_UNALIGNED
/* we check i < size instead of i + 3 / 7 because it is
* simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
* bytes at the end.
*/
#if HAVE_FAST_64BIT
while (i < size &&
!((~*(const uint64_t *)(buf + i) &
(*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
0x8080808080808080ULL))
i += 8;
#else
while (i < size &&
!((~*(const uint32_t *)(buf + i) &
(*(const uint32_t *)(buf + i) - 0x01010101U)) &
0x80808080U))
i += 4;
#endif
#endif
for (; i < size; i++)
if (!buf[i])
break;
return i;
}
av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
......@@ -133,6 +161,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
H264_DSP(8);
break;
}
c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
......
......@@ -105,6 +105,15 @@ typedef struct H264DSPContext {
/* bypass-transform */
void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride);
void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride);
/**
* Search buf from the start for up to size bytes. Return the index
* of a zero byte, or >= size if not found. Ideally, use lookahead
* to filter out any zero bytes that are known to not be followed by
* one or more further zero bytes and a one byte. Better still, filter
* out any bytes that form the trailing_zero_8bits syntax element too.
*/
int (*h264_find_start_code_candidate)(const uint8_t *buf, int size);
} H264DSPContext;
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment