VP8: optimize VP8Context struct ordering

Shaves at least 3KB off code size on x86, should improve cache utilization. This would probably be useful to do for other decoders/encoders as well.

VP8: optimize VP8Context struct ordering
Shaves at least 3KB off code size on x86, should improve cache utilization. This would probably be useful to do for other decoders/encoders as well.
1eeca886 · Jason Garrett-Glaser · 3efbe137 · 1eeca886 · 1eeca886
Commit 1eeca886 authored Mar 11, 2011 by Jason Garrett-Glaser
Hide whitespace changes
Inline Side-by-side

Showing with 79 additions and 79 deletions

vp8.c libavcodec/vp8.c +1 -1

vp8.h libavcodec/vp8.h +78 -78

No files found.
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -474,7 +474,7 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
    enum { EDGE_TOP, EDGE_LEFT, EDGE_TOPLEFT };
    int idx = CNT_ZERO;
    int cur_sign_bias = s->sign_bias[mb->ref_frame];
-    int *sign_bias = s->sign_bias;
+    int8_t *sign_bias = s->sign_bias;
    VP56mv near_mv[4];
    uint8_t cnt[4] = { 0 };
    VP56RangeCoder *c = &s->c;

--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -85,83 +85,24 @@ typedef struct {
 typedef struct {
    AVCodecContext *avctx;
-    DSPContext dsp;
-    VP8DSPContext vp8dsp;
-    H264PredContext hpc;
-    vp8_mc_func put_pixels_tab[3][3][3];
-    AVFrame frames[4];
    AVFrame *framep[4];
    uint8_t *edge_emu_buffer;
-    VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
-    int profile;
-    int mb_width;   /* number of horizontal MB */
+    uint16_t mb_width;   /* number of horizontal MB */
-    int mb_height;  /* number of vertical MB */
+    uint16_t mb_height;  /* number of vertical MB */
    int linesize;
    int uvlinesize;
-    int keyframe;
+    uint8_t keyframe;
-    int invisible;
+    uint8_t deblock_filter;
-    int update_last;    ///< update VP56_FRAME_PREVIOUS with the current one
+    uint8_t mbskip_enabled;
-    int update_golden;  ///< VP56_FRAME_NONE if not updated, or which frame to copy if so
+    uint8_t segment;             ///< segment of the current macroblock
-    int update_altref;
+    uint8_t chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
-    int deblock_filter;
+    uint8_t profile;
-    /**
-     * If this flag is not set, all the probability updates
-     * are discarded after this frame is decoded.
-     */
-    int update_probabilities;
-    /**
-     * All coefficients are contained in separate arith coding contexts.
-     * There can be 1, 2, 4, or 8 of these after the header context.
-     */
-    int num_coeff_partitions;
-    VP56RangeCoder coeff_partition[8];
-    VP8Macroblock *macroblocks;
-    VP8Macroblock *macroblocks_base;
-    VP8FilterStrength *filter_strength;
-    uint8_t *intra4x4_pred_mode_top;
-    uint8_t intra4x4_pred_mode_left[4];
-    uint8_t *segmentation_map;
-    /**
-     * Cache of the top row needed for intra prediction
-     * 16 for luma, 8 for each chroma plane
-     */
-    uint8_t (*top_border)[16+8+8];
-    /**
-     * For coeff decode, we need to know whether the above block had non-zero
-     * coefficients. This means for each macroblock, we need data for 4 luma
-     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
-     * per macroblock. We keep the last row in top_nnz.
-     */
-    uint8_t (*top_nnz)[9];
-    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
-    /**
-     * This is the index plus one of the last non-zero coeff
-     * for each of the blocks in the current macroblock.
-     * So, 0 -> no coeffs
-     *     1 -> dc-only (special transform)
-     *     2+-> full transform
-     */
-    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
-    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
-    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
-    uint8_t intra4x4_pred_mode_mb[16];
-    int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
-    int segment;             ///< segment of the current macroblock
    VP56mv mv_min;
    VP56mv mv_max;
-    int mbskip_enabled;
+    int8_t sign_bias[4]; ///< one state [0, 1] per ref frame type
-    int sign_bias[4]; ///< one state [0, 1] per ref frame type
    int ref_count[3];
    /**
@@ -170,13 +111,26 @@ typedef struct {
     * a frame, since the values persist between interframes.
     */
    struct {
-        int enabled;
+        uint8_t enabled;
-        int absolute_vals;
+        uint8_t absolute_vals;
-        int update_map;
+        uint8_t update_map;
        int8_t base_quant[4];
        int8_t filter_level[4];     ///< base loop filter level
    } segmentation;
+    struct {
+        uint8_t simple;
+        uint8_t level;
+        uint8_t sharpness;
+    } filter;
+    VP8Macroblock *macroblocks;
+    VP8FilterStrength *filter_strength;
+    uint8_t *intra4x4_pred_mode_top;
+    uint8_t intra4x4_pred_mode_left[4];
+    uint8_t *segmentation_map;
    /**
     * Macroblocks can have one of 4 different quants in a frame when
     * segmentation is enabled.
@@ -190,13 +144,7 @@ typedef struct {
    } qmat[4];
    struct {
-        int simple;
+        uint8_t enabled;    ///< whether each mb can have a different strength based on mode/ref
-        int level;
-        int sharpness;
-    } filter;
-    struct {
-        int enabled;    ///< whether each mb can have a different strength based on mode/ref
        /**
         * filter strength adjustment for the following macroblock modes:
@@ -219,6 +167,34 @@ typedef struct {
        int8_t ref[4];
    } lf_delta;
+    /**
+     * Cache of the top row needed for intra prediction
+     * 16 for luma, 8 for each chroma plane
+     */
+    uint8_t (*top_border)[16+8+8];
+    /**
+     * For coeff decode, we need to know whether the above block had non-zero
+     * coefficients. This means for each macroblock, we need data for 4 luma
+     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
+     * per macroblock. We keep the last row in top_nnz.
+     */
+    uint8_t (*top_nnz)[9];
+    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
+    /**
+     * This is the index plus one of the last non-zero coeff
+     * for each of the blocks in the current macroblock.
+     * So, 0 -> no coeffs
+     *     1 -> dc-only (special transform)
+     *     2+-> full transform
+     */
+    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
+    VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
+    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
+    uint8_t intra4x4_pred_mode_mb[16];
    /**
     * These are all of the updatable probabilities for binary decisions.
     * They are only implictly reset on keyframes, making it quite likely
@@ -236,6 +212,30 @@ typedef struct {
        uint8_t token[4][16][3][NUM_DCT_TOKENS-1];
        uint8_t mvc[2][19];
    } prob[2];
+    VP8Macroblock *macroblocks_base;
+    int invisible;
+    int update_last;    ///< update VP56_FRAME_PREVIOUS with the current one
+    int update_golden;  ///< VP56_FRAME_NONE if not updated, or which frame to copy if so
+    int update_altref;
+    /**
+     * If this flag is not set, all the probability updates
+     * are discarded after this frame is decoded.
+     */
+    int update_probabilities;
+    /**
+     * All coefficients are contained in separate arith coding contexts.
+     * There can be 1, 2, 4, or 8 of these after the header context.
+     */
+    int num_coeff_partitions;
+    VP56RangeCoder coeff_partition[8];
+    DSPContext dsp;
+    VP8DSPContext vp8dsp;
+    H264PredContext hpc;
+    vp8_mc_func put_pixels_tab[3][3][3];
+    AVFrame frames[4];
 } VP8Context;
 #endif