diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index fe768046b161c063370cebe97fc1a4490fbc466c..e40a6096eb17f999a0c043b9f4d5cb8a7a7ecba9 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -26,7 +26,8 @@ typedef struct {
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
-  uint8_t zcoeff_blk[256];
+  uint8_t *zcoeff_blk;
+  int num_4x4_blk;
   int skip;
   int_mv best_ref_mv;
   int_mv second_best_ref_mv;
@@ -177,6 +178,45 @@ struct macroblock {
                          int y_blocks);
 };
 
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      return &x->sb64_context;
+    case BLOCK_64X32:
+      return &x->sb64x32_context[xd->sb_index];
+    case BLOCK_32X64:
+      return &x->sb32x64_context[xd->sb_index];
+    case BLOCK_32X32:
+      return &x->sb32_context[xd->sb_index];
+    case BLOCK_32X16:
+      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X32:
+      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X16:
+      return &x->mb_context[xd->sb_index][xd->mb_index];
+    case BLOCK_16X8:
+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X16:
+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X8:
+      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_8X4:
+      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_4X8:
+      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_4X4:
+      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
 struct rdcost_block_args {
   MACROBLOCK *x;
   ENTROPY_CONTEXT t_above[16];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 6e8e1d13d3a071d161f5ef138c26d017a80d94b4..0515db2be3f47c250f17ce9100ec12deeba082d0 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -410,7 +410,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
   x->skip = ctx->skip;
   vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
-             sizeof(ctx->zcoeff_blk));
+             sizeof(uint8_t) * ctx->num_4x4_blk);
 
   if (!output_enabled)
     return;
@@ -690,45 +690,6 @@ static void update_stats(VP9_COMP *cpi) {
   }
 }
 
-// TODO(jingning): the variables used here are little complicated. need further
-// refactoring on organizing the temporary buffers, when recursive
-// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  switch (bsize) {
-    case BLOCK_64X64:
-      return &x->sb64_context;
-    case BLOCK_64X32:
-      return &x->sb64x32_context[xd->sb_index];
-    case BLOCK_32X64:
-      return &x->sb32x64_context[xd->sb_index];
-    case BLOCK_32X32:
-      return &x->sb32_context[xd->sb_index];
-    case BLOCK_32X16:
-      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X32:
-      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X16:
-      return &x->mb_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X8:
-      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X16:
-      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X8:
-      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X4:
-      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X8:
-      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X4:
-      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    default:
-      assert(0);
-      return NULL;
-  }
-}
-
 static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   switch (bsize) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index db74df75d8844c81d0e3bdfdac25357118f17734..f6b2a287653e9fdee65d5f6eaf5248b00a46a96d 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1416,6 +1416,94 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
   } while (++i <= MV_MAX);
 }
 
+static void init_pick_mode_context(VP9_COMP *cpi) {
+  int i;
+  MACROBLOCK  *x  = &cpi->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON  *cm = &cpi->common;
+
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+    if (i < BLOCK_16X16) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+            ctx->num_4x4_blk = num_4x4_blk;
+            CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                            vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+          }
+        }
+      }
+    } else if (i < BLOCK_32X32) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+                               ++xd->mb_index) {
+          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+          ctx->num_4x4_blk = num_4x4_blk;
+          CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                          vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+        }
+      }
+    } else if (i < BLOCK_64X64) {
+      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+        ctx->num_4x4_blk = num_4x4_blk;
+        CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                        vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+      }
+    } else {
+      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+      ctx->num_4x4_blk = num_4x4_blk;
+      CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                      vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+    }
+  }
+}
+
+static void free_pick_mode_context(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+    if (i < BLOCK_16X16) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
+          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+            vpx_free(ctx->zcoeff_blk);
+            ctx->zcoeff_blk = 0;
+          }
+        }
+      }
+    } else if (i < BLOCK_32X32) {
+      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
+        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
+                               ++xd->mb_index) {
+          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+          vpx_free(ctx->zcoeff_blk);
+          ctx->zcoeff_blk = 0;
+        }
+      }
+    } else if (i < BLOCK_64X64) {
+      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+        vpx_free(ctx->zcoeff_blk);
+        ctx->zcoeff_blk = 0;
+      }
+    } else {
+      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
+      vpx_free(ctx->zcoeff_blk);
+      ctx->zcoeff_blk = 0;
+    }
+  }
+}
+
 VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   int i, j;
   volatile union {
@@ -1452,6 +1540,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   init_config((VP9_PTR)cpi, oxcf);
 
+  init_pick_mode_context(cpi);
+
   cm->current_video_frame   = 0;
   cpi->kf_overspend_bits            = 0;
   cpi->kf_bitrate_adjustment        = 0;
@@ -1915,6 +2005,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 #endif
   }
 
+  free_pick_mode_context(&cpi->mb);
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0fc715299b4baaf58e8527fcd71569b9701010cd..bdc51527b85f5a09925c23002a2f4cb9a1dfcb73 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3577,7 +3577,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
-                   sizeof(ctx->zcoeff_blk));
+                   sizeof(uint8_t) * ctx->num_4x4_blk);
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -4317,7 +4317,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
-                   sizeof(ctx->zcoeff_blk));
+                   sizeof(uint8_t) * ctx->num_4x4_blk);
 
         for (i = 0; i < 4; i++)
           best_bmodes[i] = xd->mi_8x8[0]->bmi[i];