diff --git a/configure b/configure
index f55f798635b2d147a16caa18b7490e65af9c2670..5c8dc8e8430823e2e9b87d7db86416defdf4ef18 100755
--- a/configure
+++ b/configure
@@ -245,6 +245,7 @@ EXPERIMENT_LIST="
     comp_interintra_pred
     enable_6tap
     abovesprefmv
+    code_nonzerocount
 "
 CONFIG_LIST="
     external_build
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b46dd056849d71c34119ad74279115c50fd6705b..9f978ce5ec7154a15fb3989c0412d78286ed801b 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -250,6 +250,9 @@ typedef struct {
   INTERPOLATIONFILTERTYPE interp_filter;
 
   BLOCK_SIZE_TYPE sb_type;
+#if CONFIG_CODE_NONZEROCOUNT
+  uint16_t nzcs[256+64*2];
+#endif
 } MB_MODE_INFO;
 
 typedef struct {
@@ -295,6 +298,9 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, int16_t,  qcoeff[64*64+32*32*2]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[64*64+32*32*2]);
   DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]);
+#if CONFIG_CODE_NONZEROCOUNT
+  DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);
+#endif
 
   /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */
   BLOCKD block[24];
@@ -592,4 +598,25 @@ static void update_blockd_bmi(MACROBLOCKD *xd) {
   }
 }
 
+static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
+  TX_SIZE tx_size_uv;
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+    if (xd->mode_info_context->mbmi.txfm_size == TX_32X32)
+      tx_size_uv = TX_16X16;
+    else
+      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+  } else {
+    if (xd->mode_info_context->mbmi.txfm_size == TX_16X16)
+      tx_size_uv = TX_8X8;
+    else if (xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
+             (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+              xd->mode_info_context->mbmi.mode == SPLITMV))
+      tx_size_uv = TX_4X4;
+    else
+      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+  }
+  return tx_size_uv;
+}
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_coefupdateprobs.h b/vp9/common/vp9_coefupdateprobs.h
index ab8d8d9403e5f0014f5b648cf0fe563b70c3812a..ff018e69893eeb2b8b2b4adf4f15a196b71afaea 100644
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@@ -17,4 +17,11 @@
 #define COEF_UPDATE_PROB_8X8 252
 #define COEF_UPDATE_PROB_16X16 252
 
+#if CONFIG_CODE_NONZEROCOUNT
+#define NZC_UPDATE_PROB_4X4     252
+#define NZC_UPDATE_PROB_8X8     252
+#define NZC_UPDATE_PROB_16X16   252
+#define NZC_UPDATE_PROB_32X32   252
+#endif
+
 #endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__
diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h
index 204e65af69216d6be9194d2fbfe9660d54874271..8fb80203153e73e1affc886d283e17039e94eab5 100644
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -695,3 +695,130 @@ static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES] = {
     }
   }
 };
+
+#if CONFIG_CODE_NONZEROCOUNT
+// TODO(debargha): Replace with probabilities once the stats stabilize
+static const unsigned int default_nzc4x4_counts[MAX_NZC_CONTEXTS]
+                                               [REF_TYPES]
+                                               [BLOCK_TYPES]
+                                               [NZC4X4_TOKENS] = {
+  {
+    {
+      { 967652, 29023, 15039, 6952, 1568, 116 },
+      { 789116, 22938, 4522, 1935, 520, 47 }
+    }, {
+      { 967652, 29023, 15039, 6952, 1568, 116 },
+      { 789116, 22938, 4522, 1935, 520, 47 }
+    },
+  }, {
+    {
+      { 124684, 37167, 15270, 8483, 1777, 102 },
+      { 10405, 12395, 3401, 3574, 2461, 771 }
+    }, {
+      { 124684, 37167, 15270, 8483, 1777, 102 },
+      { 10405, 12395, 3401, 3574, 2461, 771 }
+    }
+  }, {
+    {
+      { 41100, 22976, 15627, 16137, 7982, 793 },
+      { 4249, 3084, 2131, 4081, 6439, 1653 }
+    }, {
+      { 41100, 22976, 15627, 16137, 7982, 793 },
+      { 4249, 3084, 2131, 4081, 6439, 1653 }
+    }
+  }
+};
+
+static const unsigned int default_nzc8x8_counts[MAX_NZC_CONTEXTS]
+                                               [REF_TYPES]
+                                               [BLOCK_TYPES]
+                                               [NZC8X8_TOKENS] = {
+  {
+    {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },
+    }, {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },
+    }
+  }, {
+    {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
+      { 17772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
+    }, {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
+      { 17772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
+    }
+  }, {
+    {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
+      { 6612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
+    }, {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
+      { 6612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
+    }
+  }
+};
+
+static const unsigned int default_nzc16x16_counts[MAX_NZC_CONTEXTS]
+                                                 [REF_TYPES]
+                                                 [BLOCK_TYPES]
+                                                 [NZC16X16_TOKENS] = {
+  {
+    {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+    }, {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+    }
+  }, {
+    {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
+      { 17772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+    }, {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
+      { 17772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+    }
+  }, {
+    {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
+      { 6612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
+    }, {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
+      { 6612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
+    }
+  }
+};
+
+static const unsigned int default_nzc32x32_counts[MAX_NZC_CONTEXTS]
+                                                 [REF_TYPES]
+                                                 [BLOCK_TYPES]
+                                                 [NZC32X32_TOKENS] = {
+  {
+    {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+    }, {
+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+    }
+  }, {
+    {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
+      { 17772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+    }, {
+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
+      { 17772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+    }
+  }, {
+    {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
+      { 6612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
+    }, {
+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
+      { 6612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
+    }
+  }
+};
+#endif
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 1e3a7e17e33b68be179c811751ac70de68192229..c4908e29ac00be7750f866f8153628445fb8fb00 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -1,4 +1,4 @@
-/*
+/*
  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
@@ -186,6 +186,92 @@ static const vp9_prob Pcat6[] = {
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 
+#if CONFIG_CODE_NONZEROCOUNT
+const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  -NZC_3TO4, 8,
+  -NZC_5TO8, -NZC_9TO16,
+};
+struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS];
+
+const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  8, 10,
+  -NZC_3TO4, -NZC_5TO8,
+  -NZC_9TO16, 12,
+  -NZC_17TO32, -NZC_33TO64,
+};
+struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS];
+
+const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  8, 10,
+  -NZC_3TO4, -NZC_5TO8,
+  12, 14,
+  -NZC_9TO16, -NZC_17TO32,
+  -NZC_33TO64, 16,
+  -NZC_65TO128, -NZC_129TO256,
+};
+struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS];
+
+const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = {
+  -NZC_0, 2,
+  4, 6,
+  -NZC_1, -NZC_2,
+  8, 10,
+  -NZC_3TO4, -NZC_5TO8,
+  12, 14,
+  -NZC_9TO16, -NZC_17TO32,
+  16, 18,
+  -NZC_33TO64, -NZC_65TO128,
+  -NZC_129TO256, 20,
+  -NZC_257TO512, -NZC_513TO1024,
+};
+struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS];
+
+const vp9_prob Pcat_nzc[MAX_NZC_CONTEXTS]
+                       [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA] = { {
+    // Bit probabilities are in least to most significance order
+    {176,   0,   0,   0,   0,   0,   0,   0,   0},   // 3 - 4
+    {164, 192,   0,   0,   0,   0,   0,   0,   0},   // 5 - 8
+    {154, 184, 208,   0,   0,   0,   0,   0,   0},   // 9 - 16
+    {144, 176, 200, 216,   0,   0,   0,   0,   0},   // 17 - 32
+    {140, 172, 192, 208, 224,   0,   0,   0,   0},   // 33 - 64
+    {136, 168, 188, 200, 220, 232,   0,   0,   0},   // 65 - 128
+    {132, 164, 184, 196, 216, 228, 240,   0,   0},   // 129 - 256
+    {130, 162, 178, 194, 212, 226, 240, 248,   0},   // 257 - 512
+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
+  }, {
+    {168,   0,   0,   0,   0,   0,   0,   0,   0},   // 3 - 4
+    {152, 184,   0,   0,   0,   0,   0,   0,   0},   // 5 - 8
+    {152, 184, 208,   0,   0,   0,   0,   0,   0},   // 9 - 16
+    {144, 176, 200, 216,   0,   0,   0,   0,   0},   // 17 - 32
+    {140, 172, 192, 208, 224,   0,   0,   0,   0},   // 33 - 64
+    {136, 168, 188, 200, 220, 232,   0,   0,   0},   // 65 - 128
+    {132, 164, 184, 196, 216, 228, 240,   0,   0},   // 129 - 256
+    {130, 162, 178, 194, 212, 226, 240, 248,   0},   // 257 - 512
+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
+  }, {
+    {160,   0,   0,   0,   0,   0,   0,   0,   0},   // 3 - 4
+    {152, 176,   0,   0,   0,   0,   0,   0,   0},   // 5 - 8
+    {150, 184, 208,   0,   0,   0,   0,   0,   0},   // 9 - 16
+    {144, 176, 200, 216,   0,   0,   0,   0,   0},   // 17 - 32
+    {140, 172, 192, 208, 224,   0,   0,   0,   0},   // 33 - 64
+    {136, 168, 188, 200, 220, 232,   0,   0,   0},   // 65 - 128
+    {132, 164, 184, 196, 216, 228, 240,   0,   0},   // 129 - 256
+    {130, 162, 178, 194, 212, 226, 240, 248,   0},   // 257 - 512
+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
+  },
+};
+
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
 static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -253,6 +339,9 @@ int vp9_get_coef_context(int * recent_energy, int token) {
 };
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
+#if CONFIG_CODE_NONZEROCOUNT
+  int h, g;
+#endif
   vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,
              sizeof(pc->fc.coef_probs_4x4));
   vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
@@ -261,13 +350,1128 @@ void vp9_default_coef_probs(VP9_COMMON *pc) {
              sizeof(pc->fc.coef_probs_16x16));
   vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
              sizeof(pc->fc.coef_probs_32x32));
+#if CONFIG_CODE_NONZEROCOUNT
+  for (h = 0; h < MAX_NZC_CONTEXTS; ++h) {
+    for (g = 0; g < REF_TYPES; ++g) {
+      int i;
+      unsigned int branch_ct4x4[NZC4X4_NODES][2];
+      unsigned int branch_ct8x8[NZC8X8_NODES][2];
+      unsigned int branch_ct16x16[NZC16X16_NODES][2];
+      unsigned int branch_ct32x32[NZC32X32_NODES][2];
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC4X4_TOKENS, vp9_nzc4x4_encodings, vp9_nzc4x4_tree,
+          pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4,
+          default_nzc4x4_counts[h][g][i]);
+      }
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC8X8_TOKENS, vp9_nzc8x8_encodings, vp9_nzc8x8_tree,
+          pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8,
+          default_nzc8x8_counts[h][g][i]);
+      }
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC16X16_TOKENS, vp9_nzc16x16_encodings, vp9_nzc16x16_tree,
+          pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16,
+          default_nzc16x16_counts[h][g][i]);
+      }
+      for (i = 0; i < BLOCK_TYPES; ++i) {
+        vp9_tree_probs_from_distribution(
+          NZC32X32_TOKENS, vp9_nzc32x32_encodings, vp9_nzc32x32_tree,
+          pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32,
+          default_nzc32x32_counts[h][g][i]);
+      }
+    }
+  }
+#endif  // CONFIG_CODE_NONZEROCOUNTyy
 }
 
 void vp9_coef_tree_initialize() {
   init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree);
+  vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree);
+  vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree);
+  vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree);
+#endif
+}
+
+#if CONFIG_CODE_NONZEROCOUNT
+
+#define mb_in_cur_tile(cm, mb_row, mb_col)      \
+    ((mb_col) >= (cm)->cur_tile_mb_col_start && \
+     (mb_col) <= (cm)->cur_tile_mb_col_end   && \
+     (mb_row) >= 0)
+
+#define choose_nzc_context(nzc_exp, t2, t1)     \
+    ((nzc_exp) >= ((t2) << 6) ? 2 : (nzc_exp) >= ((t1) << 6) ? 1 : 0)
+
+#define NZC_T2_32X32    32
+#define NZC_T1_32X32     8
+#define NZC_T2_16X16    16
+#define NZC_T1_16X16     4
+#define NZC_T2_8X8       8
+#define NZC_T1_8X8       2
+#define NZC_T2_4X4       4
+#define NZC_T1_4X4       1
+
+// Transforms a mb16 block index to a sb64 block index
+static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) {
+  int r = (mb_row & 3);
+  int c = (mb_col & 3);
+  int b;
+  if (block < 16) {  // Y
+    int ib = block >> 2;
+    int jb = block & 3;
+    ib += r * 4;
+    jb += c * 4;
+    b = ib * 16 + jb;
+    assert(b < 256);
+    return b;
+  } else {  // UV
+    int base = block - (block & 3);
+    int ib = (block - base) >> 1;
+    int jb = (block - base) & 1;
+    ib += r * 2;
+    jb += c * 2;
+    b = base * 16 + ib * 8 + jb;
+    assert(b >= 256 && b < 384);
+    return b;
+  }
+}
+
+// Transforms a mb16 block index to a sb32 block index
+static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) {
+  int r = (mb_row & 1);
+  int c = (mb_col & 1);
+  int b;
+  if (block < 16) {  // Y
+    int ib = block >> 2;
+    int jb = block & 3;
+    ib += r * 4;
+    jb += c * 4;
+    b = ib * 8 + jb;
+    assert(b < 64);
+    return b;
+  } else {  // UV
+    int base = block - (block & 3);
+    int ib = (block - base) >> 1;
+    int jb = (block - base) & 1;
+    ib += r * 2;
+    jb += c * 2;
+    b = base * 4 + ib * 4 + jb;
+    assert(b >= 64 && b < 96);
+    return b;
+  }
+}
+
+static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) {
+  // s is the log of the number of 4x4 blocks in each row/col of larger block
+  int b, ib, jb, nb;
+  ib = block >> s;
+  jb = block - (ib << s);
+  ib >>= tx_size;
+  jb >>= tx_size;
+  nb = 1 << (s - tx_size);
+  b = (ib * nb + jb) << (2 * tx_size);
+  return b;
+}
+
+/* BEGIN - Helper functions to get the y nzcs */
+static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) {
+  int b;
+  assert(block < 256);
+  b = block_to_txfm_index(block, mi->txfm_size, 4);
+  assert(b < 256);
+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
+}
+
+static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) {
+  int b;
+  assert(block < 64);
+  b = block_to_txfm_index(block, mi->txfm_size, 3);
+  assert(b < 64);
+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
+}
+
+static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) {
+  int b;
+  assert(block < 16);
+  b = block_to_txfm_index(block, mi->txfm_size, 2);
+  assert(b < 16);
+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
+}
+/* END - Helper functions to get the y nzcs */
+
+/* Function to get y nzc where block index is in mb16 terms */
+static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m,
+                                  int mb_row, int mb_col, int block) {
+  // NOTE: All values returned are at 64 times the true value at 4x4 scale
+  MB_MODE_INFO *const mi = &m->mbmi;
+  const int mis = cm->mode_info_stride;
+  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
+    return 0;
+  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
+    int r = mb_row & 3;
+    int c = mb_col & 3;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+      return get_nzc_4x4_y_sb64(
+          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
+  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
+    int r = mb_row & 1;
+    int c = mb_col & 1;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+      return get_nzc_4x4_y_sb32(
+          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
+  } else {
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
+      return 0;
+    return get_nzc_4x4_y_mb16(mi, block);
+  }
+}
+
+/* BEGIN - Helper functions to get the uv nzcs */
+static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) {
+  int b;
+  int base, uvtxfm_size;
+  assert(block >= 256 && block < 384);
+  uvtxfm_size = mi->txfm_size;
+  base = 256 + (block & 64);
+  block -= base;
+  b = base + block_to_txfm_index(block, uvtxfm_size, 3);
+  assert(b >= 256 && b < 384);
+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
+}
+
+static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) {
+  int b;
+  int base, uvtxfm_size;
+  assert(block >= 64 && block < 96);
+  if (mi->txfm_size == TX_32X32)
+    uvtxfm_size = TX_16X16;
+  else
+    uvtxfm_size = mi->txfm_size;
+  base = 64 + (block & 16);
+  block -= base;
+  b = base + block_to_txfm_index(block, uvtxfm_size, 2);
+  assert(b >= 64 && b < 96);
+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
+}
+
+static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) {
+  int b;
+  int base, uvtxfm_size;
+  assert(block >= 16 && block < 24);
+  if (mi->txfm_size == TX_8X8 &&
+      (mi->mode == SPLITMV || mi->mode == I8X8_PRED))
+    uvtxfm_size = TX_4X4;
+  else if (mi->txfm_size == TX_16X16)
+    uvtxfm_size = TX_8X8;
+  else
+    uvtxfm_size = mi->txfm_size;
+  base = 16 + (block & 4);
+  block -= base;
+  b = base + block_to_txfm_index(block, uvtxfm_size, 1);
+  assert(b >= 16 && b < 24);
+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
+}
+/* END - Helper functions to get the uv nzcs */
+
+/* Function to get uv nzc where block index is in mb16 terms */
+static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m,
+                                   int mb_row, int mb_col, int block) {
+  // NOTE: All values returned are at 64 times the true value at 4x4 scale
+  MB_MODE_INFO *const mi = &m->mbmi;
+  const int mis = cm->mode_info_stride;
+  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
+    return 0;
+  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
+    int r = mb_row & 3;
+    int c = mb_col & 3;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+      return get_nzc_4x4_uv_sb64(
+          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
+  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
+    int r = mb_row & 1;
+    int c = mb_col & 1;
+    m -= c + r * mis;
+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
+      return 0;
+    else
+    return get_nzc_4x4_uv_sb32(
+        &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
+  } else {
+    return get_nzc_4x4_uv_mb16(mi, block);
+  }
+}
+
+int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  assert(block < 256);
+  switch (txfm_size) {
+    case TX_32X32:
+      assert((block & 63) == 0);
+      if (block < 128) {
+        int o = (block >> 6) * 2;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o + 1,
+                          mb_row - 1, mb_col + o + 1, 15);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 128] << 3;
+      }
+      if ((block & 127) == 0) {
+        int o = (block >> 7) * 2;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
+                          mb_row + o + 1, mb_col - 1, 15);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 64] << 3;
+      }
+      nzc_exp <<= 2;
+      // Note nzc_exp is 64 times the average value expected at 32x32 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
+      break;
+
+    case TX_16X16:
+      assert((block & 15) == 0);
+      if (block < 64) {
+        int o = block >> 4;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 64] << 4;
+      }
+      if ((block & 63) == 0) {
+        int o = block >> 6;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
+      }
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+      break;
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (block < 32) {
+        int o = block >> 3;
+        int p = ((block >> 2) & 1) ? 14 : 12;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 32] << 5;
+      }
+      if ((block & 31) == 0) {
+        int o = block >> 6;
+        int p = ((block >> 5) & 1) ? 11 : 3;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+      break;
+
+    case TX_4X4:
+      if (block < 16) {
+        int o = block >> 2;
+        int p = block & 3;
+        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                12 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 16] << 6);
+      }
+      if ((block & 15) == 0) {
+        int o = block >> 6;
+        int p = (block >> 4) & 3;
+        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                 3 + 4 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+      break;
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  assert(block < 64);
+  switch (txfm_size) {
+    case TX_32X32:
+      assert(block == 0);
+      nzc_exp =
+          (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) +
+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) +
+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2;
+      // Note nzc_exp is 64 times the average value expected at 32x32 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
+      break;
+
+    case TX_16X16:
+      assert((block & 15) == 0);
+      if (block < 32) {
+        int o = (block >> 4) & 1;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
+      }
+      if ((block & 31) == 0) {
+        int o = block >> 5;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
+      }
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+      break;
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (block < 16) {
+        int o = block >> 3;
+        int p = ((block >> 2) & 1) ? 14 : 12;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
+      }
+      if ((block & 15) == 0) {
+        int o = block >> 5;
+        int p = ((block >> 4) & 1) ? 11 : 3;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+      break;
+
+    case TX_4X4:
+      if (block < 8) {
+        int o = block >> 2;
+        int p = block & 3;
+        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                12 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
+      }
+      if ((block & 7) == 0) {
+        int o = block >> 5;
+        int p = (block >> 3) & 3;
+        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                 3 + 4 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+      break;
+
+    default:
+      return 0;
+      break;
+  }
+}
+
+int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  assert(block < 16);
+  switch (txfm_size) {
+    case TX_16X16:
+      assert(block == 0);
+      nzc_exp =
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15);
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (block < 8) {
+        int p = ((block >> 2) & 1) ? 14 : 12;
+        nzc_exp =
+            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) +
+            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
+      }
+      if ((block & 7) == 0) {
+        int p = ((block >> 3) & 1) ? 11 : 3;
+        nzc_exp +=
+            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) +
+            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (block < 4) {
+        int p = block & 3;
+        nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col,
+                                12 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
+      }
+      if ((block & 3) == 0) {
+        int p = (block >> 2) & 3;
+        nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1,
+                                 3 + 4 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+      break;
+  }
+}
+
+int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  const int base = block - (block & 63);
+  const int boff = (block & 63);
+  const int base_mb16 = base >> 4;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  TX_SIZE txfm_size_uv;
+
+  assert(block >= 256 && block < 384);
+  txfm_size_uv = txfm_size;
+
+  switch (txfm_size_uv) {
+    case TX_32X32:
+      assert(block == 256 || block == 320);
+      nzc_exp =
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
+                         base_mb16 + 3);
+      nzc_exp <<= 2;
+      // Note nzc_exp is 64 times the average value expected at 32x32 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
+
+    case TX_16X16:
+      // uv txfm_size 16x16
+      assert((block & 15) == 0);
+      if (boff < 32) {
+        int o = (boff >> 4) & 1;
+        nzc_exp =
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 3) +
+            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
+      }
+      if ((boff & 31) == 0) {
+        int o = boff >> 5;
+        nzc_exp +=
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
+                           mb_row + o, mb_col - 1, base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
+                           mb_row + o, mb_col - 1, base_mb16 + 3) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
+                           mb_row + o + 1, mb_col - 1, base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
+                           mb_row + o + 1, mb_col - 1, base_mb16 + 3);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
+      }
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (boff < 16) {
+        int o = boff >> 2;
+        nzc_exp =
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
+      }
+      if ((boff & 15) == 0) {
+        int o = boff >> 4;
+        nzc_exp +=
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (boff < 8) {
+        int o = boff >> 1;
+        int p = boff & 1;
+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                 base_mb16 + 2 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
+      }
+      if ((boff & 7) == 0) {
+        int o = boff >> 4;
+        int p = (boff >> 3) & 1;
+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                  base_mb16 + 1 + 2 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  const int base = block - (block & 15);
+  const int boff = (block & 15);
+  const int base_mb16 = base >> 2;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  TX_SIZE txfm_size_uv;
+
+  assert(block >= 64 && block < 96);
+  if (txfm_size == TX_32X32)
+    txfm_size_uv = TX_16X16;
+  else
+    txfm_size_uv = txfm_size;
+
+  switch (txfm_size_uv) {
+    case TX_16X16:
+      // uv txfm_size 16x16
+      assert(block == 64 || block == 80);
+      nzc_exp =
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
+                         base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
+                         base_mb16 + 3);
+      nzc_exp <<= 1;
+      // Note nzc_exp is 64 times the average value expected at 16x16 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
+      break;
+
+    case TX_8X8:
+      assert((block & 3) == 0);
+      if (boff < 8) {
+        int o = boff >> 2;
+        nzc_exp =
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 2) +
+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
+      }
+      if ((boff & 7) == 0) {
+        int o = boff >> 3;
+        nzc_exp +=
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 1) +
+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                           base_mb16 + 3);
+      } else {
+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
+      }
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (boff < 4) {
+        int o = boff >> 1;
+        int p = boff & 1;
+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
+                                 base_mb16 + 2 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
+      }
+      if ((boff & 3) == 0) {
+        int o = boff >> 3;
+        int p = (boff >> 2) & 1;
+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
+                                  base_mb16 + 1 + 2 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block) {
+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
+  // neighboring blocks are
+  int mis = cm->mode_info_stride;
+  int nzc_exp = 0;
+  const int base = block - (block & 3);
+  const int boff = (block & 3);
+  const int base_mb16 = base;
+  TX_SIZE txfm_size = cur->mbmi.txfm_size;
+  TX_SIZE txfm_size_uv;
+
+  assert(block >= 16 && block < 24);
+  if (txfm_size == TX_16X16)
+    txfm_size_uv = TX_8X8;
+  else if (txfm_size == TX_8X8 &&
+           (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV))
+    txfm_size_uv = TX_4X4;
+  else
+    txfm_size_uv = txfm_size;
+
+  switch (txfm_size_uv) {
+    case TX_8X8:
+      assert((block & 3) == 0);
+      nzc_exp =
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) +
+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) +
+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3);
+      // Note nzc_exp is 64 times the average value expected at 8x8 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
+
+    case TX_4X4:
+      if (boff < 2) {
+        int p = boff & 1;
+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
+                                 base_mb16 + 2 + p);
+      } else {
+        nzc_exp = (cur->mbmi.nzcs[block - 2] << 6);
+      }
+      if ((boff & 1) == 0) {
+        int p = (boff >> 1) & 1;
+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
+                                  base_mb16 + 1 + 2 * p);
+      } else {
+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
+      }
+      nzc_exp >>= 1;
+      // Note nzc_exp is 64 times the average value expected at 4x4 scale
+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
+
+    default:
+      return 0;
+  }
+}
+
+int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) {
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    assert(block < 384);
+    if (block < 256)
+      return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context,
+                                        get_mb_row(xd), get_mb_col(xd), block);
+    else
+      return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context,
+                                         get_mb_row(xd), get_mb_col(xd), block);
+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+    assert(block < 96);
+    if (block < 64)
+      return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context,
+                                        get_mb_row(xd), get_mb_col(xd), block);
+    else
+      return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context,
+                                         get_mb_row(xd), get_mb_col(xd), block);
+  } else {
+    assert(block < 64);
+    if (block < 16)
+      return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context,
+                                        get_mb_row(xd), get_mb_col(xd), block);
+    else
+      return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context,
+                                         get_mb_row(xd), get_mb_col(xd), block);
+  }
+}
+
+static void update_nzc(VP9_COMMON *cm,
+                       uint16_t nzc,
+                       int nzc_context,
+                       TX_SIZE tx_size,
+                       int ref,
+                       int type) {
+  int c;
+  c = codenzc(nzc);
+  if (tx_size == TX_32X32)
+    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
+  else if (tx_size == TX_16X16)
+    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
+  else if (tx_size == TX_8X8)
+    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
+  else if (tx_size == TX_4X4)
+    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+  else
+    assert(0);
+  // TODO(debargha): Handle extra bits later if needed
+}
+
+static void update_nzcs_sb64(VP9_COMMON *cm,
+                             MACROBLOCKD *xd,
+                             int mb_row,
+                             int mb_col) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 256; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
+      }
+      for (j = 256; j < 384; j += 64) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 256; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
+      }
+      for (j = 256; j < 384; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 256; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
+      }
+      for (j = 256; j < 384; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 256; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
+      }
+      for (j = 256; j < 384; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void update_nzcs_sb32(VP9_COMMON *cm,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 64; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 64; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 64; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
+      }
+      for (j = 64; j < 96; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 64; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
+      }
+      for (j = 64; j < 96; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+      }
+      break;
+
+    default:
+      break;
+  }
 }
 
+static void update_nzcs_mb16(VP9_COMMON *cm,
+                             MACROBLOCKD *xd,
+                             int mb_row,
+                             int mb_col) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_16X16:
+      for (j = 0; j < 16; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
+      }
+      for (j = 16; j < 24; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 16; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
+      }
+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
+        for (j = 16; j < 24; ++j) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+        }
+      } else {
+        for (j = 16; j < 24; j += 4) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
+        }
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 16; ++j) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
+      }
+      for (j = 16; j < 24; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+void vp9_update_nzc_counts(VP9_COMMON *cm,
+                           MACROBLOCKD *xd,
+                           int mb_row,
+                           int mb_col) {
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64)
+    update_nzcs_sb64(cm, xd, mb_row, mb_col);
+  else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32)
+    update_nzcs_sb32(cm, xd, mb_row, mb_col);
+  else
+    update_nzcs_mb16(cm, xd, mb_row, mb_col);
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 // #define COEF_COUNT_TESTING
 
 #define COEF_COUNT_SAT 24
@@ -277,10 +1481,10 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-static void update_coef_probs(vp9_coeff_probs *dst_coef_probs,
-                              vp9_coeff_probs *pre_coef_probs,
-                              int block_types, vp9_coeff_count *coef_counts,
-                              int count_sat, int update_factor) {
+static void adapt_coef_probs(vp9_coeff_probs *dst_coef_probs,
+                             vp9_coeff_probs *pre_coef_probs,
+                             int block_types, vp9_coeff_count *coef_counts,
+                             int count_sat, int update_factor) {
   int t, i, j, k, l, count;
   unsigned int branch_ct[ENTROPY_NODES][2];
   vp9_prob coef_probs[ENTROPY_NODES];
@@ -308,9 +1512,6 @@ static void update_coef_probs(vp9_coeff_probs *dst_coef_probs,
 }
 
 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
-#ifdef COEF_COUNT_TESTING
-  int t, i, j, k;
-#endif
   int count_sat;
   int update_factor; /* denominator 256 */
 
@@ -326,16 +1527,121 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
     count_sat = COEF_COUNT_SAT;
   }
 
-  update_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
-                    BLOCK_TYPES, cm->fc.coef_counts_4x4,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
-                    BLOCK_TYPES, cm->fc.coef_counts_8x8,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
-                    BLOCK_TYPES, cm->fc.coef_counts_16x16,
-                    count_sat, update_factor);
-  update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
-                    BLOCK_TYPES, cm->fc.coef_counts_32x32,
-                    count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
+                   BLOCK_TYPES, cm->fc.coef_counts_4x4,
+                   count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
+                   BLOCK_TYPES, cm->fc.coef_counts_8x8,
+                   count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
+                   BLOCK_TYPES, cm->fc.coef_counts_16x16,
+                   count_sat, update_factor);
+  adapt_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
+                   BLOCK_TYPES, cm->fc.coef_counts_32x32,
+                   count_sat, update_factor);
+}
+
+#if CONFIG_CODE_NONZEROCOUNT
+static void adapt_nzc_probs(VP9_COMMON *cm,
+                            int block_size,
+                            int count_sat,
+                            int update_factor) {
+  int c, r, b, n;
+  int count, factor;
+  unsigned int nzc_branch_ct[NZC32X32_NODES][2];
+  vp9_prob nzc_probs[NZC32X32_NODES];
+  int tokens, nodes;
+  const vp9_tree_index *nzc_tree;
+  const struct vp9_token_struct *nzc_encodings;
+  vp9_prob *dst_nzc_probs;
+  vp9_prob *pre_nzc_probs;
+  unsigned int *nzc_counts;
+
+  if (block_size == 32) {
+    tokens = NZC32X32_TOKENS;
+    nzc_tree = vp9_nzc32x32_tree;
+    nzc_encodings = vp9_nzc32x32_encodings;
+    dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
+  } else if (block_size == 16) {
+    tokens = NZC16X16_TOKENS;
+    nzc_tree = vp9_nzc16x16_tree;
+    nzc_encodings = vp9_nzc16x16_encodings;
+    dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
+  } else if (block_size == 8) {
+    tokens = NZC8X8_TOKENS;
+    nzc_tree = vp9_nzc8x8_tree;
+    nzc_encodings = vp9_nzc8x8_encodings;
+    dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
+  } else {
+    nzc_tree = vp9_nzc4x4_tree;
+    nzc_encodings = vp9_nzc4x4_encodings;
+    tokens = NZC4X4_TOKENS;
+    dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
+    pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
+  }
+  nodes = tokens - 1;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
+    for (r = 0; r < REF_TYPES; ++r)
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        int offset_tokens = offset * tokens;
+        vp9_tree_probs_from_distribution(tokens,
+                                         nzc_encodings, nzc_tree,
+                                         nzc_probs, nzc_branch_ct,
+                                         nzc_counts + offset_tokens);
+        for (n = 0; n < nodes; ++n) {
+          count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          dst_nzc_probs[offset_nodes + n] =
+              weighted_prob(pre_nzc_probs[offset_nodes + n],
+                            nzc_probs[n], factor);
+        }
+      }
+}
+
+// #define NZC_COUNT_TESTING
+void vp9_adapt_nzc_probs(VP9_COMMON *cm) {
+  int count_sat;
+  int update_factor; /* denominator 256 */
+#ifdef NZC_COUNT_TESTING
+  int c, r, b, t;
+  printf("\n");
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        printf("    {");
+        for (t = 0; t < NZC4X4_TOKENS; ++t) {
+          printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]);
+        }
+        printf("}\n");
+      }
+      printf("\n");
+    }
+#endif
+
+  if (cm->frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
+    count_sat = COEF_COUNT_SAT_KEY;
+  } else if (cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR;
+    count_sat = COEF_COUNT_SAT;
+  }
+
+  adapt_nzc_probs(cm, 4, count_sat, update_factor);
+  adapt_nzc_probs(cm, 8, count_sat, update_factor);
+  adapt_nzc_probs(cm, 16, count_sat, update_factor);
+  adapt_nzc_probs(cm, 32, count_sat, update_factor);
 }
+#endif  // CONFIG_CODE_NONZEROCOUNT
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 8d28b00583ed7d99a7c4f6bdf6c68b85afe67d4d..25ba3c08dd403befd5485b1ecfff4b9923ab5067 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -142,4 +142,80 @@ static int get_coef_band(TX_SIZE tx_size, int coef_index) {
 }
 extern int vp9_get_coef_context(int * recent_energy, int token);
 
+#if CONFIG_CODE_NONZEROCOUNT
+/* Alphabet for number of non-zero symbols in block */
+#define NZC_0                   0       /* Used for all blocks */
+#define NZC_1                   1       /* Used for all blocks */
+#define NZC_2                   2       /* Used for all blocks */
+#define NZC_3TO4                3       /* Used for all blocks */
+#define NZC_5TO8                4       /* Used for all blocks */
+#define NZC_9TO16               5       /* Used for all blocks */
+#define NZC_17TO32              6       /* Used for 8x8 and larger blocks */
+#define NZC_33TO64              7       /* Used for 8x8 and larger blocks */
+#define NZC_65TO128             8       /* Used for 16x16 and larger blocks */
+#define NZC_129TO256            9       /* Used for 16x16 and larger blocks */
+#define NZC_257TO512           10       /* Used for 32x32 and larger blocks */
+#define NZC_513TO1024          11       /* Used for 32x32 and larger blocks */
+
+/* Number of tokens for each block size */
+#define NZC4X4_TOKENS           6
+#define NZC8X8_TOKENS           8
+#define NZC16X16_TOKENS        10
+#define NZC32X32_TOKENS        12
+
+/* Number of nodes for each block size */
+#define NZC4X4_NODES            5
+#define NZC8X8_NODES            7
+#define NZC16X16_NODES          9
+#define NZC32X32_NODES         11
+
+/* Max number of tokens with extra bits */
+#define NZC_TOKENS_EXTRA        9
+
+/* Max number of extra bits */
+#define NZC_BITS_EXTRA          9
+
+#define MAX_NZC_CONTEXTS        3
+
+/* nzc trees */
+extern const vp9_tree_index    vp9_nzc4x4_tree[];
+extern const vp9_tree_index    vp9_nzc8x8_tree[];
+extern const vp9_tree_index    vp9_nzc16x16_tree[];
+extern const vp9_tree_index    vp9_nzc32x32_tree[];
+
+/* nzc encodings */
+extern struct vp9_token_struct  vp9_nzc4x4_encodings[NZC4X4_TOKENS];
+extern struct vp9_token_struct  vp9_nzc8x8_encodings[NZC8X8_TOKENS];
+extern struct vp9_token_struct  vp9_nzc16x16_encodings[NZC16X16_TOKENS];
+extern struct vp9_token_struct  vp9_nzc32x32_encodings[NZC32X32_TOKENS];
+
+#define codenzc(x) (\
+  (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \
+  (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\
+  (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11)
+#define extranzcbits(c) ((c) <= 2 ? 0 : (c) - 2)
+#define basenzcvalue(c) ((c) <= 2 ? (c) : (1 << ((c) - 2)) + 1)
+
+int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur,
+                               int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block);
+int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur,
+                                int mb_row, int mb_col, int block);
+int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block);
+void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd,
+                           int mb_row, int mb_col);
+void vp9_adapt_nzc_probs(struct VP9Common *cm);
+
+/* Extra bit probabilities - block size agnostic */
+extern const vp9_prob Pcat_nzc[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
+                              [NZC_BITS_EXTRA];
+
+#endif  // CONFIG_CODE_NONZEROCOUNT
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 48d19a332c49a83fad5fc3848cc990ffa593e401..2cdb9c4689404a5f5c5bfc1b214e89ed3286ab0e 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -58,10 +58,21 @@ typedef struct frame_contexts {
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
+
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                        [NZC4X4_NODES];
+  vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                        [NZC8X8_NODES];
+  vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC16X16_NODES];
+  vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                          [NZC32X32_NODES];
+#endif
 
   nmv_context nmvc;
   nmv_context pre_nmvc;
@@ -84,11 +95,31 @@ typedef struct frame_contexts {
   vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                            [NZC4X4_NODES];
+  vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                            [NZC8X8_NODES];
+  vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                              [NZC16X16_NODES];
+  vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                              [NZC32X32_NODES];
+#endif
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
   vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                             [NZC4X4_TOKENS];
+  unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                             [NZC8X8_TOKENS];
+  unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                               [NZC16X16_TOKENS];
+  unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
+                               [NZC32X32_TOKENS];
+#endif
 
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
@@ -300,4 +331,31 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
   buf[new_idx]++;
 }
 
+// TODO(debargha): merge the two functions
+static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd,
+                       int mb_row, int block_size) {
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+
+  // Are edges available for intra prediction?
+  xd->up_available    = (mb_row != 0);
+}
+
+static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd,
+                       int mb_col, int block_size) {
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+
+  // Are edges available for intra prediction?
+  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
+  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+}
+
+static int get_mb_row(const MACROBLOCKD *xd) {
+  return ((-xd->mb_to_top_edge) >> 7);
+}
+
+static int get_mb_col(const MACROBLOCKD *xd) {
+  return ((-xd->mb_to_left_edge) >> 7);
+}
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 326c80239ee4c49893fe65c8b15e2f7a7418fbc0..89dcdc09d9ab283bb9ddf264ec10733206395310 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -173,7 +173,6 @@ static void kfread_modes(VP9D_COMP *pbi,
       m->mbmi.mb_skip_coeff = 0;
   }
 
-
   y_mode = m->mbmi.sb_type ?
       read_kf_sb_ymode(bc,
           pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]):
@@ -677,22 +676,23 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   mbmi->need_to_clamp_secondmv = 0;
   mbmi->second_ref_frame = NONE;
 
+  // Make sure the MACROBLOCKD mode info pointer is pointed at the
+  // correct entry for the current macroblock.
+  xd->mode_info_context = mi;
+  xd->prev_mode_info_context = prev_mi;
+
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV values
   // that are in 1/8th pel units
-  xd->mb_to_left_edge = mb_to_left_edge
-                      = -((mb_col * 16) << 3);
+  set_mb_row(cm, xd, mb_row, mb_size);
+  set_mb_col(cm, xd, mb_col, mb_size);
+
+  mb_to_left_edge = xd->mb_to_left_edge;
   mb_to_left_edge -= LEFT_TOP_MARGIN;
 
-  xd->mb_to_right_edge = mb_to_right_edge
-                       = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
+  mb_to_right_edge = xd->mb_to_right_edge;
   mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
 
-  // Make sure the MACROBLOCKD mode info pointer is pointed at the
-  // correct entry for the current macroblock.
-  xd->mode_info_context = mi;
-  xd->prev_mode_info_context = prev_mi;
-
   // Read the macroblock segment id.
   read_mb_segment_id(pbi, mb_row, mb_col, bc);
 
@@ -750,17 +750,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
       vp9_mv_ref_probs(&pbi->common, mv_ref_p,
                        mbmi->mb_mode_context[ref_frame]);
-      /*
-      if (pbi->common.current_video_frame == 1) {
-	int k = mbmi->mb_mode_context[ref_frame];
-	printf("vp9_mode_contexts: [%d %d %d %d] %d %d %d %d\n",
-	       mb_row, mb_col, ref_frame, k,
-	       cm->fc.vp9_mode_contexts[k][0],
-	       cm->fc.vp9_mode_contexts[k][1],
-	       cm->fc.vp9_mode_contexts[k][2],
-	       cm->fc.vp9_mode_contexts[k][3]);
-      }
-      */
 
       // If the segment level skip mode enabled
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -1176,20 +1165,270 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) {
   vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
   if (pbi->common.mb_no_coeff_skip) {
     int k;
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
       cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
+    }
   }
 
   mb_mode_mv_init(pbi, bc);
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static uint16_t read_nzc(VP9_COMMON *const cm,
+                         int nzc_context,
+                         TX_SIZE tx_size,
+                         int ref,
+                         int type,
+                         BOOL_DECODER* const bc) {
+  int c, e;
+  uint16_t nzc;
+  if (tx_size == TX_32X32) {
+    c = treed_read(bc, vp9_nzc32x32_tree,
+                   cm->fc.nzc_probs_32x32[nzc_context][ref][type]);
+    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_16X16) {
+    c = treed_read(bc, vp9_nzc16x16_tree,
+                   cm->fc.nzc_probs_16x16[nzc_context][ref][type]);
+    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_8X8) {
+    c = treed_read(bc, vp9_nzc8x8_tree,
+                   cm->fc.nzc_probs_8x8[nzc_context][ref][type]);
+    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_4X4) {
+    c = treed_read(bc, vp9_nzc4x4_tree,
+                   cm->fc.nzc_probs_4x4[nzc_context][ref][type]);
+    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+  } else {
+    assert(0);
+  }
+  nzc = basenzcvalue(c);
+  if ((e = extranzcbits(c))) {
+    int x = 0;
+    while (e--)
+      x |= (vp9_read(bc, Pcat_nzc[nzc_context][c - 3][e]) << e);
+    nzc += x;
+  }
+  if (tx_size == TX_32X32)
+    assert(nzc <= 1024);
+  else if (tx_size == TX_16X16)
+    assert(nzc <= 256);
+  else if (tx_size == TX_8X8)
+    assert(nzc <= 64);
+  else if (tx_size == TX_4X4)
+    assert(nzc <= 16);
+  return nzc;
+}
+
+static void read_nzcs_sb64(VP9_COMMON *const cm,
+                           MACROBLOCKD* xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 256; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 64) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 256; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 256; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 256; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 256; j < 384; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void read_nzcs_sb32(VP9_COMMON *const cm,
+                           MACROBLOCKD* xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 64; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 64; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 64; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 64; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 64; j < 96; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void read_nzcs_mb16(VP9_COMMON *const cm,
+                           MACROBLOCKD* xd,
+                           int mb_row,
+                           int mb_col,
+                           BOOL_DECODER* const bc) {
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_16X16:
+      for (j = 0; j < 16; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 16; j < 24; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 16; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
+      }
+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
+        for (j = 16; j < 24; ++j) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+        }
+      } else {
+        for (j = 16; j < 24; j += 4) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
+        }
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 16; ++j) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 16; j < 24; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                            MACROBLOCKD* const xd,
                            int mb_row,
                            int mb_col,
                            BOOL_DECODER* const bc) {
+  VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *mi = xd->mode_info_context;
   MODE_INFO *prev_mi = xd->prev_mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (pbi->common.frame_type == KEY_FRAME) {
     kfread_modes(pbi, mi, mb_row, mb_col, bc);
@@ -1199,4 +1438,28 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                       mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
                       pbi->common.active_ref_scale);
   }
+#if CONFIG_CODE_NONZEROCOUNT
+  if (mbmi->sb_type == BLOCK_SIZE_SB64X64)
+    read_nzcs_sb64(cm, xd, mb_row, mb_col, bc);
+  else if (mbmi->sb_type == BLOCK_SIZE_SB32X32)
+    read_nzcs_sb32(cm, xd, mb_row, mb_col, bc);
+  else
+    read_nzcs_mb16(cm, xd, mb_row, mb_col, bc);
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
+  if (mbmi->sb_type) {
+    const int n_mbs = 1 << mbmi->sb_type;
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int mis = cm->mode_info_stride;
+    int x, y;
+
+    for (y = 0; y < y_mbs; y++) {
+      for (x = !y; x < x_mbs; x++) {
+        mi[y * mis + x] = *mi;
+      }
+    }
+  } else {
+    update_blockd_bmi(xd);
+  }
 }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b44d6595b2253586165f35d001fa7135d12491c4..8dfb3e851698ddc6ac573073fd8935049f4ce4d1 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -39,7 +39,7 @@
 
 #define COEFCOUNT_TESTING
 
-//#define DEC_DEBUG
+// #define DEC_DEBUG
 #ifdef DEC_DEBUG
 int dec_debug = 0;
 #endif
@@ -246,7 +246,7 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
     int i;
     printf("\n");
     printf("qcoeff 8x8\n");
-    for (i = 0; i < 400; i++) {
+    for (i = 0; i < 384; i++) {
       printf("%3d ", xd->qcoeff[i]);
       if (i % 16 == 15) printf("\n");
     }
@@ -862,14 +862,9 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
    * values that are in 1/8th pel units
    */
   block_size >>= 4;  // in mb units
-  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-  xd->mb_to_left_edge = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
-  xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
 
-  xd->up_available    = (mb_row != 0);
-  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+  set_mb_row(cm, xd, mb_row, block_size);
+  set_mb_col(cm, xd, mb_col, block_size);
 
   xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
   xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
@@ -910,20 +905,6 @@ static void set_refs(VP9D_COMP *pbi, int block_size,
       xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
     }
   }
-
-  if (mbmi->sb_type) {
-    const int n_mbs = 1 << mbmi->sb_type;
-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
-    const int mis = cm->mode_info_stride;
-    int x, y;
-
-    for (y = 0; y < y_mbs; y++) {
-      for (x = !y; x < x_mbs; x++) {
-        mi[y * mis + x] = *mi;
-      }
-    }
-  }
 }
 
 /* Decode a row of Superblocks (2x2 region of MBs) */
@@ -938,6 +919,11 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
   for (mb_col = pc->cur_tile_mb_col_start;
        mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {
     if (vp9_read(bc, pc->sb64_coded)) {
+#ifdef DEC_DEBUG
+      dec_debug = (pc->current_video_frame == 1 && mb_row == 0 && mb_col == 0);
+      if (dec_debug)
+        printf("Debug\n");
+#endif
       set_offsets(pbi, 64, mb_row, mb_col);
       vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
       set_refs(pbi, 64, mb_row, mb_col);
@@ -958,6 +944,10 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
         xd->sb_index = j;
 
         if (vp9_read(bc, pc->sb32_coded)) {
+#ifdef DEC_DEBUG
+          dec_debug = (pc->current_video_frame == 1 &&
+                       mb_row + y_idx_sb == 0 && mb_col + x_idx_sb == 0);
+#endif
           set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
           vp9_decode_mb_mode_mv(pbi,
                                 xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
@@ -978,11 +968,14 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
               // MB lies outside frame, skip on to next
               continue;
             }
+#ifdef DEC_DEBUG
+            dec_debug = (pc->current_video_frame == 1 &&
+                         mb_row + y_idx == 0 && mb_col + x_idx == 0);
+#endif
 
             set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);
             xd->mb_index = i;
             vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
-            update_blockd_bmi(xd);
             set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
             decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
 
@@ -1073,6 +1066,63 @@ static void init_frame(VP9D_COMP *pbi) {
     xd->fullpixel_mask = 0xfffffff8;
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void read_nzc_probs_common(VP9_COMMON *cm,
+                                  BOOL_DECODER* const bc,
+                                  int block_size) {
+  int c, r, b, t;
+  int tokens, nodes;
+  vp9_prob *nzc_probs;
+  vp9_prob upd;
+
+  if (!vp9_read_bit(bc)) return;
+
+  if (block_size == 32) {
+    tokens = NZC32X32_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
+    upd = NZC_UPDATE_PROB_32X32;
+  } else if (block_size == 16) {
+    tokens = NZC16X16_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
+    upd = NZC_UPDATE_PROB_16X16;
+  } else if (block_size == 8) {
+    tokens = NZC8X8_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
+    upd = NZC_UPDATE_PROB_8X8;
+  } else {
+    tokens = NZC4X4_TOKENS;
+    nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
+    upd = NZC_UPDATE_PROB_4X4;
+  }
+  nodes = tokens - 1;
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        for (t = 0; t < nodes; ++t) {
+          vp9_prob *p = &nzc_probs[offset_nodes + t];
+          if (vp9_read(bc, upd)) {
+            *p = read_prob_diff_update(bc, *p);
+          }
+        }
+      }
+    }
+  }
+}
+
+static void read_nzc_probs(VP9_COMMON *cm,
+                           BOOL_DECODER* const bc) {
+  read_nzc_probs_common(cm, bc, 4);
+  if (cm->txfm_mode != ONLY_4X4)
+    read_nzc_probs_common(cm, bc, 8);
+  if (cm->txfm_mode > ALLOW_8X8)
+    read_nzc_probs_common(cm, bc, 16);
+  if (cm->txfm_mode > ALLOW_16X16)
+    read_nzc_probs_common(cm, bc, 32);
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static void read_coef_probs_common(BOOL_DECODER* const bc,
                                    vp9_coeff_probs *coef_probs,
                                    int block_types) {
@@ -1085,7 +1135,7 @@ static void read_coef_probs_common(BOOL_DECODER* const bc,
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
             if (l >= 3 && k == 0)
               continue;
-            for (m = 0; m < ENTROPY_NODES; m++) {
+            for (m = CONFIG_CODE_NONZEROCOUNT; m < ENTROPY_NODES; m++) {
               vp9_prob *const p = coef_probs[i][j][k][l] + m;
 
               if (vp9_read(bc, COEF_UPDATE_PROB)) {
@@ -1539,6 +1589,17 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob;
 #endif
   pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(pbi->common.fc.pre_nzc_probs_4x4,
+           pbi->common.fc.nzc_probs_4x4);
+  vp9_copy(pbi->common.fc.pre_nzc_probs_8x8,
+           pbi->common.fc.nzc_probs_8x8);
+  vp9_copy(pbi->common.fc.pre_nzc_probs_16x16,
+           pbi->common.fc.nzc_probs_16x16);
+  vp9_copy(pbi->common.fc.pre_nzc_probs_32x32,
+           pbi->common.fc.nzc_probs_32x32);
+#endif
+
   vp9_zero(pbi->common.fc.coef_counts_4x4);
   vp9_zero(pbi->common.fc.coef_counts_8x8);
   vp9_zero(pbi->common.fc.coef_counts_16x16);
@@ -1555,8 +1616,17 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 #if CONFIG_COMP_INTERINTRA_PRED
   vp9_zero(pbi->common.fc.interintra_counts);
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_zero(pbi->common.fc.nzc_counts_4x4);
+  vp9_zero(pbi->common.fc.nzc_counts_8x8);
+  vp9_zero(pbi->common.fc.nzc_counts_16x16);
+  vp9_zero(pbi->common.fc.nzc_counts_32x32);
+#endif
 
   read_coef_probs(pbi, &header_bc);
+#if CONFIG_CODE_NONZEROCOUNT
+  read_nzc_probs(&pbi->common, &header_bc);
+#endif
 
   /* Initialize xd pointers. Any reference should do for xd->pre, so use 0. */
   vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],
@@ -1700,8 +1770,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   }
 
   if (!pc->error_resilient_mode &&
-      !pc->frame_parallel_decoding_mode)
+      !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
+#if CONFIG_CODE_NONZEROCOUNT
+    vp9_adapt_nzc_probs(pc);
+#endif
+  }
   if (pc->frame_type != KEY_FRAME) {
     if (!pc->error_resilient_mode &&
         !pc->frame_parallel_decoding_mode) {
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index a192266efc8f6cd0361b834c52737e5f516a3d5b..a53edfc3cda38d7ca4341571cb875f4684bc1a14 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -69,13 +69,24 @@ static int get_signed(BOOL_DECODER *br, int value_to_sign) {
     pt = vp9_get_coef_context(&recent_energy, token);         \
   } while (0)
 
+#if CONFIG_CODE_NONZEROCOUNT
 #define WRITE_COEF_CONTINUE(val, token)                       \
   {                                                           \
-    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);        \
+    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);      \
     INCREMENT_COUNT(token);                                   \
     c++;                                                      \
+    nzc++;                                           \
     continue;                                                 \
   }
+#else
+#define WRITE_COEF_CONTINUE(val, token)                       \
+  {                                                           \
+    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);      \
+    INCREMENT_COUNT(token);                                   \
+    c++;                                                      \
+    continue;                                                 \
+  }
+#endif  // CONFIG_CODE_NONZEROCOUNT
 
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
@@ -99,6 +110,10 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
   const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
+#if CONFIG_CODE_NONZEROCOUNT
+  uint16_t nzc = 0;
+  uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx];
+#endif
 
   if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
     aidx = vp9_block2above_sb64[txfm_size][block_idx];
@@ -170,12 +185,24 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
 
     if (c >= seg_eob)
       break;
+#if CONFIG_CODE_NONZEROCOUNT
+    if (nzc == nzc_expected)
+      break;
+#endif
     prob = coef_probs[type][ref][get_coef_band(txfm_size, c)][pt];
+#if CONFIG_CODE_NONZEROCOUNT == 0
     if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
       break;
+#endif
 SKIP_START:
     if (c >= seg_eob)
       break;
+#if CONFIG_CODE_NONZEROCOUNT
+    if (nzc == nzc_expected)
+      break;
+    // decode zero node only if there are zeros left
+    if (seg_eob - nzc_expected - c + nzc > 0)
+#endif
     if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
@@ -242,8 +269,10 @@ SKIP_START:
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
+#if CONFIG_CODE_NONZEROCOUNT == 0
   if (c < seg_eob)
     coef_counts[type][ref][get_coef_band(txfm_size, c)][pt][DCT_EOB_TOKEN]++;
+#endif
 
   A0[aidx] = L0[lidx] = c > 0;
   if (txfm_size >= TX_8X8) {
@@ -272,7 +301,6 @@ SKIP_START:
       }
     }
   }
-
   return c;
 }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index b05da870c8183066e7871ba174ab3cb086986c39..c6267f172e1c4d7ce96c80c8a07772ea9ef16ac9 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -720,10 +720,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV
   // values that are in 1/8th pel units
-  xd->mb_to_left_edge = -((mb_col * 16) << 3);
-  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-  xd->mb_to_right_edge = ((pc->mb_cols - mb_size - mb_col) * 16) << 3;
-  xd->mb_to_bottom_edge = ((pc->mb_rows - mb_size - mb_row) * 16) << 3;
+
+  set_mb_row(pc, xd, mb_row, mb_size);
+  set_mb_col(pc, xd, mb_col, mb_size);
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -752,18 +751,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
-    const int nmbs = mb_size;
-    const int xmbs = MIN(nmbs, mb_cols_left);
-    const int ymbs = MIN(nmbs, mb_rows_left);
-    int x, y;
-
-    skip_coeff = 1;
-    for (y = 0; y < ymbs; y++) {
-      for (x = 0; x < xmbs; x++) {
-        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
-      }
-    }
-
+    skip_coeff = m->mbmi.mb_skip_coeff;
     vp9_write(bc, skip_coeff,
               vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
   }
@@ -967,7 +955,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 }
 
 static void write_mb_modes_kf(const VP9_COMP *cpi,
-                              const MODE_INFO *m,
+                              MODE_INFO *m,
                               vp9_writer *bc,
                               int mb_rows_left, int mb_cols_left) {
   const VP9_COMMON *const c = &cpi->common;
@@ -986,18 +974,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
   } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
-    const int nmbs = 1 << m->mbmi.sb_type;
-    const int xmbs = MIN(nmbs, mb_cols_left);
-    const int ymbs = MIN(nmbs, mb_rows_left);
-    int x, y;
-
-    skip_coeff = 1;
-    for (y = 0; y < ymbs; y++) {
-      for (x = 0; x < xmbs; x++) {
-        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
-      }
-    }
-
+    skip_coeff = m->mbmi.mb_skip_coeff;
     vp9_write(bc, skip_coeff,
               vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
@@ -1055,30 +1032,275 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
   }
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void write_nzc(VP9_COMMON *const cm,
+                      uint16_t nzc,
+                      int nzc_context,
+                      TX_SIZE tx_size,
+                      int ref,
+                      int type,
+                      vp9_writer* const bc) {
+  int c, e;
+  c = codenzc(nzc);
+  if (tx_size == TX_32X32) {
+    write_token(bc, vp9_nzc32x32_tree,
+                cm->fc.nzc_probs_32x32[nzc_context][ref][type],
+                vp9_nzc32x32_encodings + c);
+    // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_16X16) {
+    write_token(bc, vp9_nzc16x16_tree,
+                cm->fc.nzc_probs_16x16[nzc_context][ref][type],
+                vp9_nzc16x16_encodings + c);
+    // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_8X8) {
+    write_token(bc, vp9_nzc8x8_tree,
+                cm->fc.nzc_probs_8x8[nzc_context][ref][type],
+                vp9_nzc8x8_encodings + c);
+    // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
+  } else if (tx_size == TX_4X4) {
+    write_token(bc, vp9_nzc4x4_tree,
+                cm->fc.nzc_probs_4x4[nzc_context][ref][type],
+                vp9_nzc4x4_encodings + c);
+    // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+  } else {
+    assert(0);
+  }
+
+  if ((e = extranzcbits(c))) {
+    int x = nzc - basenzcvalue(c);
+    while (e--)
+      vp9_write(bc, (x >> e) & 1, Pcat_nzc[nzc_context][c - 3][e]);
+  }
+}
+
+static void write_nzcs_sb64(VP9_COMP *cpi,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col,
+                            vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 256; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 64) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 256; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 256; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 256; j < 384; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 256; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 256; j < 384; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void write_nzcs_sb32(VP9_COMP *cpi,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col,
+                            vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_32X32:
+      for (j = 0; j < 64; j += 64) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_16X16:
+      for (j = 0; j < 64; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 16) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 64; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
+      }
+      for (j = 64; j < 96; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 64; ++j) {
+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 64; j < 96; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void write_nzcs_mb16(VP9_COMP *cpi,
+                            MACROBLOCKD *xd,
+                            int mb_row,
+                            int mb_col,
+                            vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO *m = xd->mode_info_context;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  int j, nzc_context;
+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+
+  assert(mb_col == get_mb_col(xd));
+  assert(mb_row == get_mb_row(xd));
+
+  if (mi->mb_skip_coeff)
+    return;
+
+  switch (mi->txfm_size) {
+    case TX_16X16:
+      for (j = 0; j < 16; j += 16) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
+      }
+      for (j = 16; j < 24; j += 4) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+      }
+      break;
+
+    case TX_8X8:
+      for (j = 0; j < 16; j += 4) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
+      }
+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
+        for (j = 16; j < 24; ++j) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+        }
+      } else {
+        for (j = 16; j < 24; j += 4) {
+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
+        }
+      }
+      break;
+
+    case TX_4X4:
+      for (j = 0; j < 16; ++j) {
+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+      }
+      for (j = 16; j < 24; ++j) {
+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+#endif
+
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
                           int mb_row, int mb_col) {
-  VP9_COMMON *const c = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   xd->mode_info_context = m;
-  xd->left_available = mb_col > c->cur_tile_mb_col_start;
-  xd->right_available =
-      (mb_col + (1 << m->mbmi.sb_type)) < c->cur_tile_mb_col_end;
-  xd->up_available = mb_row > 0;
-  if (c->frame_type == KEY_FRAME) {
+  set_mb_row(&cpi->common, xd, mb_row, (1 << m->mbmi.sb_type));
+  set_mb_col(&cpi->common, xd, mb_col, (1 << m->mbmi.sb_type));
+  if (cm->frame_type == KEY_FRAME) {
     write_mb_modes_kf(cpi, m, bc,
-                      c->mb_rows - mb_row, c->mb_cols - mb_col);
+                      cm->mb_rows - mb_row, cm->mb_cols - mb_col);
 #ifdef ENTROPY_STATS
     active_section = 8;
 #endif
   } else {
     pack_inter_mode_mvs(cpi, m, bc,
-                        c->mb_rows - mb_row, c->mb_cols - mb_col);
+                        cm->mb_rows - mb_row, cm->mb_cols - mb_col);
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
   }
+#if CONFIG_CODE_NONZEROCOUNT
+  if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64)
+    write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc);
+  else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32)
+    write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc);
+  else
+    write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc);
+#endif
 
   assert(*tok < tok_end);
   pack_mb_tokens(bc, tok, tok_end);
@@ -1232,6 +1454,157 @@ static void build_coeff_contexts(VP9_COMP *cpi) {
                           cpi->frame_branch_ct_32x32, BLOCK_TYPES);
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void update_nzc_probs_common(VP9_COMP* cpi,
+                                    vp9_writer* const bc,
+                                    int block_size) {
+  VP9_COMMON *cm = &cpi->common;
+  int c, r, b, t;
+  int update[2] = {0, 0};
+  int savings = 0;
+  int tokens, nodes;
+  const vp9_tree_index *nzc_tree;
+  const struct vp9_token_struct *nzc_encodings;
+  vp9_prob *new_nzc_probs;
+  vp9_prob *old_nzc_probs;
+  unsigned int *nzc_counts;
+  unsigned int (*nzc_branch_ct)[2];
+  vp9_prob upd;
+
+  if (block_size == 32) {
+    tokens = NZC32X32_TOKENS;
+    nzc_tree = vp9_nzc32x32_tree;
+    nzc_encodings = vp9_nzc32x32_encodings;
+    old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0];
+    upd = NZC_UPDATE_PROB_32X32;
+  } else if (block_size == 16) {
+    tokens = NZC16X16_TOKENS;
+    nzc_tree = vp9_nzc16x16_tree;
+    nzc_encodings = vp9_nzc16x16_encodings;
+    old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0];
+    upd = NZC_UPDATE_PROB_16X16;
+  } else if (block_size == 8) {
+    tokens = NZC8X8_TOKENS;
+    nzc_tree = vp9_nzc8x8_tree;
+    nzc_encodings = vp9_nzc8x8_encodings;
+    old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0];
+    upd = NZC_UPDATE_PROB_8X8;
+  } else {
+    nzc_tree = vp9_nzc4x4_tree;
+    nzc_encodings = vp9_nzc4x4_encodings;
+    tokens = NZC4X4_TOKENS;
+    old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
+    new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0];
+    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
+    nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0];
+    upd = NZC_UPDATE_PROB_4X4;
+  }
+  nodes = tokens - 1;
+  // Get the new probabilities and the branch counts
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        int offset_tokens = offset * tokens;
+        vp9_tree_probs_from_distribution(tokens,
+                                         nzc_encodings, nzc_tree,
+                                         new_nzc_probs + offset_nodes,
+                                         nzc_branch_ct + offset_nodes,
+                                         nzc_counts + offset_tokens);
+      }
+    }
+  }
+
+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+        int offset_nodes = offset * nodes;
+        for (t = 0; t < nodes; ++t) {
+          vp9_prob newp = new_nzc_probs[offset_nodes + t];
+          vp9_prob oldp = old_nzc_probs[offset_nodes + t];
+          int s, u = 0;
+#if defined(SEARCH_NEWP)
+            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
+                                                oldp, &newp, upd);
+            if (s > 0 && newp != oldp)
+              u = 1;
+            if (u)
+              savings += s - (int)(vp9_cost_zero(upd));
+            else
+              savings -= (int)(vp9_cost_zero(upd));
+#else
+          s = prob_update_savings(nzc_branch_ct[offset_nodes],
+                                  oldp, newp, upd);
+          if (s > 0)
+            u = 1;
+          if (u)
+            savings += s;
+#endif
+          update[u]++;
+        }
+      }
+    }
+  }
+  if (update[1] == 0 || savings < 0) {
+    vp9_write_bit(bc, 0);
+  } else {
+    vp9_write_bit(bc, 1);
+    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
+      for (r = 0; r < REF_TYPES; ++r) {
+        for (b = 0; b < BLOCK_TYPES; ++b) {
+          int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
+          int offset_nodes = offset * nodes;
+          for (t = 0; t < nodes; ++t) {
+            vp9_prob newp = new_nzc_probs[offset_nodes + t];
+            vp9_prob *oldp = &old_nzc_probs[offset_nodes + t];
+            int s, u = 0;
+#if defined(SEARCH_NEWP)
+            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
+                                                *oldp, &newp, upd);
+            if (s > 0 && newp != *oldp)
+              u = 1;
+#else
+            s = prob_update_savings(nzc_branch_ct[offset_nodes],
+                                    *oldp, newp, upd);
+            if (s > 0)
+              u = 1;
+#endif
+            vp9_write(bc, u, upd);
+            if (u) {
+              /* send/use new probability */
+              write_prob_diff_update(bc, newp, *oldp);
+              *oldp = newp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void update_nzc_probs(VP9_COMP* cpi,
+                             vp9_writer* const bc) {
+  update_nzc_probs_common(cpi, bc, 4);
+  if (cpi->common.txfm_mode != ONLY_4X4)
+    update_nzc_probs_common(cpi, bc, 8);
+  if (cpi->common.txfm_mode > ALLOW_8X8)
+    update_nzc_probs_common(cpi, bc, 16);
+  if (cpi->common.txfm_mode > ALLOW_16X16)
+    update_nzc_probs_common(cpi, bc, 32);
+}
+#endif  // CONFIG_CODE_NONZEROCOUNT
+
 static void update_coef_probs_common(vp9_writer* const bc,
 #ifdef ENTROPY_STATS
                                      VP9_COMP *cpi,
@@ -1253,7 +1626,7 @@ static void update_coef_probs_common(vp9_writer* const bc,
       for (k = 0; k < COEF_BANDS; ++k) {
         int prev_coef_savings[ENTROPY_NODES] = {0};
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          for (t = 0; t < ENTROPY_NODES; ++t) {
+          for (t = CONFIG_CODE_NONZEROCOUNT; t < ENTROPY_NODES; ++t) {
             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
             const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
             const vp9_prob upd = COEF_UPDATE_PROB;
@@ -1299,7 +1672,7 @@ static void update_coef_probs_common(vp9_writer* const bc,
           int prev_coef_savings[ENTROPY_NODES] = {0};
           for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
             // calc probs and branch cts for this frame only
-            for (t = 0; t < ENTROPY_NODES; ++t) {
+            for (t = CONFIG_CODE_NONZEROCOUNT; t < ENTROPY_NODES; ++t) {
               vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
               vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
               const vp9_prob upd = COEF_UPDATE_PROB;
@@ -1900,6 +2273,24 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
            cpi->common.fc.coef_probs_16x16);
   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
            cpi->common.fc.coef_probs_32x32);
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(cpi->common.fc.pre_nzc_probs_4x4,
+           cpi->common.fc.nzc_probs_4x4);
+  vp9_copy(cpi->common.fc.pre_nzc_probs_8x8,
+           cpi->common.fc.nzc_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_nzc_probs_16x16,
+           cpi->common.fc.nzc_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_nzc_probs_32x32,
+           cpi->common.fc.nzc_probs_32x32);
+  // NOTE that if the counts are reset, we also need to uncomment
+  // the count updates in the write_nzc function
+  /*
+  vp9_zero(cpi->common.fc.nzc_counts_4x4);
+  vp9_zero(cpi->common.fc.nzc_counts_8x8);
+  vp9_zero(cpi->common.fc.nzc_counts_16x16);
+  vp9_zero(cpi->common.fc.nzc_counts_32x32);
+  */
+#endif
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
   vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
@@ -1916,6 +2307,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   vp9_zero(cpi->common.fc.mv_ref_ct)
 
   update_coef_probs(cpi, &header_bc);
+#if CONFIG_CODE_NONZEROCOUNT
+  update_nzc_probs(cpi, &header_bc);
+#endif
 
 #ifdef ENTROPY_STATS
   active_section = 2;
@@ -1927,8 +2321,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
     int k;
 
     vp9_update_skip_probs(cpi);
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
       vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
+    }
   }
 
   if (pc->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 560c37171c078f15d7e5e1c5e12edb8c447b7706..439006156ecbadfb929fc36b71d5a6a256abc816 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -151,6 +151,12 @@ struct macroblock {
   unsigned char *active_ptr;
 
   vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+#if CONFIG_CODE_NONZEROCOUNT
+  unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17];
+  unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65];
+  unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257];
+  unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025];
+#endif
 
   int optimize;
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a4dbdc5a87685f6c29f4141ab6cd91a890626f9f..f2be96dd77598b5cc4b7edc70ec00eab1a120a53 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -630,10 +630,6 @@ static void set_offsets(VP9_COMP *cpi,
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   const int idx_str = xd->mode_info_stride * mb_row + mb_col;
 
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 2 &&
-               mb_row == 4 && mb_col == 5);
-#endif
   // entropy context structures
   xd->above_context = cm->above_context + mb_col;
   xd->left_context  = cm->left_context + (mb_row & 3);
@@ -668,15 +664,8 @@ static void set_offsets(VP9_COMP *cpi,
   // Set up distance of MB to edge of frame in 1/8th pel units
   block_size >>= 4;  // in macroblock units
   assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));
-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
-
-  // Are edges available for intra prediction?
-  xd->up_available    = (mb_row != 0);
-  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+  set_mb_row(cm, xd, mb_row, block_size);
+  set_mb_col(cm, xd, mb_col, block_size);
 
   /* set up source buffers */
   setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);
@@ -891,7 +880,7 @@ static void pick_sb64_modes(VP9_COMP *cpi,
   }
 }
 
-static void update_stats(VP9_COMP *cpi) {
+static void update_stats(VP9_COMP *cpi, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -945,6 +934,9 @@ static void update_stats(VP9_COMP *cpi) {
     if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
       cpi->inter_zz_count++;
   }
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col);
+#endif
 }
 
 static void encode_sb(VP9_COMP *cpi,
@@ -963,8 +955,9 @@ static void encode_sb(VP9_COMP *cpi,
 
     encode_superblock32(cpi, tp,
                         output_enabled, mb_row, mb_col);
-    if (output_enabled)
-      update_stats(cpi);
+    if (output_enabled) {
+      update_stats(cpi, mb_row, mb_col);
+    }
 
     if (output_enabled) {
       (*tp)->Token = EOSB_TOKEN;
@@ -992,12 +985,13 @@ static void encode_sb(VP9_COMP *cpi,
 
       encode_macroblock(cpi, tp,
                         output_enabled, mb_row + y_idx, mb_col + x_idx);
-      if (output_enabled)
-        update_stats(cpi);
+      if (output_enabled) {
+        update_stats(cpi, mb_row + y_idx, mb_col + x_idx);
+      }
 
       if (output_enabled) {
         (*tp)->Token = EOSB_TOKEN;
-        (*tp)++;
+       (*tp)++;
         if (mb_row + y_idx < cm->mb_rows)
           cpi->tplist[mb_row + y_idx].stop = *tp;
       }
@@ -1029,7 +1023,7 @@ static void encode_sb64(VP9_COMP *cpi,
     update_state(cpi, &x->sb64_context, 64, 1);
     encode_superblock64(cpi, tp,
                         1, mb_row, mb_col);
-    update_stats(cpi);
+    update_stats(cpi, mb_row, mb_col);
 
     (*tp)->Token = EOSB_TOKEN;
     (*tp)++;
@@ -1286,6 +1280,12 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(cpi->coef_counts_8x8);
   vp9_zero(cpi->coef_counts_16x16);
   vp9_zero(cpi->coef_counts_32x32);
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_zero(cm->fc.nzc_counts_4x4);
+  vp9_zero(cm->fc.nzc_counts_8x8);
+  vp9_zero(cm->fc.nzc_counts_16x16);
+  vp9_zero(cm->fc.nzc_counts_32x32);
+#endif
 #if CONFIG_NEW_MVREF
   vp9_zero(cpi->mb_mv_ref_count);
 #endif
@@ -1327,30 +1327,34 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
     {
       // Take tiles into account and give start/end MB
-      int tile_col;
+      int tile_col, tile_row;
       TOKENEXTRA *tp = cpi->tok;
-      for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
-        TOKENEXTRA *tp_old = tp;
-        // For each row of SBs in the frame
-        vp9_get_tile_col_offsets(cm, tile_col);
 
-        for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
-          encode_sb_row(cpi, mb_row, &tp, &totalrate);
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        vp9_get_tile_row_offsets(cm, tile_row);
+
+        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+          TOKENEXTRA *tp_old = tp;
+
+          // For each row of SBs in the frame
+          vp9_get_tile_col_offsets(cm, tile_col);
+          for (mb_row = cm->cur_tile_mb_row_start;
+               mb_row < cm->cur_tile_mb_row_end; mb_row += 4) {
+            encode_sb_row(cpi, mb_row, &tp, &totalrate);
+          }
+          cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
         }
-        cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
       }
     }
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
-
   }
 
   // 256 rate units to the bit,
   // projected_frame_size in units of BYTES
   cpi->projected_frame_size = totalrate >> 8;
 
-
 #if 0
   // Keep record of the total distortion this time around for future use
   cpi->last_frame_distortion = cpi->frame_distortion;
@@ -1930,6 +1934,135 @@ static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
   }
 }
 
+#if CONFIG_CODE_NONZEROCOUNT
+static void gather_nzcs_mb16(VP9_COMMON *const cm,
+                             MACROBLOCKD *xd) {
+  int i;
+  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_4X4:
+      for (i = 0; i < 24; ++i) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_8X8:
+      for (i = 0; i < 16; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+          xd->mode_info_context->mbmi.mode == SPLITMV) {
+        for (i = 16; i < 24; ++i) {
+          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+        }
+      } else {
+        for (i = 16; i < 24; i += 4) {
+          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+        }
+      }
+      break;
+
+    case TX_16X16:
+      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
+      for (i = 16; i < 24; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    default:
+      break;
+  }
+}
+
+static void gather_nzcs_sb32(VP9_COMMON *const cm,
+                             MACROBLOCKD *xd) {
+  int i, j;
+  MODE_INFO *m = xd->mode_info_context;
+  int mis = cm->mode_info_stride;
+  vpx_memset(m->mbmi.nzcs, 0,
+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_4X4:
+      for (i = 0; i < 96; ++i) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_8X8:
+      for (i = 0; i < 96; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_16X16:
+      for (i = 0; i < 96; i += 16) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_32X32:
+      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
+      for (i = 64; i < 96; i += 16) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    default:
+      break;
+  }
+  for (i = 0; i < 2; ++i)
+    for (j = 0; j < 2; ++j) {
+      if (i == 0 && j == 0) continue;
+      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
+                 384 * sizeof(m->mbmi.nzcs[0]));
+    }
+}
+
+static void gather_nzcs_sb64(VP9_COMMON *const cm,
+                             MACROBLOCKD *xd) {
+  int i, j;
+  MODE_INFO *m = xd->mode_info_context;
+  int mis = cm->mode_info_stride;
+  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_4X4:
+      for (i = 0; i < 384; ++i) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_8X8:
+      for (i = 0; i < 384; i += 4) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_16X16:
+      for (i = 0; i < 384; i += 16) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    case TX_32X32:
+      for (i = 0; i < 384; i += 64) {
+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
+      }
+      break;
+
+    default:
+      break;
+  }
+  for (i = 0; i < 4; ++i)
+    for (j = 0; j < 4; ++j) {
+      if (i == 0 && j == 0) continue;
+      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
+                 384 * sizeof(m->mbmi.nzcs[0]));
+    }
+}
+#endif
+
 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled,
                               int mb_row, int mb_col) {
@@ -1944,8 +2077,8 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
   assert(!xd->mode_info_context->mbmi.sb_type);
 
 #ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 2 &&
-               mb_row == 5 && mb_col == 18);
+  enc_debug = (cpi->common.current_video_frame == 1 &&
+               mb_row == 0 && mb_col == 0 && output_enabled);
   if (enc_debug)
     printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
 #endif
@@ -1997,14 +2130,14 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     }
 #endif
     if (mbmi->mode == B_PRED) {
-      vp9_encode_intra16x16mbuv(x);
+      vp9_encode_intra16x16mbuv(cm, x);
       vp9_encode_intra4x4mby(x);
     } else if (mbmi->mode == I8X8_PRED) {
       vp9_encode_intra8x8mby(x);
       vp9_encode_intra8x8mbuv(x);
     } else {
-      vp9_encode_intra16x16mbuv(x);
-      vp9_encode_intra16x16mby(x);
+      vp9_encode_intra16x16mbuv(cm, x);
+      vp9_encode_intra16x16mby(cm, x);
     }
 
     if (output_enabled)
@@ -2051,7 +2184,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     }
 
     if (!x->skip) {
-      vp9_encode_inter16x16(x, mb_row, mb_col);
+      vp9_encode_inter16x16(cm, x, mb_row, mb_col);
 
       // Clear mb_skip_coeff if mb_no_coeff_skip is not set
       if (!cpi->common.mb_no_coeff_skip)
@@ -2079,12 +2212,12 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
   }
 
   if (!x->skip) {
-#if 0  // def ENC_DEBUG
+#ifdef ENC_DEBUG
     if (enc_debug) {
       int i, j;
       printf("\n");
       printf("qcoeff\n");
-      for (i = 0; i < 400; i++) {
+      for (i = 0; i < 384; i++) {
         printf("%3d ", xd->qcoeff[i]);
         if (i % 16 == 15) printf("\n");
       }
@@ -2131,6 +2264,9 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     }
 #endif
 
+#if CONFIG_CODE_NONZEROCOUNT
+    gather_nzcs_mb16(cm, xd);
+#endif
     vp9_tokenize_mb(cpi, xd, t, !output_enabled);
 
   } else {
@@ -2197,6 +2333,12 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
   unsigned int segment_id = mi->mbmi.segment_id;
   const int mis = cm->mode_info_stride;
 
+#ifdef ENC_DEBUG
+  enc_debug = (cpi->common.current_video_frame == 1 &&
+               mb_row == 0 && mb_col == 0 && output_enabled);
+  if (enc_debug)
+    printf("Encode SB32 %d %d output %d\n", mb_row, mb_col, output_enabled);
+#endif
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       adjust_act_zbin(cpi, x);
@@ -2294,8 +2436,8 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sby_32x32(x);
         vp9_quantize_sbuv_16x16(x);
         if (x->optimize) {
-          vp9_optimize_sby_32x32(x);
-          vp9_optimize_sbuv_16x16(x);
+          vp9_optimize_sby_32x32(cm, x);
+          vp9_optimize_sbuv_16x16(cm, x);
         }
         vp9_inverse_transform_sby_32x32(xd);
         vp9_inverse_transform_sbuv_16x16(xd);
@@ -2306,8 +2448,8 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sby_16x16(x);
         vp9_quantize_sbuv_16x16(x);
         if (x->optimize) {
-          vp9_optimize_sby_16x16(x);
-          vp9_optimize_sbuv_16x16(x);
+          vp9_optimize_sby_16x16(cm, x);
+          vp9_optimize_sbuv_16x16(cm, x);
         }
         vp9_inverse_transform_sby_16x16(xd);
         vp9_inverse_transform_sbuv_16x16(xd);
@@ -2318,8 +2460,8 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sby_8x8(x);
         vp9_quantize_sbuv_8x8(x);
         if (x->optimize) {
-          vp9_optimize_sby_8x8(x);
-          vp9_optimize_sbuv_8x8(x);
+          vp9_optimize_sby_8x8(cm, x);
+          vp9_optimize_sbuv_8x8(cm, x);
         }
         vp9_inverse_transform_sby_8x8(xd);
         vp9_inverse_transform_sbuv_8x8(xd);
@@ -2330,8 +2472,8 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sby_4x4(x);
         vp9_quantize_sbuv_4x4(x);
         if (x->optimize) {
-          vp9_optimize_sby_4x4(x);
-          vp9_optimize_sbuv_4x4(x);
+          vp9_optimize_sby_4x4(cm, x);
+          vp9_optimize_sbuv_4x4(cm, x);
         }
         vp9_inverse_transform_sby_4x4(xd);
         vp9_inverse_transform_sbuv_4x4(xd);
@@ -2340,6 +2482,9 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
     }
     vp9_recon_sby_s_c(xd, dst);
     vp9_recon_sbuv_s_c(xd, udst, vdst);
+#if CONFIG_CODE_NONZEROCOUNT
+    gather_nzcs_sb32(cm, xd);
+#endif
 
     vp9_tokenize_sb(cpi, xd, t, !output_enabled);
   } else {
@@ -2407,6 +2552,12 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
   unsigned int segment_id = mi->mbmi.segment_id;
   const int mis = cm->mode_info_stride;
 
+#ifdef ENC_DEBUG
+  enc_debug = (cpi->common.current_video_frame == 1 &&
+               mb_row == 0 && mb_col == 0 && output_enabled);
+  if (enc_debug)
+    printf("Encode SB64 %d %d output %d\n", mb_row, mb_col, output_enabled);
+#endif
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       adjust_act_zbin(cpi, x);
@@ -2502,8 +2653,8 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sb64y_32x32(x);
         vp9_quantize_sb64uv_32x32(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_32x32(x);
-          vp9_optimize_sb64uv_32x32(x);
+          vp9_optimize_sb64y_32x32(cm, x);
+          vp9_optimize_sb64uv_32x32(cm, x);
         }
         vp9_inverse_transform_sb64y_32x32(xd);
         vp9_inverse_transform_sb64uv_32x32(xd);
@@ -2514,8 +2665,8 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sb64y_16x16(x);
         vp9_quantize_sb64uv_16x16(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_16x16(x);
-          vp9_optimize_sb64uv_16x16(x);
+          vp9_optimize_sb64y_16x16(cm, x);
+          vp9_optimize_sb64uv_16x16(cm, x);
         }
         vp9_inverse_transform_sb64y_16x16(xd);
         vp9_inverse_transform_sb64uv_16x16(xd);
@@ -2526,8 +2677,8 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sb64y_8x8(x);
         vp9_quantize_sb64uv_8x8(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_8x8(x);
-          vp9_optimize_sb64uv_8x8(x);
+          vp9_optimize_sb64y_8x8(cm, x);
+          vp9_optimize_sb64uv_8x8(cm, x);
         }
         vp9_inverse_transform_sb64y_8x8(xd);
         vp9_inverse_transform_sb64uv_8x8(xd);
@@ -2538,8 +2689,8 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
         vp9_quantize_sb64y_4x4(x);
         vp9_quantize_sb64uv_4x4(x);
         if (x->optimize) {
-          vp9_optimize_sb64y_4x4(x);
-          vp9_optimize_sb64uv_4x4(x);
+          vp9_optimize_sb64y_4x4(cm, x);
+          vp9_optimize_sb64uv_4x4(cm, x);
         }
         vp9_inverse_transform_sb64y_4x4(xd);
         vp9_inverse_transform_sb64uv_4x4(xd);
@@ -2548,7 +2699,9 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
     }
     vp9_recon_sb64y_s_c(xd, dst);
     vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst);
-
+#if CONFIG_CODE_NONZEROCOUNT
+    gather_nzcs_sb64(cm, &x->e_mbd);
+#endif
     vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 75c8ea8f3160252b2456794129eed071b520bdfe..d3b595bd8e0d1abc1c8831d70b847f505855c6df 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -25,7 +25,7 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
-    vp9_encode_intra16x16mby(x);
+    vp9_encode_intra16x16mby(&cpi->common, x);
   } else {
     int i;
 
@@ -72,7 +72,7 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb) {
     vp9_encode_intra4x4block(mb, i);
 }
 
-void vp9_encode_intra16x16mby(MACROBLOCK *x) {
+void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
@@ -86,21 +86,21 @@ void vp9_encode_intra16x16mby(MACROBLOCK *x) {
       vp9_transform_mby_16x16(x);
       vp9_quantize_mby_16x16(x);
       if (x->optimize)
-        vp9_optimize_mby_16x16(x);
+        vp9_optimize_mby_16x16(cm, x);
       vp9_inverse_transform_mby_16x16(xd);
       break;
     case TX_8X8:
       vp9_transform_mby_8x8(x);
       vp9_quantize_mby_8x8(x);
       if (x->optimize)
-        vp9_optimize_mby_8x8(x);
+        vp9_optimize_mby_8x8(cm, x);
       vp9_inverse_transform_mby_8x8(xd);
       break;
     default:
       vp9_transform_mby_4x4(x);
       vp9_quantize_mby_4x4(x);
       if (x->optimize)
-        vp9_optimize_mby_4x4(x);
+        vp9_optimize_mby_4x4(cm, x);
       vp9_inverse_transform_mby_4x4(xd);
       break;
   }
@@ -108,7 +108,7 @@ void vp9_encode_intra16x16mby(MACROBLOCK *x) {
   vp9_recon_mby(xd);
 }
 
-void vp9_encode_intra16x16mbuv(MACROBLOCK *x) {
+void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
@@ -122,14 +122,14 @@ void vp9_encode_intra16x16mbuv(MACROBLOCK *x) {
       vp9_transform_mbuv_4x4(x);
       vp9_quantize_mbuv_4x4(x);
       if (x->optimize)
-        vp9_optimize_mbuv_4x4(x);
+        vp9_optimize_mbuv_4x4(cm, x);
       vp9_inverse_transform_mbuv_4x4(xd);
       break;
     default:  // 16x16 or 8x8
       vp9_transform_mbuv_8x8(x);
       vp9_quantize_mbuv_8x8(x);
       if (x->optimize)
-        vp9_optimize_mbuv_8x8(x);
+        vp9_optimize_mbuv_8x8(cm, x);
       vp9_inverse_transform_mbuv_8x8(xd);
       break;
     }
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index b017673ee92433f7d0afe508e155d1464342e45c..0b19b5652c95bfe9890bd88d6466cc6909ad337d 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -14,8 +14,8 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
+void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb);
 void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);
 void vp9_encode_intra8x8mby(MACROBLOCK *x);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b2ee800cd4f462a0b2d062d26552860bebb8f41a..c0386459df5ed02baf3a29412953982c029dd509 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -491,7 +491,8 @@ static int trellis_get_coeff_context(int token) {
   return vp9_get_coef_context(&recent_energy, token);
 }
 
-static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
+static void optimize_b(VP9_COMMON *const cm,
+                       MACROBLOCK *mb, int ib, PLANE_TYPE type,
                        const int16_t *dequant_ptr,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        int tx_size) {
@@ -512,26 +513,73 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
   int default_eob;
   int const *scan;
   const int mul = 1 + (tx_size == TX_32X32);
+  TX_TYPE tx_type;
+#if CONFIG_CODE_NONZEROCOUNT
+  // TODO(debargha): the dynamic programming approach used in this function
+  // is not compatible with the true rate cost when nzcs are used. Note
+  // the total rate is the sum of the nzc rate and the indicvidual token
+  // rates. The latter part can be optimized in this function, but because
+  // the nzc rate is a function of all the other tokens without a Markov
+  // relationship this rate cannot be considered correctly.
+  // The current implementation uses a suboptimal approach to account for
+  // the nzc rates somewhat, but in reality the optimization approach needs
+  // to change substantially.
+  uint16_t nzc = xd->nzcs[ib];
+  uint16_t nzc0, nzc1;
+  uint16_t final_nzc = 0, final_nzc_exp;
+  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
+  unsigned int *nzc_cost;
+  nzc0 = nzc1 = nzc;
+#endif
 
   switch (tx_size) {
     default:
     case TX_4X4:
       default_eob = 16;
-      // FIXME(rbultje): although optimize_b currently isn't called for
-      // intra4x4, this should be changed to be adst-compatible
-      scan = vp9_default_zig_zag1d_4x4;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
+#endif
+      // NOTE: this isn't called (for intra4x4 modes), but will be left in
+      // since it could be used later
+      tx_type = get_tx_type_4x4(&mb->e_mbd, &xd->block[ib]);
+      if (tx_type != DCT_DCT) {
+        switch (tx_type) {
+          case ADST_DCT:
+            scan = vp9_row_scan_4x4;
+            break;
+
+          case DCT_ADST:
+            scan = vp9_col_scan_4x4;
+            break;
+
+          default:
+            scan = vp9_default_zig_zag1d_4x4;
+            break;
+        }
+      } else {
+        scan = vp9_default_zig_zag1d_4x4;
+      }
       break;
     case TX_8X8:
       scan = vp9_default_zig_zag1d_8x8;
       default_eob = 64;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
+#endif
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
       default_eob = 256;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
+#endif
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       default_eob = 1024;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
+#endif
       break;
   }
 
@@ -542,7 +590,11 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
   rddiv = mb->rddiv;
   memset(best_index, 0, sizeof(best_index));
   /* Initialize the sentinel node of the trellis. */
+#if CONFIG_CODE_NONZEROCOUNT
+  tokens[eob][0].rate = nzc_cost[nzc];
+#else
   tokens[eob][0].rate = 0;
+#endif
   tokens[eob][0].error = 0;
   tokens[eob][0].next = default_eob;
   tokens[eob][0].token = DCT_EOB_TOKEN;
@@ -551,6 +603,9 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
   next = eob;
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
+#if CONFIG_CODE_NONZEROCOUNT
+    int new_nzc0, new_nzc1;
+#endif
 
     rc = scan[i];
     x = qcoeff_ptr[rc];
@@ -584,6 +639,10 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
       best_index[i][0] = best;
+#if CONFIG_CODE_NONZEROCOUNT
+      new_nzc0 = (best ? nzc1 : nzc0);
+#endif
+
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
@@ -609,6 +668,12 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
              DCT_EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
              DCT_EOB_TOKEN : ZERO_TOKEN;
+#if CONFIG_CODE_NONZEROCOUNT
+        // Account for rate drop because of the nzc change.
+        // TODO(debargha): Find a better solution
+        rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1];
+        rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1];
+#endif
       } else {
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
       }
@@ -641,6 +706,11 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
       best_index[i][1] = best;
+#if CONFIG_CODE_NONZEROCOUNT
+      new_nzc1 = (best ? nzc1 : nzc0) - (!x);
+      nzc0 = new_nzc0;
+      nzc1 = new_nzc1;
+#endif
       /* Finally, make this the new head of the trellis. */
       next = i;
     }
@@ -679,11 +749,18 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
   rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
+#if CONFIG_CODE_NONZEROCOUNT
+  final_nzc_exp = (best ? nzc1 : nzc0);
+#endif
   final_eob = i0 - 1;
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
-    if (x)
+    if (x) {
       final_eob = i;
+#if CONFIG_CODE_NONZEROCOUNT
+      ++final_nzc;
+#endif
+    }
     rc = scan[i];
     qcoeff_ptr[rc] = x;
     dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
@@ -695,9 +772,13 @@ static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
 
   xd->eobs[ib] = final_eob;
   *a = *l = (final_eob > 0);
+#if CONFIG_CODE_NONZEROCOUNT
+  assert(final_nzc == final_nzc_exp);
+  xd->nzcs[ib] = final_nzc;
+#endif
 }
 
-void vp9_optimize_mby_4x4(MACROBLOCK *x) {
+void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -713,13 +794,13 @@ void vp9_optimize_mby_4x4(MACROBLOCK *x) {
   tl = (ENTROPY_CONTEXT *)&t_left;
 
   for (b = 0; b < 16; b++) {
-    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
 }
 
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x) {
+void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -735,18 +816,18 @@ void vp9_optimize_mbuv_4x4(MACROBLOCK *x) {
   tl = (ENTROPY_CONTEXT *)&t_left;
 
   for (b = 16; b < 24; b++) {
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
 }
 
-static void optimize_mb_4x4(MACROBLOCK *x) {
-  vp9_optimize_mby_4x4(x);
-  vp9_optimize_mbuv_4x4(x);
+static void optimize_mb_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
+  vp9_optimize_mby_4x4(cm, x);
+  vp9_optimize_mbuv_4x4(cm, x);
 }
 
-void vp9_optimize_mby_8x8(MACROBLOCK *x) {
+void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -765,14 +846,14 @@ void vp9_optimize_mby_8x8(MACROBLOCK *x) {
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
                &above_ec, &left_ec, TX_8X8);
     a[1] = a[0] = above_ec;
     l[1] = l[0] = left_ec;
   }
 }
 
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
+void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context;
   ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context;
@@ -785,17 +866,17 @@ void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
                &above_ec, &left_ec, TX_8X8);
   }
 }
 
-static void optimize_mb_8x8(MACROBLOCK *x) {
-  vp9_optimize_mby_8x8(x);
-  vp9_optimize_mbuv_8x8(x);
+static void optimize_mb_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
+  vp9_optimize_mby_8x8(cm, x);
+  vp9_optimize_mbuv_8x8(cm, x);
 }
 
-void vp9_optimize_mby_16x16(MACROBLOCK *x) {
+void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context;
   ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context;
   ENTROPY_CONTEXT ta, tl;
@@ -805,16 +886,16 @@ void vp9_optimize_mby_16x16(MACROBLOCK *x) {
 
   ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
   tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
-  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
              &ta, &tl, TX_16X16);
 }
 
-static void optimize_mb_16x16(MACROBLOCK *x) {
-  vp9_optimize_mby_16x16(x);
-  vp9_optimize_mbuv_8x8(x);
+static void optimize_mb_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
+  vp9_optimize_mby_16x16(cm, x);
+  vp9_optimize_mbuv_8x8(cm, x);
 }
 
-void vp9_optimize_sby_32x32(MACROBLOCK *x) {
+void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
@@ -823,11 +904,11 @@ void vp9_optimize_sby_32x32(MACROBLOCK *x) {
 
   ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;
   tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
              &ta, &tl, TX_32X32);
 }
 
-void vp9_optimize_sby_16x16(MACROBLOCK *x) {
+void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
@@ -842,12 +923,12 @@ void vp9_optimize_sby_16x16(MACROBLOCK *x) {
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
 
-    optimize_b(x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_16X16);
   }
 }
 
-void vp9_optimize_sby_8x8(MACROBLOCK *x) {
+void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
@@ -866,12 +947,12 @@ void vp9_optimize_sby_8x8(MACROBLOCK *x) {
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
 
-    optimize_b(x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_8X8);
   }
 }
 
-void vp9_optimize_sby_4x4(MACROBLOCK *x) {
+void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT ta[8], tl[8];
   int n;
 
@@ -882,12 +963,12 @@ void vp9_optimize_sby_4x4(MACROBLOCK *x) {
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
 
-    optimize_b(x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_4X4);
   }
 }
 
-void vp9_optimize_sbuv_16x16(MACROBLOCK *x) {
+void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;
@@ -901,12 +982,12 @@ void vp9_optimize_sbuv_16x16(MACROBLOCK *x) {
     l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
     above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
     left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_16X16);
   }
 }
 
-void vp9_optimize_sbuv_8x8(MACROBLOCK *x) {
+void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -921,14 +1002,14 @@ void vp9_optimize_sbuv_8x8(MACROBLOCK *x) {
     l = tl + vp9_block2left_sb[TX_8X8][b];
     above_ec = (a[0] + a[1]) != 0;
     left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_8X8);
     a[0] = a[1] = above_ec;
     l[0] = l[1] = left_ec;
   }
 }
 
-void vp9_optimize_sbuv_4x4(MACROBLOCK *x) {
+void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -941,12 +1022,12 @@ void vp9_optimize_sbuv_4x4(MACROBLOCK *x) {
     const int cidx = b >= 80 ? 20 : 16;
     a = ta + vp9_block2above_sb[TX_4X4][b];
     l = tl + vp9_block2left_sb[TX_4X4][b];
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                a, l, TX_4X4);
   }
 }
 
-void vp9_optimize_sb64y_32x32(MACROBLOCK *x) {
+void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
@@ -965,12 +1046,12 @@ void vp9_optimize_sb64y_32x32(MACROBLOCK *x) {
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
 
-    optimize_b(x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_32X32);
   }
 }
 
-void vp9_optimize_sb64y_16x16(MACROBLOCK *x) {
+void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
@@ -993,12 +1074,12 @@ void vp9_optimize_sb64y_16x16(MACROBLOCK *x) {
   for (n = 0; n < 16; n++) {
     const int x_idx = n & 3, y_idx = n >> 2;
 
-    optimize_b(x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_16X16);
   }
 }
 
-void vp9_optimize_sb64y_8x8(MACROBLOCK *x) {
+void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
   ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
@@ -1029,12 +1110,12 @@ void vp9_optimize_sb64y_8x8(MACROBLOCK *x) {
   for (n = 0; n < 64; n++) {
     const int x_idx = n & 7, y_idx = n >> 3;
 
-    optimize_b(x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_8X8);
   }
 }
 
-void vp9_optimize_sb64y_4x4(MACROBLOCK *x) {
+void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT ta[16], tl[16];
   int n;
 
@@ -1049,12 +1130,12 @@ void vp9_optimize_sb64y_4x4(MACROBLOCK *x) {
   for (n = 0; n < 256; n++) {
     const int x_idx = n & 15, y_idx = n >> 4;
 
-    optimize_b(x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
                ta + x_idx, tl + y_idx, TX_4X4);
   }
 }
 
-void vp9_optimize_sb64uv_32x32(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
@@ -1072,12 +1153,12 @@ void vp9_optimize_sb64uv_32x32(MACROBLOCK *x) {
     l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
     a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;
     l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &a_ec, &l_ec, TX_32X32);
   }
 }
 
-void vp9_optimize_sb64uv_16x16(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -1094,14 +1175,14 @@ void vp9_optimize_sb64uv_16x16(MACROBLOCK *x) {
     l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
     above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
     left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_16X16);
     a[0] = a[1] = a1[0] = a1[1] = above_ec;
     l[0] = l[1] = l1[0] = l1[1] = left_ec;
   }
 }
 
-void vp9_optimize_sb64uv_8x8(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -1116,14 +1197,14 @@ void vp9_optimize_sb64uv_8x8(MACROBLOCK *x) {
     l = tl + vp9_block2left_sb64[TX_8X8][b];
     above_ec = (a[0] + a[1]) != 0;
     left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                &above_ec, &left_ec, TX_8X8);
     a[0] = a[1] = above_ec;
     l[0] = l[1] = left_ec;
   }
 }
 
-void vp9_optimize_sb64uv_4x4(MACROBLOCK *x) {
+void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
   ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
   ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
@@ -1136,12 +1217,12 @@ void vp9_optimize_sb64uv_4x4(MACROBLOCK *x) {
     const int cidx = b >= 320 ? 20 : 16;
     a = ta + vp9_block2above_sb64[TX_4X4][b];
     l = tl + vp9_block2left_sb64[TX_4X4][b];
-    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
                a, l, TX_4X4);
   }
 }
 
-void vp9_fidct_mb(MACROBLOCK *x) {
+void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
@@ -1149,7 +1230,7 @@ void vp9_fidct_mb(MACROBLOCK *x) {
     vp9_transform_mb_16x16(x);
     vp9_quantize_mb_16x16(x);
     if (x->optimize)
-      optimize_mb_16x16(x);
+      optimize_mb_16x16(cm, x);
     vp9_inverse_transform_mb_16x16(xd);
   } else if (tx_size == TX_8X8) {
     if (xd->mode_info_context->mbmi.mode == SPLITMV) {
@@ -1159,8 +1240,8 @@ void vp9_fidct_mb(MACROBLOCK *x) {
       vp9_quantize_mby_8x8(x);
       vp9_quantize_mbuv_4x4(x);
       if (x->optimize) {
-        vp9_optimize_mby_8x8(x);
-        vp9_optimize_mbuv_4x4(x);
+        vp9_optimize_mby_8x8(cm, x);
+        vp9_optimize_mbuv_4x4(cm, x);
       }
       vp9_inverse_transform_mby_8x8(xd);
       vp9_inverse_transform_mbuv_4x4(xd);
@@ -1168,24 +1249,25 @@ void vp9_fidct_mb(MACROBLOCK *x) {
       vp9_transform_mb_8x8(x);
       vp9_quantize_mb_8x8(x);
       if (x->optimize)
-        optimize_mb_8x8(x);
+        optimize_mb_8x8(cm, x);
       vp9_inverse_transform_mb_8x8(xd);
     }
   } else {
     transform_mb_4x4(x);
     vp9_quantize_mb_4x4(x);
     if (x->optimize)
-      optimize_mb_4x4(x);
+      optimize_mb_4x4(cm, x);
     vp9_inverse_transform_mb_4x4(xd);
   }
 }
 
-void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col) {
+void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                           int mb_row, int mb_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   subtract_mb(x);
-  vp9_fidct_mb(x);
+  vp9_fidct_mb(cm, x);
   vp9_recon_mb(xd);
 }
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 8164bbac25c05ada0f6a3c1bd0e29d174bbea74e..242afbeae9ab87e5761233eaf9513860bd1df162 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/common/vp9_onyxc_int.h"
 
 typedef struct {
   MB_PREDICTION_MODE mode;
@@ -23,58 +24,59 @@ typedef struct {
 
 
 struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col);
+void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                           int mb_row, int mb_col);
 
 void vp9_transform_mbuv_4x4(MACROBLOCK *x);
 void vp9_transform_mby_4x4(MACROBLOCK *x);
 
-void vp9_optimize_mby_4x4(MACROBLOCK *x);
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x);
+void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
+void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_mb_8x8(MACROBLOCK *mb);
 void vp9_transform_mby_8x8(MACROBLOCK *x);
 void vp9_transform_mbuv_8x8(MACROBLOCK *x);
-void vp9_optimize_mby_8x8(MACROBLOCK *x);
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x);
+void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
+void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_transform_mb_16x16(MACROBLOCK *mb);
 void vp9_transform_mby_16x16(MACROBLOCK *x);
-void vp9_optimize_mby_16x16(MACROBLOCK *x);
+void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_transform_sby_32x32(MACROBLOCK *x);
-void vp9_optimize_sby_32x32(MACROBLOCK *x);
+void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sby_16x16(MACROBLOCK *x);
-void vp9_optimize_sby_16x16(MACROBLOCK *x);
+void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sby_8x8(MACROBLOCK *x);
-void vp9_optimize_sby_8x8(MACROBLOCK *x);
+void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sby_4x4(MACROBLOCK *x);
-void vp9_optimize_sby_4x4(MACROBLOCK *x);
+void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sbuv_16x16(MACROBLOCK *x);
-void vp9_optimize_sbuv_16x16(MACROBLOCK *x);
+void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sbuv_8x8(MACROBLOCK *x);
-void vp9_optimize_sbuv_8x8(MACROBLOCK *x);
+void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sbuv_4x4(MACROBLOCK *x);
-void vp9_optimize_sbuv_4x4(MACROBLOCK *x);
+void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_transform_sb64y_32x32(MACROBLOCK *x);
-void vp9_optimize_sb64y_32x32(MACROBLOCK *x);
+void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64y_16x16(MACROBLOCK *x);
-void vp9_optimize_sb64y_16x16(MACROBLOCK *x);
+void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64y_8x8(MACROBLOCK *x);
-void vp9_optimize_sb64y_8x8(MACROBLOCK *x);
+void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64y_4x4(MACROBLOCK *x);
-void vp9_optimize_sb64y_4x4(MACROBLOCK *x);
+void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_32x32(MACROBLOCK *x);
-void vp9_optimize_sb64uv_32x32(MACROBLOCK *x);
+void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_16x16(MACROBLOCK *x);
-void vp9_optimize_sb64uv_16x16(MACROBLOCK *x);
+void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_8x8(MACROBLOCK *x);
-void vp9_optimize_sb64uv_8x8(MACROBLOCK *x);
+void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_transform_sb64uv_4x4(MACROBLOCK *x);
-void vp9_optimize_sb64uv_4x4(MACROBLOCK *x);
+void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
 
-void vp9_fidct_mb(MACROBLOCK *x);
+void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 23e4f8acfd15782a490549815175c36b330c8ee4..31f847399138c2ef5055c13a6804bf740a4c1d05 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1697,6 +1697,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cpi->common.error.setjmp = 0;
 
   vp9_zero(cpi->y_uv_mode_count)
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_zero(cm->fc.nzc_counts_4x4);
+  vp9_zero(cm->fc.nzc_counts_8x8);
+  vp9_zero(cm->fc.nzc_counts_16x16);
+  vp9_zero(cm->fc.nzc_counts_32x32);
+#endif
 
   return (VP9_PTR) cpi;
 }
@@ -3340,8 +3346,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
   vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
   if (!cpi->common.error_resilient_mode &&
-      !cpi->common.frame_parallel_decoding_mode)
+      !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
+#if CONFIG_CODE_NONZEROCOUNT
+    vp9_adapt_nzc_probs(&cpi->common);
+#endif
+  }
   if (cpi->common.frame_type != KEY_FRAME) {
     vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 75331755b3920e8ab0428010b18e9b962a1e27b4..13d043a142d6a686ce71b6223e8d9679cccd426e 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -112,6 +112,16 @@ typedef struct {
   int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
   int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
 
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob nzc_probs_4x4
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
+  vp9_prob nzc_probs_8x8
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
+  vp9_prob nzc_probs_16x16
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
+  vp9_prob nzc_probs_32x32
+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
+#endif
 } CODING_CONTEXT;
 
 typedef struct {
@@ -481,6 +491,25 @@ typedef struct VP9_COMP {
   vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
 
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_prob frame_nzc_probs_4x4
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
+  unsigned int frame_nzc_branch_ct_4x4
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2];
+  vp9_prob frame_nzc_probs_8x8
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
+  unsigned int frame_nzc_branch_ct_8x8
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2];
+  vp9_prob frame_nzc_probs_16x16
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
+  unsigned int frame_nzc_branch_ct_16x16
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2];
+  vp9_prob frame_nzc_probs_32x32
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
+  unsigned int frame_nzc_branch_ct_32x32
+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2];
+#endif
+
   int gfu_boost;
   int last_boost;
   int kf_boost;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 75f22fac05b447fb2b5332204dd9d44c3958c5c7..66ee248400a3fed6eda3ce4cf4e194bd61d1634d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -40,6 +40,9 @@ void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   int zbin_oq_value        = b->zbin_extra;
 
   int const *pt_scan ;
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc = 0;
+#endif
 
   switch (tx_type) {
     case ADST_DCT:
@@ -81,6 +84,9 @@ void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
 
         if (y) {
           eob = i;                                // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                  // number of nonzero coeffs
+#endif
           zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
         }
       }
@@ -88,6 +94,9 @@ void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
   }
 
   xd->eobs[b_idx] = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+  xd->nzcs[b_idx] = nzc;
+#endif
 }
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
@@ -107,6 +116,9 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
   int16_t *dqcoeff_ptr     = d->dqcoeff;
   int16_t *dequant_ptr     = d->dequant;
   int zbin_oq_value        = b->zbin_extra;
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc = 0;
+#endif
 
   vpx_memset(qcoeff_ptr, 0, 32);
   vpx_memset(dqcoeff_ptr, 0, 32);
@@ -135,6 +147,9 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
 
         if (y) {
           eob = i;                                // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                  // number of nonzero coeffs
+#endif
           zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
         }
       }
@@ -142,6 +157,9 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
   }
 
   xd->eobs[b_idx] = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+  xd->nzcs[b_idx] = nzc;
+#endif
 }
 
 void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
@@ -192,6 +210,9 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) {
     uint8_t *quant_shift_ptr = b->quant_shift;
     int16_t *dequant_ptr = d->dequant;
     int zbin_oq_value = b->zbin_extra;
+#if CONFIG_CODE_NONZEROCOUNT
+    int nzc = 0;
+#endif
 
     eob = -1;
 
@@ -215,6 +236,9 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) {
 
         if (y) {
           eob = 0;                                   // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                  // number of nonzero coeffs
+#endif
           zero_run = 0;
         }
       }
@@ -241,19 +265,33 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) {
 
         if (y) {
           eob = i;                                   // last nonzero coeffs
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                     // number of nonzero coeffs
+#endif
           zero_run = 0;
         }
       }
     }
     xd->eobs[b_idx] = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+    xd->nzcs[b_idx] = nzc;
+#endif
   } else {
     xd->eobs[b_idx] = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+    xd->nzcs[b_idx] = 0;
+#endif
   }
 }
 
 void vp9_quantize_mby_8x8(MACROBLOCK *x) {
   int i;
 
+#if CONFIG_CODE_NONZEROCOUNT
+  for (i = 0; i < 16; i ++) {
+    x->e_mbd.nzcs[i] = 0;
+  }
+#endif
   for (i = 0; i < 16; i += 4) {
     x->quantize_b_8x8(x, i);
   }
@@ -262,6 +300,11 @@ void vp9_quantize_mby_8x8(MACROBLOCK *x) {
 void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
   int i;
 
+#if CONFIG_CODE_NONZEROCOUNT
+  for (i = 16; i < 24; i ++) {
+    x->e_mbd.nzcs[i] = 0;
+  }
+#endif
   for (i = 16; i < 24; i += 4)
     x->quantize_b_8x8(x, i);
 }
@@ -272,6 +315,12 @@ void vp9_quantize_mb_8x8(MACROBLOCK *x) {
 }
 
 void vp9_quantize_mby_16x16(MACROBLOCK *x) {
+#if CONFIG_CODE_NONZEROCOUNT
+  int i;
+  for (i = 0; i < 16; i++) {
+    x->e_mbd.nzcs[i] = 0;
+  }
+#endif
   x->quantize_b_16x16(x, 0);
 }
 
@@ -286,12 +335,19 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
                      uint8_t *quant_shift_ptr,
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      int16_t *dequant_ptr, int zbin_oq_value,
-                     uint16_t *eob_ptr, const int *scan, int mul) {
+                     uint16_t *eob_ptr,
+#if CONFIG_CODE_NONZEROCOUNT
+                     uint16_t *nzc_ptr,
+#endif
+                     const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
   int zero_run = 0;
   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc = 0;
+#endif
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -320,12 +376,18 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
         if (y) {
           eob = i;                                  // last nonzero coeffs
           zero_run = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+          ++nzc;                                    // number of nonzero coeffs
+#endif
         }
       }
     }
   }
 
   *eob_ptr = eob + 1;
+#if CONFIG_CODE_NONZEROCOUNT
+  *nzc_ptr = nzc;
+#endif
 }
 
 void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) {
@@ -340,7 +402,11 @@ void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) {
            d->dqcoeff,
            d->dequant,
            b->zbin_extra,
-           &xd->eobs[b_idx], vp9_default_zig_zag1d_16x16, 1);
+           &xd->eobs[b_idx],
+#if CONFIG_CODE_NONZEROCOUNT
+           &xd->nzcs[b_idx],
+#endif
+           vp9_default_zig_zag1d_16x16, 1);
 }
 
 void vp9_quantize_sby_32x32(MACROBLOCK *x) {
@@ -358,6 +424,9 @@ void vp9_quantize_sby_32x32(MACROBLOCK *x) {
            d->dequant,
            b->zbin_extra,
            &xd->eobs[0],
+#if CONFIG_CODE_NONZEROCOUNT
+           &xd->nzcs[0],
+#endif
            vp9_default_zig_zag1d_32x32, 2);
 }
 
@@ -378,6 +447,9 @@ void vp9_quantize_sby_16x16(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n * 16],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n * 16],
+#endif
              vp9_default_zig_zag1d_16x16, 1);
 }
 
@@ -398,6 +470,9 @@ void vp9_quantize_sby_8x8(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n * 4],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n * 4],
+#endif
              vp9_default_zig_zag1d_8x8, 1);
 }
 
@@ -418,6 +493,9 @@ void vp9_quantize_sby_4x4(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n],
+#endif
              vp9_default_zig_zag1d_4x4, 1);
 }
 
@@ -437,6 +515,9 @@ void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_16x16, 1);
   }
 }
@@ -457,6 +538,9 @@ void vp9_quantize_sbuv_8x8(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_8x8, 1);
   }
 }
@@ -477,6 +561,9 @@ void vp9_quantize_sbuv_4x4(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_4x4, 1);
   }
 }
@@ -498,6 +585,9 @@ void vp9_quantize_sb64y_32x32(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n * 64],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n * 64],
+#endif
              vp9_default_zig_zag1d_32x32, 2);
 }
 
@@ -518,6 +608,9 @@ void vp9_quantize_sb64y_16x16(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n * 16],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n * 16],
+#endif
              vp9_default_zig_zag1d_16x16, 1);
 }
 
@@ -538,6 +631,9 @@ void vp9_quantize_sb64y_8x8(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n * 4],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n * 4],
+#endif
              vp9_default_zig_zag1d_8x8, 1);
 }
 
@@ -558,6 +654,9 @@ void vp9_quantize_sb64y_4x4(MACROBLOCK *x) {
              d->dequant,
              b->zbin_extra,
              &xd->eobs[n],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[n],
+#endif
              vp9_default_zig_zag1d_4x4, 1);
 }
 
@@ -577,6 +676,9 @@ void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_32x32, 2);
   }
 }
@@ -597,6 +699,9 @@ void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_16x16, 1);
   }
 }
@@ -617,6 +722,9 @@ void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_8x8, 1);
   }
 }
@@ -637,6 +745,9 @@ void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) {
              xd->block[cidx].dequant,
              x->block[cidx].zbin_extra,
              &xd->eobs[i],
+#if CONFIG_CODE_NONZEROCOUNT
+             &xd->nzcs[i],
+#endif
              vp9_default_zig_zag1d_4x4, 1);
   }
 }
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d679aaf7ae3b372a0d6cb865a043f5b5d0b74ea0..a7415af122f8cb36eff9877d71ebbde902bbd5dc 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -185,6 +185,12 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
 #if CONFIG_COMP_INTERINTRA_PRED
   cc->interintra_prob = cm->fc.interintra_prob;
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4);
+  vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8);
+  vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16);
+  vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32);
+#endif
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -240,6 +246,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
 #if CONFIG_COMP_INTERINTRA_PRED
   cm->fc.interintra_prob = cc->interintra_prob;
 #endif
+#if CONFIG_CODE_NONZEROCOUNT
+  vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4);
+  vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8);
+  vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16);
+  vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32);
+#endif
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 61379b84d0e5dc1f03b85e8b61209c830f157b09..1b83091b306c14e6a5c5a5235eb9bd14318dd6d1 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -156,6 +156,12 @@ static void fill_token_costs(vp9_coeff_count *c,
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
         for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+#if CONFIG_CODE_NONZEROCOUNT
+          // All costs are without the EOB node
+          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
+                               p[i][j][k][l],
+                               vp9_coef_tree);
+#else
           if (l == 0 && k > 0)
             vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
                                  p[i][j][k][l],
@@ -164,8 +170,63 @@ static void fill_token_costs(vp9_coeff_count *c,
             vp9_cost_tokens((int *)(c[i][j][k][l]),
                             p[i][j][k][l],
                             vp9_coef_tree);
+#endif
+        }
+}
+
+#if CONFIG_CODE_NONZEROCOUNT
+static void fill_nzc_costs(VP9_COMP *cpi, int block_size) {
+  int nzc_context, r, b, nzc, values;
+  int cost[16];
+  values = block_size * block_size + 1;
+
+  for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) {
+    for (r = 0; r < REF_TYPES; ++r) {
+      for (b = 0; b < BLOCK_TYPES; ++b) {
+        if (block_size == 4)
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_4x4[nzc_context][r][b],
+                          vp9_nzc4x4_tree);
+        else if (block_size == 8)
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_8x8[nzc_context][r][b],
+                          vp9_nzc8x8_tree);
+        else if (block_size == 16)
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_16x16[nzc_context][r][b],
+                          vp9_nzc16x16_tree);
+        else
+          vp9_cost_tokens(cost,
+                          cpi->common.fc.nzc_probs_32x32[nzc_context][r][b],
+                          vp9_nzc32x32_tree);
+
+        for (nzc = 0; nzc < values; ++nzc) {
+          int e, c, totalcost = 0;
+          c = codenzc(nzc);
+          totalcost = cost[c];
+          if ((e = extranzcbits(c))) {
+            int x = nzc - basenzcvalue(c);
+            while (e--) {
+              if ((x >> e) & 1)
+                totalcost += vp9_cost_one(Pcat_nzc[nzc_context][c - 3][e]);
+              else
+                totalcost += vp9_cost_zero(Pcat_nzc[nzc_context][c - 3][e]);
+            }
+          }
+          if (block_size == 4)
+            cpi->mb.nzc_costs_4x4[nzc_context][r][b][nzc] = totalcost;
+          else if (block_size == 8)
+            cpi->mb.nzc_costs_8x8[nzc_context][r][b][nzc] = totalcost;
+          else if (block_size == 16)
+            cpi->mb.nzc_costs_16x16[nzc_context][r][b][nzc] = totalcost;
+          else
+            cpi->mb.nzc_costs_32x32[nzc_context][r][b][nzc] = totalcost;
         }
+      }
+    }
+  }
 }
+#endif
 
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
@@ -274,6 +335,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
                    cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
                    cpi->common.fc.coef_probs_32x32, BLOCK_TYPES);
+#if CONFIG_CODE_NONZEROCOUNT
+  fill_nzc_costs(cpi, 4);
+  fill_nzc_costs(cpi, 8);
+  fill_nzc_costs(cpi, 16);
+  fill_nzc_costs(cpi, 32);
+#endif
 
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
@@ -379,7 +446,7 @@ int vp9_uvsse(MACROBLOCK *x) {
   return sse2;
 }
 
-static INLINE int cost_coeffs(MACROBLOCK *mb,
+static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
                               int ib, PLANE_TYPE type,
                               ENTROPY_CONTEXT *a,
                               ENTROPY_CONTEXT *l,
@@ -390,8 +457,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   int pt;
   const int eob = xd->eobs[ib];
   int c = 0;
-  int cost = 0, seg_eob;
-  const int segment_id = mbmi->segment_id;
+  int cost = 0;
   const int *scan;
   const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
@@ -406,12 +472,32 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   ENTROPY_CONTEXT *const l1 = l +
       sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
 
+#if CONFIG_CODE_NONZEROCOUNT
+  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
+  unsigned int *nzc_cost;
+#else
+  int seg_eob;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+#endif
+
+  // Check for consistency of tx_size with mode info
+  if (type == PLANE_TYPE_Y_WITH_DC) {
+    assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
+  } else {
+    TX_SIZE tx_size_uv = get_uv_tx_size(xd);
+    assert(tx_size == tx_size_uv);
+  }
+
   switch (tx_size) {
     case TX_4X4:
       a_ec = *a;
       l_ec = *l;
       scan = vp9_default_zig_zag1d_4x4;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
+#else
       seg_eob = 16;
+#endif
       if (type == PLANE_TYPE_Y_WITH_DC) {
         if (tx_type == ADST_DCT) {
           scan = vp9_row_scan_4x4;
@@ -424,11 +510,19 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
       scan = vp9_default_zig_zag1d_8x8;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
+#else
       seg_eob = 64;
+#endif
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
+#else
       seg_eob = 256;
+#endif
       if (type == PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
@@ -439,7 +533,11 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
+#else
       seg_eob = 1024;
+#endif
       if (type == PLANE_TYPE_UV) {
         ENTROPY_CONTEXT *a2, *a3, *l2, *l3;
         a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
@@ -464,21 +562,33 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
 
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
 
+#if CONFIG_CODE_NONZEROCOUNT == 0
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
+#endif
 
   {
     int recent_energy = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+    int nzc = 0;
+#endif
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].Token;
+#if CONFIG_CODE_NONZEROCOUNT
+      nzc += (v != 0);
+#endif
       cost += token_costs[get_coef_band(tx_size, c)][pt][t];
       cost += vp9_dct_value_cost_ptr[v];
       pt = vp9_get_coef_context(&recent_energy, t);
     }
+#if CONFIG_CODE_NONZEROCOUNT
+    cost += nzc_cost[nzc];
+#else
     if (c < seg_eob)
       cost += mb->token_costs[tx_size][type][ref][get_coef_band(tx_size, c)]
           [pt][DCT_EOB_TOKEN];
+#endif
   }
 
   // is eob first coefficient;
@@ -501,7 +611,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   return cost;
 }
 
-static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
+static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -521,7 +631,7 @@ static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
   }
 
   for (b = 0; b < 16; b++)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_Y_WITH_DC,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
@@ -529,7 +639,8 @@ static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
   return cost;
 }
 
-static void macro_block_yrd_4x4(MACROBLOCK *mb,
+static void macro_block_yrd_4x4(VP9_COMMON *const cm,
+                                MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
                                 int *skippable, int backup) {
@@ -540,11 +651,11 @@ static void macro_block_yrd_4x4(MACROBLOCK *mb,
   vp9_quantize_mby_4x4(mb);
 
   *Distortion = vp9_mbblock_error(mb) >> 2;
-  *Rate = rdcost_mby_4x4(mb, backup);
+  *Rate = rdcost_mby_4x4(cm, mb, backup);
   *skippable = vp9_mby_is_skippable_4x4(xd);
 }
 
-static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
+static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -564,7 +675,7 @@ static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   }
 
   for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_Y_WITH_DC,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b],
                         TX_8X8);
@@ -572,7 +683,8 @@ static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   return cost;
 }
 
-static void macro_block_yrd_8x8(MACROBLOCK *mb,
+static void macro_block_yrd_8x8(VP9_COMMON *const cm,
+                                MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
                                 int *skippable, int backup) {
@@ -583,11 +695,11 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
   vp9_quantize_mby_8x8(mb);
 
   *Distortion = vp9_mbblock_error(mb) >> 2;
-  *Rate = rdcost_mby_8x8(mb, backup);
+  *Rate = rdcost_mby_8x8(cm, mb, backup);
   *skippable = vp9_mby_is_skippable_8x8(xd);
 }
 
-static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
+static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int cost;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
@@ -604,11 +716,12 @@ static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
     tl = (ENTROPY_CONTEXT *)xd->left_context;
   }
 
-  cost = cost_coeffs(mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
+  cost = cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
   return cost;
 }
 
-static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
+static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb,
+                                  int *Rate, int *Distortion,
                                   int *skippable, int backup) {
   MACROBLOCKD *xd = &mb->e_mbd;
 
@@ -620,10 +733,10 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
   //                optimization in the rate-distortion optimization loop?
   if (mb->optimize &&
       xd->mode_info_context->mbmi.mode < I8X8_PRED)
-    vp9_optimize_mby_16x16(mb);
+    vp9_optimize_mby_16x16(cm, mb);
 
   *Distortion = vp9_mbblock_error(mb) >> 2;
-  *Rate = rdcost_mby_16x16(mb, backup);
+  *Rate = rdcost_mby_16x16(cm, mb, backup);
   *skippable = vp9_mby_is_skippable_16x16(xd);
 }
 
@@ -715,15 +828,16 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int *skippable,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];
 
   vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
                    x->block[0].src_stride);
 
-  macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);
-  macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);
-  macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);
+  macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);
+  macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);
+  macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
                            txfm_cache, TX_16X16);
@@ -738,8 +852,8 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
   d[12] = p[12];
 }
 
-static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
-  MACROBLOCKD * const xd = &x->e_mbd;
+static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {
+  MACROBLOCKD * xd = &x->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
@@ -754,7 +868,7 @@ static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
     tl = (ENTROPY_CONTEXT *) xd->left_context;
   }
 
-  return cost_coeffs(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
+  return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
 }
 
 static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
@@ -771,13 +885,14 @@ static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
 }
 
 #define DEBUG_ERROR 0
-static void super_block_yrd_32x32(MACROBLOCK *x,
+static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                                   int *rate, int *distortion, int *skippable,
                                   int backup) {
   MACROBLOCKD *const xd = &x->e_mbd;
 #if DEBUG_ERROR
   int16_t out[1024];
 #endif
+  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
 
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
@@ -791,7 +906,7 @@ static void super_block_yrd_32x32(MACROBLOCK *x,
   printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
          vp9_block_error_c(x->src_diff, out, 1024), *distortion);
 #endif
-  *rate       = rdcost_sby_32x32(x, backup);
+  *rate       = rdcost_sby_32x32(cm, x, backup);
   *skippable  = vp9_sby_is_skippable_32x32(xd);
 }
 
@@ -818,7 +933,8 @@ static void super_block_yrd(VP9_COMP *cpi,
 
   vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,
                        dst, dst_y_stride);
-  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);
+  super_block_yrd_32x32(&cpi->common, x,
+                        &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);
 
 #if DEBUG_ERROR
   int err[3] = { 0, 0, 0 };
@@ -835,7 +951,7 @@ static void super_block_yrd(VP9_COMP *cpi,
 
     xd->above_context = &t_above[TX_16X16][x_idx];
     xd->left_context = &t_left[TX_16X16][y_idx];
-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    macro_block_yrd_16x16(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_16X16] += d_tmp;
     r[TX_16X16][0] += r_tmp;
     s[TX_16X16] = s[TX_16X16] && s_tmp;
@@ -846,7 +962,7 @@ static void super_block_yrd(VP9_COMP *cpi,
 
     xd->above_context = &t_above[TX_4X4][x_idx];
     xd->left_context = &t_left[TX_4X4][y_idx];
-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    macro_block_yrd_4x4(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_4X4] += d_tmp;
     r[TX_4X4][0] += r_tmp;
     s[TX_4X4] = s[TX_4X4] && s_tmp;
@@ -857,7 +973,7 @@ static void super_block_yrd(VP9_COMP *cpi,
 
     xd->above_context = &t_above[TX_8X8][x_idx];
     xd->left_context = &t_left[TX_8X8][y_idx];
-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    macro_block_yrd_8x8(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_8X8] += d_tmp;
     r[TX_8X8][0] += r_tmp;
     s[TX_8X8] = s[TX_8X8] && s_tmp;
@@ -910,7 +1026,7 @@ static void super_block_64_yrd(VP9_COMP *cpi,
                          src_y_stride,
                          dst + 32 * x_idx + 32 * y_idx * dst_y_stride,
                          dst_y_stride);
-    super_block_yrd_32x32(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    super_block_yrd_32x32(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     r[TX_32X32][0] += r_tmp;
     d[TX_32X32] += d_tmp;
     s[TX_32X32] = s[TX_32X32] && s_tmp;
@@ -931,7 +1047,7 @@ static void super_block_64_yrd(VP9_COMP *cpi,
 
     xd->above_context = &t_above[TX_16X16][x_idx];
     xd->left_context = &t_left[TX_16X16][y_idx];
-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    macro_block_yrd_16x16(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_16X16] += d_tmp;
     r[TX_16X16][0] += r_tmp;
     s[TX_16X16] = s[TX_16X16] && s_tmp;
@@ -942,7 +1058,7 @@ static void super_block_64_yrd(VP9_COMP *cpi,
 
     xd->above_context = &t_above[TX_4X4][x_idx];
     xd->left_context = &t_left[TX_4X4][y_idx];
-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    macro_block_yrd_4x4(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_4X4] += d_tmp;
     r[TX_4X4][0] += r_tmp;
     s[TX_4X4] = s[TX_4X4] && s_tmp;
@@ -953,7 +1069,7 @@ static void super_block_64_yrd(VP9_COMP *cpi,
 
     xd->above_context = &t_above[TX_8X8][x_idx];
     xd->left_context = &t_left[TX_8X8][y_idx];
-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    macro_block_yrd_8x8(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_8X8] += d_tmp;
     r[TX_8X8][0] += r_tmp;
     s[TX_8X8] = s[TX_8X8] && s_tmp;
@@ -1006,6 +1122,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
   int64_t best_rd = INT64_MAX;
   int rate = 0;
   int distortion;
+  VP9_COMMON *const cm = &cpi->common;
 
   ENTROPY_CONTEXT ta = *a, tempa = *a;
   ENTROPY_CONTEXT tl = *l, templ = *l;
@@ -1022,6 +1139,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 #if CONFIG_NEWBINTRAMODES
   b->bmi.as_mode.context = vp9_find_bpred_context(b);
 #endif
+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
     int64_t this_rd;
     int ratey;
@@ -1060,7 +1178,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
     tempa = ta;
     templ = tl;
 
-    ratey = cost_coeffs(x, b - xd->block,
+    ratey = cost_coeffs(cm, x, b - xd->block,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
     rate += ratey;
     distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
@@ -1311,6 +1429,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int *bestdistortion) {
+  VP9_COMMON *const cm = &cpi->common;
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
@@ -1365,7 +1484,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       ta1 = ta0 + 1;
       tl1 = tl0 + 1;
 
-      rate_t = cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                            ta0, tl0, TX_8X8);
 
       rate += rate_t;
@@ -1398,12 +1517,12 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           x->quantize_b_4x4(x, ib + iblock[i]);
         }
         distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
-        rate_t += cost_coeffs(x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                               i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
         if (do_two) {
           i++;
-          rate_t += cost_coeffs(x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
+          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                                 i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                                 TX_4X4);
         }
@@ -1491,7 +1610,7 @@ static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) {
+static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -1510,7 +1629,7 @@ static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) {
   }
 
   for (b = 16; b < 24; b++)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
@@ -1525,14 +1644,14 @@ static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   vp9_transform_mbuv_4x4(x);
   vp9_quantize_mbuv_4x4(x);
 
-  *rate       = rd_cost_mbuv_4x4(x, do_ctx_backup);
+  *rate       = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup);
   *distortion = vp9_mbuverror(x) / 4;
   *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
+static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -1551,7 +1670,7 @@ static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(mb, b, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_8X8);
 
@@ -1564,14 +1683,14 @@ static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   vp9_transform_mbuv_8x8(x);
   vp9_quantize_mbuv_8x8(x);
 
-  *rate       = rd_cost_mbuv_8x8(x, do_ctx_backup);
+  *rate       = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup);
   *distortion = vp9_mbuverror(x) / 4;
   *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
+static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1590,22 +1709,22 @@ static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(x, b * 4, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_16X16);
 
   return cost;
 }
 
-static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,
-                                   int *distortion, int *skip,
+static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                                   int *rate, int *distortion, int *skip,
                                    int backup) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sbuv_16x16(x);
   vp9_quantize_sbuv_16x16(x);
 
-  *rate       = rd_cost_sbuv_16x16(x, backup);
+  *rate       = rd_cost_sbuv_16x16(cm, x, backup);
   *distortion = vp9_block_error_c(x->coeff + 1024,
                                   xd->dqcoeff + 1024, 512) >> 2;
   *skip       = vp9_sbuv_is_skippable_16x16(xd);
@@ -1623,7 +1742,7 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     vp9_subtract_sbuv_s_c(x->src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1);
+    rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1);
   } else {
     int n, r = 0, d = 0;
     int skippable = 1;
@@ -1671,23 +1790,14 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static void super_block_64_uvrd(MACROBLOCK *x, int *rate,
+static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate,
                                 int *distortion, int *skip);
 static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                 int *distortion, int fullpixel, int *skip) {
-  super_block_64_uvrd(x, rate, distortion, skip);
+  super_block_64_uvrd(&cpi->common, x, rate, distortion, skip);
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skip, int fullpixel,
-                              int mb_row, int mb_col) {
-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-  return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);
-}
-
 static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
                                     MACROBLOCK *x,
                                     int *rate,
@@ -1702,6 +1812,7 @@ static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
   int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
   int rate_to, UNINITIALIZED_IS_SAFE(skip);
 
+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int rate;
     int distortion;
@@ -1715,7 +1826,7 @@ static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
     vp9_transform_mbuv_4x4(x);
     vp9_quantize_mbuv_4x4(x);
 
-    rate_to = rd_cost_mbuv_4x4(x, 1);
+    rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1);
     rate = rate_to
            + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
@@ -1754,6 +1865,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
   int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
   int rate_to, UNINITIALIZED_IS_SAFE(skip);
 
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int rate;
     int distortion;
@@ -1767,7 +1879,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
 
     vp9_quantize_mbuv_8x8(x);
 
-    rate_to = rd_cost_mbuv_8x8(x, 1);
+    rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1);
     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
     distortion = vp9_mbuverror(x) / 4;
@@ -1789,7 +1901,8 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
 }
 
 // TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
-static void super_block_uvrd(MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMMON *const cm,
+                             MACROBLOCK *x,
                              int *rate,
                              int *distortion,
                              int *skippable) {
@@ -1803,7 +1916,7 @@ static void super_block_uvrd(MACROBLOCK *x,
     vp9_subtract_sbuv_s_c(x->src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1);
+    rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1);
   } else {
     int d = 0, r = 0, n, s = 1;
     ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
@@ -1837,9 +1950,9 @@ static void super_block_uvrd(MACROBLOCK *x,
       xd->above_context = t_above + x_idx;
       xd->left_context = t_left + y_idx;
       if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(x, 0);
+        r += rd_cost_mbuv_4x4(cm, x, 0);
       } else {
-        r += rd_cost_mbuv_8x8(x, 0);
+        r += rd_cost_mbuv_8x8(cm, x, 0);
       }
     }
 
@@ -1852,7 +1965,8 @@ static void super_block_uvrd(MACROBLOCK *x,
   }
 }
 
-static int rd_cost_sb64uv_32x32(MACROBLOCK *x, int backup) {
+static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                                int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1871,28 +1985,28 @@ static int rd_cost_sb64uv_32x32(MACROBLOCK *x, int backup) {
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(x, b * 16, PLANE_TYPE_UV,
+    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_32X32);
 
   return cost;
 }
 
-static void rd_inter64x64_uv_32x32(MACROBLOCK *x, int *rate,
-                                   int *distortion, int *skip,
+static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                                   int *rate, int *distortion, int *skip,
                                    int backup) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_transform_sb64uv_32x32(x);
   vp9_quantize_sb64uv_32x32(x);
 
-  *rate       = rd_cost_sb64uv_32x32(x, backup);
+  *rate       = rd_cost_sb64uv_32x32(cm, x, backup);
   *distortion = vp9_block_error_c(x->coeff + 4096,
                                   xd->dqcoeff + 4096, 2048);
   *skip       = vp9_sb64uv_is_skippable_32x32(xd);
 }
 
-static void super_block_64_uvrd(MACROBLOCK *x,
+static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
                                 int *rate,
                                 int *distortion,
                                 int *skippable) {
@@ -1913,7 +2027,7 @@ static void super_block_64_uvrd(MACROBLOCK *x,
   if (mbmi->txfm_size == TX_32X32) {
     vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
                             udst, vdst, dst_uv_stride);
-    rd_inter64x64_uv_32x32(x, &r, &d, &s, 1);
+    rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1);
   } else if (mbmi->txfm_size == TX_16X16) {
     int n;
 
@@ -1931,7 +2045,7 @@ static void super_block_64_uvrd(MACROBLOCK *x,
                             dst_uv_stride);
       xd->above_context = t_above + x_idx * 2;
       xd->left_context = t_left + y_idx * 2;
-      rd_inter32x32_uv_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+      rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0);
       r += r_tmp;
       d += d_tmp;
       s = s && s_tmp;
@@ -1961,9 +2075,9 @@ static void super_block_64_uvrd(MACROBLOCK *x,
       xd->left_context = t_left + y_idx;
       d += vp9_mbuverror(x) >> 2;
       if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(x, 0);
+        r += rd_cost_mbuv_4x4(cm, x, 0);
       } else {
-        r += rd_cost_mbuv_8x8(x, 0);
+        r += rd_cost_mbuv_8x8(cm, x, 0);
       }
     }
   }
@@ -1992,7 +2106,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
-    super_block_uvrd(x, &this_rate_tokenonly,
+    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
                      &this_distortion, &s);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
@@ -2029,7 +2143,7 @@ static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi,
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
 
-    super_block_64_uvrd(x, &this_rate_tokenonly,
+    super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly,
                         &this_distortion, &s);
     this_rate = this_rate_tokenonly +
     x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
@@ -2186,7 +2300,8 @@ static int labels2mode(
   return cost;
 }
 
-static int64_t encode_inter_mb_segment(MACROBLOCK *x,
+static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
+                                       MACROBLOCK *x,
                                        int const *labels,
                                        int which_label,
                                        int *labelyrate,
@@ -2225,7 +2340,7 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
       x->quantize_b_4x4(x, i);
       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + vp9_block2above[TX_4X4][i],
                                  tl + vp9_block2left[TX_4X4][i], TX_4X4);
     }
@@ -2234,7 +2349,8 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
-static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
+static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
+                                           MACROBLOCK *x,
                                            int const *labels,
                                            int which_label,
                                            int *labelyrate,
@@ -2288,10 +2404,12 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
           x->quantize_b_8x8(x, idx);
           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
           otherdist += thisdistortion;
-          othercost += cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                                    tacp + vp9_block2above[TX_8X8][idx],
                                    tlcp + vp9_block2left[TX_8X8][idx],
                                    TX_8X8);
+          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
         }
         for (j = 0; j < 4; j += 2) {
           bd = &xd->block[ib + iblock[j]];
@@ -2300,15 +2418,17 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
           x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
           *distortion += thisdistortion;
-          *labelyrate += cost_coeffs(x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                           ta + vp9_block2above[TX_4X4][ib + iblock[j]],
-                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
-          *labelyrate += cost_coeffs(x, ib + iblock[j] + 1,
-                           PLANE_TYPE_Y_WITH_DC,
-                           ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
-                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
+          *labelyrate +=
+              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+                          ta + vp9_block2above[TX_4X4][ib + iblock[j]],
+                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],
+                          TX_4X4);
+          *labelyrate +=
+              cost_coeffs(cm, x, ib + iblock[j] + 1,
+                          PLANE_TYPE_Y_WITH_DC,
+                          ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
+                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],
+                          TX_4X4);
         }
       } else /* 8x8 */ {
         if (otherrd) {
@@ -2319,22 +2439,26 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
             x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);
             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
             otherdist += thisdistortion;
-            othercost += cost_coeffs(x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                           tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
-                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
-            othercost += cost_coeffs(x, ib + iblock[j] + 1,
-                           PLANE_TYPE_Y_WITH_DC,
-                           tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
-                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
-                           TX_4X4);
+            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+            othercost +=
+                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
+                            tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
+                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
+                            TX_4X4);
+            othercost +=
+                cost_coeffs(cm, x, ib + iblock[j] + 1,
+                            PLANE_TYPE_Y_WITH_DC,
+                            tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
+                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
+                            TX_4X4);
+            xd->mode_info_context->mbmi.txfm_size = TX_8X8;
           }
         }
         x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
         x->quantize_b_8x8(x, idx);
         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
         *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
                                    ta + vp9_block2above[TX_8X8][idx],
                                    tl + vp9_block2left[TX_8X8][idx], TX_8X8);
       }
@@ -2574,11 +2698,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
 
       if (segmentation == PARTITIONING_4X4) {
-        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
+        this_rd = encode_inter_mb_segment(&cpi->common,
+                                          x, labels, i, &labelyrate,
                                           &distortion, ta_s, tl_s);
         other_rd = this_rd;
       } else {
-        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
+        this_rd = encode_inter_mb_segment_8x8(&cpi->common,
+                                              x, labels, i, &labelyrate,
                                               &distortion, &other_rd,
                                               ta_s, tl_s);
       }
@@ -3146,7 +3272,9 @@ static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x,
   // UV cost and distortion
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     x->e_mbd.predictor, x->src.uv_stride);
-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
+  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 &&
+      x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED &&
+      x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
     rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
                          cpi->common.full_pixel, &uv_skippable, 1);
   else
@@ -3933,7 +4061,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_COMP_INTERINTRA_PRED
   int is_best_interintra = 0;
   int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;
+  int best_intra16_mode = DC_PRED;
+#if SEPARATE_INTERINTRA_UV
+  int best_intra16_uv_mode = DC_PRED;
+#endif
 #endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
@@ -4015,6 +4146,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   cpi->zbin_mode_boost = 0;
   vp9_update_zbin_extra(cpi, x);
 
+  xd->mode_info_context->mbmi.mode = DC_PRED;
+
   rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
                           &uv_intra_rate_tokenonly, &uv_intra_distortion,
                           &uv_intra_skippable);
@@ -4330,6 +4463,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       this_rd_thresh =
           (mbmi->ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
 
       for (switchable_filter_index = 0;
            switchable_filter_index < VP9_SWITCHABLE_FILTERS;
@@ -4421,8 +4555,11 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (tmp_rd < best_yrd) {
         int uv_skippable;
 
-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                       cpi->common.full_pixel, mb_row, mb_col);
+        vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
+        vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                          x->e_mbd.predictor, x->src.uv_stride);
+        rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv,
+                             cpi->common.full_pixel, &uv_skippable, 1);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -4543,8 +4680,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         (this_rd < best_intra16_rd)) {
       best_intra16_rd = this_rd;
       best_intra16_mode = this_mode;
+#if SEPARATE_INTERINTRA_UV
       best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
                               uv_intra_mode_8x8 : uv_intra_mode);
+#endif
     }
 #endif
 
@@ -4792,6 +4931,7 @@ void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t txfm_cache[NB_TXFM_MODES], err;
   int i;
 
+  xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, txfm_cache);
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
@@ -4826,6 +4966,7 @@ void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t txfm_cache[NB_TXFM_MODES], err;
   int i;
 
+  xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                  &dist_y, &y_skip, txfm_cache);
   rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
@@ -4873,6 +5014,7 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int i;
 
   mbmi->ref_frame = INTRA_FRAME;
+  mbmi->mode = DC_PRED;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
                           &uv_intra_skippable);
   modeuv = mbmi->uv_mode;
@@ -5002,7 +5144,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_COMP_INTERINTRA_PRED
   int is_best_interintra = 0;
   int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;
+  int best_intra16_mode = DC_PRED;
+#if SEPARATE_INTERINTRA_UV
+  int best_intra16_uv_mode = DC_PRED;
+#endif
 #endif
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
@@ -5334,8 +5479,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         (this_rd < best_intra16_rd)) {
       best_intra16_rd = this_rd;
       best_intra16_mode = this_mode;
+#if SEPARATE_INTERINTRA_UV
       best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
                               mode_uv_8x8 : mode_uv_4x4);
+#endif
     }
 #endif
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index cfaf5f592473365f520703238fecb8b65cfadd81..a04a20c29a68683f8f0e3bc3af7a755a577640d8 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -219,10 +219,8 @@ static void count_segs(VP9_COMP *cpi,
   const int segment_id = mi->mbmi.segment_id;
 
   xd->mode_info_context = mi;
-  xd->mb_to_top_edge = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - mb_size - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - mb_size - mb_col) * 16) << 3;
+  set_mb_row(cm, xd, mb_row, mb_size);
+  set_mb_col(cm, xd, mb_col, mb_size);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index d115fe80e62efd9e74b23a9e96debfa62ddd4eeb..0fad9b0328709988c259f4a5b094bafb8c51707b 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -119,6 +119,11 @@ static void tokenize_b(VP9_COMP *cpi,
                           get_tx_type(xd, &xd->block[ib]) : DCT_DCT;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
+#if CONFIG_CODE_NONZEROCOUNT
+  int zerosleft, nzc = 0;
+  if (eob == 0)
+    assert(xd->nzcs[ib] == 0);
+#endif
 
   if (sb_type == BLOCK_SIZE_SB64X64) {
     a = (ENTROPY_CONTEXT *)xd->above_context +
@@ -207,29 +212,47 @@ static void tokenize_b(VP9_COMP *cpi,
   do {
     const int band = get_coef_band(tx_size, c);
     int token;
-
+    int v = 0;
+#if CONFIG_CODE_NONZEROCOUNT
+    zerosleft = seg_eob - xd->nzcs[ib] - c + nzc;
+#endif
     if (c < eob) {
       const int rc = scan[c];
-      const int v = qcoeff_ptr[rc];
+      v = qcoeff_ptr[rc];
       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
 
       t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
       token    = vp9_dct_value_tokens_ptr[v].Token;
     } else {
+#if CONFIG_CODE_NONZEROCOUNT
+      break;
+#else
       token = DCT_EOB_TOKEN;
+#endif
     }
 
     t->Token = token;
     t->context_tree = probs[type][ref][band][pt];
+#if CONFIG_CODE_NONZEROCOUNT
+    // Skip zero node if there are no zeros left
+    t->skip_eob_node = 1 + (zerosleft == 0);
+#else
     t->skip_eob_node = (pt == 0) && (band > 0);
+#endif
     assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
     }
+#if CONFIG_CODE_NONZEROCOUNT
+    nzc += (v != 0);
+#endif
 
     pt = vp9_get_coef_context(&recent_energy, token);
     ++t;
   } while (c < eob && ++c < seg_eob);
+#if CONFIG_CODE_NONZEROCOUNT
+  assert(nzc == xd->nzcs[ib]);
+#endif
 
   *tp = t;
   a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */
@@ -903,13 +926,15 @@ static void stuff_b(VP9_COMP *cpi,
                     PLANE_TYPE type,
                     TX_SIZE tx_size,
                     int dry_run) {
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+#if CONFIG_CODE_NONZEROCOUNT == 0
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
   int pt, band;
   TOKENEXTRA *t = *tp;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const int ref = mbmi->ref_frame != INTRA_FRAME;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+#endif
   ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
 
   if (sb_type == BLOCK_SIZE_SB32X32) {
@@ -939,14 +964,18 @@ static void stuff_b(VP9_COMP *cpi,
     case TX_4X4:
       a_ec = a[0];
       l_ec = l[0];
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_4x4;
       probs = cpi->common.fc.coef_probs_4x4;
+#endif
       break;
     case TX_8X8:
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_8x8;
       probs = cpi->common.fc.coef_probs_8x8;
+#endif
       break;
     case TX_16X16:
       if (type != PLANE_TYPE_UV) {
@@ -956,8 +985,10 @@ static void stuff_b(VP9_COMP *cpi,
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_16x16;
       probs = cpi->common.fc.coef_probs_16x16;
+#endif
       break;
     case TX_32X32:
       if (type != PLANE_TYPE_UV) {
@@ -971,19 +1002,25 @@ static void stuff_b(VP9_COMP *cpi,
         l_ec = (l[0] + l[1] + l1[0] + l1[1] +
                 l2[0] + l2[1] + l3[0] + l3[1]) != 0;
       }
+#if CONFIG_CODE_NONZEROCOUNT == 0
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
+#endif
       break;
   }
 
+#if CONFIG_CODE_NONZEROCOUNT == 0
   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-
   band = get_coef_band(tx_size, 0);
   t->Token = DCT_EOB_TOKEN;
   t->context_tree = probs[type][ref][band][pt];
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
+  if (!dry_run) {
+    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
+  }
+#endif
   *a = *l = 0;
   if (tx_size == TX_8X8) {
     a[1] = 0;
@@ -1009,10 +1046,6 @@ static void stuff_b(VP9_COMP *cpi,
       l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
     }
   }
-
-  if (!dry_run) {
-    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
-  }
 }
 
 static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
diff --git a/vp9/encoder/vp9_treewriter.c b/vp9/encoder/vp9_treewriter.c
index 8e252813cc0cebd4e96d17cb4a607949027fcb61..951ffa798d164b71cc5b7f546e7e02e3c5362ec4 100644
--- a/vp9/encoder/vp9_treewriter.c
+++ b/vp9/encoder/vp9_treewriter.c
@@ -35,5 +35,6 @@ void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
 }
 
 void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
+  c[0] = 0;
   cost(c, t, p, 2, 0);
 }