diff --git a/configure b/configure
index 04090786f7ec415434319b0714f8b938d919165b..dc441754a53b086caf2ac395393970db91302a64 100755
--- a/configure
+++ b/configure
@@ -250,6 +250,7 @@ EXPERIMENT_LIST="
     enable_6tap
     abovesprefmv
     intht
+    intht4x4
 "
 CONFIG_LIST="
     external_build
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 0d51f06142431bacd4188528940d867984059787..b2c15fc768ca723c99a5b75649a92a87f1c2fa79 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -408,7 +408,7 @@ typedef struct macroblockd {
 
 #define ACTIVE_HT8  300
 
-#define ACTIVE_HT16 300
+#define ACTIVE_HT16 0
 
 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE
 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 680a206274ae71c41a8f0fb1ada90e96aeda22ce..01e8ea3c255a0c0d9119d00bff78fafcac8e5f0e 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -50,6 +50,14 @@ static const int cospi_29_64 = 2404;
 static const int cospi_30_64 = 1606;
 static const int cospi_31_64 = 804;
 
+#if CONFIG_INTHT4X4
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const int sinpi_1_9 = 5283;
+static const int sinpi_2_9 = 9929;
+static const int sinpi_3_9 = 13377;
+static const int sinpi_4_9 = 15212;
+#endif
+
 static INLINE int dct_const_round_shift(int input) {
   int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
   assert((rv <= INT16_MAX) && (rv >= INT16_MIN));
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index b27b34cf2c70096f03b7d55aa02bb794ee90ceaa..2f847dc780147310637cc5aacb37f287493eab14 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -494,7 +494,6 @@ void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
 }
 #endif
 
-
 void idct4_1d(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
@@ -651,6 +650,100 @@ void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
     }
 }
 
+#if CONFIG_INTHT4X4
+static void iadst4_1d(int16_t *input, int16_t *output) {
+  int x0, x1, x2, x3;
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
+                        int pitch, TX_TYPE tx_type) {
+  int16_t out[16];
+  int16_t *outptr = &out[0];
+  const int short_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[4], temp_out[4];
+
+  void (*invr)(int16_t*, int16_t*);
+  void (*invc)(int16_t*, int16_t*);
+
+  switch (tx_type) {
+    case ADST_ADST:
+      invc = &iadst4_1d;
+      invr = &iadst4_1d;
+      break;
+    case ADST_DCT:
+      invc = &iadst4_1d;
+      invr = &idct4_1d;
+      break;
+    case DCT_ADST:
+      invc = &idct4_1d;
+      invr = &iadst4_1d;
+      break;
+    case DCT_DCT:
+      invc = &idct4_1d;
+      invr = &idct4_1d;
+      break;
+    default:
+      assert(0);
+  }
+
+  // inverse transform row vectors
+  for (i = 0; i < 4; ++i) {
+    invr(input, outptr);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    invc(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      output[j * short_pitch + i] = (temp_out[j] + 8) >> 4;
+  }
+}
+#endif
+
 #if CONFIG_INTHT
 static void iadst8_1d(int16_t *input, int16_t *output) {
   int x0, x1, x2, x3, x4, x5, x6, x7;
@@ -733,7 +826,7 @@ static void iadst8_1d(int16_t *input, int16_t *output) {
 }
 
 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
-                        TX_TYPE tx_type, int pitch) {
+                        int pitch, TX_TYPE tx_type) {
   int16_t out[8 * 8];
   int16_t *outptr = &out[0];
   const int short_pitch = pitch >> 1;
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index c81fe2d0d3825626ac3e84510155012f4f21cc6d..e7cfe207b2f83a89804d9630b7067b094ad4a823 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -51,8 +51,13 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
   for (i = 0; i < 16; i++) {
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
+#if CONFIG_INTHT4X4
+      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff,
+                       32, tx_type);
+#else
       vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32,
                    tx_type, 4, xd->block[i].eob);
+#endif
     } else {
       vp9_inverse_transform_b_4x4(xd, i, 32);
     }
@@ -93,7 +98,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
     if (tx_type != DCT_DCT) {
 #if CONFIG_INTHT
       vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff,
-                           tx_type, 32);
+                           32, tx_type);
 #else
       vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
                  xd->block[i].eob);
@@ -108,7 +113,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
     if (tx_type != DCT_DCT) {
 #if CONFIG_INTHT
       vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,
-                           tx_type, 32);
+                           32, tx_type);
 #else
       vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8,
                  xd->block[i + 2].eob);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 4dce0c9626fc842be2a12812718190bb13ca9b36..066989272d0be8ca570a56d7e5e781a85ec24626 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -300,10 +300,15 @@ prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
 #if CONFIG_INTHT
-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int tx_type, int pitch"
+prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht8x8
 #endif
 
+#if CONFIG_INTHT4X4
+prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
+specialize vp9_short_iht4x4
+#endif
+
 prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
 specialize vp9_ihtllm
 
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 839a918fb06e7e28805acc578fa6b567a37c8c50..1f64767fa657335cdc4a5da281e1ef87ced6f5f2 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -69,7 +69,11 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
     input[i] = dq[i] * input[i];
   }
 
+#if CONFIG_INTHT4X4
+  vp9_short_iht4x4(input, output, 8, tx_type);
+#else
   vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs);
+#endif
 
   vpx_memset(input, 0, 32);
 
@@ -93,7 +97,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
     }
 
 #if CONFIG_INTHT
-    vp9_short_iht8x8(input, output, tx_type, 16);
+    vp9_short_iht8x8(input, output, 16, tx_type);
 #else
     vp9_ihtllm(input, output, 16, tx_type, 8, eobs);
 #endif
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index fa72297141abef34057657197286f558420c8c8f..1dd30130a7f06b19f0a8a40872f5950c19f54a6c 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -56,7 +56,11 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   if (tx_type != DCT_DCT) {
     vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
     vp9_ht_quantize_b_4x4(be, b, tx_type);
+#if CONFIG_INTHT4X4
+    vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);
+#else
     vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
+#endif
   } else {
     x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(be, b) ;
@@ -155,7 +159,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
 
 #if CONFIG_INTHT
       vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,
-                            tx_type, 32);
+                            32, tx_type);
 #else
       vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
                    tx_type, 8, xd->block[idx].eob);
@@ -173,7 +177,11 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
       if (tx_type != DCT_DCT) {
         vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
         vp9_ht_quantize_b_4x4(be, b, tx_type);
+#if CONFIG_INTHT4X4
+        vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);
+#else
         vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);
+#endif
       } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
         x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(be, be + 1, b, b + 1);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 317209be7f1dad77d10ac6201cd6489d8c5cdaac..29893b8191113d8d0dac2a84e170984bb87a0b6b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1170,7 +1170,11 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
+#if CONFIG_INTHT4X4
+    vp9_short_iht4x4(best_dqcoeff, b->diff, 32, best_tx_type);
+#else
     vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);
+#endif
   else
     xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);