diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 5336f5ab0d645265852285a3ef7f1e3b24f306e3..2a410c34ee1f7e3091b2fd177acfea84b1e88e3e 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -111,9 +111,10 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_lossless_c);
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_lossless_c);
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
 #include "vp8/common/blockd.h"
-void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+void vp8_ihtllm_c(short *input, short *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim);
 #endif
 
 
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 616e4938ecd970c1f825d0de7b4b5ed2ee159e9c..5c7bf78d2984c22f2b9745004ee7989d761c8d89 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -93,120 +93,17 @@ float iadst_8[64] = {
 };
 #endif
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
-  int i, j, k;
-  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
-                            // the implementation could be simplified in
-                            // conjunction with integer transform
-  short *ip = input;
-  short *op = output;
-  int shortpitch = pitch >> 1;
-
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
-
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
-
-  // load and convert residual array into floating-point
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += 4;
-    ip  += 4;
-  }
-
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = &iadst_4[0];
-      break;
-
-    default :
-      ptv = &idct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfb[i] = 0 ;
-      for(k = 0; k < 4; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<2)];
-      }
-      pfa += 1;
-    }
-
-    pfb += 4;
-    ptv += 4;
-    pfa = &bufa[0];
-  }
-
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = &iadst_4[0];
-      break;
-
-    default :
-      pth = &idct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < 4; k++) {
-        pfa[i] += pfb[k] * pth[k];
-      }
-      pth += 4;
-     }
-
-    pfa += 4;
-    pfb += 4;
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = &iadst_4[0];
-        break;
-
-      default :
-        pth = &idct_4[0];
-        break;
-    }
-  }
-
-  // convert to short integer format and load BLOCKD buffer
-  op  = output;
-  pfa = &bufa[0];
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
-                             -(short)( - pfa[i] / 8 + 0.49);
-    }
-    op  += shortpitch;
-    pfa += 4;
-  }
-}
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM8X8
-void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+void vp8_ihtllm_c(short *input, short *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim) {
   int i, j, k;
   float bufa[64], bufb[64]; // buffers are for floating-point test purpose
                             // the implementation could be simplified in
                             // conjunction with integer transform
+
+                            // further notice, since we are thinking to use one
+                            // function for both 4x4 and 8x8 transforms, the
+                            // temporary buffers are simply initialized with 64.
   short *ip = input;
   short *op = output;
   int shortpitch = pitch >> 1;
@@ -218,12 +115,12 @@ void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   float *ptv, *pth;
 
   // load and convert residual array into floating-point
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = (float)ip[i];
     }
-    pfa += 8;
-    ip  += 8;
+    pfa += tx_dim;
+    ip  += tx_dim;
   }
 
   // vertical transformation
@@ -233,25 +130,25 @@ void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   switch(tx_type) {
     case ADST_ADST :
     case ADST_DCT  :
-      ptv = &iadst_8[0];
+      ptv = (tx_dim == 4) ? &iadst_4[0] : &iadst_8[0];
       break;
 
     default :
-      ptv = &idct_8[0];
+      ptv = (tx_dim == 4) ? &idct_4[0] : &idct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfb[i] = 0 ;
-      for(k = 0; k < 8; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<3)];
+      for(k = 0; k < tx_dim; k++) {
+        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
       }
       pfa += 1;
     }
 
-    pfb += 8;
-    ptv += 8;
+    pfb += tx_dim;
+    ptv += tx_dim;
     pfa = &bufa[0];
   }
 
@@ -262,34 +159,34 @@ void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   switch(tx_type) {
     case ADST_ADST :
     case  DCT_ADST :
-      pth = &iadst_8[0];
+      pth = (tx_dim == 4) ? &iadst_4[0] : &iadst_8[0];
       break;
 
     default :
-      pth = &idct_8[0];
+      pth = (tx_dim == 4) ? &idct_4[0] : &idct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = 0;
-      for(k = 0; k < 8; k++) {
+      for(k = 0; k < tx_dim; k++) {
         pfa[i] += pfb[k] * pth[k];
       }
-      pth += 8;
+      pth += tx_dim;
      }
 
-    pfa += 8;
-    pfb += 8;
+    pfa += tx_dim;
+    pfb += tx_dim;
 
     switch(tx_type) {
       case ADST_ADST :
       case  DCT_ADST :
-        pth = &iadst_8[0];
+        pth = (tx_dim == 4) ? &iadst_4[0] : &iadst_8[0];
         break;
 
       default :
-        pth = &idct_8[0];
+        pth = (tx_dim == 4) ? &idct_4[0] : &idct_8[0];
         break;
     }
   }
@@ -298,13 +195,14 @@ void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   op  = output;
   pfa = &bufa[0];
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
                              -(short)( - pfa[i] / 8 + 0.49);
     }
+
     op  += shortpitch;
-    pfa += 8;
+    pfa += tx_dim;
   }
 }
 #endif
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index de9aad58d96358865aa90f068eff5b70b7c18a7d..1357839c791a743f1038e91eec13bdc6e7f2f686 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -33,7 +33,7 @@ static void recon_dcblock_8x8(MACROBLOCKD *x) {
 
 #if CONFIG_HYBRIDTRANSFORM
 void vp8_inverse_htransform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) {
-  vp8_iht4x4llm_c(b->dqcoeff, b->diff, pitch, b->bmi.as_mode.tx_type);
+  vp8_ihtllm_c(b->dqcoeff, b->diff, pitch, b->bmi.as_mode.tx_type, 4);
 }
 #endif
 
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 057104f497de828b1985c35bce56698164210854..59f453edff0547a8bb36b705f12307da2074b89a 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -392,7 +392,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       txfm_map(b, pred_mode_conv(i8x8mode));
       vp8_ht_dequant_idct_add_8x8_c(b->bmi.as_mode.tx_type,
                                     q, dq, pre, dst, 16, stride);
-      // vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
       q += 64;
 #else
       for (j = 0; j < 4; j++) {
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index bf44fd61ac109dd8c310a76170ffc14238b7162b..6164c44d534130725b556f2261bf0a927d4f00b5 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -55,7 +55,7 @@ void vp8_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
     input[i] = dq[i] * input[i];
   }
 
-  vp8_iht4x4llm_c( input, output, 4 << 1, tx_type );
+  vp8_ihtllm_c(input, output, 4 << 1, tx_type, 4);
 
   vpx_memset(input, 0, 32);
 
@@ -95,7 +95,7 @@ void vp8_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
     input[i] = dq[1] * input[i];
   }
 
-  vp8_iht8x8llm_c(input, output, 16, tx_type);
+  vp8_ihtllm_c(input, output, 16, tx_type, 8);
 
   vpx_memset(input, 0, 128);
 
@@ -117,9 +117,10 @@ void vp8_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
       diff_ptr += 8;
       pred += pitch;
     }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+    // shift buffer pointers to next 4x4 block in the submacroblock
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
   }
 }
 #endif
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index ad525855265f723df2e3fadfd0541bc528e181ea..ae1912903c951e8942b8c0e6e4cdef74e513db63 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -329,114 +329,9 @@ void vp8_short_fhaar2x2_c(short *input, short *output, int pitch) { // pitch = 8
 
 }
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
-  int i, j, k;
-  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
-                             // the implementation could be simplified in
-                             // conjunction with integer transform
-  short *ip = input;
-  short *op = output;
-
-  float *pfa = &bufa[0];
-  float *pfb = &bufb[0];
-
-  // pointers to vertical and horizontal transforms
-  float *ptv, *pth;
-
-  // load and convert residual array into floating-point
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = (float)ip[i];
-    }
-    pfa += 4;
-    ip  += pitch / 2;
-  }
-
-  // vertical transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case ADST_DCT  :
-      ptv = &adst_4[0];
-      break;
-
-    default :
-      ptv = &dct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfb[i] = 0;
-      for(k = 0; k < 4; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<2)];
-      }
-      pfa += 1;
-    }
-    pfb += 4;
-    ptv += 4;
-    pfa = &bufa[0];
-  }
-
-  // horizontal transformation
-  pfa = &bufa[0];
-  pfb = &bufb[0];
-
-  switch(tx_type) {
-    case ADST_ADST :
-    case  DCT_ADST :
-      pth = &adst_4[0];
-      break;
-
-    default :
-      pth = &dct_4[0];
-      break;
-  }
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      pfa[i] = 0;
-      for(k = 0; k < 4; k++) {
-        pfa[i] += pfb[k] * pth[k];
-      }
-      pth += 4;
-     }
-
-    pfa += 4;
-    pfb += 4;
-
-    switch(tx_type) {
-      case ADST_ADST :
-      case  DCT_ADST :
-        pth = &adst_4[0];
-        break;
-
-      default :
-        pth = &dct_4[0];
-        break;
-    }
-  }
-
-  // convert to short integer format and load BLOCKD buffer
-  op  = output ;
-  pfa = &bufa[0] ;
-
-  for(j = 0; j < 4; j++) {
-    for(i = 0; i < 4; i++) {
-      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
-                                   -(short)(- 8 * pfa[i] + 0.49);
-    }
-    op  += 4;
-    pfa += 4;
-  }
-}
-#endif
-
-#if CONFIG_HYBRIDTRANSFORM8X8
-void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+void vp8_fht_c(short *input, short *output, int pitch,
+               TX_TYPE tx_type, int tx_dim) {
   int i, j, k;
   float bufa[64], bufb[64]; // buffers are for floating-point test purpose
                              // the implementation could be simplified in
@@ -451,11 +346,11 @@ void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   float *ptv, *pth;
 
   // load and convert residual array into floating-point
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = (float)ip[i];
     }
-    pfa += 8;
+    pfa += tx_dim;
     ip  += pitch / 2;
   }
 
@@ -466,24 +361,24 @@ void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   switch(tx_type) {
     case ADST_ADST :
     case ADST_DCT  :
-      ptv = &adst_8[0];
+      ptv = (tx_dim == 4) ? &adst_4[0] : &adst_8[0];
       break;
 
     default :
-      ptv = &dct_8[0];
+      ptv = (tx_dim == 4) ? &dct_4[0] : &dct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfb[i] = 0;
-      for(k = 0; k < 8; k++) {
-        pfb[i] += ptv[k] * pfa[(k<<3)];
+      for(k = 0; k < tx_dim; k++) {
+        pfb[i] += ptv[k] * pfa[(k * tx_dim)];
       }
       pfa += 1;
     }
-    pfb += 8;
-    ptv += 8;
+    pfb += tx_dim;
+    ptv += tx_dim;
     pfa = &bufa[0];
   }
 
@@ -494,34 +389,34 @@ void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   switch(tx_type) {
     case ADST_ADST :
     case  DCT_ADST :
-      pth = &adst_8[0];
+      pth = (tx_dim == 4) ? &adst_4[0] : &adst_8[0];
       break;
 
     default :
-      pth = &dct_8[0];
+      pth = (tx_dim == 4) ? &dct_4[0] : &dct_8[0];
       break;
   }
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       pfa[i] = 0;
-      for(k = 0; k < 8; k++) {
+      for(k = 0; k < tx_dim; k++) {
         pfa[i] += pfb[k] * pth[k];
       }
-      pth += 8;
+      pth += tx_dim;
      }
 
-    pfa += 8;
-    pfb += 8;
+    pfa += tx_dim;
+    pfb += tx_dim;
 
     switch(tx_type) {
       case ADST_ADST :
       case  DCT_ADST :
-        pth = &adst_8[0];
+        pth = (tx_dim == 4) ? &adst_4[0] : &adst_8[0];
         break;
 
       default :
-        pth = &dct_8[0];
+        pth = (tx_dim == 4) ? &dct_4[0] : &dct_8[0];
         break;
     }
   }
@@ -530,13 +425,13 @@ void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   op  = output ;
   pfa = &bufa[0] ;
 
-  for(j = 0; j < 8; j++) {
-    for(i = 0; i < 8; i++) {
+  for(j = 0; j < tx_dim; j++) {
+    for(i = 0; i < tx_dim; i++) {
       op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
                                    -(short)(- 8 * pfa[i] + 0.49);
     }
-    op  += 8;
-    pfa += 8;
+    op  += tx_dim;
+    pfa += tx_dim;
   }
 }
 #endif
@@ -582,14 +477,6 @@ void vp8_short_fdct4x4_c(short *input, short *output, int pitch) {
   }
 }
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_fht8x4_c(short *input, short *output, int pitch,
-                  TX_TYPE tx_type) {
-  vp8_fht4x4_c(input,     output,      pitch, tx_type);
-  vp8_fht4x4_c(input + 4, output + 16, pitch, tx_type);
-}
-#endif
-
 void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
 {
     vp8_short_fdct4x4_c(input,   output,    pitch);
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index 9936969d5a42c23a8051cb6b3cc54c38e84141d3..2d7b61754585200d552a53872595571db3243db3 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -23,9 +23,9 @@
 #endif
 
 
-#if CONFIG_HYBRIDTRANSFORM
-void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
-void vp8_fht8x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+void vp8_fht_c(short *input, short *output, int pitch,
+               TX_TYPE tx_type, int tx_dim);
 #endif
 
 #if CONFIG_TX16X16
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 964046d926f3646ac6a6895c2f8201abdbf263db..c4049016362727543c09eb9a1378a517a1c8aed5 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -91,8 +91,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
     if(active_ht) {
       b->bmi.as_mode.test = b->bmi.as_mode.first;
       txfm_map(b, b->bmi.as_mode.first);
-
-      vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
+      vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
       vp8_ht_quantize_b(be, b);
       vp8_inverse_htransform_b(IF_RTCD(&rtcd->common->idct), b, 32) ;
     } else {
@@ -317,16 +316,11 @@ void vp8_encode_intra8x8(const VP8_ENCODER_RTCD *rtcd,
     vp8_subtract_4b_c(be, b, 16);
 
     txfm_map(b, pred_mode_conv(b->bmi.as_mode.first));
-
-    vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32,
-                 b->bmi.as_mode.tx_type);
+    vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
+              b->bmi.as_mode.tx_type, 8);
     x->quantize_b_8x8(x->block + idx, xd->block + idx);
-    vp8_iht8x8llm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
-                    b->bmi.as_mode.tx_type);
-
-//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-//    x->quantize_b_8x8(x->block + idx, xd->block + idx);
-//    vp8_short_idct8x8_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+    vp8_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+                 b->bmi.as_mode.tx_type, 8);
 
     // reconstruct submacroblock
     for (i = 0; i < 4; i++) {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3f2b8e85c6870a5a8743515d4a005e544529795f..67bf33d6fdbbdba21bdbe2b338193bd57df55abf 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -612,20 +612,20 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type,
         if((type == PLANE_TYPE_Y_WITH_DC) && active_ht) {
           switch (b->bmi.as_mode.tx_type) {
             case ADST_DCT:
-              pt_scan = vp8_row_scan;
+              scan = vp8_row_scan;
               break;
 
             case DCT_ADST:
-              pt_scan = vp8_col_scan;
+              scan = vp8_col_scan;
               break;
 
             default:
-              pt_scan = vp8_default_zig_zag1d;
+              scan = vp8_default_zig_zag1d;
               break;
           }
 
         } else
-          pt_scan = vp8_default_zig_zag1d;
+          scan = vp8_default_zig_zag1d;
       }
 #endif
       break;
@@ -937,8 +937,7 @@ static int64_t rd_pick_intra4x4block(
       if(active_ht) {
         b->bmi.as_mode.test = mode;
         txfm_map(b, mode);
-
-        vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
+        vp8_fht_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type, 4);
         vp8_ht_quantize_b(be, b);
       } else {
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
@@ -991,7 +990,7 @@ static int64_t rd_pick_intra4x4block(
 
   // inverse transform
   if(active_ht) {
-    vp8_iht4x4llm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type );
+    vp8_ihtllm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type, 4);
   } else {
     IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
                                                                 b->diff, 32);
@@ -1230,8 +1229,8 @@ static int64_t rd_pick_intra8x8block(
 
 #if CONFIG_HYBRIDTRANSFORM8X8
       txfm_map(b, pred_mode_conv(mode));
-      vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32, b->bmi.as_mode.tx_type);
-//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
+                b->bmi.as_mode.tx_type, 8);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
 
       // compute quantization mse of 8x8 block