diff --git a/configure b/configure
index d1d25b3d5725a9a4d3775c8e2cb6fd74dddfbd32..c93ffd75fcc614ec2a64f77a42dafdf879b6a7bc 100755
--- a/configure
+++ b/configure
@@ -247,6 +247,8 @@ EXPERIMENT_LIST="
     implicit_segmentation
     newbintramodes
     comp_interintra_pred
+    tx32x32
+    dwt32x32hybrid
 "
 CONFIG_LIST="
     external_build
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f089c6839b02a36997c1cb3d69f6c5ea3f4288f
--- /dev/null
+++ b/test/dct32x32_test.cc
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+extern "C" {
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
+  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+}
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+#if !CONFIG_DWT32X32HYBRID
+static const double kPi = 3.141592653589793238462643383279502884;
+static void reference2_32x32_idct_2d(double *input, double *output) {
+  double x;
+  for (int l = 0; l < 32; ++l) {
+    for (int k = 0; k < 32; ++k) {
+      double s = 0;
+      for (int i = 0; i < 32; ++i) {
+        for (int j = 0; j < 32; ++j) {
+          x = cos(kPi * j * (l + 0.5) / 32.0) *
+              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
+          if (i != 0)
+            x *= sqrt(2.0);
+          if (j != 0)
+            x *= sqrt(2.0);
+          s += x;
+        }
+      }
+      output[k * 32 + l] = s / 4;
+    }
+  }
+}
+
+static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 32; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 32; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+  // First transform columns
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = input[j*32 + i];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 32; ++j)
+      output[j * 32 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i*32];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 32; ++j)
+      output[j + i * 32] = temp_out[j] / 4;
+  }
+}
+
+
+TEST(VP9Idct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[1024], coeff[1024];
+    int16_t out_c[1024];
+    double out_r[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      in[j] = rnd.Rand8() - rnd.Rand8();
+
+    reference_32x32_dct_2d(in, out_r);
+    for (int j = 0; j < 1024; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct32x32_c(coeff, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const int diff = out_c[j] - in[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 3x32 IDCT has error " << error
+          << " at index " << j;
+    }
+
+    vp9_short_fdct32x32_c(in, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const double diff = coeff[j] - out_c[j];
+      const double error = diff * diff;
+      EXPECT_GE(1.0, error)
+          << "Error: 32x32 FDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+#else  // CONFIG_DWT32X32HYBRID
+  // TODO(rbultje/debargha): add DWT-specific tests
+#endif  // CONFIG_DWT32X32HYBRID
+TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  unsigned int max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[1024];
+    int16_t test_temp_block[1024];
+    int16_t test_output_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+
+    for (int j = 0; j < 1024; ++j) {
+      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block/10, total_error)
+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block";
+}
+
+TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[1024], input_extreme_block[1024];
+    int16_t output_block[1024], output_extreme_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 1024; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 32;
+    vp9_short_fdct32x32_c(input_block, output_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 1024; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 32x32 FDCT extreme has coefficient larger than "
+             "4*DCT_MAX_VALUE";
+    }
+  }
+}
+}  // namespace
diff --git a/test/test.mk b/test/test.mk
index 4fb464e643e93fabb2f3b7b7e9c8d04561907a44..919cf04387bbea5a08115f673bf3302c8b53b808 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -64,6 +64,9 @@ endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_TX32X32),yesyes)
+LIBVPX_TEST_SRCS-yes += dct32x32_test.cc
+endif
 LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
 LIBVPX_TEST_SRCS-yes += variance_test.cc
 endif # VP9
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 27ef22fffd3fe956d37919da701c210ea2c7d20c..11efd44759a25b4d497cdc54bdb3ae6a902471ed 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -129,7 +129,13 @@ typedef enum {
   TX_4X4,                      // 4x4 dct transform
   TX_8X8,                      // 8x8 dct transform
   TX_16X16,                    // 16x16 dct transform
-  TX_SIZE_MAX                  // Number of different transforms available
+  TX_SIZE_MAX_MB,              // Number of transforms available to MBs
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  TX_32X32 = TX_SIZE_MAX_MB,   // 32x32 dct transform
+  TX_SIZE_MAX_SB,              // Number of transforms available to SBs
+#else
+  TX_SIZE_MAX_SB = TX_SIZE_MAX_MB,
+#endif
 } TX_SIZE;
 
 typedef enum {
@@ -302,6 +308,15 @@ typedef struct blockd {
   union b_mode_info bmi;
 } BLOCKD;
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+typedef struct superblockd {
+  /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */
+  DECLARE_ALIGNED(16, short, diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, short, qcoeff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, short, dqcoeff[32*32+16*16*2]);
+} SUPERBLOCKD;
+#endif
+
 typedef struct macroblockd {
   DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
   DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
@@ -309,6 +324,10 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, short, dqcoeff[400]);
   DECLARE_ALIGNED(16, unsigned short,  eobs[25]);
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  SUPERBLOCKD sb_coeff_data;
+#endif
+
   /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
   BLOCKD block[25];
   int fullpixel_mask;
diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h
index 52fb02f36af34ef5fbcfb8c421fb992544264eb3..31103adb7fe7cd9906d7681eea55a65a6b674c3b 100644
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -1375,3 +1375,5 @@ static const vp9_prob
     }
   }
 };
+
+#define default_coef_probs_32x32 default_coef_probs_16x16
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 4832b4e9c7b973a1b96d9df74c5bbf91e92090fe..321fa8c57504a6e335c6db9dca0ece3ee0880d61 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -132,6 +132,109 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
     250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
 };
 
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+    0,    1,   32,   64,   33,    2,    3,   34,   65,   96,  128,   97,   66,   35,    4,    5,   36,   67,   98,  129,  160,  192,  161,  130,   99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224,  256,  225,  194,  163,  132,  101,   70,   39,    8,    9,   40,   71,  102,  133,  164,  195,  226,  257,  288,  320,  289,  258,  227,  196,  165,  134,  103,   72,
+   41,   10,   11,   42,   73,  104,  135,  166,  197,  228,  259,  290,  321,  352,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,   13,   44,   75,  106,  137,
+  168,  199,  230,  261,  292,  323,  354,  385,  416,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,
+  263,  294,  325,  356,  387,  418,  449,  480,  512,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,   17,   48,   79,  110,  141,  172,  203,
+  234,  265,  296,  327,  358,  389,  420,  451,  482,  513,  544,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,   19,   50,
+   81,  112,  143,  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,  577,  608,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,  268,  237,
+  206,  175,  144,  113,   82,   51,   20,   21,   52,   83,  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,  517,  548,  579,  610,  641,  672,  704,  673,  642,
+  611,  580,  549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,
+  148,  117,   86,   55,   24,   25,   56,   87,  118,  149,  180,  211,  242,  273,  304,  335,  366,  397,  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  832,
+  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,   27,   58,   89,  120,  151,  182,
+  213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,
+  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,   90,   59,   28,   29,   60,   91,  122,  153,  184,  215,  246,  277,  308,  339,  370,  401,
+  432,  463,  494,  525,  556,  587,  618,  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  960,  929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,
+  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,   30,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+  527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,  993,  962,  931,  900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,
+  497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,   95,  126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,  467,  498,  529,  560,  591,
+  622,  653,  684,  715,  746,  777,  808,  839,  870,  901,  932,  963,  994,  995,  964,  933,  902,  871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,
+  406,  375,  344,  313,  282,  251,  220,  189,  158,  127,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+  841,  872,  903,  934,  965,  996,  997,  966,  935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,
+  191,  223,  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  999,  968,  937,  906,  875,
+  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+  659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000, 1001,  970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,  474,  443,  412,
+  381,  350,  319,  351,  382,  413,  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,  847,  878,  909,  940,  971, 1002, 1003,  972,  941,  910,  879,  848,  817,
+  786,  755,  724,  693,  662,  631,  600,  569,  538,  507,  476,  445,  414,  383,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+  973, 1004, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,  633,  602,  571,  540,  509,  478,  447,  479,  510,  541,  572,  603,  634,  665,  696,  727,  758,  789,
+  820,  851,  882,  913,  944,  975, 1006, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,  666,  635,  604,  573,  542,  511,  543,  574,  605,  636,  667,  698,  729,  760,
+  791,  822,  853,  884,  915,  946,  977, 1008, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575,  607,  638,  669,  700,  731,  762,  793,  824,  855,
+  886,  917,  948,  979, 1010, 1011,  980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012, 1013,  982,
+  951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
+  923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
+};
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
@@ -160,10 +263,11 @@ static const Prob Pcat2[] = { 165, 145};
 static const Prob Pcat3[] = { 173, 148, 140};
 static const Prob Pcat4[] = { 176, 155, 140, 135};
 static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const Prob Pcat6[] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+static const Prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
 static void init_bit_tree(vp9_tree_index *p, int n) {
   int i = 0;
@@ -182,7 +286,7 @@ static void init_bit_trees() {
   init_bit_tree(cat3, 3);
   init_bit_tree(cat4, 4);
   init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 13);
+  init_bit_tree(cat6, 14);
 }
 
 vp9_extra_bit_struct vp9_extra_bits[12] = {
@@ -196,7 +300,7 @@ vp9_extra_bit_struct vp9_extra_bits[12] = {
   { cat3, Pcat3, 3, 11},
   { cat4, Pcat4, 4, 19},
   { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 13, 67},
+  { cat6, Pcat6, 14, 67},
   { 0, 0, 0, 0}
 };
 
@@ -218,6 +322,11 @@ void vp9_default_coef_probs(VP9_COMMON *pc) {
   vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
              default_hybrid_coef_probs_16x16,
              sizeof(pc->fc.hybrid_coef_probs_16x16));
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
+             sizeof(pc->fc.coef_probs_32x32));
+#endif
 }
 
 void vp9_coef_tree_initialize() {
@@ -444,4 +553,28 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
           else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
         }
       }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  for (i = 0; i < BLOCK_TYPES_32X32; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.coef_counts_32x32[i][j][k], 256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_coef_probs_32x32[i][j][k][t] *
+                  (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.coef_probs_32x32[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.coef_probs_32x32[i][j][k][t] = 255;
+          else cm->fc.coef_probs_32x32[i][j][k][t] = prob;
+        }
+      }
+#endif
 }
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 3c74de7be9b73dabd5e9e8ddce10b6c45f1e9e1f..96d964448f143b42bf880566445863cb6c3e95a8 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -55,7 +55,7 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 #define PROB_UPDATE_BASELINE_COST   7
 
 #define MAX_PROB                255
-#define DCT_MAX_VALUE           8192
+#define DCT_MAX_VALUE           16384
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
@@ -66,6 +66,10 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 
 #define BLOCK_TYPES_16X16 4
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+#define BLOCK_TYPES_32X32 4
+#endif
+
 /* Middle dimension is a coarsening of the coefficient's
    position within the 4x4 DCT. */
 
@@ -73,6 +77,9 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
 extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
 extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);
+#endif
 
 /* Inside dimension is 3-valued measure of nearby complexity, that is,
    the extent to which nearby coefficients are nonzero.  For the first
@@ -106,9 +113,13 @@ extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
 extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
 
 extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);
+#endif
+
 void vp9_coef_tree_initialize(void);
 
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
 void vp9_adapt_coef_probs(struct VP9Common *);
 
 #endif
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 9622dfdee1335b596afa7b88f53b60b77b630903..cc685b99e68a3e52387b7c65c37a1827d43b31e5 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -1774,3 +1774,465 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
 #undef RIGHT_SHIFT
 #undef RIGHT_ROUNDING
 #endif
+
+#if CONFIG_TX32X32
+#if !CONFIG_DWT32X32HYBRID
+#define DownshiftMultiplyBy2(x) x * 2
+#define DownshiftMultiply(x) x
+static void idct16(double *input, double *output, int stride) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  // step 1 and 2
+  step[ 0] = input[stride*0] + input[stride*8];
+  step[ 1] = input[stride*0] - input[stride*8];
+
+  temp1 = input[stride*4]*C12;
+  temp2 = input[stride*12]*C4;
+
+  temp1 -= temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+
+  step[ 2] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*4]*C4;
+  temp2 = input[stride*12]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  step[ 3] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*2]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] + input[stride*10];
+
+  step[ 4] = temp1 + temp2;
+  step[ 5] = temp1 - temp2;
+
+  temp1 = input[stride*14]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] - input[stride*10];
+
+  step[ 6] = temp2 - temp1;
+  step[ 7] = temp2 + temp1;
+
+  // for odd input
+  temp1 = input[stride*3]*C12;
+  temp2 = input[stride*13]*C4;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[ 8] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*3]*C4;
+  temp2 = input[stride*13]*C12;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[ 9] = DownshiftMultiplyBy2(temp2);
+
+  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
+  intermediate[11] = input[stride*15] - input[stride*1];
+  intermediate[12] = input[stride*15] + input[stride*1];
+  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
+
+  temp1 = input[stride*11]*C12;
+  temp2 = input[stride*5]*C4;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[14] = DownshiftMultiplyBy2(temp2);
+
+  temp1 = input[stride*11]*C4;
+  temp2 = input[stride*5]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[15] = DownshiftMultiplyBy2(temp1);
+
+  step[ 8] = intermediate[ 8] + intermediate[14];
+  step[ 9] = intermediate[ 9] + intermediate[15];
+  step[10] = intermediate[10] + intermediate[11];
+  step[11] = intermediate[10] - intermediate[11];
+  step[12] = intermediate[12] + intermediate[13];
+  step[13] = intermediate[12] - intermediate[13];
+  step[14] = intermediate[ 8] - intermediate[14];
+  step[15] = intermediate[ 9] - intermediate[15];
+
+  // step 3
+  output[stride*0] = step[ 0] + step[ 3];
+  output[stride*1] = step[ 1] + step[ 2];
+  output[stride*2] = step[ 1] - step[ 2];
+  output[stride*3] = step[ 0] - step[ 3];
+
+  temp1 = step[ 4]*C14;
+  temp2 = step[ 7]*C2;
+  temp1 -= temp2;
+  output[stride*4] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 4]*C2;
+  temp2 = step[ 7]*C14;
+  temp1 += temp2;
+  output[stride*7] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C10;
+  temp2 = step[ 6]*C6;
+  temp1 -= temp2;
+  output[stride*5] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C6;
+  temp2 = step[ 6]*C10;
+  temp1 += temp2;
+  output[stride*6] =  DownshiftMultiply(temp1);
+
+  output[stride*8] = step[ 8] + step[11];
+  output[stride*9] = step[ 9] + step[10];
+  output[stride*10] = step[ 9] - step[10];
+  output[stride*11] = step[ 8] - step[11];
+  output[stride*12] = step[12] + step[15];
+  output[stride*13] = step[13] + step[14];
+  output[stride*14] = step[13] - step[14];
+  output[stride*15] = step[12] - step[15];
+
+  // output 4
+  step[ 0] = output[stride*0] + output[stride*7];
+  step[ 1] = output[stride*1] + output[stride*6];
+  step[ 2] = output[stride*2] + output[stride*5];
+  step[ 3] = output[stride*3] + output[stride*4];
+  step[ 4] = output[stride*3] - output[stride*4];
+  step[ 5] = output[stride*2] - output[stride*5];
+  step[ 6] = output[stride*1] - output[stride*6];
+  step[ 7] = output[stride*0] - output[stride*7];
+
+  temp1 = output[stride*8]*C7;
+  temp2 = output[stride*15]*C9;
+  temp1 -= temp2;
+  step[ 8] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C11;
+  temp2 = output[stride*14]*C5;
+  temp1 += temp2;
+  step[ 9] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*10]*C3;
+  temp2 = output[stride*13]*C13;
+  temp1 -= temp2;
+  step[10] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C15;
+  temp2 = output[stride*12]*C1;
+  temp1 += temp2;
+  step[11] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C1;
+  temp2 = output[stride*12]*C15;
+  temp2 -= temp1;
+  step[12] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*10]*C13;
+  temp2 = output[stride*13]*C3;
+  temp1 += temp2;
+  step[13] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C5;
+  temp2 = output[stride*14]*C11;
+  temp2 -= temp1;
+  step[14] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*8]*C9;
+  temp2 = output[stride*15]*C7;
+  temp1 += temp2;
+  step[15] = DownshiftMultiply(temp1);
+
+  // step 5
+  output[stride*0] = step[0] + step[15];
+  output[stride*1] = step[1] + step[14];
+  output[stride*2] = step[2] + step[13];
+  output[stride*3] = step[3] + step[12];
+  output[stride*4] = step[4] + step[11];
+  output[stride*5] = step[5] + step[10];
+  output[stride*6] = step[6] + step[ 9];
+  output[stride*7] = step[7] + step[ 8];
+
+  output[stride*15] = step[0] - step[15];
+  output[stride*14] = step[1] - step[14];
+  output[stride*13] = step[2] - step[13];
+  output[stride*12] = step[3] - step[12];
+  output[stride*11] = step[4] - step[11];
+  output[stride*10] = step[5] - step[10];
+  output[stride*9] = step[6] - step[ 9];
+  output[stride*8] = step[7] - step[ 8];
+}
+static void butterfly_32_idct_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step1[32];
+  double step2[32];
+
+  step1[ 0] = input[stride*0];
+  step1[ 1] = input[stride*2];
+  step1[ 2] = input[stride*4];
+  step1[ 3] = input[stride*6];
+  step1[ 4] = input[stride*8];
+  step1[ 5] = input[stride*10];
+  step1[ 6] = input[stride*12];
+  step1[ 7] = input[stride*14];
+  step1[ 8] = input[stride*16];
+  step1[ 9] = input[stride*18];
+  step1[10] = input[stride*20];
+  step1[11] = input[stride*22];
+  step1[12] = input[stride*24];
+  step1[13] = input[stride*26];
+  step1[14] = input[stride*28];
+  step1[15] = input[stride*30];
+
+  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
+  step1[17] = (input[stride*3] + input[stride*1]);
+  step1[18] = (input[stride*5] + input[stride*3]);
+  step1[19] = (input[stride*7] + input[stride*5]);
+  step1[20] = (input[stride*9] + input[stride*7]);
+  step1[21] = (input[stride*11] + input[stride*9]);
+  step1[22] = (input[stride*13] + input[stride*11]);
+  step1[23] = (input[stride*15] + input[stride*13]);
+  step1[24] = (input[stride*17] + input[stride*15]);
+  step1[25] = (input[stride*19] + input[stride*17]);
+  step1[26] = (input[stride*21] + input[stride*19]);
+  step1[27] = (input[stride*23] + input[stride*21]);
+  step1[28] = (input[stride*25] + input[stride*23]);
+  step1[29] = (input[stride*27] + input[stride*25]);
+  step1[30] = (input[stride*29] + input[stride*27]);
+  step1[31] = (input[stride*31] + input[stride*29]);
+
+  idct16(step1, step2, 1);
+  idct16(step1 + 16, step2 + 16, 1);
+
+  step2[16] = DownshiftMultiply(step2[16] / (2*C1));
+  step2[17] = DownshiftMultiply(step2[17] / (2*C3));
+  step2[18] = DownshiftMultiply(step2[18] / (2*C5));
+  step2[19] = DownshiftMultiply(step2[19] / (2*C7));
+  step2[20] = DownshiftMultiply(step2[20] / (2*C9));
+  step2[21] = DownshiftMultiply(step2[21] / (2*C11));
+  step2[22] = DownshiftMultiply(step2[22] / (2*C13));
+  step2[23] = DownshiftMultiply(step2[23] / (2*C15));
+  step2[24] = DownshiftMultiply(step2[24] / (2*C17));
+  step2[25] = DownshiftMultiply(step2[25] / (2*C19));
+  step2[26] = DownshiftMultiply(step2[26] / (2*C21));
+  step2[27] = DownshiftMultiply(step2[27] / (2*C23));
+  step2[28] = DownshiftMultiply(step2[28] / (2*C25));
+  step2[29] = DownshiftMultiply(step2[29] / (2*C27));
+  step2[30] = DownshiftMultiply(step2[30] / (2*C29));
+  step2[31] = DownshiftMultiply(step2[31] / (2*C31));
+
+  output[stride* 0] = step2[ 0] + step2[16];
+  output[stride* 1] = step2[ 1] + step2[17];
+  output[stride* 2] = step2[ 2] + step2[18];
+  output[stride* 3] = step2[ 3] + step2[19];
+  output[stride* 4] = step2[ 4] + step2[20];
+  output[stride* 5] = step2[ 5] + step2[21];
+  output[stride* 6] = step2[ 6] + step2[22];
+  output[stride* 7] = step2[ 7] + step2[23];
+  output[stride* 8] = step2[ 8] + step2[24];
+  output[stride* 9] = step2[ 9] + step2[25];
+  output[stride*10] = step2[10] + step2[26];
+  output[stride*11] = step2[11] + step2[27];
+  output[stride*12] = step2[12] + step2[28];
+  output[stride*13] = step2[13] + step2[29];
+  output[stride*14] = step2[14] + step2[30];
+  output[stride*15] = step2[15] + step2[31];
+  output[stride*16] = step2[15] - step2[(31 - 0)];
+  output[stride*17] = step2[14] - step2[(31 - 1)];
+  output[stride*18] = step2[13] - step2[(31 - 2)];
+  output[stride*19] = step2[12] - step2[(31 - 3)];
+  output[stride*20] = step2[11] - step2[(31 - 4)];
+  output[stride*21] = step2[10] - step2[(31 - 5)];
+  output[stride*22] = step2[ 9] - step2[(31 - 6)];
+  output[stride*23] = step2[ 8] - step2[(31 - 7)];
+  output[stride*24] = step2[ 7] - step2[(31 - 8)];
+  output[stride*25] = step2[ 6] - step2[(31 - 9)];
+  output[stride*26] = step2[ 5] - step2[(31 - 10)];
+  output[stride*27] = step2[ 4] - step2[(31 - 11)];
+  output[stride*28] = step2[ 3] - step2[(31 - 12)];
+  output[stride*29] = step2[ 2] - step2[(31 - 13)];
+  output[stride*30] = step2[ 1] - step2[(31 - 14)];
+  output[stride*31] = step2[ 0] - step2[(31 - 15)];
+}
+
+void vp9_short_idct32x32_c(short *input, short *output, int pitch) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[32*32], out2[32*32];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_32_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i*32] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = out[j*32 + i];
+      butterfly_32_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out2[j*32 + i] = temp_out[j];
+    }
+    for (i = 0; i < 32*32; ++i)
+      output[i] = round(out2[i]/128);
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+#else  // CONFIG_DWT32X32HYBRID
+
+#define MAX_BLOCK_LENGTH   64
+#define ENH_PRECISION_BITS 1
+#define ENH_PRECISION_RND ((1 << ENH_PRECISION_BITS) / 2)
+
+// Note: block length must be even for this implementation
+static void synthesis_53_row(int length, short *lowpass, short *highpass,
+                             short *x) {
+  short r, * a, * b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = ((r = *a++) + 1) >> 1;
+    *x++ = *b++ + ((r + (*a) + 2) >> 2);
+  }
+  *x++ = ((r = *a) + 1)>>1;
+  *x++ = *b + ((r+1)>>1);
+}
+
+static void synthesis_53_col(int length, short *lowpass, short *highpass,
+                             short *x) {
+  short r, * a, * b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = r = *a++;
+    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
+  }
+  *x++ = r = *a;
+  *x++ = ((*b) << 1) + r;
+}
+
+// NOTE: Using a 5/3 integer wavelet for now. Explore using a wavelet
+// with a better response later
+void dyadic_synthesize(int levels, int width, int height, short *c, int pitch_c,
+                       short *x, int pitch_x) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * MAX_BLOCK_LENGTH];
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
+      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      x[i * pitch_x + j] = (c[i * pitch_c + j] + ENH_PRECISION_RND) >>
+      ENH_PRECISION_BITS;
+}
+
+void vp9_short_idct32x32_c(short *input, short *output, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  short buffer2[32 * 32];
+  const int short_pitch = pitch >> 1;
+  int i;
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c(input, buffer, pitch);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(short) * 16);
+    vpx_memcpy(buffer2 + i * 32 + 16, input + i * short_pitch + 16,
+               sizeof(short) * 16);
+  }
+  for (; i < 32; ++i) {
+    vpx_memcpy(buffer2 + i * 32, input + i * short_pitch,
+               sizeof(short) * 32);
+  }
+  dyadic_synthesize(1, 32, 32, buffer2, 32, output, 32);
+}
+#endif  // CONFIG_DWT32X32HYBRID
+#endif  // CONFIG_TX32X32
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index c78f1ad3c6ab26801d54627c7ad206e08787922e..3abf328940edcad0eede716a125a569f048b614b 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -143,3 +143,16 @@ void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {
   vp9_inverse_transform_mby_16x16(xd);
   vp9_inverse_transform_mbuv_8x8(xd);
 }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) {
+  vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64);
+}
+
+void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) {
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024,
+                                xd_sb->diff + 1024, 32);
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280,
+                                xd_sb->diff + 1280, 32);
+}
+#endif
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index b012834f37337e4d93111d2738aa1e7afbe16bf6..94593f8cc60eb440d6889bd76de273934daab4da 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -38,4 +38,9 @@ extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);
 
 extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
+extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
+#endif
+
 #endif  // __INC_INVTRANS_H
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 8d4d014ba7cfc912697b3ae2739b5f3bc0b54bef..1139fb5d13161b0fb56fab80682758263eb8f0c8 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -192,6 +192,9 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
 
   /* Point at base of Mb MODE_INFO list */
   const MODE_INFO *mode_info_context = cm->mi;
+#if CONFIG_SUPERBLOCKS
+  const int mis = cm->mode_info_stride;
+#endif
 
   /* Initialize the loop filter for this frame. */
   vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
@@ -226,14 +229,18 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
           if (mb_col > 0
 #if CONFIG_SUPERBLOCKS
               && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
+                   ((mode_info_context[0].mbmi.mb_skip_coeff &&
+                     mode_info_context[-1].mbmi.mb_skip_coeff)
+#if CONFIG_TX32X32
+                    || mode_info_context[-1].mbmi.txfm_size == TX_32X32
+#endif
+                    ))
 #endif
               )
             vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
                                 post->uv_stride, &lfi);
 
-          if (!skip_lf && tx_type != TX_16X16) {
+          if (!skip_lf && tx_type < TX_16X16) {
             if (tx_type == TX_8X8)
               vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
                                     post->uv_stride, &lfi);
@@ -247,14 +254,18 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
           if (mb_row > 0
 #if CONFIG_SUPERBLOCKS
               && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+                   ((mode_info_context[0].mbmi.mb_skip_coeff &&
+                     mode_info_context[-mis].mbmi.mb_skip_coeff)
+#if CONFIG_TX32X32
+                    || mode_info_context[-mis].mbmi.txfm_size == TX_32X32
+#endif
+                    ))
 #endif
               )
             vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
                                 post->uv_stride, &lfi);
 
-          if (!skip_lf && tx_type != TX_16X16) {
+          if (!skip_lf && tx_type < TX_16X16) {
             if (tx_type == TX_8X8)
               vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
                                     post->uv_stride, &lfi);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 0b6de7f82a2360e742b2c902b64c8f725c7bd206..d80498df1ad4e18dd0fd32a46e9547e607adea50 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -58,6 +58,9 @@ typedef struct frame_contexts {
   vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
   vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
   vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_prob coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#endif
 
   nmv_context nmvc;
   nmv_context pre_nmvc;
@@ -95,6 +98,11 @@ typedef struct frame_contexts {
   vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_prob pre_coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#endif
+
   unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
   unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
@@ -110,6 +118,11 @@ typedef struct frame_contexts {
   unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
       [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  unsigned int coef_counts_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
@@ -139,8 +152,11 @@ typedef enum {
   ONLY_4X4            = 0,
   ALLOW_8X8           = 1,
   ALLOW_16X16         = 2,
-  TX_MODE_SELECT      = 3,
-  NB_TXFM_MODES       = 4,
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  ALLOW_32X32         = 3,
+#endif
+  TX_MODE_SELECT      = 3 + (CONFIG_TX32X32 && CONFIG_SUPERBLOCKS),
+  NB_TXFM_MODES       = 4 + (CONFIG_TX32X32 && CONFIG_SUPERBLOCKS),
 } TXFM_MODE;
 
 typedef struct VP9Common {
@@ -268,7 +284,7 @@ typedef struct VP9Common {
   vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
 
   // FIXME contextualize
-  vp9_prob prob_tx[TX_SIZE_MAX - 1];
+  vp9_prob prob_tx[TX_SIZE_MAX_SB - 1];
 
   vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
 
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 1f8dfce34b2f2e6be18293a2ba4d36e5b9d3fc69..e567bac8d54cec696bd7c52d09155ec8d5bf920b 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -168,6 +168,53 @@ void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
     }
   }
 }
+
+#if CONFIG_TX32X32
+void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y, stride = xd->block[0].dst_stride;
+  short *diff = xd->sb_coeff_data.diff;
+
+  for (y = 0; y < 32; y++) {
+    for (x = 0; x < 32; x++) {
+      int a = dst[x] + diff[x];
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+      dst[x] = a;
+    }
+    dst += stride;
+    diff += 32;
+  }
+}
+
+void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, stride = xd->block[16].dst_stride;
+  short *udiff = xd->sb_coeff_data.diff + 1024;
+  short *vdiff = xd->sb_coeff_data.diff + 1280;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      int u = udst[x] + udiff[x];
+      int v = vdst[x] + vdiff[x];
+      if (u < 0)
+        u = 0;
+      else if (u > 255)
+        u = 255;
+      if (v < 0)
+        v = 0;
+      else if (v > 255)
+        v = 255;
+      udst[x] = u;
+      vdst[x] = v;
+    }
+    udst += stride;
+    vdst += stride;
+    udiff += 16;
+    vdiff += 16;
+  }
+}
+#endif
 #endif
 
 void vp9_recon_mby_c(MACROBLOCKD *xd) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 5b7af100b636f9efb2665cec789b45b63b79b5c1..49a3a8595ed7b02871b4c1b9e56edae7ec6aa80a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -361,6 +361,9 @@ specialize vp9_short_idct16x16
 prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch"
 specialize vp9_short_idct10_16x16
 
+prototype void vp9_short_idct32x32 "short *input, short *output, int pitch"
+specialize vp9_short_idct32x32
+
 prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim"
 specialize vp9_ihtllm
 
@@ -640,6 +643,9 @@ specialize vp9_short_fdct8x4
 prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
 specialize vp9_short_walsh4x4
 
+prototype void vp9_short_fdct32x32 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct32x32
+
 prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
 specialize vp9_short_fdct16x16
 
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 46a6ee45444f25203b7a6142f5e7ac6a72699fee..89c1e458dd851b3fc4aa80b247dc34f8b3470409 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -14,7 +14,7 @@
 
 static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
 static const int seg_feature_data_max[SEG_LVL_MAX] =
-                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX - 1};
+                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1};
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 7e53884f78fd4282e27d7715c344e93a16d85a03..b9f411dd2959cbcb5ba65a8681168390980b9403 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -209,8 +209,17 @@ static void kfread_modes(VP9D_COMP *pbi,
       m->mbmi.mode <= I8X8_PRED) {
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
+    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
       m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.encoded_as_sb)
+        m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);
+#endif
+    }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.encoded_as_sb) {
+    m->mbmi.txfm_size = TX_32X32;
+#endif
   } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
     m->mbmi.txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
@@ -1219,8 +1228,17 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
     if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV)
+        mbmi->mode != SPLITMV) {
       mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb && mbmi->txfm_size != TX_8X8)
+        mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);
+#endif
+    }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  } else if (mbmi->encoded_as_sb && cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
+#endif
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 76349ad86737c01359808754a23656000acad37d..7f851a18a48cde74e24e7d73f458789a6c55f0d7 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -693,6 +693,7 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
   MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;
 
   assert(xd->mode_info_context->mbmi.encoded_as_sb);
 
@@ -733,6 +734,30 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
   }
 
   /* dequantization and idct */
+#if CONFIG_TX32X32
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+    if (eobtotal == 0) {  // skip loopfilter
+      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+      if (mb_col + 1 < pc->mb_cols)
+        xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+      if (mb_row + 1 < pc->mb_rows) {
+        xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+      }
+    } else {
+      vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                 xd->dst.y_buffer, xd->dst.y_buffer,
+                                 xd->dst.y_stride, xd->dst.y_stride,
+                                 xd->eobs[0]);
+      vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                            xd->block[16].dequant,
+                                            xd->dst.u_buffer, xd->dst.v_buffer,
+                                            xd->dst.uv_stride, xd->eobs + 16);
+    }
+  } else {
+#endif
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
@@ -742,7 +767,7 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
 
     xd->above_context = pc->above_context + mb_col + x_idx;
     xd->left_context = pc->left_context + y_idx;
-    xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride;
+    xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
       xd->eobs[i] = 0;
@@ -766,6 +791,9 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
   xd->above_context = pc->above_context + mb_col;
   xd->left_context = pc->left_context;
   xd->mode_info_context = orig_mi;
+#if CONFIG_TX32X32
+  }
+#endif
 }
 #endif
 
@@ -1244,6 +1272,11 @@ static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
     read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
     read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
   }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (pbi->common.txfm_mode > ALLOW_16X16) {
+    read_coef_probs_common(bc, pc->fc.coef_probs_32x32);
+  }
+#endif
 }
 
 int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
@@ -1433,9 +1466,16 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   /* Read the loop filter level and type */
   pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (pc->txfm_mode == 3)
+    pc->txfm_mode += vp9_read_bit(&header_bc);
+#endif
   if (pc->txfm_mode == TX_MODE_SELECT) {
     pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
     pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+#endif
   }
 
   pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
@@ -1591,6 +1631,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
            pbi->common.fc.coef_probs_16x16);
   vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
            pbi->common.fc.hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(pbi->common.fc.pre_coef_probs_32x32,
+           pbi->common.fc.coef_probs_32x32);
+#endif
   vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
 #if CONFIG_SUPERBLOCKS
   vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob);
@@ -1610,6 +1654,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
   vp9_zero(pbi->common.fc.coef_counts_16x16);
   vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_zero(pbi->common.fc.coef_counts_32x32);
+#endif
   vp9_zero(pbi->common.fc.ymode_counts);
 #if CONFIG_SUPERBLOCKS
   vp9_zero(pbi->common.fc.sb_ymode_counts);
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 79114d58ca1a3cfb2e9ee426f5d43887f7410650..22a66716fd2de62325d43f80eba1276f88f3e1cb 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -352,3 +352,30 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
     add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
   }
 }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_dequant_idct_add_32x32(int16_t *input, const int16_t *dq,
+                                uint8_t *pred, uint8_t *dest, int pitch,
+                                int stride, uint16_t eobs) {
+  short output[1024];
+  int i;
+
+  input[0]= input[0] * dq[0] / 2;
+  for (i = 1; i < 1024; i++)
+    input[i] = input[i] * dq[1] / 2;
+  vp9_short_idct32x32_c(input, output, 64);
+  vpx_memset(input, 0, 2048);
+
+  add_residual(output, pred, pitch, dest, stride, 32, 32);
+}
+
+void vp9_dequant_idct_add_uv_block_16x16_c(short *q, const short *dq,
+                                           unsigned char *dstu,
+                                           unsigned char *dstv,
+                                           int stride,
+                                           unsigned short *eobs) {
+  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]);
+  vp9_dequant_idct_add_16x16_c(q + 256, dq,
+                               dstv, dstv, stride, stride, eobs[4]);
+}
+#endif
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 897ad5204ac0dbbc162357c8a126a7695dea197c..35a26477ad6818eef10cf03e8aa41488f04a8484 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -55,8 +55,9 @@
 #define CAT5_PROB3 157
 #define CAT5_PROB4 180
 
-static const unsigned char cat6_prob[14] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const unsigned char cat6_prob[15] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
+};
 
 void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
   /* Clear entropy contexts */
@@ -161,6 +162,12 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
         coef_counts = fc->hybrid_coef_counts_16x16[type];
       }
       break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      coef_probs = fc->coef_probs_32x32[type];
+      coef_counts = fc->coef_counts_32x32[type];
+      break;
+#endif
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -256,6 +263,54 @@ static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
   return eob;
 }
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
+  unsigned short* const eobs = xd->eobs;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int c, i, eobtotal = 0, seg_eob;
+
+  // Luma block
+  eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, PLANE_TYPE_Y_WITH_DC,
+                             DCT_DCT, get_eob(xd, segment_id, 1024),
+                             xd->sb_coeff_data.qcoeff,
+                             vp9_default_zig_zag1d_32x32,
+                             TX_32X32, vp9_coef_bands_32x32);
+  A[1] = A[2] = A[3] = A[0];
+  L[1] = L[2] = L[3] = L[0];
+  eobtotal += c;
+
+  // 16x16 chroma blocks
+  seg_eob = get_eob(xd, segment_id, 256);
+  for (i = 16; i < 24; i += 4) {
+    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
+
+    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+                               DCT_DCT, seg_eob,
+                               xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+                               vp9_default_zig_zag1d_16x16,
+                               TX_16X16, vp9_coef_bands_16x16);
+    a[1] = a[0];
+    l[1] = l[0];
+    eobtotal += c;
+  }
+
+  // no Y2 block
+  vpx_memset(&A[8], 0, sizeof(A[8]));
+  vpx_memset(&L[8], 0, sizeof(L[8]));
+
+  vpx_memcpy(xd->above_context + 1, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(xd->left_context + 1, xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES));
+
+  return eobtotal;
+}
+#endif
 
 static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
                                       MACROBLOCKD* const xd,
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 9b319d4a951c856940a10ddffcc79b763d32c8c2..09d354ea6c5ddefdd151e6e86e60590948a63b33 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -23,6 +23,12 @@ int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
 int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
                          BOOL_DECODER* const);
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc);
+#endif
+
 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
                                 BOOL_DECODER* const bc);
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 0adaeee0a14d6ee008b1bef942a320dbd3cb60b2..847815f50849dbff0c37d9446d72f1c8c4410748 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1200,8 +1200,13 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
           TX_SIZE sz = mi->txfm_size;
           // FIXME(rbultje) code ternary symbol once all experiments are merged
           vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
+          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
             vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb && sz != TX_8X8)
+              vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+#endif
+          }
         }
 
 #ifdef ENTROPY_STATS
@@ -1337,8 +1342,13 @@ static void write_mb_modes_kf(const VP9_COMMON  *c,
     TX_SIZE sz = m->mbmi.txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED)
+    if (sz != TX_4X4 && ym <= TM_PRED) {
       vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (m->mbmi.encoded_as_sb && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
+#endif
+    }
   }
 }
 
@@ -1547,29 +1557,54 @@ static void build_coeff_contexts(VP9_COMP *cpi) {
           if (!cpi->dummy_packing)
             for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
               context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
+#endif
+        }
+      }
+    }
+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_hybrid_coef_probs_16x16[i][j][k],
+            cpi->frame_hybrid_branch_ct_16x16[i][j][k],
+            cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              hybrid_context_counters_16x16[i][j][k][t] +=
+                cpi->hybrid_coef_counts_16x16[i][j][k][t];
 #endif
         }
       }
     }
   }
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_hybrid_coef_probs_16x16[i][j][k],
-          cpi->frame_hybrid_branch_ct_16x16[i][j][k],
-          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (cpi->common.txfm_mode > ALLOW_16X16) {
+    for (i = 0; i < BLOCK_TYPES_32X32; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_coef_probs_32x32[i][j][k],
+            cpi->frame_branch_ct_32x32[i][j][k],
+            cpi->coef_counts_32x32[i][j][k], 256, 1);
 #ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              context_counters_32x32[i][j][k][t] +=
+                cpi->coef_counts_32x32[i][j][k][t];
 #endif
+        }
       }
     }
   }
+#endif
 }
 
 static void update_coef_probs_common(
@@ -1714,6 +1749,15 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
                              cpi->common.fc.hybrid_coef_probs_16x16,
                              cpi->frame_hybrid_branch_ct_16x16);
   }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (cpi->common.txfm_mode > ALLOW_16X16) {
+    update_coef_probs_common(bc,
+                             cpi->frame_coef_probs_32x32,
+                             cpi->common.fc.coef_probs_32x32,
+                             cpi->frame_branch_ct_32x32);
+  }
+#endif
 }
 
 #ifdef PACKET_TESTING
@@ -1955,18 +1999,53 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
 
   {
     if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
-                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
-                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
-      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
+      pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +
+                                cpi->txfm_count_16x16p[TX_4X4] +
+                                cpi->txfm_count_8x8p[TX_4X4],
+                                cpi->txfm_count_32x32p[TX_4X4] +
+                                cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_32x32p[TX_16X16] +
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+                                cpi->txfm_count_32x32p[TX_32X32] +
+#endif
+                                cpi->txfm_count_16x16p[TX_4X4] +
+                                cpi->txfm_count_16x16p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_16X16] +
+                                cpi->txfm_count_8x8p[TX_4X4] +
+                                cpi->txfm_count_8x8p[TX_8X8]);
+      pc->prob_tx[1] = get_prob(cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_8X8],
+                                cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_32x32p[TX_16X16] +
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+                                cpi->txfm_count_32x32p[TX_32X32] +
+#endif
+                                cpi->txfm_count_16x16p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_16X16]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      pc->prob_tx[2] = get_prob(cpi->txfm_count_32x32p[TX_16X16],
+                                cpi->txfm_count_32x32p[TX_16X16] +
+                                cpi->txfm_count_32x32p[TX_32X32]);
+#endif
     } else {
       pc->prob_tx[0] = 128;
       pc->prob_tx[1] = 128;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      pc->prob_tx[2] = 128;
+#endif
+    }
+    vp9_write_literal(&header_bc, pc->txfm_mode <= 3 ? pc->txfm_mode : 3, 2);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    if (pc->txfm_mode > ALLOW_16X16) {
+      vp9_write_bit(&header_bc, pc->txfm_mode == TX_MODE_SELECT);
     }
-    vp9_write_literal(&header_bc, pc->txfm_mode, 2);
+#endif
     if (pc->txfm_mode == TX_MODE_SELECT) {
       vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
       vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      vp9_write_literal(&header_bc, pc->prob_tx[2], 8);
+#endif
     }
   }
 
@@ -2150,6 +2229,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
   vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
   vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
   vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
+           cpi->common.fc.coef_probs_32x32);
+#endif
 #if CONFIG_SUPERBLOCKS
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
 #endif
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 4669d2ed6945dbccff7d75213c748f30c699cf37..82dc5edc1670b227db75def2cca47b73b0a10d87 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -36,9 +36,15 @@ typedef struct block {
   short *zbin;
   short *zbin_8x8;
   short *zbin_16x16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  short *zbin_32x32;
+#endif
   short *zrun_zbin_boost;
   short *zrun_zbin_boost_8x8;
   short *zrun_zbin_boost_16x16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  short *zrun_zbin_boost_32x32;
+#endif
   short *round;
 
   // Zbin Over Quant value
@@ -52,6 +58,9 @@ typedef struct block {
   int eob_max_offset;
   int eob_max_offset_8x8;
   int eob_max_offset_16x16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  int eob_max_offset_32x32;
+#endif
 } BLOCK;
 
 typedef struct {
@@ -83,6 +92,13 @@ typedef struct {
   int64_t txfm_rd_diff[NB_TXFM_MODES];
 } PICK_MODE_CONTEXT;
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+typedef struct superblock {
+  DECLARE_ALIGNED(16, short, src_diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, short, coeff[32*32+16*16*2]);
+} SUPERBLOCK;
+#endif
+
 typedef struct macroblock {
   DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
   DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
@@ -95,6 +111,10 @@ typedef struct macroblock {
   // 1 DC 2nd order block each with 16 entries
   BLOCK block[25];
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+  SUPERBLOCK sb_coeff_data;
+#endif
+
   YV12_BUFFER_CONFIG src;
 
   MACROBLOCKD e_mbd;
@@ -153,9 +173,9 @@ typedef struct macroblock {
 
   unsigned char *active_ptr;
 
-  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+  unsigned int token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][COEF_BANDS]
     [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+  unsigned int hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][COEF_BANDS]
     [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
 
   int optimize;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 6753f2462883d9c9b272efb8387f79b999b5c5c4..0fc8fa35edb416c177395f372ea2151dc6e7ece5 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -1330,3 +1330,461 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
 #undef RIGHT_SHIFT
 #undef ROUNDING
 #endif
+
+#if CONFIG_TX32X32
+#if !CONFIG_DWT32X32HYBRID
+static void dct32_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step[32];
+
+  // Stage 1
+  step[0] = input[stride*0] + input[stride*(32 - 1)];
+  step[1] = input[stride*1] + input[stride*(32 - 2)];
+  step[2] = input[stride*2] + input[stride*(32 - 3)];
+  step[3] = input[stride*3] + input[stride*(32 - 4)];
+  step[4] = input[stride*4] + input[stride*(32 - 5)];
+  step[5] = input[stride*5] + input[stride*(32 - 6)];
+  step[6] = input[stride*6] + input[stride*(32 - 7)];
+  step[7] = input[stride*7] + input[stride*(32 - 8)];
+  step[8] = input[stride*8] + input[stride*(32 - 9)];
+  step[9] = input[stride*9] + input[stride*(32 - 10)];
+  step[10] = input[stride*10] + input[stride*(32 - 11)];
+  step[11] = input[stride*11] + input[stride*(32 - 12)];
+  step[12] = input[stride*12] + input[stride*(32 - 13)];
+  step[13] = input[stride*13] + input[stride*(32 - 14)];
+  step[14] = input[stride*14] + input[stride*(32 - 15)];
+  step[15] = input[stride*15] + input[stride*(32 - 16)];
+  step[16] = -input[stride*16] + input[stride*(32 - 17)];
+  step[17] = -input[stride*17] + input[stride*(32 - 18)];
+  step[18] = -input[stride*18] + input[stride*(32 - 19)];
+  step[19] = -input[stride*19] + input[stride*(32 - 20)];
+  step[20] = -input[stride*20] + input[stride*(32 - 21)];
+  step[21] = -input[stride*21] + input[stride*(32 - 22)];
+  step[22] = -input[stride*22] + input[stride*(32 - 23)];
+  step[23] = -input[stride*23] + input[stride*(32 - 24)];
+  step[24] = -input[stride*24] + input[stride*(32 - 25)];
+  step[25] = -input[stride*25] + input[stride*(32 - 26)];
+  step[26] = -input[stride*26] + input[stride*(32 - 27)];
+  step[27] = -input[stride*27] + input[stride*(32 - 28)];
+  step[28] = -input[stride*28] + input[stride*(32 - 29)];
+  step[29] = -input[stride*29] + input[stride*(32 - 30)];
+  step[30] = -input[stride*30] + input[stride*(32 - 31)];
+  step[31] = -input[stride*31] + input[stride*(32 - 32)];
+
+  // Stage 2
+  output[stride*0] = step[0] + step[16 - 1];
+  output[stride*1] = step[1] + step[16 - 2];
+  output[stride*2] = step[2] + step[16 - 3];
+  output[stride*3] = step[3] + step[16 - 4];
+  output[stride*4] = step[4] + step[16 - 5];
+  output[stride*5] = step[5] + step[16 - 6];
+  output[stride*6] = step[6] + step[16 - 7];
+  output[stride*7] = step[7] + step[16 - 8];
+  output[stride*8] = -step[8] + step[16 - 9];
+  output[stride*9] = -step[9] + step[16 - 10];
+  output[stride*10] = -step[10] + step[16 - 11];
+  output[stride*11] = -step[11] + step[16 - 12];
+  output[stride*12] = -step[12] + step[16 - 13];
+  output[stride*13] = -step[13] + step[16 - 14];
+  output[stride*14] = -step[14] + step[16 - 15];
+  output[stride*15] = -step[15] + step[16 - 16];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18];
+  output[stride*19] = step[19];
+
+  output[stride*20] = (-step[20] + step[27])*C16;
+  output[stride*21] = (-step[21] + step[26])*C16;
+  output[stride*22] = (-step[22] + step[25])*C16;
+  output[stride*23] = (-step[23] + step[24])*C16;
+
+  output[stride*24] = (step[24] + step[23])*C16;
+  output[stride*25] = (step[25] + step[22])*C16;
+  output[stride*26] = (step[26] + step[21])*C16;
+  output[stride*27] = (step[27] + step[20])*C16;
+
+  output[stride*28] = step[28];
+  output[stride*29] = step[29];
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 3
+  step[0] = output[stride*0] + output[stride*(8 - 1)];
+  step[1] = output[stride*1] + output[stride*(8 - 2)];
+  step[2] = output[stride*2] + output[stride*(8 - 3)];
+  step[3] = output[stride*3] + output[stride*(8 - 4)];
+  step[4] = -output[stride*4] + output[stride*(8 - 5)];
+  step[5] = -output[stride*5] + output[stride*(8 - 6)];
+  step[6] = -output[stride*6] + output[stride*(8 - 7)];
+  step[7] = -output[stride*7] + output[stride*(8 - 8)];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9];
+  step[10] = (-output[stride*10] + output[stride*13])*C16;
+  step[11] = (-output[stride*11] + output[stride*12])*C16;
+  step[12] = (output[stride*12] + output[stride*11])*C16;
+  step[13] = (output[stride*13] + output[stride*10])*C16;
+  step[14] = output[stride*14];
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*23];
+  step[17] = output[stride*17] + output[stride*22];
+  step[18] = output[stride*18] + output[stride*21];
+  step[19] = output[stride*19] + output[stride*20];
+  step[20] = -output[stride*20] + output[stride*19];
+  step[21] = -output[stride*21] + output[stride*18];
+  step[22] = -output[stride*22] + output[stride*17];
+  step[23] = -output[stride*23] + output[stride*16];
+  step[24] = -output[stride*24] + output[stride*31];
+  step[25] = -output[stride*25] + output[stride*30];
+  step[26] = -output[stride*26] + output[stride*29];
+  step[27] = -output[stride*27] + output[stride*28];
+  step[28] = output[stride*28] + output[stride*27];
+  step[29] = output[stride*29] + output[stride*26];
+  step[30] = output[stride*30] + output[stride*25];
+  step[31] = output[stride*31] + output[stride*24];
+
+  // Stage 4
+  output[stride*0] = step[0] + step[3];
+  output[stride*1] = step[1] + step[2];
+  output[stride*2] = -step[2] + step[1];
+  output[stride*3] = -step[3] + step[0];
+  output[stride*4] = step[4];
+  output[stride*5] = (-step[5] + step[6])*C16;
+  output[stride*6] = (step[6] + step[5])*C16;
+  output[stride*7] = step[7];
+  output[stride*8] = step[8] + step[11];
+  output[stride*9] = step[9] + step[10];
+  output[stride*10] = -step[10] + step[9];
+  output[stride*11] = -step[11] + step[8];
+  output[stride*12] = -step[12] + step[15];
+  output[stride*13] = -step[13] + step[14];
+  output[stride*14] = step[14] + step[13];
+  output[stride*15] = step[15] + step[12];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18]*-C8 + step[29]*C24;
+  output[stride*19] = step[19]*-C8 + step[28]*C24;
+  output[stride*20] = step[20]*-C24 + step[27]*-C8;
+  output[stride*21] = step[21]*-C24 + step[26]*-C8;
+  output[stride*22] = step[22];
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25];
+  output[stride*26] = step[26]*C24 + step[21]*-C8;
+  output[stride*27] = step[27]*C24 + step[20]*-C8;
+  output[stride*28] = step[28]*C8 + step[19]*C24;
+  output[stride*29] = step[29]*C8 + step[18]*C24;
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 5
+  step[0] = (output[stride*0] + output[stride*1]) * C16;
+  step[1] = (-output[stride*1] + output[stride*0]) * C16;
+  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
+  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
+  step[4] = output[stride*4] + output[stride*5];
+  step[5] = -output[stride*5] + output[stride*4];
+  step[6] = -output[stride*6] + output[stride*7];
+  step[7] = output[stride*7] + output[stride*6];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
+  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
+  step[11] = output[stride*11];
+  step[12] = output[stride*12];
+  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
+  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*19];
+  step[17] = output[stride*17] + output[stride*18];
+  step[18] = -output[stride*18] + output[stride*17];
+  step[19] = -output[stride*19] + output[stride*16];
+  step[20] = -output[stride*20] + output[stride*23];
+  step[21] = -output[stride*21] + output[stride*22];
+  step[22] = output[stride*22] + output[stride*21];
+  step[23] = output[stride*23] + output[stride*20];
+  step[24] = output[stride*24] + output[stride*27];
+  step[25] = output[stride*25] + output[stride*26];
+  step[26] = -output[stride*26] + output[stride*25];
+  step[27] = -output[stride*27] + output[stride*24];
+  step[28] = -output[stride*28] + output[stride*31];
+  step[29] = -output[stride*29] + output[stride*30];
+  step[30] = output[stride*30] + output[stride*29];
+  step[31] = output[stride*31] + output[stride*28];
+
+  // Stage 6
+  output[stride*0] = step[0];
+  output[stride*1] = step[1];
+  output[stride*2] = step[2];
+  output[stride*3] = step[3];
+  output[stride*4] = step[4]*C28 + step[7]*C4;
+  output[stride*5] = step[5]*C12 + step[6]*C20;
+  output[stride*6] = step[6]*C12 + step[5]*-C20;
+  output[stride*7] = step[7]*C28 + step[4]*-C4;
+  output[stride*8] = step[8] + step[9];
+  output[stride*9] = -step[9] + step[8];
+  output[stride*10] = -step[10] + step[11];
+  output[stride*11] = step[11] + step[10];
+  output[stride*12] = step[12] + step[13];
+  output[stride*13] = -step[13] + step[12];
+  output[stride*14] = -step[14] + step[15];
+  output[stride*15] = step[15] + step[14];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17]*-C4 + step[30]*C28;
+  output[stride*18] = step[18]*-C28 + step[29]*-C4;
+  output[stride*19] = step[19];
+  output[stride*20] = step[20];
+  output[stride*21] = step[21]*-C20 + step[26]*C12;
+  output[stride*22] = step[22]*-C12 + step[25]*-C20;
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25]*C12 + step[22]*-C20;
+  output[stride*26] = step[26]*C20 + step[21]*C12;
+  output[stride*27] = step[27];
+  output[stride*28] = step[28];
+  output[stride*29] = step[29]*C28 + step[18]*-C4;
+  output[stride*30] = step[30]*C4 + step[17]*C28;
+  output[stride*31] = step[31];
+
+  // Stage 7
+  step[0] = output[stride*0];
+  step[1] = output[stride*1];
+  step[2] = output[stride*2];
+  step[3] = output[stride*3];
+  step[4] = output[stride*4];
+  step[5] = output[stride*5];
+  step[6] = output[stride*6];
+  step[7] = output[stride*7];
+  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
+  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
+  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
+  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
+  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
+  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
+  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
+  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
+
+  step[16] = output[stride*16] + output[stride*17];
+  step[17] = -output[stride*17] + output[stride*16];
+  step[18] = -output[stride*18] + output[stride*19];
+  step[19] = output[stride*19] + output[stride*18];
+  step[20] = output[stride*20] + output[stride*21];
+  step[21] = -output[stride*21] + output[stride*20];
+  step[22] = -output[stride*22] + output[stride*23];
+  step[23] = output[stride*23] + output[stride*22];
+  step[24] = output[stride*24] + output[stride*25];
+  step[25] = -output[stride*25] + output[stride*24];
+  step[26] = -output[stride*26] + output[stride*27];
+  step[27] = output[stride*27] + output[stride*26];
+  step[28] = output[stride*28] + output[stride*29];
+  step[29] = -output[stride*29] + output[stride*28];
+  step[30] = -output[stride*30] + output[stride*31];
+  step[31] = output[stride*31] + output[stride*30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[stride*0] = step[0];
+  output[stride*16] = step[1];
+  output[stride*8] = step[2];
+  output[stride*24] = step[3];
+  output[stride*4] = step[4];
+  output[stride*20] = step[5];
+  output[stride*12] = step[6];
+  output[stride*28] = step[7];
+  output[stride*2] = step[8];
+  output[stride*18] = step[9];
+  output[stride*10] = step[10];
+  output[stride*26] = step[11];
+  output[stride*6] = step[12];
+  output[stride*22] = step[13];
+  output[stride*14] = step[14];
+  output[stride*30] = step[15];
+
+  output[stride*1] = step[16]*C31 + step[31]*C1;
+  output[stride*17] = step[17]*C15 + step[30]*C17;
+  output[stride*9] = step[18]*C23 + step[29]*C9;
+  output[stride*25] = step[19]*C7 + step[28]*C25;
+  output[stride*5] = step[20]*C27 + step[27]*C5;
+  output[stride*21] = step[21]*C11 + step[26]*C21;
+  output[stride*13] = step[22]*C19 + step[25]*C13;
+  output[stride*29] = step[23]*C3 + step[24]*C29;
+  output[stride*3] = step[24]*C3 + step[23]*-C29;
+  output[stride*19] = step[25]*C19 + step[22]*-C13;
+  output[stride*11] = step[26]*C11 + step[21]*-C21;
+  output[stride*27] = step[27]*C27 + step[20]*-C5;
+  output[stride*7] = step[28]*C7 + step[19]*-C25;
+  output[stride*23] = step[29]*C23 + step[18]*-C9;
+  output[stride*15] = step[30]*C15 + step[17]*-C17;
+  output[stride*31] = step[31]*C31 + step[16]*-C1;
+}
+
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int shortpitch = pitch >> 1;
+    int i, j;
+    double output[1024];
+    // First transform columns
+    for (i = 0; i < 32; i++) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; j++)
+        temp_in[j] = input[j*shortpitch + i];
+      dct32_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; j++)
+        output[j*32 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = output[j + i*32];
+      dct32_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        output[j + i*32] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 1024; i++) {
+      out[i] = (short)round(output[i]/4);
+    }
+  }
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+#else  // CONFIG_DWT32X32HYBRID
+
+#define MAX_BLOCK_LENGTH   64
+#define ENH_PRECISION_BITS 1
+#define ENH_PRECISION_RND ((1 << ENH_PRECISION_BITS) / 2)
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int n;
+  short r, * a, * b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) << 1;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) << 1;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int n;
+  short r, * a, * b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+// NOTE: Using a 5/3 integer wavelet for now. Explore using a wavelet
+// with a better response later
+static void dyadic_analyze(int levels, int width, int height,
+                           short *x, int pitch_x, short *c, int pitch_c) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * MAX_BLOCK_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x[i * pitch_x + j] << ENH_PRECISION_BITS;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  int i;
+  const int short_pitch = pitch >> 1;
+  dyadic_analyze(1, 32, 32, input, short_pitch, out, 32);
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c(out, buffer, 64);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
+}
+#endif  // CONFIG_DWT32X32HYBRID
+#endif  // CONFIG_TX32X32
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 6ab23cae0b2f43480dea341bd2e1ea362e5b1703..f504fc53c73bd47be8e252bb7ff6365291160d10 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -456,6 +456,10 @@ static void update_state(VP9_COMP *cpi, MACROBLOCK *x,
       if (xd->mb_to_right_edge >= 0)
         vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
     }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  } else {
+    ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
+#endif
   }
 #endif
 
@@ -1487,6 +1491,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(cpi->hybrid_coef_counts_8x8);
   vp9_zero(cpi->coef_counts_16x16);
   vp9_zero(cpi->hybrid_coef_counts_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_zero(cpi->coef_counts_32x32);
+#endif
 
   vp9_frame_init_quantizer(cpi);
 
@@ -1507,7 +1514,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
   vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
   vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
-  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
+  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
+  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
   vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
   vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
   {
@@ -1700,7 +1708,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      * keyframe's probabilities as an estimate of what the current keyframe's
      * coefficient cost distributions may look like. */
     if (frame_type == 0) {
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      txfm_type = ALLOW_32X32;
+#else
       txfm_type = ALLOW_16X16;
+#endif
     } else
 #if 0
     /* FIXME (rbultje)
@@ -1731,9 +1743,15 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     } else
       txfm_type = ALLOW_8X8;
 #else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+    ALLOW_32X32 : TX_MODE_SELECT;
+#else
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+    cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
     ALLOW_16X16 : TX_MODE_SELECT;
+#endif
 #endif
     cpi->common.txfm_mode = txfm_type;
     if (txfm_type != TX_MODE_SELECT) {
@@ -1753,7 +1771,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       int64_t pd = cpi->rd_tx_select_diff[i];
       int diff;
       if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
+                     2048 * (TX_SIZE_MAX_SB - 1), 0);
       diff = (int)(pd / cpi->common.MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
@@ -1776,19 +1795,37 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     }
 
     if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
-      const int count8x8 = cpi->txfm_count[TX_8X8];
+      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +
+                           cpi->txfm_count_32x32p[TX_4X4] +
+                           cpi->txfm_count_8x8p[TX_4X4];
+      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +
+                              cpi->txfm_count_16x16p[TX_8X8];
       const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
-      const int count16x16 = cpi->txfm_count[TX_16X16];
+      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];
+      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];
+#else
+      const int count32x32 = 0;
+#endif
 
-      if (count4x4 == 0 && count16x16 == 0) {
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32 == 0) {
         cpi->common.txfm_mode = ALLOW_8X8;
         reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
         cpi->common.txfm_mode = ONLY_4X4;
         reset_skip_txfm_size(cpi, TX_4X4);
-      } else if (count8x8 == 0 && count4x4 == 0) {
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+        cpi->common.txfm_mode = ALLOW_32X32;
+#endif
+      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
         cpi->common.txfm_mode = ALLOW_16X16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+        reset_skip_txfm_size(cpi, TX_16X16);
+#endif
       }
     }
   } else {
@@ -2087,6 +2124,7 @@ static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
   }
 
+  assert(mbmi->txfm_size <= TX_16X16);
   if (mbmi->ref_frame == INTRA_FRAME) {
 #ifdef ENC_DEBUG
     if (enc_debug) {
@@ -2266,7 +2304,7 @@ static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
            vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
       if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
           mbmi->mode != SPLITMV) {
-        cpi->txfm_count[mbmi->txfm_size]++;
+        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
       } else if (mbmi->mode == I8X8_PRED ||
                  (mbmi->mode == SPLITMV &&
                   mbmi->partitioning != PARTITIONING_4X4)) {
@@ -2308,6 +2346,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
   MODE_INFO *mi = x->e_mbd.mode_info_context;
   unsigned int segment_id = mi->mbmi.segment_id;
   ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+  const int mis = cm->mode_info_stride;
 
   x->skip = 0;
 
@@ -2397,6 +2436,53 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                                        xd->dst.y_stride, xd->dst.uv_stride);
   }
 
+#if CONFIG_TX32X32
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+                         dst, dst_y_stride);
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    vp9_transform_sby_32x32(x);
+    vp9_transform_sbuv_16x16(x);
+    vp9_quantize_sby_32x32(x);
+    vp9_quantize_sbuv_16x16(x);
+    // TODO(rbultje): trellis optimize
+    vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
+    vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
+    vp9_recon_sby_s_c(&x->e_mbd, dst);
+    vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst);
+
+    if (!x->skip) {
+      vp9_tokenize_sb(cpi, &x->e_mbd, t, 0);
+    } else {
+      int mb_skip_context =
+          cpi->common.mb_no_coeff_skip ?
+          (mi - 1)->mbmi.mb_skip_coeff +
+          (mi - mis)->mbmi.mb_skip_coeff :
+          0;
+      mi->mbmi.mb_skip_coeff = 1;
+      if (cm->mb_no_coeff_skip) {
+        cpi->skip_true_count[mb_skip_context]++;
+        vp9_fix_contexts_sb(xd);
+      } else {
+        vp9_stuff_sb(cpi, xd, t, 0);
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+
+    // copy skip flag on all mb_mode_info contexts in this SB
+    // if this was a skip at this txfm size
+    if (mb_col < cm->mb_cols - 1)
+      mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    if (mb_row < cm->mb_rows - 1) {
+      mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+      if (mb_col < cm->mb_cols - 1)
+        mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    }
+    skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff;
+  } else {
+#endif
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
@@ -2405,7 +2491,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
     memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
     memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
     tp[n] = *t;
-    xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+    xd->mode_info_context = mi + x_idx + y_idx * mis;
 
     vp9_subtract_mby_s_c(x->src_diff,
                          src + x_idx * 16 + y_idx * 16 * src_y_stride,
@@ -2433,7 +2519,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
       int mb_skip_context =
         cpi->common.mb_no_coeff_skip ?
           (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+            (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff :
           0;
       xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
       if (cpi->common.mb_no_coeff_skip) {
@@ -2450,20 +2536,29 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
 
   xd->mode_info_context = mi;
   update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+#if CONFIG_TX32X32
+  }
+#endif
   if (cm->txfm_mode == TX_MODE_SELECT &&
       !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
         (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-    cpi->txfm_count[mi->mbmi.txfm_size]++;
+    cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
   } else {
-    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
+    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
+#if CONFIG_TX32X32
+                    TX_32X32 :
+#else
+                    TX_16X16 :
+#endif
+                    cm->txfm_mode;
     mi->mbmi.txfm_size = sz;
     if (mb_col < cm->mb_cols - 1)
       mi[1].mbmi.txfm_size = sz;
     if (mb_row < cm->mb_rows - 1) {
-      mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+      mi[mis].mbmi.txfm_size = sz;
       if (mb_col < cm->mb_cols - 1)
-        mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+        mi[mis + 1].mbmi.txfm_size = sz;
     }
   }
 }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 091f2f0fe9b09053cfa53e2a3f603938ababc6d8..46087c28e53e41ece34856bad43c3e64a416c1d3 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -108,6 +108,52 @@ void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
   }
 }
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_subtract_sby_s_c(short *diff, const unsigned char *src, int src_stride,
+                          const unsigned char *pred, int dst_stride) {
+  int r, c;
+
+  for (r = 0; r < 32; r++) {
+    for (c = 0; c < 32; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += 32;
+    pred += dst_stride;
+    src  += src_stride;
+  }
+}
+
+void vp9_subtract_sbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride) {
+  short *udiff = diff + 1024;
+  short *vdiff = diff + 1024 + 256;
+  int r, c;
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      udiff[c] = usrc[c] - upred[c];
+    }
+
+    udiff += 16;
+    upred += dst_stride;
+    usrc  += src_stride;
+  }
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      vdiff[c] = vsrc[c] - vpred[c];
+    }
+
+    vdiff += 16;
+    vpred += dst_stride;
+    vsrc  += src_stride;
+  }
+}
+#endif
+
 void vp9_subtract_mby_c(short *diff, unsigned char *src,
                         unsigned char *pred, int stride) {
   vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
@@ -265,6 +311,22 @@ void vp9_transform_mb_16x16(MACROBLOCK *x) {
   vp9_transform_mbuv_8x8(x);
 }
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_transform_sby_32x32(MACROBLOCK *x) {
+  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
+  vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64);
+}
+
+void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
+  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
+  vp9_clear_system_state();
+  x->vp9_short_fdct16x16(x_sb->src_diff + 1024,
+                         x_sb->coeff + 1024, 32);
+  x->vp9_short_fdct16x16(x_sb->src_diff + 1280,
+                         x_sb->coeff + 1280, 32);
+}
+#endif
+
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 4f49647a2c76127c52f14ddaaca513eef423ac4a..3c0a0a5a2376d711ae4f02fcbcb5e35a247b178f 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -47,6 +47,11 @@ void vp9_transform_mb_16x16(MACROBLOCK *mb);
 void vp9_transform_mby_16x16(MACROBLOCK *x);
 void vp9_optimize_mby_16x16(MACROBLOCK *x);
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+void vp9_transform_sby_32x32(MACROBLOCK *x);
+void vp9_transform_sbuv_16x16(MACROBLOCK *x);
+#endif
+
 void vp9_fidct_mb(MACROBLOCK *x);
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
@@ -59,6 +64,14 @@ void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
 void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
                           int src_stride, const unsigned char *pred,
                           int dst_stride);
+#if CONFIG_TX32X32
+void vp9_subtract_sby_s_c(short *diff, const unsigned char *src, int src_stride,
+                          const unsigned char *pred, int dst_stride);
+void vp9_subtract_sbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride);
+#endif
 #endif
 
 #endif
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 0fe7a14b6e1852a3efeabdb14167b35aa3496e0e..779534bac1b1d9ac5399fbda05729ac16beab90a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1810,7 +1810,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 #endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
-  for (i = 0; i < TX_SIZE_MAX - 1; i++)
+  for (i = 0; i < TX_SIZE_MAX_SB - 1; i++)
     cm->prob_tx[i]               = 128;
 
   // Prime the recent reference frame useage counters.
@@ -3698,6 +3698,9 @@ static void encode_frame_to_data_rate
   vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
   vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
            cpi->hybrid_coef_counts_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
+#endif
   vp9_adapt_coef_probs(&cpi->common);
   if (cpi->common.frame_type != KEY_FRAME) {
 #if CONFIG_SUPERBLOCKS
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 511e62f1c85776012cef5da467438502977f9465..28acc96d47a2a7a09446ef978db6707a66be2a2b 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -109,6 +109,11 @@ typedef struct {
   vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
       [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_prob coef_probs_32x32[BLOCK_TYPES_32X32]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+#endif
+
 #if CONFIG_SUPERBLOCKS
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
 #endif
@@ -435,6 +440,15 @@ typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]);
+#endif
+
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
@@ -483,8 +497,9 @@ typedef struct VP9_COMP {
   int comp_pred_count[COMP_PRED_CONTEXTS];
   int single_pred_count[COMP_PRED_CONTEXTS];
   // FIXME contextualize
-  int txfm_count[TX_SIZE_MAX];
-  int txfm_count_8x8p[TX_SIZE_MAX - 1];
+  int txfm_count_32x32p[TX_SIZE_MAX_SB];
+  int txfm_count_16x16p[TX_SIZE_MAX_MB];
+  int txfm_count_8x8p[TX_SIZE_MAX_MB - 1];
   int64_t rd_tx_select_diff[NB_TXFM_MODES];
   int rd_tx_select_threshes[4][NB_TXFM_MODES];
 
@@ -604,6 +619,12 @@ typedef struct VP9_COMP {
   vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
   unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+  unsigned int coef_counts_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_branch_ct_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+#endif
+
   int gfu_boost;
   int last_boost;
   int kf_boost;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index f160edb565e112907345c9321e99b938e9de9587..fcc7d2948b00d011f6fe702e87d8e3d505159f0a 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -323,28 +323,25 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x) {
   vp9_quantize_mbuv_8x8(x);
 }
 
-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+static void quantize(short *zbin_boost_orig_ptr,
+                     short *coeff_ptr, int n_coeffs, int max_coeffs,
+                     short *zbin_ptr, short *round_ptr, short *quant_ptr,
+                     unsigned char *quant_shift_ptr,
+                     short *qcoeff_ptr, short *dqcoeff_ptr,
+                     short *dequant_ptr, short zbin_oq_value,
+                     int *eob_ptr, const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin_16x16;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
+  short *zbin_boost_ptr = zbin_boost_orig_ptr;
 
-  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
-  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(short));
 
   eob = -1;
-  for (i = 0; i < b->eob_max_offset_16x16; i++) {
-    rc   = vp9_default_zig_zag1d_16x16[i];
-    z    = coeff_ptr[rc];
+  for (i = 0; i < max_coeffs; i++) {
+    rc   = scan[i];
+    z    = coeff_ptr[rc] * mul;
 
     zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
     zbin_boost_ptr ++;
@@ -354,22 +351,70 @@ void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
 
     if (x >= zbin) {
       x += (round_ptr[rc!=0]);
-      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
           >> quant_shift_ptr[rc!=0];              // quantize (x)
       x  = (y ^ sz) - sz;                         // get the sign back
       qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
 
       if (y) {
         eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+        zbin_boost_ptr = zbin_boost_orig_ptr;
       }
     }
   }
 
-  d->eob = eob + 1;
+  *eob_ptr = eob + 1;
+}
+
+void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+  quantize(b->zrun_zbin_boost_16x16,
+           b->coeff,
+           256, b->eob_max_offset_16x16,
+           b->zbin_16x16, b->round, b->quant, b->quant_shift,
+           d->qcoeff,
+           d->dqcoeff,
+           d->dequant,
+           b->zbin_extra,
+           &d->eob, vp9_default_zig_zag1d_16x16, 1);
+}
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_quantize_sby_32x32(MACROBLOCK *x) {
+  x->e_mbd.block[0].eob = 0;
+  quantize(x->block[0].zrun_zbin_boost_32x32,
+           x->sb_coeff_data.coeff,
+           1024, x->block[0].eob_max_offset_32x32,
+           x->block[0].zbin_32x32,
+           x->block[0].round, x->block[0].quant, x->block[0].quant_shift,
+           x->e_mbd.sb_coeff_data.qcoeff,
+           x->e_mbd.sb_coeff_data.dqcoeff,
+           x->e_mbd.block[0].dequant,
+           x->block[0].zbin_extra,
+           &x->e_mbd.block[0].eob,
+           vp9_default_zig_zag1d_32x32, 2);
 }
 
+void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
+  int i;
+
+  x->e_mbd.block[16].eob = 0;
+  x->e_mbd.block[20].eob = 0;
+  for (i = 16; i < 24; i += 4)
+    quantize(x->block[i].zrun_zbin_boost_16x16,
+             x->sb_coeff_data.coeff + 1024 + (i - 16) * 64,
+             256, x->block[i].eob_max_offset_16x16,
+             x->block[i].zbin_16x16,
+             x->block[i].round, x->block[0].quant, x->block[i].quant_shift,
+             x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+             x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,
+             x->e_mbd.block[i].dequant,
+             x->block[i].zbin_extra,
+             &x->e_mbd.block[i].eob,
+             vp9_default_zig_zag1d_16x16, 1);
+}
+#endif
+
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
  * these two C functions if corresponding optimized routine is not available.
  * NEON optimized version implements currently the fast quantization for pair
@@ -427,6 +472,74 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
     48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
   };
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  static const int zbin_boost_32x32[1024] = {
+    0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+  };
+#endif
   int qrounding_factor = 48;
 
 
@@ -454,7 +567,13 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_y1_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_32x32[Q][0] =
+     ((quant_val * zbin_boost_32x32[0]) + 64) >> 7;
+#endif
 
 
     quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
@@ -468,7 +587,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_y2_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y2_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
 
     quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
     invert_quant(cpi->UVquant[Q] + 0,
@@ -481,7 +601,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
     cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_uv_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_uv_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
 
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
@@ -543,16 +664,30 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
 
       quant_val = vp9_ac_yquant(Q);
       cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
 
       quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
       cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_y2_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
 
       quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
       cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_uv_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
     }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    // 32x32 structures. Same comment above applies.
+    for (i = 1; i < 1024; i++) {
+      int rc = vp9_default_zig_zag1d_32x32[i];
+
+      quant_val = vp9_ac_yquant(Q);
+      cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_32x32[Q][i] =
+        ((quant_val * zbin_boost_32x32[i]) + 64) >> 7;
+    }
+#endif
   }
 }
 
@@ -592,11 +727,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
     x->block[i].zbin = cpi->Y1zbin[QIndex];
     x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
     x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex];
+#endif
     x->block[i].round = cpi->Y1round[QIndex];
     x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
     x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex];
+#endif
     x->block[i].zbin_extra = (short)zbin_extra;
 
     // Segment max eob offset feature.
@@ -607,10 +748,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
       x->block[i].eob_max_offset_16x16 =
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      x->block[i].eob_max_offset_32x32 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+#endif
     } else {
       x->block[i].eob_max_offset = 16;
       x->block[i].eob_max_offset_8x8 = 64;
       x->block[i].eob_max_offset_16x16 = 256;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      x->block[i].eob_max_offset_32x32 = 1024;
+#endif
     }
   }
 
@@ -640,9 +788,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
       x->block[i].eob_max_offset_8x8 =
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_16x16 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
     } else {
       x->block[i].eob_max_offset = 16;
       x->block[i].eob_max_offset_8x8 = 64;
+      x->block[i].eob_max_offset_16x16 = 256;
     }
   }
 
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index dd11e75bab5f0bd2309100a5733d28b6b6462329..832a486f505b4125a21ad026d12dec6fb975dcde 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -78,6 +78,11 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x);
 extern prototype_quantize_block(vp9_quantize_quantb_16x16);
 extern prototype_quantize_mb(vp9_quantize_mby_16x16);
 
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+void vp9_quantize_sby_32x32(MACROBLOCK *x);
+void vp9_quantize_sbuv_16x16(MACROBLOCK *x);
+#endif
+
 struct VP9_COMP;
 
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b589243e8207398f2bd03d60a999bd12bc638b28..c896e41b1eb9cfd14423ae0d27598bdf9d0278cf 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -175,6 +175,9 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
   vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
   vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
   vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);
+#endif
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
   cc->interintra_prob = cm->fc.interintra_prob;
@@ -234,6 +237,9 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
   vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
   vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
   vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);
+#endif
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
   cm->fc.interintra_prob = cc->interintra_prob;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9cea189698b47bab3ccbd653fdd947435962f968..60f14f8feaf60e7475cd09bbf5e82d5417a05a67 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -400,12 +400,18 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
     cpi->common.fc.hybrid_coef_probs_16x16,
     BLOCK_TYPES_16X16);
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  fill_token_costs(
+    cpi->mb.token_costs[TX_32X32],
+    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_32x32,
+    BLOCK_TYPES_32X32);
+#endif
+
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
   vp9_init_mode_costs(cpi);
 
-  if (cpi->common.frame_type != KEY_FRAME)
-  {
+  if (cpi->common.frame_type != KEY_FRAME) {
     vp9_build_nmv_cost_table(
         cpi->mb.nmvjointcost,
         cpi->mb.e_mbd.allow_high_precision_mv ?
@@ -556,7 +562,7 @@ static int cost_coeffs_2x2(MACROBLOCK *mb,
 
 static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size) {
+                       TX_SIZE tx_size) {
   const int eob = b->eob;
   int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
   int cost = 0, default_eob, seg_eob;
@@ -613,9 +619,24 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
       default_eob = 256;
       if (type == PLANE_TYPE_Y_WITH_DC) {
         tx_type = get_tx_type_16x16(xd, b);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      } else if (type == PLANE_TYPE_UV) {
+        int ib = (int)(b - xd->block) - 16;
+
+        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * ib;
+#endif
       }
       break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      scan = vp9_default_zig_zag1d_32x32;
+      band = vp9_coef_bands_32x32;
+      default_eob = 1024;
+      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      break;
+#endif
     default:
+      abort();
       break;
   }
   if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
@@ -813,23 +834,28 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
 }
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
-                                     int r[2][TX_SIZE_MAX], int *rate,
-                                     int d[TX_SIZE_MAX], int *distortion,
-                                     int s[TX_SIZE_MAX], int *skip,
-                                     int64_t txfm_cache[NB_TXFM_MODES]) {
+                                     int (*r)[2], int *rate,
+                                     int *d, int *distortion,
+                                     int *s, int *skip,
+                                     int64_t txfm_cache[NB_TXFM_MODES],
+                                     TX_SIZE max_txfm_size) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   vp9_prob skip_prob = cm->mb_no_coeff_skip ?
                        vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
-  int64_t rd[2][TX_SIZE_MAX];
-  int n;
-
-  r[1][TX_16X16] = r[0][TX_16X16] + vp9_cost_one(cm->prob_tx[0]) +
-                   vp9_cost_one(cm->prob_tx[1]);
-  r[1][TX_8X8]   = r[0][TX_8X8] + vp9_cost_one(cm->prob_tx[0]) +
-                   vp9_cost_zero(cm->prob_tx[1]);
-  r[1][TX_4X4]   = r[0][TX_4X4] + vp9_cost_zero(cm->prob_tx[0]);
+  int64_t rd[TX_SIZE_MAX_SB][2];
+  int n, m;
+
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    r[n][1] = r[n][0];
+    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+      if (m == n)
+        r[n][1] += vp9_cost_zero(cm->prob_tx[m]);
+      else
+        r[n][1] += vp9_cost_one(cm->prob_tx[m]);
+    }
+  }
 
   if (cm->mb_no_coeff_skip) {
     int s0, s1;
@@ -838,64 +864,82 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
     s0 = vp9_cost_bit(skip_prob, 0);
     s1 = vp9_cost_bit(skip_prob, 1);
 
-    for (n = TX_4X4; n <= TX_16X16; n++) {
+    for (n = TX_4X4; n <= max_txfm_size; n++) {
       if (s[n]) {
-        rd[0][n] = rd[1][n] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
       } else {
-        rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n] + s0, d[n]);
-        rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n] + s0, d[n]);
+        rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+        rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
       }
     }
   } else {
-    for (n = TX_4X4; n <= TX_16X16; n++) {
-      rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n], d[n]);
-      rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n], d[n]);
+    for (n = TX_4X4; n <= max_txfm_size; n++) {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0], d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1], d[n]);
     }
   }
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (max_txfm_size == TX_32X32 &&
+      (cm->txfm_mode == ALLOW_32X32 ||
+       (cm->txfm_mode == TX_MODE_SELECT &&
+        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
+        rd[TX_32X32][1] < rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_32X32;
+  } else
+#endif
   if ( cm->txfm_mode == ALLOW_16X16 ||
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||
+#endif
       (cm->txfm_mode == TX_MODE_SELECT &&
-       rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])) {
+       rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode == ALLOW_8X8 ||
-           (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_8X8] < rd[1][TX_4X4])) {
+           (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_8X8;
   } else {
-    assert(cm->txfm_mode == ONLY_4X4 ||
-          (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_4X4] <= rd[1][TX_8X8]));
+    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);
     mbmi->txfm_size = TX_4X4;
   }
 
   *distortion = d[mbmi->txfm_size];
-  *rate       = r[cm->txfm_mode == TX_MODE_SELECT][mbmi->txfm_size];
+  *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->txfm_size];
 
-  txfm_cache[ONLY_4X4] = rd[0][TX_4X4];
-  txfm_cache[ALLOW_8X8] = rd[0][TX_8X8];
-  txfm_cache[ALLOW_16X16] = rd[0][TX_16X16];
-  if (rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])
-    txfm_cache[TX_MODE_SELECT] = rd[1][TX_16X16];
+  txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
+  txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
+  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
+      rd[TX_32X32][1] < rd[TX_4X4][1])
+    txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
+  else
+#endif
+  if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
+    txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
   else
-    txfm_cache[TX_MODE_SELECT] = rd[1][TX_4X4] < rd[1][TX_8X8] ?
-                                 rd[1][TX_4X4] : rd[1][TX_8X8];
+    txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
+                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 }
 
 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int *skippable,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX];
+  int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];
 
   vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
                    x->block[0].src_stride);
 
-  macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16],
-                        &s[TX_16X16], 1);
-  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8], &s[TX_8X8], 1);
-  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4], &s[TX_4X4], 1);
+  macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);
+  macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);
+  macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
-                           txfm_cache);
+                           txfm_cache, TX_16X16);
 }
 
 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
@@ -908,25 +952,91 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
 }
 
 #if CONFIG_SUPERBLOCKS
+#if CONFIG_TX32X32
+static int rdcost_sby_32x32(MACROBLOCK *x) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above,
+                  *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+
+  return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
+}
+
+static int vp9_sb_block_error_c(short *coeff, short *dqcoeff, int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    unsigned int this_diff = coeff[i] - dqcoeff[i];
+    error += this_diff * this_diff;
+  }
+
+  return error > INT_MAX ? INT_MAX : error;
+}
+
+#define DEBUG_ERROR 0
+static void super_block_yrd_32x32(MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable) {
+  SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
+  MACROBLOCKD * const xd = &x->e_mbd;
+  SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
+#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID
+  short out[1024];
+#endif
+
+  vp9_transform_sby_32x32(x);
+  vp9_quantize_sby_32x32(x);
+#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID
+  vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
+#endif
+
+#if !CONFIG_DWT32X32HYBRID
+  *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
+#else
+  *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;
+#endif
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
+         vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
+#endif
+  *rate       = rdcost_sby_32x32(x);
+  *skippable  = vp9_sby_is_skippable_32x32(&x->e_mbd);
+}
+#endif
+
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int *distortion,
                             int *skip,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES t_above[3][2], *orig_above = xd->above_context;
-  ENTROPY_CONTEXT_PLANES t_left[3][2], *orig_left = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2],
+                        *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2],
+                        *orig_left = xd->left_context;
 
-  for (n = TX_4X4; n <= TX_16X16; n++) {
+  for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) {
     vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
     vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
-    r[0][n] = 0;
+    r[n][0] = 0;
     d[n] = 0;
     s[n] = 1;
   }
 
+#if CONFIG_TX32X32
+  vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+                       dst, dst_y_stride);
+  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+#endif
+
+#if DEBUG_ERROR
+  int err[3] = { 0, 0, 0 };
+#endif
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
     int r_tmp, d_tmp, s_tmp;
@@ -941,25 +1051,42 @@ static void super_block_yrd(VP9_COMP *cpi,
     xd->left_context = &t_left[TX_16X16][y_idx];
     macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_16X16] += d_tmp;
-    r[0][TX_16X16] += r_tmp;
+    r[TX_16X16][0] += r_tmp;
     s[TX_16X16] = s[TX_16X16] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_16x16(xd);
+    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
 
     xd->above_context = &t_above[TX_4X4][x_idx];
     xd->left_context = &t_left[TX_4X4][y_idx];
     macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_4X4] += d_tmp;
-    r[0][TX_4X4] += r_tmp;
+    r[TX_4X4][0] += r_tmp;
     s[TX_4X4] = s[TX_4X4] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_4x4(xd);
+    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
 
     xd->above_context = &t_above[TX_8X8][x_idx];
     xd->left_context = &t_left[TX_8X8][y_idx];
     macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_8X8] += d_tmp;
-    r[0][TX_8X8] += r_tmp;
+    r[TX_8X8][0] += r_tmp;
     s[TX_8X8] = s[TX_8X8] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_8x8(xd);
+    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
   }
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache);
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
+  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
+  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
+#endif
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
+                           TX_SIZE_MAX_SB - 1);
 
   xd->above_context = orig_above;
   xd->left_context = orig_left;
@@ -1632,14 +1759,59 @@ static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 }
 
 #if CONFIG_SUPERBLOCKS
+#if CONFIG_TX32X32
+static int rd_cost_sbuv_16x16(MACROBLOCK *x) {
+  int b;
+  int cost = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *) &t_above;
+  tl = (ENTROPY_CONTEXT *) &t_left;
+
+  for (b = 16; b < 24; b += 4)
+    cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV,
+                        ta + vp9_block2above_8x8[b],
+                        tl + vp9_block2left_8x8[b], TX_16X16);
+
+  return cost;
+}
+
+static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,
+                                   int *distortion, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  vp9_transform_sbuv_16x16(x);
+  vp9_quantize_sbuv_16x16(x);
+
+  *rate       = rd_cost_sbuv_16x16(x);
+  *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024,
+                                   xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2;
+  *skip       = vp9_sbuv_is_skippable_16x16(xd);
+}
+#endif
+
 static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                 int *distortion, int fullpixel, int *skip) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int n, r = 0, d = 0;
   const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+
+#if CONFIG_TX32X32
+  if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skip);
+  } else {
+#endif
+  int n, r = 0, d = 0;
   int skippable = 1;
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
@@ -1680,8 +1852,11 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   xd->above_context = ta;
   memcpy(xd->above_context, t_above, sizeof(t_above));
   memcpy(xd->left_context, t_left, sizeof(t_left));
+#if CONFIG_TX32X32
+  }
+#endif
 
-  return RDCOST(x->rdmult, x->rddiv, r, d);
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 #endif
 
@@ -1818,15 +1993,26 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
 }
 
 #if CONFIG_SUPERBLOCKS
-static void super_block_uvrd_8x8(MACROBLOCK *x,
-                                 int *rate,
-                                 int *distortion,
-                                 int *skippable) {
+// TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
+static void super_block_uvrd(MACROBLOCK *x,
+                             int *rate,
+                             int *distortion,
+                             int *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int d = 0, r = 0, n, s = 1;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+
+#if CONFIG_TX32X32
+  if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skippable);
+  } else {
+#endif
+  int d = 0, r = 0, n, s = 1;
   ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
   ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
@@ -1844,9 +2030,15 @@ static void super_block_uvrd_8x8(MACROBLOCK *x,
                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-    s &= vp9_mbuv_is_skippable_8x8(xd);
+    if (mbmi->txfm_size == TX_4X4) {
+      vp9_transform_mbuv_4x4(x);
+      vp9_quantize_mbuv_4x4(x);
+      s &= vp9_mbuv_is_skippable_4x4(xd);
+    } else {
+      vp9_transform_mbuv_8x8(x);
+      vp9_quantize_mbuv_8x8(x);
+      s &= vp9_mbuv_is_skippable_8x8(xd);
+    }
 
     d += vp9_mbuverror(x) >> 2;
     xd->above_context = ta + x_idx;
@@ -1864,6 +2056,9 @@ static void super_block_uvrd_8x8(MACROBLOCK *x,
   xd->above_context = ta;
   memcpy(xd->above_context, t_above, sizeof(t_above));
   memcpy(xd->left_context,  t_left,  sizeof(t_left));
+#if CONFIG_TX32X32
+  }
+#endif
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
@@ -1882,8 +2077,8 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
-    super_block_uvrd_8x8(x, &this_rate_tokenonly,
-                         &this_distortion, &s);
+    super_block_uvrd(x, &this_rate_tokenonly,
+                     &this_distortion, &s);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -4141,8 +4336,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int y_skip, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES];
 
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-
   error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                    &dist_y, &y_skip, txfm_cache);
   error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
@@ -4362,6 +4555,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;
   MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;
   int switchable_filter_index = 0;
+#if CONFIG_TX32X32
+  int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
+  int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
+  MB_PREDICTION_MODE mode_uv_16x16;
+#endif
 
   x->skip = 0;
   xd->mode_info_context->mbmi.segment_id = segment_id;
@@ -4397,6 +4595,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                             &dist_uv_8x8, &uv_skip_8x8);
     mode_uv_8x8 = mbmi->uv_mode;
   }
+#if CONFIG_TX32X32
+  if (cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
+    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
+                            &dist_uv_16x16, &uv_skip_16x16);
+    mode_uv_16x16 = mbmi->uv_mode;
+  }
+#endif
 
   for (mode_index = 0; mode_index < MAX_MODES;
        mode_index += (!switchable_filter_index)) {
@@ -4524,6 +4730,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         distortion_uv = dist_uv_4x4;
         skippable = skippable && uv_skip_4x4;
         mbmi->uv_mode = mode_uv_4x4;
+#if CONFIG_TX32X32
+      } else if (mbmi->txfm_size == TX_32X32) {
+        rate_uv = rate_uv_16x16;
+        distortion_uv = dist_uv_16x16;
+        skippable = skippable && uv_skip_16x16;
+        mbmi->uv_mode = mode_uv_16x16;
+#endif
       } else {
         rate_uv = rate_uv_8x8;
         distortion_uv = dist_uv_8x8;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 44963b2237dfa050ec8351ce3f4c7a713fe195b2..a662e048edae44ebbae2a729000e848f7b6c7630 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -117,7 +117,7 @@ static void tokenize_b(VP9_COMP *cpi,
                        int dry_run) {
   int pt; /* near block/prev token context index */
   int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
-  const int eob = b->eob;     /* one beyond last nonzero coeff */
+  int eob = b->eob;     /* one beyond last nonzero coeff */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const short *qcoeff_ptr = b->qcoeff;
   int seg_eob;
@@ -177,7 +177,23 @@ static void tokenize_b(VP9_COMP *cpi,
         counts = cpi->coef_counts_16x16;
         probs = cpi->common.fc.coef_probs_16x16;
       }
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+      if (type == PLANE_TYPE_UV) {
+        int uv_idx = (((int) (b - xd->block)) - 16) >> 2;
+        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx;
+      }
+#endif
+      break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      seg_eob = 1024;
+      bands = vp9_coef_bands_32x32;
+      scan = vp9_default_zig_zag1d_32x32;
+      counts = cpi->coef_counts_32x32;
+      probs = cpi->common.fc.coef_probs_32x32;
+      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
       break;
+#endif
   }
 
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
@@ -283,6 +299,79 @@ static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
   return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
 }
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {
+  int skip = 1;
+  skip &= !xd->block[0].eob;
+  return skip;
+}
+
+int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {
+  return (!xd->block[16].eob) & (!xd->block[20].eob);
+}
+
+static int sb_is_skippable_32x32(MACROBLOCKD *xd) {
+  return vp9_sby_is_skippable_32x32(xd) &&
+         vp9_sbuv_is_skippable_16x16(xd);
+}
+
+void vp9_tokenize_sb(VP9_COMP *cpi,
+                     MACROBLOCKD *xd,
+                     TOKENEXTRA **t,
+                     int dry_run) {
+  VP9_COMMON * const cm = &cpi->common;
+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };
+  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };
+  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
+  const int segment_id = mbmi->segment_id;
+  const int skip_inc =  !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+                        (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0);
+  int b;
+
+  mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
+
+  if (mbmi->mb_skip_coeff) {
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!cm->mb_no_coeff_skip) {
+      vp9_stuff_sb(cpi, xd, t, dry_run);
+    } else {
+      vp9_fix_contexts_sb(xd);
+    }
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
+
+  tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
+             A[0], L[0], TX_32X32, dry_run);
+  A[0][1] = A[0][2] = A[0][3] = A[0][0];
+  L[0][1] = L[0][2] = L[0][3] = L[0][0];
+
+  for (b = 16; b < 24; b += 4) {
+    tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+               A[0] + vp9_block2above_8x8[b], L[0] + vp9_block2left_8x8[b],
+               TX_16X16, dry_run);
+    A[0][vp9_block2above_8x8[b] + 1] = A[0][vp9_block2above_8x8[b]];
+    L[0][vp9_block2left_8x8[b] + 1]  = L[0][vp9_block2left_8x8[b]];
+  }
+  vpx_memset(&A[0][8], 0, sizeof(A[0][8]));
+  vpx_memset(&L[0][8], 0, sizeof(L[0][8]));
+  vpx_memcpy(A[1], A[0], sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(L[1], L[0], sizeof(ENTROPY_CONTEXT_PLANES));
+
+  if (dry_run)
+    *t = t_backup;
+}
+#endif
+
 void vp9_tokenize_mb(VP9_COMP *cpi,
                      MACROBLOCKD *xd,
                      TOKENEXTRA **t,
@@ -717,6 +806,13 @@ static __inline void stuff_b(VP9_COMP *cpi,
         probs = cpi->common.fc.coef_probs_16x16;
       }
       break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      bands = vp9_coef_bands_32x32;
+      counts = cpi->coef_counts_32x32;
+      probs = cpi->common.fc.coef_probs_32x32;
+      break;
+#endif
   }
   band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
   t->Token = DCT_EOB_TOKEN;
@@ -775,7 +871,8 @@ static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
   A[1] = A[2] = A[3] = A[0];
   L[1] = L[2] = L[3] = L[0];
   for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+            A + vp9_block2above_8x8[b],
             L + vp9_block2above_8x8[b], TX_8X8, dry_run);
     A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
     L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
@@ -869,6 +966,43 @@ void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
   }
 }
 
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd,
+                               TOKENEXTRA **t, int dry_run) {
+  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };
+  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };
+  int b;
+
+  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
+          A[0], L[0], TX_32X32, dry_run);
+  A[0][1] = A[0][2] = A[0][3] = A[0][0];
+  L[0][1] = L[0][2] = L[0][3] = L[0][0];
+  for (b = 16; b < 24; b += 4) {
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+            A[0] + vp9_block2above_8x8[b],
+            L[0] + vp9_block2above_8x8[b], TX_16X16, dry_run);
+    A[0][vp9_block2above_8x8[b] + 1] = A[0][vp9_block2above_8x8[b]];
+    L[0][vp9_block2left_8x8[b] + 1]  = L[0][vp9_block2left_8x8[b]];
+  }
+  vpx_memset(&A[0][8], 0, sizeof(A[0][8]));
+  vpx_memset(&L[0][8], 0, sizeof(L[0][8]));
+  vpx_memcpy(A[1], A[0], sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(L[1], L[0], sizeof(ENTROPY_CONTEXT_PLANES));
+}
+
+void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+  TOKENEXTRA * const t_backup = *t;
+
+  stuff_sb_32x32(cpi, xd, t, dry_run);
+
+  if (dry_run) {
+    *t = t_backup;
+  }
+}
+#endif
+
 void vp9_fix_contexts(MACROBLOCKD *xd) {
   /* Clear entropy contexts for blocks */
   if ((xd->mode_info_context->mbmi.mode != B_PRED
@@ -885,3 +1019,10 @@ void vp9_fix_contexts(MACROBLOCKD *xd) {
     xd->left_context->y2 = 1;
   }
 }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_fix_contexts_sb(MACROBLOCKD *xd) {
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+}
+#endif
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 868909be39fe8ff32a1f319df830c446c7b7d496..cfd5db694b420e2aa718561dde600ae9c7a65bd8 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -34,16 +34,29 @@ extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
 extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
 extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
 extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
+extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
+#endif
 
 struct VP9_COMP;
 
 extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                             TOKENEXTRA **t, int dry_run);
+extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                            TOKENEXTRA **t, int dry_run);
 
 extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                          TOKENEXTRA **t, int dry_run);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                         TOKENEXTRA **t, int dry_run);
+#endif
 
 extern void vp9_fix_contexts(MACROBLOCKD *xd);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern void vp9_fix_contexts_sb(MACROBLOCKD *xd);
+#endif
 
 #ifdef ENTROPY_STATS
 void init_context_counters();