loopfilter.c 32.2 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5 6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9
 */
10

Zoe Liu's avatar
Zoe Liu committed
11 12
#include <stdlib.h>

13
#include "./vpx_config.h"
Jingning Han's avatar
Jingning Han committed
14
#include "vpx_dsp/vpx_dsp_common.h"
15
#include "vpx_ports/mem.h"
John Koleszar's avatar
John Koleszar committed
16

17
static INLINE int8_t signed_char_clamp(int t) {
18
  return (int8_t)clamp(t, -128, 127);
John Koleszar's avatar
John Koleszar committed
19 20
}

21 22 23 24 25 26 27 28 29 30 31 32 33 34
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE int16_t signed_char_clamp_high(int t, int bd) {
  switch (bd) {
    case 10:
      return (int16_t)clamp(t, -128*4, 128*4-1);
    case 12:
      return (int16_t)clamp(t, -128*16, 128*16-1);
    case 8:
    default:
      return (int16_t)clamp(t, -128, 128-1);
  }
}
#endif

Dmitry Kovalev's avatar
Dmitry Kovalev committed
35
// should we apply any filter at all: 11111111 yes, 00000000 no
36 37 38 39 40
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
                                 uint8_t p3, uint8_t p2,
                                 uint8_t p1, uint8_t p0,
                                 uint8_t q0, uint8_t q1,
                                 uint8_t q2, uint8_t q3) {
41
  int8_t mask = 0;
John Koleszar's avatar
John Koleszar committed
42 43 44 45 46 47 48
  mask |= (abs(p3 - p2) > limit) * -1;
  mask |= (abs(p2 - p1) > limit) * -1;
  mask |= (abs(p1 - p0) > limit) * -1;
  mask |= (abs(q1 - q0) > limit) * -1;
  mask |= (abs(q2 - q1) > limit) * -1;
  mask |= (abs(q3 - q2) > limit) * -1;
  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
Dmitry Kovalev's avatar
Dmitry Kovalev committed
49
  return ~mask;
John Koleszar's avatar
John Koleszar committed
50 51
}

52 53 54 55 56
static INLINE int8_t flat_mask4(uint8_t thresh,
                                uint8_t p3, uint8_t p2,
                                uint8_t p1, uint8_t p0,
                                uint8_t q0, uint8_t q1,
                                uint8_t q2, uint8_t q3) {
57 58 59 60 61 62 63 64
  int8_t mask = 0;
  mask |= (abs(p1 - p0) > thresh) * -1;
  mask |= (abs(q1 - q0) > thresh) * -1;
  mask |= (abs(p2 - p0) > thresh) * -1;
  mask |= (abs(q2 - q0) > thresh) * -1;
  mask |= (abs(p3 - p0) > thresh) * -1;
  mask |= (abs(q3 - q0) > thresh) * -1;
  return ~mask;
65 66 67 68 69 70 71 72
}

static INLINE int8_t flat_mask5(uint8_t thresh,
                                uint8_t p4, uint8_t p3,
                                uint8_t p2, uint8_t p1,
                                uint8_t p0, uint8_t q0,
                                uint8_t q1, uint8_t q2,
                                uint8_t q3, uint8_t q4) {
73 74 75 76
  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
  mask |= (abs(p4 - p0) > thresh) * -1;
  mask |= (abs(q4 - q0) > thresh) * -1;
  return ~mask;
77 78
}

79
// is there high edge variance internal edge: 11111111 yes, 00000000 no
80 81
static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                              uint8_t q0, uint8_t q1) {
82
  int8_t hev = 0;
John Koleszar's avatar
John Koleszar committed
83 84 85
  hev  |= (abs(p1 - p0) > thresh) * -1;
  hev  |= (abs(q1 - q0) > thresh) * -1;
  return hev;
John Koleszar's avatar
John Koleszar committed
86 87
}

88
static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
89
                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
90
  int8_t filter1, filter2;
John Koleszar's avatar
John Koleszar committed
91

92 93 94 95
  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
96
  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113

  // add outer taps if we have high edge variance
  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

  // inner taps
  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

  // save bottom 3 bits so that we round one side +4 and the other +3
  // if it equals 4 we'll set to adjust by -1 to account for the fact
  // we'd round 3 the other way
  filter1 = signed_char_clamp(filter + 4) >> 3;
  filter2 = signed_char_clamp(filter + 3) >> 3;

  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

  // outer tap adjustments
114
  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
John Koleszar's avatar
John Koleszar committed
115

116 117
  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
John Koleszar's avatar
John Koleszar committed
118
}
119

120
void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
121 122
                            const uint8_t *blimit, const uint8_t *limit,
                            const uint8_t *thresh, int count) {
123
  int i;
John Koleszar's avatar
John Koleszar committed
124

Dmitry Kovalev's avatar
Dmitry Kovalev committed
125 126
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
127 128 129 130 131
  for (i = 0; i < 8 * count; ++i) {
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
132
    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
John Koleszar's avatar
John Koleszar committed
133
    ++s;
134
  }
John Koleszar's avatar
John Koleszar committed
135 136
}

137
void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
138 139 140
                                 const uint8_t *limit0, const uint8_t *thresh0,
                                 const uint8_t *blimit1, const uint8_t *limit1,
                                 const uint8_t *thresh1) {
141 142
  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
143 144
}

145
void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
146 147
                          const uint8_t *limit, const uint8_t *thresh,
                          int count) {
148
  int i;
John Koleszar's avatar
John Koleszar committed
149

Dmitry Kovalev's avatar
Dmitry Kovalev committed
150 151
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
152 153 154 155 156
  for (i = 0; i < 8 * count; ++i) {
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
157
    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
158
    s += pitch;
159
  }
John Koleszar's avatar
John Koleszar committed
160
}
161

162
void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
163 164 165
                               const uint8_t *limit0, const uint8_t *thresh0,
                               const uint8_t *blimit1, const uint8_t *limit1,
                               const uint8_t *thresh1) {
166 167
  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
168
                                  thresh1, 1);
169 170
}

171
static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
172 173 174 175
                           uint8_t *op3, uint8_t *op2,
                           uint8_t *op1, uint8_t *op0,
                           uint8_t *oq0, uint8_t *oq1,
                           uint8_t *oq2, uint8_t *oq3) {
John Koleszar's avatar
John Koleszar committed
176
  if (flat && mask) {
177 178
    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
John Koleszar's avatar
John Koleszar committed
179

180 181 182 183 184 185 186
    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
John Koleszar's avatar
John Koleszar committed
187
  } else {
188
    filter4(mask, thresh, op1,  op0, oq0, oq1);
John Koleszar's avatar
John Koleszar committed
189
  }
190
}
191

192
void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
193 194
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
195
  int i;
John Koleszar's avatar
John Koleszar committed
196

Dmitry Kovalev's avatar
Dmitry Kovalev committed
197 198
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
199 200 201 202 203 204
  for (i = 0; i < 8 * count; ++i) {
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
205
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
206 207
    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
John Koleszar's avatar
John Koleszar committed
208
    ++s;
209
  }
John Koleszar's avatar
John Koleszar committed
210
}
211

212
void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
213 214 215
                                 const uint8_t *limit0, const uint8_t *thresh0,
                                 const uint8_t *blimit1, const uint8_t *limit1,
                                 const uint8_t *thresh1) {
216 217
  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
218 219
}

220
void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
221 222
                          const uint8_t *limit, const uint8_t *thresh,
                          int count) {
223 224 225 226 227 228 229
  int i;

  for (i = 0; i < 8 * count; ++i) {
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
230
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
231 232
    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
                                 s,     s + 1, s + 2, s + 3);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
233
    s += pitch;
234
  }
John Koleszar's avatar
John Koleszar committed
235 236
}

237
void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
238 239 240
                               const uint8_t *limit0, const uint8_t *thresh0,
                               const uint8_t *blimit1, const uint8_t *limit1,
                               const uint8_t *thresh1) {
241 242
  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
243
                                    thresh1, 1);
244 245
}

246
static INLINE void filter16(int8_t mask, uint8_t thresh,
247 248 249 250 251 252 253 254 255
                            uint8_t flat, uint8_t flat2,
                            uint8_t *op7, uint8_t *op6,
                            uint8_t *op5, uint8_t *op4,
                            uint8_t *op3, uint8_t *op2,
                            uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3,
                            uint8_t *oq4, uint8_t *oq5,
                            uint8_t *oq6, uint8_t *oq7) {
256
  if (flat2 && flat && mask) {
257 258 259 260 261
    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;

    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
262

263
    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0, 4);
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1, 4);
    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2, 4);
    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3, 4);
    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4, 4);
    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5, 4);
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
    *oq6 = ROUND_POWER_OF_TWO(p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
292
  } else {
293
    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
294 295 296
  }
}

297
void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
298 299
                             const uint8_t *limit, const uint8_t *thresh,
                             int count) {
300
  int i;
301

Dmitry Kovalev's avatar
Dmitry Kovalev committed
302 303
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
304
  for (i = 0; i < 8 * count; ++i) {
305 306 307 308
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
309 310
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
    const int8_t flat2 = flat_mask5(1,
311 312
                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
313

314
    filter16(mask, *thresh, flat, flat2,
315 316 317 318
             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
             s,         s + 1 * p, s + 2 * p, s + 3 * p,
             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
319
    ++s;
320
  }
321
}
322

323 324 325 326 327
static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
                                   const uint8_t *blimit,
                                   const uint8_t *limit,
                                   const uint8_t *thresh,
                                   int count) {
328 329
  int i;

330
  for (i = 0; i < count; ++i) {
331 332 333 334
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
335 336
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
337
                                    q0, s[4], s[5], s[6], s[7]);
338

339
    filter16(mask, *thresh, flat, flat2,
340 341
             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
342
    s += p;
343
  }
344
}
345

346
void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
347
                           const uint8_t *limit, const uint8_t *thresh) {
348 349 350
  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
}

351
void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
352
                                const uint8_t *limit, const uint8_t *thresh) {
353
  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
354
}
355 356 357

#if CONFIG_VP9_HIGHBITDEPTH
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
358 359 360 361 362
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
                                        uint16_t p3, uint16_t p2,
                                        uint16_t p1, uint16_t p0,
                                        uint16_t q0, uint16_t q1,
                                        uint16_t q2, uint16_t q3, int bd) {
363 364 365 366 367 368 369 370 371 372 373 374 375
  int8_t mask = 0;
  int16_t limit16 = (uint16_t)limit << (bd - 8);
  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
  mask |= (abs(p3 - p2) > limit16) * -1;
  mask |= (abs(p2 - p1) > limit16) * -1;
  mask |= (abs(p1 - p0) > limit16) * -1;
  mask |= (abs(q1 - q0) > limit16) * -1;
  mask |= (abs(q2 - q1) > limit16) * -1;
  mask |= (abs(q3 - q2) > limit16) * -1;
  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
  return ~mask;
}

376 377 378 379 380
static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
                                       uint16_t p3, uint16_t p2,
                                       uint16_t p1, uint16_t p0,
                                       uint16_t q0, uint16_t q1,
                                       uint16_t q2, uint16_t q3, int bd) {
381 382 383 384 385 386 387 388 389 390 391
  int8_t mask = 0;
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
  mask |= (abs(p1 - p0) > thresh16) * -1;
  mask |= (abs(q1 - q0) > thresh16) * -1;
  mask |= (abs(p2 - p0) > thresh16) * -1;
  mask |= (abs(q2 - q0) > thresh16) * -1;
  mask |= (abs(p3 - p0) > thresh16) * -1;
  mask |= (abs(q3 - q0) > thresh16) * -1;
  return ~mask;
}

392 393 394 395 396 397 398
static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
                                       uint16_t p4, uint16_t p3,
                                       uint16_t p2, uint16_t p1,
                                       uint16_t p0, uint16_t q0,
                                       uint16_t q1, uint16_t q2,
                                       uint16_t q3, uint16_t q4, int bd) {
  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
399 400 401 402 403 404 405 406
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
  mask |= (abs(p4 - p0) > thresh16) * -1;
  mask |= (abs(q4 - q0) > thresh16) * -1;
  return ~mask;
}

// Is there high edge variance internal edge:
// 11111111_11111111 yes, 00000000_00000000 no ?
407 408
static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
                                      uint16_t q0, uint16_t q1, int bd) {
409 410 411 412 413 414 415
  int16_t hev = 0;
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
  hev |= (abs(p1 - p0) > thresh16) * -1;
  hev |= (abs(q1 - q0) > thresh16) * -1;
  return hev;
}

416 417 418
static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                  int bd) {
419 420 421 422 423 424 425 426
  int16_t filter1, filter2;
  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
  // into -128 to +127 instead of 0 to 255.
  int shift = bd - 8;
  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
427
  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450

  // Add outer taps if we have high edge variance.
  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;

  // Inner taps.
  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;

  // Save bottom 3 bits so that we round one side +4 and the other +3
  // if it equals 4 we'll set to adjust by -1 to account for the fact
  // we'd round 3 the other way.
  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;

  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);

  // Outer tap adjustments.
  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;

  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
}

451
void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
                                   const uint8_t *blimit, const uint8_t *limit,
                                   const uint8_t *thresh, int count, int bd) {
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8 * count; ++i) {
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
    const uint16_t p1 = s[-2 * p];
    const uint16_t p0 = s[-p];
    const uint16_t q0 = s[0 * p];
    const uint16_t q1 = s[1 * p];
    const uint16_t q2 = s[2 * p];
    const uint16_t q3 = s[3 * p];
467 468 469
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
470 471 472 473
    ++s;
  }
}

474
void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
475 476 477 478 479 480 481
                                        const uint8_t *blimit0,
                                        const uint8_t *limit0,
                                        const uint8_t *thresh0,
                                        const uint8_t *blimit1,
                                        const uint8_t *limit1,
                                        const uint8_t *thresh1,
                                        int bd) {
482 483
  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
484 485
}

486
void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
487 488 489 490 491 492 493 494 495
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count, int bd) {
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8 * count; ++i) {
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
496 497 498
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
499 500 501 502
    s += pitch;
  }
}

503
void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
504 505 506 507 508 509 510
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1,
                                      int bd) {
511 512
  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
513 514 515
                              thresh1, 1, bd);
}

516 517 518 519 520
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
                                  uint16_t *op3, uint16_t *op2,
                                  uint16_t *op1, uint16_t *op0,
                                  uint16_t *oq0, uint16_t *oq1,
                                  uint16_t *oq2, uint16_t *oq3, int bd) {
521 522 523 524 525 526 527 528 529 530 531 532
  if (flat && mask) {
    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
  } else {
533
    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
534 535 536
  }
}

537
void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
538 539 540 541 542 543 544 545 546 547
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int count, int bd) {
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8 * count; ++i) {
    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

548
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
549
                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
550 551 552
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    highbd_filter8(mask, *thresh, flat,
553 554 555 556 557 558
                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
    ++s;
  }
}

559
void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
560 561 562 563 564 565 566
                                        const uint8_t *blimit0,
                                        const uint8_t *limit0,
                                        const uint8_t *thresh0,
                                        const uint8_t *blimit1,
                                        const uint8_t *limit1,
                                        const uint8_t *thresh1,
                                        int bd) {
567 568
  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
569 570
}

571
void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
572 573 574 575 576 577 578
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count, int bd) {
  int i;

  for (i = 0; i < 8 * count; ++i) {
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
579 580 581 582 583
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    highbd_filter8(mask, *thresh, flat,
584 585 586 587 588 589 590
                 s - 4, s - 3, s - 2, s - 1,
                 s, s + 1, s + 2, s + 3,
                 bd);
    s += pitch;
  }
}

591
void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
592 593 594 595 596 597 598
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1,
                                      int bd) {
599 600
  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
601 602 603
                              thresh1, 1, bd);
}

604 605 606 607 608 609 610 611 612 613
static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
                                   uint8_t flat, uint8_t flat2,
                                   uint16_t *op7, uint16_t *op6,
                                   uint16_t *op5, uint16_t *op4,
                                   uint16_t *op3, uint16_t *op2,
                                   uint16_t *op1, uint16_t *op0,
                                   uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3,
                                   uint16_t *oq4, uint16_t *oq5,
                                   uint16_t *oq6, uint16_t *oq7, int bd) {
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
  if (flat2 && flat && mask) {
    const uint16_t p7 = *op7;
    const uint16_t p6 = *op6;
    const uint16_t p5 = *op5;
    const uint16_t p4 = *op4;
    const uint16_t p3 = *op3;
    const uint16_t p2 = *op2;
    const uint16_t p1 = *op1;
    const uint16_t p0 = *op0;
    const uint16_t q0 = *oq0;
    const uint16_t q1 = *oq1;
    const uint16_t q2 = *oq2;
    const uint16_t q3 = *oq3;
    const uint16_t q4 = *oq4;
    const uint16_t q5 = *oq5;
    const uint16_t q6 = *oq6;
    const uint16_t q7 = *oq7;

    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0, 4);
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1, 4);
    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2, 4);
    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3, 4);
    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4, 4);
    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5, 4);
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
    *oq6 = ROUND_POWER_OF_TWO(p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
  } else {
662 663
    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
                   bd);
664 665 666
  }
}

667
void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8 * count; ++i) {
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
    const uint16_t p1 = s[-2 * p];
    const uint16_t p0 = s[-p];
    const uint16_t q0 = s[0 * p];
    const uint16_t q1 = s[1 * p];
    const uint16_t q2 = s[2 * p];
    const uint16_t q3 = s[3 * p];
683 684 685 686 687
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    const int8_t flat2 = highbd_flat_mask5(
688 689 690
        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);

691 692 693 694 695 696
    highbd_filter16(mask, *thresh, flat, flat2,
                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
                    s, s + 1 * p, s + 2 * p, s + 3 * p,
                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
                    bd);
697 698 699 700
    ++s;
  }
}

701 702 703 704 705
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh,
                                          int count, int bd) {
706 707 708 709 710 711 712 713 714 715 716
  int i;

  for (i = 0; i < count; ++i) {
    const uint16_t p3 = s[-4];
    const uint16_t p2 = s[-3];
    const uint16_t p1 = s[-2];
    const uint16_t p0 = s[-1];
    const uint16_t q0 = s[0];
    const uint16_t q1 = s[1];
    const uint16_t q2 = s[2];
    const uint16_t q3 = s[3];
717 718 719 720 721 722 723 724 725 726 727
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
                                           q0, s[4], s[5], s[6], s[7], bd);

    highbd_filter16(mask, *thresh, flat, flat2,
                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
                    bd);
728 729 730 731
    s += p;
  }
}

732
void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
733 734
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
735
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
736 737
}

738
void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
739 740 741 742
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh,
                                       int bd) {
743
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
744 745
}
#endif  // CONFIG_VP9_HIGHBITDEPTH