loopfilter.c 33.1 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5 6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9
 */
10

Zoe Liu's avatar
Zoe Liu committed
11 12
#include <stdlib.h>

13
#include "./vpx_config.h"
14
#include "./vpx_dsp_rtcd.h"
Jingning Han's avatar
Jingning Han committed
15
#include "vpx_dsp/vpx_dsp_common.h"
16
#include "vpx_ports/mem.h"
John Koleszar's avatar
John Koleszar committed
17

18
static INLINE int8_t signed_char_clamp(int t) {
19
  return (int8_t)clamp(t, -128, 127);
John Koleszar's avatar
John Koleszar committed
20 21
}

22 23 24 25 26 27 28 29 30 31 32 33 34 35
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE int16_t signed_char_clamp_high(int t, int bd) {
  switch (bd) {
    case 10:
      return (int16_t)clamp(t, -128*4, 128*4-1);
    case 12:
      return (int16_t)clamp(t, -128*16, 128*16-1);
    case 8:
    default:
      return (int16_t)clamp(t, -128, 128-1);
  }
}
#endif

Dmitry Kovalev's avatar
Dmitry Kovalev committed
36
// should we apply any filter at all: 11111111 yes, 00000000 no
37 38 39 40 41
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
                                 uint8_t p3, uint8_t p2,
                                 uint8_t p1, uint8_t p0,
                                 uint8_t q0, uint8_t q1,
                                 uint8_t q2, uint8_t q3) {
42
  int8_t mask = 0;
John Koleszar's avatar
John Koleszar committed
43 44 45 46 47 48 49
  mask |= (abs(p3 - p2) > limit) * -1;
  mask |= (abs(p2 - p1) > limit) * -1;
  mask |= (abs(p1 - p0) > limit) * -1;
  mask |= (abs(q1 - q0) > limit) * -1;
  mask |= (abs(q2 - q1) > limit) * -1;
  mask |= (abs(q3 - q2) > limit) * -1;
  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
Dmitry Kovalev's avatar
Dmitry Kovalev committed
50
  return ~mask;
John Koleszar's avatar
John Koleszar committed
51 52
}

53 54 55 56 57
static INLINE int8_t flat_mask4(uint8_t thresh,
                                uint8_t p3, uint8_t p2,
                                uint8_t p1, uint8_t p0,
                                uint8_t q0, uint8_t q1,
                                uint8_t q2, uint8_t q3) {
58 59 60 61 62 63 64 65
  int8_t mask = 0;
  mask |= (abs(p1 - p0) > thresh) * -1;
  mask |= (abs(q1 - q0) > thresh) * -1;
  mask |= (abs(p2 - p0) > thresh) * -1;
  mask |= (abs(q2 - q0) > thresh) * -1;
  mask |= (abs(p3 - p0) > thresh) * -1;
  mask |= (abs(q3 - q0) > thresh) * -1;
  return ~mask;
66 67 68 69 70 71 72 73
}

static INLINE int8_t flat_mask5(uint8_t thresh,
                                uint8_t p4, uint8_t p3,
                                uint8_t p2, uint8_t p1,
                                uint8_t p0, uint8_t q0,
                                uint8_t q1, uint8_t q2,
                                uint8_t q3, uint8_t q4) {
74 75 76 77
  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
  mask |= (abs(p4 - p0) > thresh) * -1;
  mask |= (abs(q4 - q0) > thresh) * -1;
  return ~mask;
78 79
}

80
// is there high edge variance internal edge: 11111111 yes, 00000000 no
81 82
static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                              uint8_t q0, uint8_t q1) {
83
  int8_t hev = 0;
John Koleszar's avatar
John Koleszar committed
84 85 86
  hev  |= (abs(p1 - p0) > thresh) * -1;
  hev  |= (abs(q1 - q0) > thresh) * -1;
  return hev;
John Koleszar's avatar
John Koleszar committed
87 88
}

89
static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
90
                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
91
  int8_t filter1, filter2;
John Koleszar's avatar
John Koleszar committed
92

93 94 95 96
  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
97
  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114

  // add outer taps if we have high edge variance
  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

  // inner taps
  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

  // save bottom 3 bits so that we round one side +4 and the other +3
  // if it equals 4 we'll set to adjust by -1 to account for the fact
  // we'd round 3 the other way
  filter1 = signed_char_clamp(filter + 4) >> 3;
  filter2 = signed_char_clamp(filter + 3) >> 3;

  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

  // outer tap adjustments
115
  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
John Koleszar's avatar
John Koleszar committed
116

117 118
  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
John Koleszar's avatar
John Koleszar committed
119
}
120

121
void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
122
                            const uint8_t *blimit, const uint8_t *limit,
123
                            const uint8_t *thresh) {
124
  int i;
John Koleszar's avatar
John Koleszar committed
125

Dmitry Kovalev's avatar
Dmitry Kovalev committed
126 127
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
128
  for (i = 0; i < 8; ++i) {
129 130 131 132
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
133
    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
John Koleszar's avatar
John Koleszar committed
134
    ++s;
135
  }
John Koleszar's avatar
John Koleszar committed
136 137
}

138
void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
139 140 141
                                 const uint8_t *limit0, const uint8_t *thresh0,
                                 const uint8_t *blimit1, const uint8_t *limit1,
                                 const uint8_t *thresh1) {
142 143
  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
144 145
}

146
void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
147
                          const uint8_t *limit, const uint8_t *thresh) {
148
  int i;
John Koleszar's avatar
John Koleszar committed
149

Dmitry Kovalev's avatar
Dmitry Kovalev committed
150 151
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
152
  for (i = 0; i < 8; ++i) {
153 154 155 156
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
157
    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
158
    s += pitch;
159
  }
John Koleszar's avatar
John Koleszar committed
160
}
161

162
void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
163 164 165
                               const uint8_t *limit0, const uint8_t *thresh0,
                               const uint8_t *blimit1, const uint8_t *limit1,
                               const uint8_t *thresh1) {
166 167
  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
168 169
}

170
static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
171 172 173 174
                           uint8_t *op3, uint8_t *op2,
                           uint8_t *op1, uint8_t *op0,
                           uint8_t *oq0, uint8_t *oq1,
                           uint8_t *oq2, uint8_t *oq3) {
John Koleszar's avatar
John Koleszar committed
175
  if (flat && mask) {
176 177
    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
John Koleszar's avatar
John Koleszar committed
178

179 180 181 182 183 184 185
    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
John Koleszar's avatar
John Koleszar committed
186
  } else {
187
    filter4(mask, thresh, op1,  op0, oq0, oq1);
John Koleszar's avatar
John Koleszar committed
188
  }
189
}
190

191
void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
192
                            const uint8_t *limit, const uint8_t *thresh) {
193
  int i;
John Koleszar's avatar
John Koleszar committed
194

Dmitry Kovalev's avatar
Dmitry Kovalev committed
195 196
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
197
  for (i = 0; i < 8; ++i) {
198 199 200 201 202
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
203
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
204 205
    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
John Koleszar's avatar
John Koleszar committed
206
    ++s;
207
  }
John Koleszar's avatar
John Koleszar committed
208
}
209

210
void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
211 212 213
                                 const uint8_t *limit0, const uint8_t *thresh0,
                                 const uint8_t *blimit1, const uint8_t *limit1,
                                 const uint8_t *thresh1) {
214 215
  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
216 217
}

218
void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
219
                          const uint8_t *limit, const uint8_t *thresh) {
220 221
  int i;

222
  for (i = 0; i < 8; ++i) {
223 224 225 226
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
227
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
228 229
    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
                                 s,     s + 1, s + 2, s + 3);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
230
    s += pitch;
231
  }
John Koleszar's avatar
John Koleszar committed
232 233
}

234
void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
235 236 237
                               const uint8_t *limit0, const uint8_t *thresh0,
                               const uint8_t *blimit1, const uint8_t *limit1,
                               const uint8_t *thresh1) {
238 239
  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
240 241
}

242
static INLINE void filter16(int8_t mask, uint8_t thresh,
243 244 245 246 247 248 249 250 251
                            uint8_t flat, uint8_t flat2,
                            uint8_t *op7, uint8_t *op6,
                            uint8_t *op5, uint8_t *op4,
                            uint8_t *op3, uint8_t *op2,
                            uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3,
                            uint8_t *oq4, uint8_t *oq5,
                            uint8_t *oq6, uint8_t *oq7) {
252
  if (flat2 && flat && mask) {
253 254 255 256 257
    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;

    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
258

259
    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0, 4);
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1, 4);
    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2, 4);
    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3, 4);
    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4, 4);
    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5, 4);
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
    *oq6 = ROUND_POWER_OF_TWO(p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
288
  } else {
289
    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
290 291 292
  }
}

293 294 295
static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh, int count) {
296
  int i;
297

Dmitry Kovalev's avatar
Dmitry Kovalev committed
298 299
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
300
  for (i = 0; i < 8 * count; ++i) {
301 302 303 304
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
305 306
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
    const int8_t flat2 = flat_mask5(1,
307 308
                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
309

310
    filter16(mask, *thresh, flat, flat2,
311 312 313 314
             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
             s,         s + 1 * p, s + 2 * p, s + 3 * p,
             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
315
    ++s;
316
  }
317
}
318

319 320 321 322 323 324 325 326 327 328
void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
}

void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh) {
  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
}

329 330 331 332 333
static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
                                   const uint8_t *blimit,
                                   const uint8_t *limit,
                                   const uint8_t *thresh,
                                   int count) {
334 335
  int i;

336
  for (i = 0; i < count; ++i) {
337 338 339 340
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
    const int8_t mask = filter_mask(*limit, *blimit,
                                    p3, p2, p1, p0, q0, q1, q2, q3);
341 342
    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
343
                                    q0, s[4], s[5], s[6], s[7]);
344

345
    filter16(mask, *thresh, flat, flat2,
346 347
             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
348
    s += p;
349
  }
350
}
351

352
void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
353
                           const uint8_t *limit, const uint8_t *thresh) {
354 355 356
  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
}

357
void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
358
                                const uint8_t *limit, const uint8_t *thresh) {
359
  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
360
}
361 362 363

#if CONFIG_VP9_HIGHBITDEPTH
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
364 365 366 367 368
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
                                        uint16_t p3, uint16_t p2,
                                        uint16_t p1, uint16_t p0,
                                        uint16_t q0, uint16_t q1,
                                        uint16_t q2, uint16_t q3, int bd) {
369 370 371 372 373 374 375 376 377 378 379 380 381
  int8_t mask = 0;
  int16_t limit16 = (uint16_t)limit << (bd - 8);
  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
  mask |= (abs(p3 - p2) > limit16) * -1;
  mask |= (abs(p2 - p1) > limit16) * -1;
  mask |= (abs(p1 - p0) > limit16) * -1;
  mask |= (abs(q1 - q0) > limit16) * -1;
  mask |= (abs(q2 - q1) > limit16) * -1;
  mask |= (abs(q3 - q2) > limit16) * -1;
  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
  return ~mask;
}

382 383 384 385 386
static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
                                       uint16_t p3, uint16_t p2,
                                       uint16_t p1, uint16_t p0,
                                       uint16_t q0, uint16_t q1,
                                       uint16_t q2, uint16_t q3, int bd) {
387 388 389 390 391 392 393 394 395 396 397
  int8_t mask = 0;
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
  mask |= (abs(p1 - p0) > thresh16) * -1;
  mask |= (abs(q1 - q0) > thresh16) * -1;
  mask |= (abs(p2 - p0) > thresh16) * -1;
  mask |= (abs(q2 - q0) > thresh16) * -1;
  mask |= (abs(p3 - p0) > thresh16) * -1;
  mask |= (abs(q3 - q0) > thresh16) * -1;
  return ~mask;
}

398 399 400 401 402 403 404
static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
                                       uint16_t p4, uint16_t p3,
                                       uint16_t p2, uint16_t p1,
                                       uint16_t p0, uint16_t q0,
                                       uint16_t q1, uint16_t q2,
                                       uint16_t q3, uint16_t q4, int bd) {
  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
405 406 407 408 409 410 411 412
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
  mask |= (abs(p4 - p0) > thresh16) * -1;
  mask |= (abs(q4 - q0) > thresh16) * -1;
  return ~mask;
}

// Is there high edge variance internal edge:
// 11111111_11111111 yes, 00000000_00000000 no ?
413 414
static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
                                      uint16_t q0, uint16_t q1, int bd) {
415 416 417 418 419 420 421
  int16_t hev = 0;
  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
  hev |= (abs(p1 - p0) > thresh16) * -1;
  hev |= (abs(q1 - q0) > thresh16) * -1;
  return hev;
}

422 423 424
static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                  int bd) {
425 426 427 428 429 430 431 432
  int16_t filter1, filter2;
  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
  // into -128 to +127 instead of 0 to 255.
  int shift = bd - 8;
  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
433
  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456

  // Add outer taps if we have high edge variance.
  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;

  // Inner taps.
  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;

  // Save bottom 3 bits so that we round one side +4 and the other +3
  // if it equals 4 we'll set to adjust by -1 to account for the fact
  // we'd round 3 the other way.
  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;

  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);

  // Outer tap adjustments.
  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;

  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
}

457
void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
458
                                   const uint8_t *blimit, const uint8_t *limit,
459
                                   const uint8_t *thresh, int bd) {
460 461 462 463
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
464
  for (i = 0; i < 8; ++i) {
465 466 467 468 469 470 471 472
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
    const uint16_t p1 = s[-2 * p];
    const uint16_t p0 = s[-p];
    const uint16_t q0 = s[0 * p];
    const uint16_t q1 = s[1 * p];
    const uint16_t q2 = s[2 * p];
    const uint16_t q3 = s[3 * p];
473 474 475
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
476 477 478 479
    ++s;
  }
}

480
void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
481 482 483 484 485 486 487
                                        const uint8_t *blimit0,
                                        const uint8_t *limit0,
                                        const uint8_t *thresh0,
                                        const uint8_t *blimit1,
                                        const uint8_t *limit1,
                                        const uint8_t *thresh1,
                                        int bd) {
488 489
  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
490 491
}

492
void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
493
                                 const uint8_t *limit, const uint8_t *thresh,
494
                                 int bd) {
495 496 497 498
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
499
  for (i = 0; i < 8; ++i) {
500 501
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
502 503 504
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
505 506 507 508
    s += pitch;
  }
}

509
void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
510 511 512 513 514 515 516
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1,
                                      int bd) {
517
  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
518
  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
519
                              thresh1, bd);
520 521
}

522 523 524 525 526
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
                                  uint16_t *op3, uint16_t *op2,
                                  uint16_t *op1, uint16_t *op0,
                                  uint16_t *oq0, uint16_t *oq1,
                                  uint16_t *oq2, uint16_t *oq3, int bd) {
527 528 529 530 531 532 533 534 535 536 537 538
  if (flat && mask) {
    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
  } else {
539
    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
540 541 542
  }
}

543
void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
544
                                   const uint8_t *limit, const uint8_t *thresh,
545
                                   int bd) {
546 547 548 549
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
550
  for (i = 0; i < 8; ++i) {
551 552 553
    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

554
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
555
                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
556 557 558
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    highbd_filter8(mask, *thresh, flat,
559 560 561 562 563 564
                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
    ++s;
  }
}

565
void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
566 567 568 569 570 571 572
                                        const uint8_t *blimit0,
                                        const uint8_t *limit0,
                                        const uint8_t *thresh0,
                                        const uint8_t *blimit1,
                                        const uint8_t *limit1,
                                        const uint8_t *thresh1,
                                        int bd) {
573 574
  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
575 576
}

577
void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
578
                                 const uint8_t *limit, const uint8_t *thresh,
579
                                 int bd) {
580 581
  int i;

582
  for (i = 0; i < 8; ++i) {
583 584
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
585 586 587 588 589
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    highbd_filter8(mask, *thresh, flat,
590 591 592 593 594 595 596
                 s - 4, s - 3, s - 2, s - 1,
                 s, s + 1, s + 2, s + 3,
                 bd);
    s += pitch;
  }
}

597
void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
598 599 600 601 602 603 604
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1,
                                      int bd) {
605
  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
606
  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
607
                              thresh1, bd);
608 609
}

610 611 612 613 614 615 616 617 618 619
static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
                                   uint8_t flat, uint8_t flat2,
                                   uint16_t *op7, uint16_t *op6,
                                   uint16_t *op5, uint16_t *op4,
                                   uint16_t *op3, uint16_t *op2,
                                   uint16_t *op1, uint16_t *op0,
                                   uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3,
                                   uint16_t *oq4, uint16_t *oq5,
                                   uint16_t *oq6, uint16_t *oq7, int bd) {
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
  if (flat2 && flat && mask) {
    const uint16_t p7 = *op7;
    const uint16_t p6 = *op6;
    const uint16_t p5 = *op5;
    const uint16_t p4 = *op4;
    const uint16_t p3 = *op3;
    const uint16_t p2 = *op2;
    const uint16_t p1 = *op1;
    const uint16_t p0 = *op0;
    const uint16_t q0 = *oq0;
    const uint16_t q1 = *oq1;
    const uint16_t q2 = *oq2;
    const uint16_t q3 = *oq3;
    const uint16_t q4 = *oq4;
    const uint16_t q5 = *oq5;
    const uint16_t q6 = *oq6;
    const uint16_t q7 = *oq7;

    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0, 4);
    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1, 4);
    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2, 4);
    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3, 4);
    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4, 4);
    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5, 4);
    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
    *oq6 = ROUND_POWER_OF_TWO(p0 +
                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
  } else {
668 669
    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
                   bd);
670 671 672
  }
}

673 674 675 676 677
static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                            const uint8_t *blimit,
                                            const uint8_t *limit,
                                            const uint8_t *thresh,
                                            int count, int bd) {
678 679 680 681 682 683 684 685 686 687 688 689 690
  int i;

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8 * count; ++i) {
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
    const uint16_t p1 = s[-2 * p];
    const uint16_t p0 = s[-p];
    const uint16_t q0 = s[0 * p];
    const uint16_t q1 = s[1 * p];
    const uint16_t q2 = s[2 * p];
    const uint16_t q3 = s[3 * p];
691 692 693 694 695
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    const int8_t flat2 = highbd_flat_mask5(
696 697 698
        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);

699 700 701 702 703 704
    highbd_filter16(mask, *thresh, flat, flat2,
                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
                    s, s + 1 * p, s + 2 * p, s + 3 * p,
                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
                    bd);
705 706 707 708
    ++s;
  }
}

709 710 711 712 713 714 715 716 717 718 719 720 721 722
void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
}

void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
                                         const uint8_t *blimit,
                                         const uint8_t *limit,
                                         const uint8_t *thresh, int bd) {
  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
}

723 724 725 726 727
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh,
                                          int count, int bd) {
728 729 730 731 732 733 734 735 736 737 738
  int i;

  for (i = 0; i < count; ++i) {
    const uint16_t p3 = s[-4];
    const uint16_t p2 = s[-3];
    const uint16_t p1 = s[-2];
    const uint16_t p0 = s[-1];
    const uint16_t q0 = s[0];
    const uint16_t q1 = s[1];
    const uint16_t q2 = s[2];
    const uint16_t q3 = s[3];
739 740 741 742 743 744 745 746 747 748 749
    const int8_t mask = highbd_filter_mask(*limit, *blimit,
                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
                                          bd);
    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
                                           q0, s[4], s[5], s[6], s[7], bd);

    highbd_filter16(mask, *thresh, flat, flat2,
                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
                    bd);
750 751 752 753
    s += p;
  }
}

754
void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
755 756
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
757
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
758 759
}

760
void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
761 762 763 764
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh,
                                       int bd) {
765
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
766 767
}
#endif  // CONFIG_VP9_HIGHBITDEPTH