vpx_convolve8_msa.c 25.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
/*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include <assert.h>
Zoe Liu's avatar
Zoe Liu committed
12
13
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/vpx_convolve_msa.h"
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

const uint8_t mc_filt_mask_arr[16 * 3] = {
  /* 8 width cases */
  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  /* 4 width cases */
  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
  /* 4 width cases */
  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
};

static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
                                     uint8_t *dst, int32_t dst_stride,
                                     int8_t *filter_horiz, int8_t *filter_vert,
                                     int32_t height) {
  uint32_t loop_cnt;
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
30
31
32
33
34
  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  v16u8 mask0, mask1, mask2, mask3, out;
  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
35

36
  mask0 = LD_UB(&mc_filt_mask_arr[16]);
37
38
39
  src -= (3 + 3 * src_stride);

  /* rearranging filter */
40
41
  filt = LD_SH(filter_horiz);
  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
42
43
44
45
46

  mask1 = mask0 + 2;
  mask2 = mask0 + 4;
  mask3 = mask0 + 6;

47
48
  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
49
50
  src += (7 * src_stride);

51
52
53
54
55
56
57
58
59
  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
60

61
62
  filt = LD_SH(filter_vert);
  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
63

64
65
  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
66

67
68
69
70
  for (loop_cnt = (height >> 2); loop_cnt--;) {
    LD_SB4(src, src_stride, src7, src8, src9, src10);
    XORI_B4_128_SB(src7, src8, src9, src10);
    src += (4 * src_stride);
71

clang-format's avatar
clang-format committed
72
73
    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
                              filt_hz1, filt_hz2, filt_hz3);
74
75
76
77
78
    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);

clang-format's avatar
clang-format committed
79
80
    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
                              filt_hz1, filt_hz2, filt_hz3);
81
82
83
84
85
86
87
88
    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);
    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
    SAT_SH2_SH(tmp0, tmp1, 7);
    out = PCKEV_XORI128_UB(tmp0, tmp1);
    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
89
90
    dst += (4 * dst_stride);

91
    hz_out5 = hz_out9;
92
93
94
95
96
97
98
99
100
101
102
103
    out0 = out2;
    out1 = out3;
    out2 = out4;
  }
}

static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
                                     uint8_t *dst, int32_t dst_stride,
                                     int8_t *filter_horiz, int8_t *filter_vert,
                                     int32_t height) {
  uint32_t loop_cnt;
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
104
105
106
107
108
  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
109
110
  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;

111
  mask0 = LD_UB(&mc_filt_mask_arr[0]);
112
113
114
  src -= (3 + 3 * src_stride);

  /* rearranging filter */
115
116
  filt = LD_SH(filter_horiz);
  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
117
118
119
120
121

  mask1 = mask0 + 2;
  mask2 = mask0 + 4;
  mask3 = mask0 + 6;

122
  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
123
124
  src += (7 * src_stride);

125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);

  filt = LD_SH(filter_vert);
  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);

  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
147
148

  for (loop_cnt = (height >> 2); loop_cnt--;) {
149
    LD_SB4(src, src_stride, src7, src8, src9, src10);
150
151
    src += (4 * src_stride);

152
153
    XORI_B4_128_SB(src7, src8, src9, src10);

clang-format's avatar
clang-format committed
154
155
    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
                              filt_hz1, filt_hz2, filt_hz3);
156
157
158
159
    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);

clang-format's avatar
clang-format committed
160
161
    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
                              filt_hz1, filt_hz2, filt_hz3);
162
163
164
165
    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);

clang-format's avatar
clang-format committed
166
167
    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
                              filt_hz1, filt_hz2, filt_hz3);
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);

    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);
    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
    ST8x4_UB(vec0, vec1, dst, dst_stride);
182
183
    dst += (4 * dst_stride);

184
    hz_out6 = hz_out10;
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
    out0 = out2;
    out1 = out3;
    out2 = out8;
    out4 = out6;
    out5 = out7;
    out6 = out9;
  }
}

static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
  int32_t multiple8_cnt;
  for (multiple8_cnt = 2; multiple8_cnt--;) {
    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
                             filter_vert, height);
    src += 8;
    dst += 8;
  }
}

static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
  int32_t multiple8_cnt;
  for (multiple8_cnt = 4; multiple8_cnt--;) {
    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
                             filter_vert, height);
    src += 8;
    dst += 8;
  }
}

static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
  int32_t multiple8_cnt;
  for (multiple8_cnt = 8; multiple8_cnt--;) {
    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
                             filter_vert, height);
    src += 8;
    dst += 8;
  }
}

static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz,
                                      int8_t *filter_vert) {
  v16i8 src0, src1, src2, src3, src4, mask;
238
239
  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
240

241
  mask = LD_SB(&mc_filt_mask_arr[16]);
242
243

  /* rearranging filter */
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
  filt = LD_UH(filter_horiz);
  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);

  filt = LD_UH(filter_vert);
  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);

  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);

  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
262
263
264
265
266
267
268
269
}

static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz,
                                      int8_t *filter_vert) {
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
  v16i8 res0, res1, res2, res3;
270
271
272
  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
273

274
  mask = LD_SB(&mc_filt_mask_arr[16]);
275
276

  /* rearranging filter */
277
278
  filt = LD_UH(filter_horiz);
  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
279

280
281
  filt = LD_UH(filter_vert);
  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
282

283
  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
284
  src += (8 * src_stride);
285
286
287
288
289
290
291
292
293
294
295
296
297
  src8 = LD_SB(src);

  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
             hz_out3, hz_out5, 8);
  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);

  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
clang-format's avatar
clang-format committed
298
299
  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
              vec5, vec6, vec7);
300
  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
clang-format's avatar
clang-format committed
301
302
  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
              res3);
303
304
305
  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
  dst += (4 * dst_stride);
  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
306
307
308
309
}

static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
                                     uint8_t *dst, int32_t dst_stride,
310
                                     int8_t *filter_horiz, int8_t *filter_vert,
311
312
                                     int32_t height) {
  if (4 == height) {
313
314
    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
                              filter_vert);
315
  } else if (8 == height) {
316
317
    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
                              filter_vert);
318
319
320
321
322
323
324
  }
}

static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz,
                                      int8_t *filter_vert) {
325
326
327
  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
328
329
  v8i16 filt;

330
  mask = LD_SB(&mc_filt_mask_arr[0]);
331
332

  /* rearranging filter */
333
334
  filt = LD_SH(filter_horiz);
  filt_hz = (v16u8)__msa_splati_h(filt, 0);
335

336
337
  filt = LD_SH(filter_vert);
  filt_vt = (v16u8)__msa_splati_h(filt, 0);
338

339
  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
340

341
342
343
344
  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
345

346
347
348
  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
349

350
351
352
  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
353

354
355
356
  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
357

358
359
360
  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  ST8x4_UB(out0, out1, dst, dst_stride);
361
362
363
}

static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
clang-format's avatar
clang-format committed
364
                                          int32_t src_stride, uint8_t *dst,
365
366
                                          int32_t dst_stride,
                                          int8_t *filter_horiz,
clang-format's avatar
clang-format committed
367
                                          int8_t *filter_vert, int32_t height) {
368
  uint32_t loop_cnt;
369
370
371
  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
  v16u8 filt_hz, filt_vt, vec0;
  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
372
373
  v8i16 filt;

374
  mask = LD_SB(&mc_filt_mask_arr[0]);
375
376

  /* rearranging filter */
377
378
  filt = LD_SH(filter_horiz);
  filt_hz = (v16u8)__msa_splati_h(filt, 0);
379

380
381
  filt = LD_SH(filter_vert);
  filt_vt = (v16u8)__msa_splati_h(filt, 0);
382

383
  src0 = LD_SB(src);
384
385
  src += src_stride;

386
  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
387
388

  for (loop_cnt = (height >> 3); loop_cnt--;) {
389
    LD_SB4(src, src_stride, src1, src2, src3, src4);
390
391
    src += (4 * src_stride);

392
393
394
    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
395

396
397
398
    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
399

400
    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
401

402
403
404
    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
405

406
407
    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
    LD_SB4(src, src_stride, src1, src2, src3, src4);
408
    src += (4 * src_stride);
409
410
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
411

412
413
414
    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
    ST8x4_UB(out0, out1, dst, dst_stride);
415
416
    dst += (4 * dst_stride);

417
418
419
    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
420

421
422
423
    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
424

425
426
427
    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
428

429
430
431
    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
432

433
434
435
    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
    ST8x4_UB(out0, out1, dst, dst_stride);
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
    dst += (4 * dst_stride);
  }
}

static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
                                     uint8_t *dst, int32_t dst_stride,
                                     int8_t *filter_horiz, int8_t *filter_vert,
                                     int32_t height) {
  if (4 == height) {
    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
                              filter_vert);
  } else {
    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
                                  filter_horiz, filter_vert, height);
  }
}

static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
  uint32_t loop_cnt;
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
459
460
  v16u8 filt_hz, filt_vt, vec0, vec1;
  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
461
462
  v8i16 filt;

463
  mask = LD_SB(&mc_filt_mask_arr[0]);
464
465

  /* rearranging filter */
466
467
  filt = LD_SH(filter_horiz);
  filt_hz = (v16u8)__msa_splati_h(filt, 0);
468

469
470
  filt = LD_SH(filter_vert);
  filt_vt = (v16u8)__msa_splati_h(filt, 0);
471

472
  LD_SB2(src, 8, src0, src1);
473
474
  src += src_stride;

475
476
477
  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);

478
  for (loop_cnt = (height >> 2); loop_cnt--;) {
479
480
    LD_SB4(src, src_stride, src0, src2, src4, src6);
    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
481
482
    src += (4 * src_stride);

483
484
485
486
487
488
    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
    PCKEV_ST_SB(tmp1, tmp2, dst);
489
490
    dst += dst_stride;

491
492
493
494
495
496
    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
    PCKEV_ST_SB(tmp1, tmp2, dst);
497
498
    dst += dst_stride;

499
500
501
502
503
504
    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
    PCKEV_ST_SB(tmp1, tmp2, dst);
505
506
    dst += dst_stride;

507
508
509
510
511
512
    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
    PCKEV_ST_SB(tmp1, tmp2, dst);
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
    dst += dst_stride;
  }
}

static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
  int32_t multiple8_cnt;
  for (multiple8_cnt = 2; multiple8_cnt--;) {
    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
                              filter_vert, height);
    src += 16;
    dst += 16;
  }
}

static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
  int32_t multiple8_cnt;
  for (multiple8_cnt = 4; multiple8_cnt--;) {
    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
                              filter_vert, height);
    src += 16;
    dst += 16;
  }
}

clang-format's avatar
clang-format committed
543
void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
544
545
                       ptrdiff_t dst_stride, const InterpKernel *filter,
                       int x0_q4, int32_t x_step_q4, int y0_q4,
clang-format's avatar
clang-format committed
546
                       int32_t y_step_q4, int32_t w, int32_t h) {
547
548
  const int16_t *const filter_x = filter[x0_q4];
  const int16_t *const filter_y = filter[y0_q4];
549
550
  int8_t cnt, filt_hor[8], filt_ver[8];

551
552
  assert(x_step_q4 == 16);
  assert(y_step_q4 == 16);
553
554
  assert(((const int32_t *)filter_x)[1] != 0x800000);
  assert(((const int32_t *)filter_y)[1] != 0x800000);
555
556
557
558
559
560
561
562
563
564

  for (cnt = 0; cnt < 8; ++cnt) {
    filt_hor[cnt] = filter_x[cnt];
    filt_ver[cnt] = filter_y[cnt];
  }

  if (((const int32_t *)filter_x)[0] == 0 &&
      ((const int32_t *)filter_y)[0] == 0) {
    switch (w) {
      case 4:
clang-format's avatar
clang-format committed
565
566
567
        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
                                 (int32_t)dst_stride, &filt_hor[3],
                                 &filt_ver[3], (int32_t)h);
568
569
        break;
      case 8:
clang-format's avatar
clang-format committed
570
571
572
        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
                                 (int32_t)dst_stride, &filt_hor[3],
                                 &filt_ver[3], (int32_t)h);
573
574
        break;
      case 16:
clang-format's avatar
clang-format committed
575
576
577
        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
                                  (int32_t)dst_stride, &filt_hor[3],
                                  &filt_ver[3], (int32_t)h);
578
579
        break;
      case 32:
clang-format's avatar
clang-format committed
580
581
582
        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
                                  (int32_t)dst_stride, &filt_hor[3],
                                  &filt_ver[3], (int32_t)h);
583
584
        break;
      case 64:
clang-format's avatar
clang-format committed
585
586
587
        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
                                  (int32_t)dst_stride, &filt_hor[3],
                                  &filt_ver[3], (int32_t)h);
588
589
        break;
      default:
590
591
        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
                        x_step_q4, y0_q4, y_step_q4, w, h);
592
593
594
595
        break;
    }
  } else if (((const int32_t *)filter_x)[0] == 0 ||
             ((const int32_t *)filter_y)[0] == 0) {
596
597
    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
                    y0_q4, y_step_q4, w, h);
598
599
600
  } else {
    switch (w) {
      case 4:
clang-format's avatar
clang-format committed
601
602
603
        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
                                 (int32_t)dst_stride, filt_hor, filt_ver,
                                 (int32_t)h);
604
605
        break;
      case 8:
clang-format's avatar
clang-format committed
606
607
608
        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
                                 (int32_t)dst_stride, filt_hor, filt_ver,
                                 (int32_t)h);
609
610
        break;
      case 16:
clang-format's avatar
clang-format committed
611
612
613
        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
                                  (int32_t)dst_stride, filt_hor, filt_ver,
                                  (int32_t)h);
614
615
        break;
      case 32:
clang-format's avatar
clang-format committed
616
617
618
        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
                                  (int32_t)dst_stride, filt_hor, filt_ver,
                                  (int32_t)h);
619
620
        break;
      case 64:
clang-format's avatar
clang-format committed
621
622
623
        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
                                  (int32_t)dst_stride, filt_hor, filt_ver,
                                  (int32_t)h);
624
625
        break;
      default:
626
627
        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
                        x_step_q4, y0_q4, y_step_q4, w, h);
628
629
630
631
        break;
    }
  }
}