Commit 054013a0 authored by Diego Biurrun's avatar Diego Biurrun
Browse files

dsputil: Move APE-specific bits into apedsp

parent 256da077
......@@ -25,6 +25,7 @@
#include "libavutil/avassert.h"
#include "libavutil/channel_layout.h"
#include "libavutil/opt.h"
#include "apedsp.h"
#include "avcodec.h"
#include "dsputil.h"
#include "bytestream.h"
......@@ -136,6 +137,7 @@ typedef struct APEContext {
AVClass *class; ///< class for AVOptions
AVCodecContext *avctx;
DSPContext dsp;
APEDSPContext adsp;
int channels;
int samples; ///< samples left to decode in current frame
int bps;
......@@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
static void predictor_decode_mono_3950(APEContext *ctx, int count);
static void predictor_decode_stereo_3950(APEContext *ctx, int count);
// TODO: dsputilize
static av_cold int ape_decode_close(AVCodecContext *avctx)
{
APEContext *s = avctx->priv_data;
......@@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
return 0;
}
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul)
{
int res = 0;
while (order--) {
res += *v1 * *v2++;
*v1++ += mul * *v3++;
}
return res;
}
static av_cold int ape_decode_init(AVCodecContext *avctx)
{
APEContext *s = avctx->priv_data;
......@@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
s->predictor_decode_stereo = predictor_decode_stereo_3950;
}
s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
if (ARCH_ARM)
ff_apedsp_init_arm(&s->adsp);
if (ARCH_PPC)
ff_apedsp_init_ppc(&s->adsp);
if (ARCH_X86)
ff_apedsp_init_x86(&s->adsp);
ff_dsputil_init(&s->dsp, avctx);
avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
......@@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
while (count--) {
/* round fixedpoint scalar product */
res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
f->adaptcoeffs - order,
order, APESIGN(*data));
res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
f->delay - order,
f->adaptcoeffs - order,
order, APESIGN(*data));
res = (res + (1 << (fracbits - 1))) >> fracbits;
res += *data;
*data++ = res;
......
/*
* Monkey's Audio lossless audio decoder
* Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
* based upon libdemac from Dave Chapman.
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_APEDSP_H
#define AVCODEC_APEDSP_H
#include <stdint.h>
typedef struct APEDSPContext {
/**
* Calculate scalar product of v1 and v2,
* and v1[i] += v3[i] * mul
* @param len length of vectors, should be multiple of 16
*/
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
const int16_t *v2,
const int16_t *v3,
int len, int mul);
} APEDSPContext;
void ff_apedsp_init_arm(APEDSPContext *c);
void ff_apedsp_init_ppc(APEDSPContext *c);
void ff_apedsp_init_x86(APEDSPContext *c);
#endif /* AVCODEC_APEDSP_H */
......@@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o
......@@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
......
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/apedsp.h"
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
const int16_t *v3, int len, int mul);
av_cold void ff_apedsp_init_arm(APEDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
}
}
/*
* ARM NEON optimised integer operations
* Copyright (c) 2009 Kostya Shishkov
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
function ff_scalarproduct_and_madd_int16_neon, export=1
vld1.16 {d28[],d29[]}, [sp]
vmov.i16 q0, #0
vmov.i16 q1, #0
vmov.i16 q2, #0
vmov.i16 q3, #0
mov r12, r0
1: vld1.16 {d16-d17}, [r0,:128]!
vld1.16 {d18-d19}, [r1]!
vld1.16 {d20-d21}, [r2]!
vld1.16 {d22-d23}, [r0,:128]!
vld1.16 {d24-d25}, [r1]!
vld1.16 {d26-d27}, [r2]!
vmul.s16 q10, q10, q14
vmul.s16 q13, q13, q14
vmlal.s16 q0, d16, d18
vmlal.s16 q1, d17, d19
vadd.s16 q10, q8, q10
vadd.s16 q13, q11, q13
vmlal.s16 q2, d22, d24
vmlal.s16 q3, d23, d25
vst1.16 {q10}, [r12,:128]!
subs r3, r3, #16
vst1.16 {q13}, [r12,:128]!
bne 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
vpadd.s32 d18, d4, d5
vpadd.s32 d19, d6, d7
vpadd.s32 d0, d16, d17
vpadd.s32 d1, d18, d19
vpadd.s32 d2, d0, d1
vpaddl.s32 d3, d2
vmov.32 r0, d3[0]
bx lr
endfunc
......@@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
const int16_t *v3, int len, int mul);
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
......@@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_neon;
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
}
......@@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
vmov.32 r0, d3[0]
bx lr
endfunc
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
function ff_scalarproduct_and_madd_int16_neon, export=1
vld1.16 {d28[],d29[]}, [sp]
vmov.i16 q0, #0
vmov.i16 q1, #0
vmov.i16 q2, #0
vmov.i16 q3, #0
mov r12, r0
1: vld1.16 {d16-d17}, [r0,:128]!
vld1.16 {d18-d19}, [r1]!
vld1.16 {d20-d21}, [r2]!
vld1.16 {d22-d23}, [r0,:128]!
vld1.16 {d24-d25}, [r1]!
vld1.16 {d26-d27}, [r2]!
vmul.s16 q10, q10, q14
vmul.s16 q13, q13, q14
vmlal.s16 q0, d16, d18
vmlal.s16 q1, d17, d19
vadd.s16 q10, q8, q10
vadd.s16 q13, q11, q13
vmlal.s16 q2, d22, d24
vmlal.s16 q3, d23, d25
vst1.16 {q10}, [r12,:128]!
subs r3, r3, #16
vst1.16 {q13}, [r12,:128]!
bne 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
vpadd.s32 d18, d4, d5
vpadd.s32 d19, d6, d7
vpadd.s32 d0, d16, d17
vpadd.s32 d1, d18, d19
vpadd.s32 d2, d0, d1
vpaddl.s32 d3, d2
vmov.32 r0, d3[0]
bx lr
endfunc
......@@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
return res;
}
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul)
{
int res = 0;
while (order--) {
res += *v1 * *v2++;
*v1++ += mul * *v3++;
}
return res;
}
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len)
{
......@@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
c->try_8x8basis = try_8x8basis_c;
c->add_8x8basis = add_8x8basis_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->scalarproduct_int16 = scalarproduct_int16_c;
c->vector_clip_int32 = vector_clip_int32_c;
c->vector_clipf = vector_clipf_c;
......
......@@ -255,16 +255,6 @@ typedef struct DSPContext {
*/
int32_t (*scalarproduct_int16)(const int16_t *v1,
const int16_t *v2 /* align 16 */, int len);
/* ape functions */
/**
* Calculate scalar product of v1 and v2,
* and v1[i] += v3[i] * mul
* @param len length of vectors, should be multiple of 16
*/
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
const int16_t *v2,
const int16_t *v3,
int len, int mul);
/**
* Clip each element in an array of int32_t to a given minimum and
......
......@@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
......
/*
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/apedsp.h"
#if HAVE_ALTIVEC
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
const int16_t *v2,
const int16_t *v3,
int order, int mul)
{
LOAD_ZERO;
vec_s16 *pv1 = (vec_s16 *) v1;
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
register vec_s16 t0, t1, i0, i1, i4;
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
register vec_s32 res = zero_s32v;
register vec_u8 align = vec_lvsl(0, v2);
int32_t ires;
order >>= 4;
do {
i1 = vec_ld(16, v2);
t0 = vec_perm(i2, i1, align);
i2 = vec_ld(32, v2);
t1 = vec_perm(i1, i2, align);
i0 = pv1[0];
i1 = pv1[1];
res = vec_msum(t0, i0, res);
res = vec_msum(t1, i1, res);
i4 = vec_ld(16, v3);
t0 = vec_perm(i3, i4, align);
i3 = vec_ld(32, v3);
t1 = vec_perm(i4, i3, align);
pv1[0] = vec_mladd(t0, muls, i0);
pv1[1] = vec_mladd(t1, muls, i1);
pv1 += 2;
v2 += 16;
v3 += 16;
} while (--order);
res = vec_splat(vec_sums(res, zero_s32v), 3);
vec_ste(res, 0, &ires);
return ires;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
{
#if HAVE_ALTIVEC
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
#endif /* HAVE_ALTIVEC */
}
......@@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
return ires;
}
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
const int16_t *v2,
const int16_t *v3,
int order, int mul)
{
LOAD_ZERO;
vec_s16 *pv1 = (vec_s16 *) v1;
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
register vec_s16 t0, t1, i0, i1, i4;
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
register vec_s32 res = zero_s32v;
register vec_u8 align = vec_lvsl(0, v2);
int32_t ires;
order >>= 4;
do {
i1 = vec_ld(16, v2);
t0 = vec_perm(i2, i1, align);
i2 = vec_ld(32, v2);
t1 = vec_perm(i1, i2, align);
i0 = pv1[0];
i1 = pv1[1];
res = vec_msum(t0, i0, res);
res = vec_msum(t1, i1, res);
i4 = vec_ld(16, v3);
t0 = vec_perm(i3, i4, align);
i3 = vec_ld(32, v3);
t1 = vec_perm(i4, i3, align);
pv1[0] = vec_mladd(t0, muls, i0);
pv1[1] = vec_mladd(t1, muls, i1);
pv1 += 2;
v2 += 16;
v3 += 16;
} while (--order);
res = vec_splat(vec_sums(res, zero_s32v), 3);
vec_ste(res, 0, &ires);
return ires;
}
av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
{
c->scalarproduct_int16 = scalarproduct_int16_altivec;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
}
......@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
......@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
......
;******************************************************************************
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_TEXT
%macro SCALARPRODUCT 0
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
; int order, int mul)
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
%if mmsize == 16
pshuflw m7, m7, 0
punpcklqdq m7, m7
%else
pshufw m7, m7, 0
%endif
pxor m6, m6
add v1q, orderq
add v2q, orderq
add v3q, orderq
neg orderq
.loop:
movu m0, [v2q + orderq]
movu m1, [v2q + orderq + mmsize]
mova m4, [v1q + orderq]
mova m5, [v1q + orderq + mmsize]
movu m2, [v3q + orderq]
movu m3, [v3q + orderq + mmsize]
pmaddwd m0, m4
pmaddwd m1, m5
pmullw m2, m7
pmullw m3, m7
paddd m6, m0
paddd m6, m1
paddw m2, m4
paddw m3, m5
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
add orderq, mmsize*2
jl .loop
%if mmsize == 16
movhlps m0, m6
paddd m6, m0
pshuflw m0, m6, 0x4e
%else
pshufw m0, m6, 0x4e
%endif
paddd m6, m0
movd eax, m6
RET
%endmacro
INIT_MMX mmxext
SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
sub orderq, mmsize*2
%if %1
mova m1, m4
mova m4, [v2q + orderq]
mova m0, [v2q + orderq + mmsize]
palignr m1, m0, %1
palignr m0, m4, %1
mova m3, m5
mova m5, [v3q + orderq]
mova m2, [v3q + orderq + mmsize]
palignr m3, m2, %1
palignr m2, m5, %1
%else
mova m0, [v2q + orderq]
mova m1, [v2q + orderq + mmsize]
mova m2, [v3q + orderq]
mova m3, [v3q + orderq + mmsize]
%endif
%define t0 [v1q + orderq]
%define t1 [v1q + orderq + mmsize]
%if ARCH_X86_64
mova m8, t0
mova m9, t1
%define t0 m8
%define t1 m9
%endif
pmaddwd m0, t0
pmaddwd m1, t1
pmullw m2, m7
pmullw m3, m7