Commit 67fd620c authored by Marc Hoffman's avatar Marc Hoffman Committed by Diego Biurrun
Browse files

bfin dsputils, basic pixel operations sads, diffs, motion compensation

and standard IEEE 8x8 block transforms
patch by Marc Hoffman, mmh pleasantst com

Originally committed as revision 8594 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent f8fb86e9
......@@ -394,6 +394,11 @@ endif
OBJS-$(TARGET_ARCH_BFIN) += bfin/dsputil_bfin.o \
ASM_OBJS-$(TARGET_ARCH_BFIN) += bfin/pixels_bfin.o \
bfin/idct_bfin.o \
bfin/fdct_bfin.o \
bfin/xidct.o \
EXTRALIBS := -L$(BUILD_ROOT)/libavutil -lavutil$(BUILDSUF) $(EXTRALIBS)
NAME=avcodec
......
/*
* config_bfin.h
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
low level assembler interface wrapper
DEFUN(put_pixels_clamped,mL1,
(DCTELEM *block, uint8_t *dest, int line_size)):
body
rts;
*/
#ifndef DEFUN
#ifndef mL1
#define mL1 .l1.text
#endif
#define mL3 .text
#define DEFUN(fname,where,interface) \
.section where; \
.global _ff_bfin_ ## fname ; \
.align 8; \
_ff_bfin_ ## fname
#endif
/*
* Copyright (c) 2006 Michael Benjamin
* BlackFin DSPUTILS
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
*
* This file is part of FFmpeg.
*
......@@ -18,38 +21,290 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <unistd.h>
#include <bits/bfin_sram.h>
#include "../avcodec.h"
#include "../dsputil.h"
static int sad8x8_bfin( void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h )
{
int sum;
__asm__ __volatile__ (
"P0 = %1;" // blk1
"P1 = %2;" // blk2
"P2 = %3;\n" // h
"I0 = P0;"
"I1 = P1;\n"
"A0 = 0;"
"A1 = 0;\n"
"M0 = P2;\n"
"P3 = 32;\n"
"LSETUP (sad8x8LoopBegin, sad8x8LoopEnd) LC0=P3;\n"
"sad8x8LoopBegin:\n"
" DISALGNEXCPT || R0 = [I0] || R2 = [I1];\n"
" DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];\n"
"sad8x8LoopEnd:\n"
" SAA ( R1:0 , R3:2 );\n"
"R3 = A1.L + A1.H, R2 = A0.L + A0.H;\n"
"%0 = R2 + R3 (S);\n"
: "=&d" (sum)
: "m"(blk1), "m"(blk2), "m"(h)
: "P0","P1","P2","I0","I1","A0","A1","R0","R1","R2","R3");
return sum;
#define USE_L1CODE
#ifdef USE_L1CODE
#define L1CODE __attribute__ ((l1_text))
#else
#define L1CODE
#endif
int off;
extern void ff_bfin_idct (DCTELEM *block) L1CODE;
extern void ff_bfin_fdct (DCTELEM *block) L1CODE;
extern void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
extern void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
extern void ff_bfin_diff_pixels (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride) L1CODE;
extern void ff_bfin_get_pixels (DCTELEM *restrict block, const uint8_t *pixels, int line_size) L1CODE;
extern int ff_bfin_pix_norm1 (uint8_t * pix, int line_size) L1CODE;
extern int ff_bfin_z_sad8x8 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
extern int ff_bfin_z_sad16x16 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
extern void ff_bfin_z_put_pixels16_xy2 (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
extern void ff_bfin_z_put_pixels8_xy2 (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
extern void ff_bfin_put_pixels16_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
extern void ff_bfin_put_pixels8_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
extern int ff_bfin_pix_sum (uint8_t *p, int stride) L1CODE;
extern void ff_bfin_put_pixels8uc (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
extern void ff_bfin_put_pixels16uc (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
extern void ff_bfin_put_pixels8uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
extern void ff_bfin_put_pixels16uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
extern int ff_bfin_sse4 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
extern int ff_bfin_sse8 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
extern int ff_bfin_sse16 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
#if 0
void pblk (uint8_t *p, int w, int h, int s)
{
int i,j;
av_log (0,0,"0x%08x:\n", p);
for (i = 0;i<h;i++) {
for (j=0;j<w;j++)
av_log (0,0,"%3d ", p[j]);
p+=s;
av_log (0,0,"\n");
}
av_log (0,0,"\n");
}
#endif
static void bfin_idct_add (uint8_t *dest, int line_size, DCTELEM *block)
{
ff_bfin_idct (block);
ff_bfin_add_pixels_clamped (block, dest, line_size);
}
static void bfin_idct_put (uint8_t *dest, int line_size, DCTELEM *block)
{
ff_bfin_idct (block);
ff_bfin_put_pixels_clamped (block, dest, line_size);
}
static void bfin_clear_blocks (DCTELEM *blocks)
{
// This is just a simple memset.
//
asm("P0=192; "
"I0=%0; "
"R0=0; "
"LSETUP(clear_blocks_blkfn_lab,clear_blocks_blkfn_lab)LC0=P0;"
"clear_blocks_blkfn_lab:"
"[I0++]=R0;"
::"a" (blocks):"P0","I0","R0");
}
static void bfin_put_pixels8 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels8uc (block, pixels, pixels, line_size, line_size, h);
}
static void bfin_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels8uc (block, pixels, pixels+1, line_size, line_size, h);
}
static void bfin_put_pixels8_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels8uc (block, pixels, pixels+line_size, line_size, line_size, h);
}
static void bfin_put_pixels8_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
{
ff_bfin_z_put_pixels8_xy2 (block,s0,line_size, line_size, h);
}
static void bfin_put_pixels16 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels16uc (block, pixels, pixels, line_size, line_size, h);
}
static void bfin_put_pixels16_x2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels16uc (block, pixels, pixels+1, line_size, line_size, h);
}
static void bfin_put_pixels16_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels16uc (block, pixels, pixels+line_size, line_size, line_size, h);
}
static void bfin_put_pixels16_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
{
ff_bfin_z_put_pixels16_xy2 (block,s0,line_size, line_size, h);
}
void bfin_put_pixels8_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels8uc_nornd (block, pixels, pixels, line_size, h);
}
static void bfin_put_pixels8_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+1, line_size, h);
}
static void bfin_put_pixels8_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+line_size, line_size, h);
}
void bfin_put_pixels16_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels16uc_nornd (block, pixels, pixels, line_size, h);
}
static void bfin_put_pixels16_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+1, line_size, h);
}
static void bfin_put_pixels16_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+line_size, line_size, h);
}
static int bfin_pix_abs16 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
return ff_bfin_z_sad16x16 (blk1,blk2,line_size,line_size,h);
}
static uint8_t vtmp_blk[256] __attribute__((l1_data_B));
static int bfin_pix_abs16_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+1, 16, line_size, h);
return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
}
static int bfin_pix_abs16_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+line_size, 16, line_size, h);
return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
}
static int bfin_pix_abs16_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
ff_bfin_z_put_pixels16_xy2 (vtmp_blk, blk2, 16, line_size, h);
return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
}
static int bfin_pix_abs8 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
return ff_bfin_z_sad8x8 (blk1,blk2,line_size,line_size, h);
}
static int bfin_pix_abs8_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+1, 8, line_size, h);
return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
}
static int bfin_pix_abs8_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+line_size, 8, line_size, h);
return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
}
static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
{
ff_bfin_z_put_pixels8_xy2 (vtmp_blk, blk2, 8, line_size, h);
return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
}
/*
decoder optimization
start on 2/11 100 frames of 352x240@25 compiled with no optimization -g debugging
9.824s ~ 2.44x off
6.360s ~ 1.58x off with -O2
5.740s ~ 1.43x off with idcts
2.64s 2/20 same sman.mp4 decode only
*/
void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
{
c->pix_abs[1][0] = sad8x8_bfin;
c->sad[1] = sad8x8_bfin;
c->get_pixels = ff_bfin_get_pixels;
c->diff_pixels = ff_bfin_diff_pixels;
c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
c->clear_blocks = bfin_clear_blocks;
c->pix_sum = ff_bfin_pix_sum;
c->pix_norm1 = ff_bfin_pix_norm1;
c->sad[0] = bfin_pix_abs16;
c->sad[1] = bfin_pix_abs8;
/* TODO [0] 16 [1] 8 */
c->pix_abs[0][0] = bfin_pix_abs16;
c->pix_abs[0][1] = bfin_pix_abs16_x2;
c->pix_abs[0][2] = bfin_pix_abs16_y2;
c->pix_abs[0][3] = bfin_pix_abs16_xy2;
c->pix_abs[1][0] = bfin_pix_abs8;
c->pix_abs[1][1] = bfin_pix_abs8_x2;
c->pix_abs[1][2] = bfin_pix_abs8_y2;
c->pix_abs[1][3] = bfin_pix_abs8_xy2;
c->sse[0] = ff_bfin_sse16;
c->sse[1] = ff_bfin_sse8;
c->sse[2] = ff_bfin_sse4;
/**
* Halfpel motion compensation with rounding (a+b+1)>>1.
* This is an array[4][4] of motion compensation functions for 4
* horizontal blocksizes (8,16) and the 4 halfpel positions
* *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
* @param block destination where the result is stored
* @param pixels source
* @param line_size number of bytes in a horizontal line of block
* @param h height
*/
c->put_pixels_tab[0][0] = bfin_put_pixels16;
c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
c->put_pixels_tab[0][3] = bfin_put_pixels16_xy2;
c->put_pixels_tab[1][0] = bfin_put_pixels8;
c->put_pixels_tab[1][1] = bfin_put_pixels8_x2;
c->put_pixels_tab[1][2] = bfin_put_pixels8_y2;
c->put_pixels_tab[1][3] = bfin_put_pixels8_xy2;
c->put_no_rnd_pixels_tab[1][0] = bfin_put_pixels8_nornd;
c->put_no_rnd_pixels_tab[1][1] = bfin_put_pixels8_x2_nornd;
c->put_no_rnd_pixels_tab[1][2] = bfin_put_pixels8_y2_nornd;
c->put_no_rnd_pixels_tab[1][3] = ff_bfin_put_pixels8_xy2_nornd;
c->put_no_rnd_pixels_tab[0][0] = bfin_put_pixels16_nornd;
c->put_no_rnd_pixels_tab[0][1] = bfin_put_pixels16_x2_nornd;
c->put_no_rnd_pixels_tab[0][2] = bfin_put_pixels16_y2_nornd;
c->put_no_rnd_pixels_tab[0][3] = ff_bfin_put_pixels16_xy2_nornd;
c->fdct = ff_bfin_fdct;
c->idct = ff_bfin_idct;
c->idct_add = bfin_idct_add;
c->idct_put = bfin_idct_put;
}
/*
* fdct BlackFin
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
void ff_bfin_fdct (DCTELEM *buf);
This implementation works only for 8x8 input. The range of input
must be -256 to 255 i.e. 8bit input represented in a 16bit data
word. The original data must be sign extended into the 16bit data
words.
Chen factorization of
8
X(m) = sum (x(n) * cos ((2n+1)*m*pi/16))
n=0
C4
0 --*-------------*0+7---*-----*0+3-------*-*-------------------> 0
\ / \ / X S4,S4
1 --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4
\ / \ -C4 C3
2 --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2
\ / / \ X S3,-S3
3 --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6
/ C7 C3
4 --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1
/ \ -C4 X \ /S7 C3
5 --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5
/ \ X S4,S4 / X S3,-S3
6 --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3
/ \ C4 X / \-S7 C3
--*-------------*0-7------------*-*7+6--*-----*---------------> 7
C7
Notation
Cn = cos(n*pi/8) used throughout the code.
Registers used:
R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1.
Other registers used:
I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.
Input - r0 - pointer to start of DCTELEM *block
Output - The DCT output coefficients in the DCTELEM *block
Register constraint:
This code is called from jpeg_encode.
R6, R5, R4 if modified should be stored and restored.
Performance: (Timer version 0.6.33)
Code Size : 240 Bytes.
Memory Required :
Input Matrix : 8 * 8 * 2 Bytes.
Coefficients : 16 Bytes
Temporary matrix: 8 * 8 * 2 Bytes.
Cycle Count :26+{18+8*(14+2S)}*2 where S -> Stalls
(7.45 c/pel)
-----------------------------------------
| Size | Forward DCT | Inverse DCT |
-----------------------------------------
| 8x8 | 284 Cycles | 311 Cycles |
-----------------------------------------
Ck = int16(cos(k/16*pi)*32767+.5)/2
#define C4 23170
#define C3 13623
#define C6 6270
#define C7 3196
Sk = int16(sin(k/16*pi)*32767+.5)/2
#define S4 11585
#define S3 9102
#define S6 15137
#define S7 16069
the coefficients are ordered as follows:
short dct_coef[]
C4,S4,
C6,S6,
C7,S7,
S3,C3,
-----------------------------------------------------------
FFMPEG conformance testing results
-----------------------------------------------------------
dct-test: modified with the following
dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
produces the following output:
root:/u/ffmpeg/bhead/libavcodec> ./dct-test
ffmpeg DCT/IDCT test
2 -131 -6 -48 -36 33 -83 24
34 52 -24 -15 5 92 57 143
-67 -43 -1 74 -16 5 -71 32
-78 106 92 -34 -38 81 20 -18
7 -62 40 2 -15 90 -62 -83
-83 1 -104 -13 43 -19 7 11
-63 31 12 -29 83 72 21 10
-17 -63 -15 73 50 -91 159 -14
DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
DCT BFINfdct: 92.1 kdct/s
root:/u/ffmpeg/bhead/libavcodec>
*/
#include "config_bfin.h"
.section .l1.data.B,"aw",@progbits
.align 4;
dct_coeff:
.short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;
.section .l1.data.A,"aw",@progbits
.align 4
vtmp: .space 128
DEFUN(fdct,mL1,
(DCTELEM *block)):
[--SP] = (R7:4, P5:3); // Push the registers onto the stack.
b0 = r0;
r0 = [P3+dct_coeff@GOT17M4];
b3 = r0;
r0 = [P3+vtmp@GOT17M4];
b2 = r0;
L3 = 16; // L3 is set to 16 to make the coefficient
// array Circular.
//----------------------------------------------------------------------------
/*
* I0, I1, and I2 registers are used to read the input data. I3 register is used
* to read the coefficients. P0 and P1 registers are used for writing the output
* data.
*/
M0 = 12 (X); // All these initializations are used in the
M1 = 16 (X); // modification of address offsets.
M2 = 128 (X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2(X);
// Prescale the input to get the correct precision.
i0=b0;
i1=b0;
lsetup (.0, .1) LC0 = P3;
r0=[i0++];
.0: r1=r0<<3 (v) || r0=[i0++] ;
.1: [i1++]=r1;
/*
* B0 points to the "in" buffer.
* B2 points to "temp" buffer in the first iteration.
*/
lsetup (.2, .3) LC0 = P0;
.2:
I0 = B0; // I0 points to Input Element (0, 0).
I1 = B0; // Element 1 and 0 is read in R0.
I1 += M0 || R0 = [I0++]; // I1 points to Input Element (0, 6).
I2 = I1; // Element 6 is read into R3.H.
I2 -= 4 || R3.H = W[I1++]; // I2 points to Input Element (0, 4).
I3 = B3; // I3 points to Coefficients.
P0 = B2; // P0 points to temporary array Element
// (0, 0).
P1 = B2; // P1 points to temporary array.
R7 = [P1++P2] || R2 = [I2++]; // P1 points to temporary array
// Element (1, 0).
// R7 is a dummy read. X4,X5
// are read into R2.
R3.L = W[I1--]; // X7 is read into R3.L.
R1.H = W[I0++]; // X2 is read into R1.H.
/*
* X0 = (X0 + X7) / 2.
* X1 = (X1 + X6) / 2.