Initial community commit

This commit is contained in:
Jef 2024-09-24 14:54:57 +02:00
parent 537bcbc862
commit fc06254474
16440 changed files with 4239995 additions and 2 deletions

View file

@ -0,0 +1,61 @@
## Target to built
TARGET =libvputil
## TOOLS
CC = ecc
LD = ecc
AR = ar
OBJDUMP = objdump
RM = rm -f
## Directories
TOPDIR =C:\DuckSoft
PRIVATEINCLUDE =${TOPDIR}\private\include
CORELIBSINCLUDE =${TOPDIR}\private\corelibs\include
CDXVINCLUDE =${TOPDIR}\private\corelibs\cdxv\include
VPPPINCLUDE =${TOPDIR}\private\corelibs\cdxv\vputil\include
CURRENTDIR =${TOPDIR}\private\corelibs\cdxv\vputil
LIBDIR =${TOPDIR}\private\corelibs\lib\mapca
## Compile Flags
ALLINCLUDES =-I${CDXVINCLUDE} -I${CORELIBSINCLUDE} -I${PRIVATEINCLUDE} -I${VPPPINCLUDE}
VP6DEFINES =-DPREDICT_2D -DVFW_COMP -DCOMPDLL -DPOSTPROCESS -DCPUISLITTLEENDIAN -DNORMALIZED
ETIDEFINES =-DMAPCA
ALLDEFINES =${VP6DEFINES} ${ETIDEFINES}
DEBUG =-O2
CFLAGS =-msvc -align 8 -etswp -mP3OPT_nonlocal_calls_through_register=true \
-mP2OPT_suppress_library_call_conv_warnings=TRUE -maalign_branch_target \
-magen_interroutine_padding
ALLFLAGS =$(CFLAGS) ${ALLDEFINES} ${ALLINCLUDES} ${DEBUG}
## Files
OBJS =generic\fdct.o \
generic\idctpart.o \
generic\reconstruct.o \
generic\vputil.o \
bsp\bspFdct.o \
bsp\bspIDct.o \
bsp\bsprecon.o \
bsp\bspvputil.o \
bsp\uoptsystemdependant.o
SRCS =$(OBJS:.o=.c)
ARTARGET =${TARGET}.a
# archive
ARTARGET:${OBJS}
${AR} -cr ${ARTARGET} ${OBJS}
mv ${ARTARGET} ${LIBDIR}
${OBJS} : ${SRCS}
$(CC) $(ALLFLAGS) -c $*.c -o $*.o
clean:
${RM} ${OBJS} ${ARTARGET}

View file

@ -0,0 +1,312 @@
/****************************************************************************
*
* Module Title : fdct.c
*
* Description : Fast 8x8 DCT C-Implementation.
*
****************************************************************************/
/****************************************************************************
* Header Files
****************************************************************************/
#include "dct.h"
/****************************************************************************
* Macros
****************************************************************************/
#define SIGNBITDUPPED(X) ( (signed )((X & 0x80000000)) >> 31 )
#define DOROUND(X) X = ( (SIGNBITDUPPED(X) & (0xffff)) + X );
/****************************************************************************
* Module statics
****************************************************************************/
static INT32 xC1S7 = 64277;
static INT32 xC2S6 = 60547;
static INT32 xC3S5 = 54491;
static INT32 xC4S4 = 46341;
static INT32 xC5S3 = 36410;
static INT32 xC6S2 = 25080;
static INT32 xC7S1 = 12785;
/****************************************************************************
*
* ROUTINE : fdct_short_C_orig
*
* INPUTS : INT16 *InputData : 16-bit input data.
*
* OUTPUTS : INT16 *OutputData : 16-bit transform coefficients.
*
* RETURNS : void
*
* FUNCTION : Performs an 8x8 2-D fast DCT.
*
* The algorithm used is derived from the flowgraph for
* the Vetterli and Ligtenberg fast 1-D dct given in the
* JPEG reference book by Pennebaker and Mitchell.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void fdct_short_C_orig ( INT16 *InputData, INT16 *OutputData )
{
int loop;
INT32 is07, is12, is34, is56;
INT32 is0734, is1256;
INT32 id07, id12, id34, id56;
INT32 irot_input_x, irot_input_y;
INT32 icommon_product1; // Re-used product (c4s4 * (s12 - s56)).
INT32 icommon_product2; // Re-used product (c4s4 * (d12 + d56)).
INT32 temp1, temp2; // intermediate variable for computation
INT32 InterData[64];
INT32 *ip = InterData;
INT16 *op = OutputData;
for ( loop=0; loop<8; loop++ )
{
// Pre calculate some common sums and differences.
is07 = InputData[0] + InputData[7];
is12 = InputData[1] + InputData[2];
is34 = InputData[3] + InputData[4];
is56 = InputData[5] + InputData[6];
id07 = InputData[0] - InputData[7];
id12 = InputData[1] - InputData[2];
id34 = InputData[3] - InputData[4];
id56 = InputData[5] - InputData[6];
is0734 = is07 + is34;
is1256 = is12 + is56;
// Pre-Calculate some common product terms.
icommon_product1 = xC4S4*(is12 - is56);
DOROUND ( icommon_product1 )
icommon_product1 >>= 16;
icommon_product2 = xC4S4*(id12 + id56);
DOROUND ( icommon_product2 )
icommon_product2 >>= 16;
ip[0] = (xC4S4*(is0734 + is1256));
DOROUND ( ip[0] );
ip[0] >>= 16;
ip[4] = (xC4S4*(is0734 - is1256));
DOROUND ( ip[4] );
ip[4] >>= 16;
// Define inputs to rotation for outputs 2 and 6
irot_input_x = id12 - id56;
irot_input_y = is07 - is34;
// Apply rotation for outputs 2 and 6.
temp1 = xC6S2*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC2S6*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
ip[2] = temp1 + temp2;
temp1 = xC6S2*irot_input_y;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC2S6*irot_input_x;
DOROUND ( temp2 );
temp2 >>= 16;
ip[6] = temp1 -temp2;
// Define inputs to rotation for outputs 1 and 7
irot_input_x = icommon_product1 + id07;
irot_input_y = -( id34 + icommon_product2 );
// Apply rotation for outputs 1 and 7.
temp1 = xC1S7*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC7S1*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
ip[1] = temp1 - temp2;
temp1 = xC7S1*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC1S7*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
ip[7] = temp1 + temp2;
// Define inputs to rotation for outputs 3 and 5
irot_input_x = id07 - icommon_product1;
irot_input_y = id34 - icommon_product2;
// Apply rotation for outputs 3 and 5.
temp1 = xC3S5 * irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC5S3*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
ip[3] = temp1 - temp2;
temp1 = xC5S3*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC3S5*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
ip[5] = temp1 + temp2;
// Increment data pointer for next row.
InputData += 8;
ip += 8; // advance pointer to next row
}
// Performed DCT on rows, now transform the columns
ip = InterData;
for ( loop=0; loop<8; loop++ )
{
// Pre calculate some common sums and differences.
is07 = ip[0 * 8] + ip[7 * 8];
is12 = ip[1 * 8] + ip[2 * 8];
is34 = ip[3 * 8] + ip[4 * 8];
is56 = ip[5 * 8] + ip[6 * 8];
id07 = ip[0 * 8] - ip[7 * 8];
id12 = ip[1 * 8] - ip[2 * 8];
id34 = ip[3 * 8] - ip[4 * 8];
id56 = ip[5 * 8] - ip[6 * 8];
is0734 = is07 + is34;
is1256 = is12 + is56;
// Pre-Calculate some common product terms.
icommon_product1 = xC4S4*(is12 - is56);
icommon_product2 = xC4S4*(id12 + id56);
DOROUND ( icommon_product1 )
DOROUND ( icommon_product2 )
icommon_product1 >>= 16;
icommon_product2 >>= 16;
temp1 = xC4S4*(is0734 + is1256);
temp2 = xC4S4*(is0734 - is1256);
DOROUND ( temp1 );
DOROUND ( temp2 );
temp1 >>= 16;
temp2 >>= 16;
op[0*8] = (INT16)temp1;
op[4*8] = (INT16)temp2;
// Define inputs to rotation for outputs 2 and 6
irot_input_x = id12 - id56;
irot_input_y = is07 - is34;
// Apply rotation for outputs 2 and 6.
temp1 = xC6S2*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC2S6*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
op[2*8] = (INT16)(temp1 + temp2);
temp1 = xC6S2*irot_input_y;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC2S6*irot_input_x;
DOROUND ( temp2 );
temp2 >>= 16;
op[6*8] = (INT16)(temp1 -temp2);
// Define inputs to rotation for outputs 1 and 7
irot_input_x = icommon_product1 + id07;
irot_input_y = -( id34 + icommon_product2 );
// Apply rotation for outputs 1 and 7.
temp1 = xC1S7*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC7S1*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
op[1*8] = (INT16) (temp1 - temp2);
temp1 = xC7S1*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC1S7*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
op[7*8] = (INT16)(temp1 + temp2);
// Define inputs to rotation for outputs 3 and 5
irot_input_x = id07 - icommon_product1;
irot_input_y = id34 - icommon_product2;
// Apply rotation for outputs 3 and 5.
temp1 = xC3S5*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC5S3*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
op[3*8] = (INT16)(temp1 - temp2);
temp1 = xC5S3*irot_input_x;
DOROUND ( temp1 );
temp1 >>= 16;
temp2 = xC3S5*irot_input_y;
DOROUND ( temp2 );
temp2 >>= 16;
op[5*8] = (INT16) (temp1 + temp2);
// Increment data pointer for next column.
ip ++;
op ++;
}
}
/****************************************************************************
*
* ROUTINE : fdct_short_C
*
* INPUTS : INT16 *InputData : 16-bit input data.
*
* OUTPUTS : INT16 *OutputData : 16-bit transform coefficients.
*
* RETURNS : void
*
* FUNCTION : Performs an 8x8 2-D fast DCT.
*
* The function to up the precision of FDCT by number of bits
* defined by FDCT_PRECISION_BITS.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void fdct_short_C ( INT16 *DCTDataBuffer, INT16 *DCT_codes )
{
INT32 i;
// Increase precision on input to fdct
for ( i = 0; i < 64; i++ )
DCTDataBuffer[i] = DCTDataBuffer[i] << FDCT_PRECISION_BITS;
// Transform the error signal using the forward DCT to get set of transform coefficients
fdct_short_C_orig ( DCTDataBuffer, DCT_codes );
// Strip off the extra bits from the DCT output.
// This should ultimately be merged into the quantize process but there are also
// implications for DC prediction that would then need to be sorted
for ( i = 0; i < 64; i++ )
{
// signed shift modified so behaves like "/" (truncates towards 0 for + and -)
if ( DCT_codes[i] >= 0 )
DCT_codes[i] = (DCT_codes[i]) >> FDCT_PRECISION_BITS;
else
DCT_codes[i] = (DCT_codes[i] + FDCT_PRECISION_NEG_ADJ) >> FDCT_PRECISION_BITS;
}
}

View file

@ -0,0 +1,921 @@
/****************************************************************************
*
* Module Title : idctpart.c
*
* Description : IDCT with multiple versions based on # of non 0 coeffs
*
****************************************************************************/
/****************************************************************************
* Header Files
****************************************************************************/
#include "dct.h"
#include "string.h"
/****************************************************************************
* Macros
****************************************************************************/
#define int32 int
#define int16 short
#define IdctAdjustBeforeShift 8
#define xC1S7 64277
#define xC2S6 60547
#define xC3S5 54491
#define xC4S4 46341
#define xC5S3 36410
#define xC6S2 25080
#define xC7S1 12785
/****************************************************************************
* Module statics
****************************************************************************/
static const UINT32 dequant_index[64] =
{
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
27, 20, 13, 6, 7, 14, 21, 28,
35, 42, 49, 56, 57, 50, 43, 36,
29, 22, 15, 23, 30, 37, 44, 51,
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63
};
#if 0 // AWG CODE NO LONGER USED IN CODEBASE.
/* Cos and Sin constant multipliers used during DCT and IDCT */
const double C1S7 = (double)0.9807852804032;
const double C2S6 = (double)0.9238795325113;
const double C3S5 = (double)0.8314696123025;
const double C4S4 = (double)0.7071067811865;
const double C5S3 = (double)0.5555702330196;
const double C6S2 = (double)0.3826834323651;
const double C7S1 = (double)0.1950903220161;
/****************************************************************************
* Exports
****************************************************************************/
// DCT lookup tables
INT32 * C4S4_TablePtr;
INT32 C4S4_Table[(COEFF_MAX * 4) + 1];
INT32 * C6S2_TablePtr;
INT32 C6S2_Table[(COEFF_MAX * 2) + 1];
INT32 * C2S6_TablePtr;
INT32 C2S6_Table[(COEFF_MAX * 2) + 1];
INT32 * C1S7_TablePtr;
INT32 C1S7_Table[(COEFF_MAX * 2) + 1];
INT32 * C7S1_TablePtr;
INT32 C7S1_Table[(COEFF_MAX * 2) + 1];
INT32 * C3S5_TablePtr;
INT32 C3S5_Table[(COEFF_MAX * 2) + 1];
INT32 * C5S3_TablePtr;
INT32 C5S3_Table[(COEFF_MAX * 2) + 1];
/****************************************************************************
*
* ROUTINE : InitDctTables
*
* INPUTS : None.
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Initialises lookup tables used in IDCT.
*
* SPECIAL NOTES : NO LONGER USED IN CODEBASE.
*
****************************************************************************/
void InitDctTables ( void )
{
INT32 i;
C4S4_TablePtr = &C4S4_Table[COEFF_MAX*2];
for( i = -(2 * COEFF_MAX); i < (2 * COEFF_MAX); i++ )
{
if ( i < 0 )
C4S4_TablePtr[i] = (INT32)((i * C4S4) - 0.5);
else
C4S4_TablePtr[i] = (INT32)((i * C4S4) + 0.5);
}
C6S2_TablePtr = &C6S2_Table[COEFF_MAX];
for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
{
if ( i < 0 )
C6S2_TablePtr[i] = (INT32)((i * C6S2) - 0.5);
else
C6S2_TablePtr[i] = (INT32)((i * C6S2) + 0.5);
}
C2S6_TablePtr = &C2S6_Table[COEFF_MAX];
for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
{
if ( i < 0 )
C2S6_TablePtr[i] = (INT32)((i * C2S6) - 0.5);
else
C2S6_TablePtr[i] = (INT32)((i * C2S6) + 0.5);
}
C1S7_TablePtr = &C1S7_Table[COEFF_MAX];
for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
{
if ( i < 0 )
C1S7_TablePtr[i] = (INT32)((i * C1S7) - 0.5);
else
C1S7_TablePtr[i] = (INT32)((i * C1S7) + 0.5);
}
C7S1_TablePtr = &C7S1_Table[COEFF_MAX];
for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
{
if ( i < 0 )
C7S1_TablePtr[i] = (INT32)((i * C7S1) - 0.5);
else
C7S1_TablePtr[i] = (INT32)((i * C7S1) + 0.5);
}
C3S5_TablePtr = &C3S5_Table[COEFF_MAX];
for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
{
if ( i < 0 )
C3S5_TablePtr[i] = (INT32)((i * C3S5) - 0.5);
else
C3S5_TablePtr[i] = (INT32)((i * C3S5) + 0.5);
}
C5S3_TablePtr = &C5S3_Table[COEFF_MAX];
for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
{
if ( i < 0 )
C5S3_TablePtr[i] = (INT32)((i * C5S3) - 0.5);
else
C5S3_TablePtr[i] = (INT32)((i * C5S3) + 0.5);
}
}
#endif
/****************************************************************************
*
* ROUTINE : dequant_slow
*
* INPUTS : INT16 *dequant_coeffs : Pointer to dequantization step sizes.
* INT16 *quantized_list : Pointer to quantized DCT coeffs
* (in zig-zag order).
*
* OUTPUTS : INT32 *DCT_block : Pointer to 8x8 de-quantized block
* (in 2-D raster order).
*
* RETURNS : void
*
* FUNCTION : De-quantizes an 8x8 block of quantized DCT coeffs.
*
* SPECIAL NOTES : Uses dequant_index to invert zig-zag ordering.
*
****************************************************************************/
void dequant_slow ( INT16 *dequant_coeffs, INT16 *quantized_list, INT32 *DCT_block )
{
// Loop fully expanded for maximum speed
DCT_block[dequant_index[0]] = quantized_list[0] * dequant_coeffs[0];
DCT_block[dequant_index[1]] = quantized_list[1] * dequant_coeffs[1];
DCT_block[dequant_index[2]] = quantized_list[2] * dequant_coeffs[2];
DCT_block[dequant_index[3]] = quantized_list[3] * dequant_coeffs[3];
DCT_block[dequant_index[4]] = quantized_list[4] * dequant_coeffs[4];
DCT_block[dequant_index[5]] = quantized_list[5] * dequant_coeffs[5];
DCT_block[dequant_index[6]] = quantized_list[6] * dequant_coeffs[6];
DCT_block[dequant_index[7]] = quantized_list[7] * dequant_coeffs[7];
DCT_block[dequant_index[8]] = quantized_list[8] * dequant_coeffs[8];
DCT_block[dequant_index[9]] = quantized_list[9] * dequant_coeffs[9];
DCT_block[dequant_index[10]] = quantized_list[10] * dequant_coeffs[10];
DCT_block[dequant_index[11]] = quantized_list[11] * dequant_coeffs[11];
DCT_block[dequant_index[12]] = quantized_list[12] * dequant_coeffs[12];
DCT_block[dequant_index[13]] = quantized_list[13] * dequant_coeffs[13];
DCT_block[dequant_index[14]] = quantized_list[14] * dequant_coeffs[14];
DCT_block[dequant_index[15]] = quantized_list[15] * dequant_coeffs[15];
DCT_block[dequant_index[16]] = quantized_list[16] * dequant_coeffs[16];
DCT_block[dequant_index[17]] = quantized_list[17] * dequant_coeffs[17];
DCT_block[dequant_index[18]] = quantized_list[18] * dequant_coeffs[18];
DCT_block[dequant_index[19]] = quantized_list[19] * dequant_coeffs[19];
DCT_block[dequant_index[20]] = quantized_list[20] * dequant_coeffs[20];
DCT_block[dequant_index[21]] = quantized_list[21] * dequant_coeffs[21];
DCT_block[dequant_index[22]] = quantized_list[22] * dequant_coeffs[22];
DCT_block[dequant_index[23]] = quantized_list[23] * dequant_coeffs[23];
DCT_block[dequant_index[24]] = quantized_list[24] * dequant_coeffs[24];
DCT_block[dequant_index[25]] = quantized_list[25] * dequant_coeffs[25];
DCT_block[dequant_index[26]] = quantized_list[26] * dequant_coeffs[26];
DCT_block[dequant_index[27]] = quantized_list[27] * dequant_coeffs[27];
DCT_block[dequant_index[28]] = quantized_list[28] * dequant_coeffs[28];
DCT_block[dequant_index[29]] = quantized_list[29] * dequant_coeffs[29];
DCT_block[dequant_index[30]] = quantized_list[30] * dequant_coeffs[30];
DCT_block[dequant_index[31]] = quantized_list[31] * dequant_coeffs[31];
DCT_block[dequant_index[32]] = quantized_list[32] * dequant_coeffs[32];
DCT_block[dequant_index[33]] = quantized_list[33] * dequant_coeffs[33];
DCT_block[dequant_index[34]] = quantized_list[34] * dequant_coeffs[34];
DCT_block[dequant_index[35]] = quantized_list[35] * dequant_coeffs[35];
DCT_block[dequant_index[36]] = quantized_list[36] * dequant_coeffs[36];
DCT_block[dequant_index[37]] = quantized_list[37] * dequant_coeffs[37];
DCT_block[dequant_index[38]] = quantized_list[38] * dequant_coeffs[38];
DCT_block[dequant_index[39]] = quantized_list[39] * dequant_coeffs[39];
DCT_block[dequant_index[40]] = quantized_list[40] * dequant_coeffs[40];
DCT_block[dequant_index[41]] = quantized_list[41] * dequant_coeffs[41];
DCT_block[dequant_index[42]] = quantized_list[42] * dequant_coeffs[42];
DCT_block[dequant_index[43]] = quantized_list[43] * dequant_coeffs[43];
DCT_block[dequant_index[44]] = quantized_list[44] * dequant_coeffs[44];
DCT_block[dequant_index[45]] = quantized_list[45] * dequant_coeffs[45];
DCT_block[dequant_index[46]] = quantized_list[46] * dequant_coeffs[46];
DCT_block[dequant_index[47]] = quantized_list[47] * dequant_coeffs[47];
DCT_block[dequant_index[48]] = quantized_list[48] * dequant_coeffs[48];
DCT_block[dequant_index[49]] = quantized_list[49] * dequant_coeffs[49];
DCT_block[dequant_index[50]] = quantized_list[50] * dequant_coeffs[50];
DCT_block[dequant_index[51]] = quantized_list[51] * dequant_coeffs[51];
DCT_block[dequant_index[52]] = quantized_list[52] * dequant_coeffs[52];
DCT_block[dequant_index[53]] = quantized_list[53] * dequant_coeffs[53];
DCT_block[dequant_index[54]] = quantized_list[54] * dequant_coeffs[54];
DCT_block[dequant_index[55]] = quantized_list[55] * dequant_coeffs[55];
DCT_block[dequant_index[56]] = quantized_list[56] * dequant_coeffs[56];
DCT_block[dequant_index[57]] = quantized_list[57] * dequant_coeffs[57];
DCT_block[dequant_index[58]] = quantized_list[58] * dequant_coeffs[58];
DCT_block[dequant_index[59]] = quantized_list[59] * dequant_coeffs[59];
DCT_block[dequant_index[60]] = quantized_list[60] * dequant_coeffs[60];
DCT_block[dequant_index[61]] = quantized_list[61] * dequant_coeffs[61];
DCT_block[dequant_index[62]] = quantized_list[62] * dequant_coeffs[62];
DCT_block[dequant_index[63]] = quantized_list[63] * dequant_coeffs[63];
}
/****************************************************************************
*
* ROUTINE : IDctSlow
*
* INPUTS : int16 *InputData : Pointer to 8x8 quantized DCT coefficients.
* int16 *QuantMatrix : Pointer to 8x8 quantization matrix.
*
* OUTPUTS : int16 *OutputData : Pointer to 8x8 block to hold output.
*
* RETURNS : void
*
* FUNCTION : Inverse quantizes and inverse DCT's input 8x8 block
* to reproduce prediction error.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void IDctSlow ( int16 *InputData, int16 *QuantMatrix, int16 *OutputData )
{
int loop;
int32 t1, t2;
int32 IntermediateData[64];
int32 _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
int32 _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
int32 *ip = IntermediateData;
int16 *op = OutputData;
// dequantize the input
dequant_slow ( QuantMatrix, InputData, IntermediateData );
// Inverse DCT on the rows now
for ( loop=0; loop<8; loop++ )
{
// Check for non-zero values
if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] )
{
t1 = (int32)(xC1S7 * ip[1]);
t2 = (int32)(xC7S1 * ip[7]);
t1 >>= 16;
t2 >>= 16;
_A = t1 + t2;
t1 = (int32)(xC7S1 * ip[1]);
t2 = (int32)(xC1S7 * ip[7]);
t1 >>= 16;
t2 >>= 16;
_B = t1 - t2;
t1 = (int32)(xC3S5 * ip[3]);
t2 = (int32)(xC5S3 * ip[5]);
t1 >>= 16;
t2 >>= 16;
_C = t1 + t2;
t1 = (int32)(xC3S5 * ip[5]);
t2 = (int32)(xC5S3 * ip[3]);
t1 >>= 16;
t2 >>= 16;
_D = t1 - t2;
t1 = (int32)(xC4S4 * (_A - _C));
t1 >>= 16;
_Ad = t1;
t1 = (int32)(xC4S4 * (_B - _D));
t1 >>= 16;
_Bd = t1;
_Cd = _A + _C;
_Dd = _B + _D;
t1 = (int32)(xC4S4 * (ip[0] + ip[4]));
t1 >>= 16;
_E = t1;
t1 = (int32)(xC4S4 * (ip[0] - ip[4]));
t1 >>= 16;
_F = t1;
t1 = (int32)(xC2S6 * ip[2]);
t2 = (int32)(xC6S2 * ip[6]);
t1 >>= 16;
t2 >>= 16;
_G = t1 + t2;
t1 = (int32)(xC6S2 * ip[2]);
t2 = (int32)(xC2S6 * ip[6]);
t1 >>= 16;
t2 >>= 16;
_H = t1 - t2;
_Ed = _E - _G;
_Gd = _E + _G;
_Add = _F + _Ad;
_Bdd = _Bd - _H;
_Fd = _F - _Ad;
_Hd = _Bd + _H;
// Final sequence of operations over-write original inputs.
ip[0] = (int16)((_Gd + _Cd ) >> 0);
ip[7] = (int16)((_Gd - _Cd ) >> 0);
ip[1] = (int16)((_Add + _Hd ) >> 0);
ip[2] = (int16)((_Add - _Hd ) >> 0);
ip[3] = (int16)((_Ed + _Dd ) >> 0);
ip[4] = (int16)((_Ed - _Dd ) >> 0);
ip[5] = (int16)((_Fd + _Bdd ) >> 0);
ip[6] = (int16)((_Fd - _Bdd ) >> 0);
}
ip += 8; /* next row */
}
ip = IntermediateData;
for ( loop=0; loop<8; loop++ )
{
// Check for non-zero values (bitwise | faster than logical ||)
if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] )
{
t1 = (int32)(xC1S7 * ip[1*8]);
t2 = (int32)(xC7S1 * ip[7*8]);
t1 >>= 16;
t2 >>= 16;
_A = t1 + t2;
t1 = (int32)(xC7S1 * ip[1*8]);
t2 = (int32)(xC1S7 * ip[7*8]);
t1 >>= 16;
t2 >>= 16;
_B = t1 - t2;
t1 = (int32)(xC3S5 * ip[3*8]);
t2 = (int32)(xC5S3 * ip[5*8]);
t1 >>= 16;
t2 >>= 16;
_C = t1 + t2;
t1 = (int32)(xC3S5 * ip[5*8]);
t2 = (int32)(xC5S3 * ip[3*8]);
t1 >>= 16;
t2 >>= 16;
_D = t1 - t2;
t1 = (int32)(xC4S4 * (_A - _C));
t1 >>= 16;
_Ad = t1;
t1 = (int32)(xC4S4 * (_B - _D));
t1 >>= 16;
_Bd = t1;
_Cd = _A + _C;
_Dd = _B + _D;
t1 = (int32)(xC4S4 * (ip[0*8] + ip[4*8]));
t1 >>= 16;
_E = t1;
t1 = (int32)(xC4S4 * (ip[0*8] - ip[4*8]));
t1 >>= 16;
_F = t1;
t1 = (int32)(xC2S6 * ip[2*8]);
t2 = (int32)(xC6S2 * ip[6*8]);
t1 >>= 16;
t2 >>= 16;
_G = t1 + t2;
t1 = (int32)(xC6S2 * ip[2*8]);
t2 = (int32)(xC2S6 * ip[6*8]);
t1 >>= 16;
t2 >>= 16;
_H = t1 - t2;
_Ed = _E - _G;
_Gd = _E + _G;
_Add = _F + _Ad;
_Bdd = _Bd - _H;
_Fd = _F - _Ad;
_Hd = _Bd + _H;
_Gd += IdctAdjustBeforeShift;
_Add += IdctAdjustBeforeShift;
_Ed += IdctAdjustBeforeShift;
_Fd += IdctAdjustBeforeShift;
// Final sequence of operations over-write original inputs.
op[0*8] = (int16)((_Gd + _Cd ) >> 4);
op[7*8] = (int16)((_Gd - _Cd ) >> 4);
op[1*8] = (int16)((_Add + _Hd ) >> 4);
op[2*8] = (int16)((_Add - _Hd ) >> 4);
op[3*8] = (int16)((_Ed + _Dd ) >> 4);
op[4*8] = (int16)((_Ed - _Dd ) >> 4);
op[5*8] = (int16)((_Fd + _Bdd ) >> 4);
op[6*8] = (int16)((_Fd - _Bdd ) >> 4);
}
else
{
op[0*8] = 0;
op[7*8] = 0;
op[1*8] = 0;
op[2*8] = 0;
op[3*8] = 0;
op[4*8] = 0;
op[5*8] = 0;
op[6*8] = 0;
}
ip++; // next column
op++;
}
}
/****************************************************************************
*
* ROUTINE : dequant_slow10
*
* INPUTS : INT16 *dequant_coeffs : Pointer to dequantization step sizes.
* INT16 *quantized_list : Pointer to quantized DCT coeffs
* (in zig-zag order).
*
* OUTPUTS : INT32 *DCT_block : Pointer to 8x8 de-quantized block
* (in 2-D raster order).
*
* RETURNS : void
*
* FUNCTION : De-quantizes an 8x8 block of quantized DCT coeffs that
* only has non-zero coefficients in the first 10, i.e.
* only DC & AC1-9 are non-zero, AC10-63 __MUST_BE_ zero.
*
* SPECIAL NOTES : Uses dequant_index to invert zig-zag ordering.
*
****************************************************************************/
void dequant_slow10 ( INT16 *dequant_coeffs, INT16 *quantized_list, INT32 *DCT_block )
{
memset(DCT_block,0, 128);
// Loop fully expanded for maximum speed
DCT_block[dequant_index[0]] = quantized_list[0] * dequant_coeffs[0];
DCT_block[dequant_index[1]] = quantized_list[1] * dequant_coeffs[1];
DCT_block[dequant_index[2]] = quantized_list[2] * dequant_coeffs[2];
DCT_block[dequant_index[3]] = quantized_list[3] * dequant_coeffs[3];
DCT_block[dequant_index[4]] = quantized_list[4] * dequant_coeffs[4];
DCT_block[dequant_index[5]] = quantized_list[5] * dequant_coeffs[5];
DCT_block[dequant_index[6]] = quantized_list[6] * dequant_coeffs[6];
DCT_block[dequant_index[7]] = quantized_list[7] * dequant_coeffs[7];
DCT_block[dequant_index[8]] = quantized_list[8] * dequant_coeffs[8];
DCT_block[dequant_index[9]] = quantized_list[9] * dequant_coeffs[9];
DCT_block[dequant_index[10]] = quantized_list[10] * dequant_coeffs[10];
}
/****************************************************************************
*
* ROUTINE : IDctSlow10
*
* INPUTS : int16 *InputData : Pointer to 8x8 quantized DCT coefficients.
* int16 *QuantMatrix : Pointer to 8x8 quantization matrix.
*
* OUTPUTS : int16 *OutputData : Pointer to 8x8 block to hold output.
*
* RETURNS : void
*
* FUNCTION : Inverse quantizes and inverse DCT's input 8x8 block
* with non-zero coeffs only in DC & the first 9 AC coeffs.
* i.e. non-zeros ONLY in the following 10 positions:
*
* x x x x 0 0 0 0
* x x x 0 0 0 0 0
* x x 0 0 0 0 0 0
* x 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
*
* SPECIAL NOTES : Output data is in raster, not zig-zag, order.
*
****************************************************************************/
void IDct10 ( int16 *InputData, int16 *QuantMatrix, int16 *OutputData )
{
int loop;
int32 t1, t2;
int32 IntermediateData[64];
int32 _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
int32 _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
int32 *ip = IntermediateData;
int16 *op = OutputData;
// dequantize the input
dequant_slow10 ( QuantMatrix, InputData, IntermediateData );
// Inverse DCT on the rows now
for ( loop=0; loop<4; loop++ )
{
// Check for non-zero values
if ( ip[0] | ip[1] | ip[2] | ip[3] )
{
t1 = (int32)(xC1S7 * ip[1]);
t1 >>= 16;
_A = t1;
t1 = (int32)(xC7S1 * ip[1]);
t1 >>= 16;
_B = t1 ;
t1 = (int32)(xC3S5 * ip[3]);
t1 >>= 16;
_C = t1;
t2 = (int32)(xC5S3 * ip[3]);
t2 >>= 16;
_D = -t2;
t1 = (int32)(xC4S4 * (_A - _C));
t1 >>= 16;
_Ad = t1;
t1 = (int32)(xC4S4 * (_B - _D));
t1 >>= 16;
_Bd = t1;
_Cd = _A + _C;
_Dd = _B + _D;
t1 = (int32)(xC4S4 * ip[0] );
t1 >>= 16;
_E = t1;
_F = t1;
t1 = (int32)(xC2S6 * ip[2]);
t1 >>= 16;
_G = t1;
t1 = (int32)(xC6S2 * ip[2]);
t1 >>= 16;
_H = t1 ;
_Ed = _E - _G;
_Gd = _E + _G;
_Add = _F + _Ad;
_Bdd = _Bd - _H;
_Fd = _F - _Ad;
_Hd = _Bd + _H;
// Final sequence of operations over-write original inputs.
ip[0] = (int16)((_Gd + _Cd ) >> 0);
ip[7] = (int16)((_Gd - _Cd ) >> 0);
ip[1] = (int16)((_Add + _Hd ) >> 0);
ip[2] = (int16)((_Add - _Hd ) >> 0);
ip[3] = (int16)((_Ed + _Dd ) >> 0);
ip[4] = (int16)((_Ed - _Dd ) >> 0);
ip[5] = (int16)((_Fd + _Bdd ) >> 0);
ip[6] = (int16)((_Fd - _Bdd ) >> 0);
}
ip += 8; /* next row */
}
ip = IntermediateData;
for ( loop=0; loop<8; loop++ )
{
// Check for non-zero values (bitwise or faster than ||)
if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] )
{
t1 = (int32)(xC1S7 * ip[1*8]);
t1 >>= 16;
_A = t1 ;
t1 = (int32)(xC7S1 * ip[1*8]);
t1 >>= 16;
_B = t1 ;
t1 = (int32)(xC3S5 * ip[3*8]);
t1 >>= 16;
_C = t1 ;
t2 = (int32)(xC5S3 * ip[3*8]);
t2 >>= 16;
_D = - t2;
t1 = (int32)(xC4S4 * (_A - _C));
t1 >>= 16;
_Ad = t1;
t1 = (int32)(xC4S4 * (_B - _D));
t1 >>= 16;
_Bd = t1;
_Cd = _A + _C;
_Dd = _B + _D;
t1 = (int32)(xC4S4 * ip[0*8]);
t1 >>= 16;
_E = t1;
_F = t1;
t1 = (int32)(xC2S6 * ip[2*8]);
t1 >>= 16;
_G = t1;
t1 = (int32)(xC6S2 * ip[2*8]);
t1 >>= 16;
_H = t1;
_Ed = _E - _G;
_Gd = _E + _G;
_Add = _F + _Ad;
_Bdd = _Bd - _H;
_Fd = _F - _Ad;
_Hd = _Bd + _H;
_Gd += IdctAdjustBeforeShift;
_Add += IdctAdjustBeforeShift;
_Ed += IdctAdjustBeforeShift;
_Fd += IdctAdjustBeforeShift;
// Final sequence of operations over-write original inputs.
op[0*8] = (int16)((_Gd + _Cd ) >> 4);
op[7*8] = (int16)((_Gd - _Cd ) >> 4);
op[1*8] = (int16)((_Add + _Hd ) >> 4);
op[2*8] = (int16)((_Add - _Hd ) >> 4);
op[3*8] = (int16)((_Ed + _Dd ) >> 4);
op[4*8] = (int16)((_Ed - _Dd ) >> 4);
op[5*8] = (int16)((_Fd + _Bdd ) >> 4);
op[6*8] = (int16)((_Fd - _Bdd ) >> 4);
}
else
{
op[0*8] = 0;
op[7*8] = 0;
op[1*8] = 0;
op[2*8] = 0;
op[3*8] = 0;
op[4*8] = 0;
op[5*8] = 0;
op[6*8] = 0;
}
ip++; // next column
op++;
}
}
/****************************************************************************
*
* ROUTINE : IDct1
*
* INPUTS : int16 *InputData : Pointer to 8x8 quantized DCT coefficients.
* int16 *QuantMatrix : Pointer to 8x8 quantization matrix.
*
* OUTPUTS : int16 *OutputData : Pointer to 8x8 block to hold output.
*
* RETURNS : void
*
* FUNCTION : Inverse DCT's input 8x8 block with only one non-zero
* coeff in the DC position:
*
* x 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
*
* SPECIAL NOTES : Output data is in raster, not zig-zag, order.
*
****************************************************************************/
void IDct1 ( int16 *InputData, int16 *QuantMatrix, INT16 *OutputData )
{
INT32 loop;
INT16 OutD;
OutD = (INT16)((INT32)(InputData[0]*QuantMatrix[0]+15)>>5);
for ( loop=0; loop<64; loop++ )
OutputData[loop] = OutD;
}
#if 0
/****************************************************************************
*
* ROUTINE : IDct4
*
* INPUTS : int16 *InputData : Pointer to 8x8 DCT coefficients.
*
* OUTPUTS : int16 *OutputData : Pointer to 8x8 block to hold output.
*
* RETURNS : void
*
* FUNCTION : Inverse DCT's input 8x8 block with at most four non-zero
* coeffs in the following positions:
*
* x x 0 0 0 0 0 0
* x x 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
* 0 0 0 0 0 0 0 0
*
* SPECIAL NOTES : CURRENTLY NOT USED IN CODEBASE.
*
****************************************************************************/
void IDct4 ( int16 *InputData, int16 *OutputData )
{
int32 t1;
int loop;
int32 _Add, _Fd;
int32 _A, _B, _Ad, _Bd, _Cd, _Dd, _E;
int16 *ip = InputData;
int16 *op = OutputData;
// Unzigzag the coefficents
ip[8] = ip[2];
ip[9] = ip[4];
ip[2] = 0;
ip[5] = 0;
// Inverse DCT on the rows now
for ( loop = 0; loop < 2; loop++)
{
// Check for non-zero values
if ( ip[0] | ip[1] )
{
t1 = (int32)(xC1S7 * ip[1]);
t1 >>= 16;
_A = t1;
t1 = (int32)(xC7S1 * ip[1]);
t1 >>= 16;
_B = t1 ;
t1 = (int32)(xC4S4 * _A );
t1 >>= 16;
_Ad = t1;
t1 = (int32)(xC4S4 * _B );
t1 >>= 16;
_Bd = t1;
_Cd = _A ;
_Dd = _B ;
t1 = (int32)(xC4S4 * ip[0] );
t1 >>= 16;
_E = t1;
_Add = _E + _Ad;
_Fd = _E - _Ad;
// Final sequence of operations over-write original inputs.
ip[0] = (int16)((_E + _Cd ) >> 0);
ip[7] = (int16)((_E - _Cd ) >> 0);
ip[1] = (int16)((_Add + _Bd ) >> 0);
ip[2] = (int16)((_Add - _Bd ) >> 0);
ip[3] = (int16)((_E + _Dd ) >> 0);
ip[4] = (int16)((_E - _Dd ) >> 0);
ip[5] = (int16)((_Fd + _Bd ) >> 0);
ip[6] = (int16)((_Fd - _Bd ) >> 0);
}
ip += 8; /* next row */
}
ip = InputData;
for ( loop=0; loop<8; loop++ )
{
// Check for non-zero values (bitwise or faster than ||)
if ( ip[0 * 8] | ip[1 * 8] )
{
t1 = (int32)(xC1S7 * ip[1*8]);
t1 >>= 16;
_A = t1 ;
t1 = (int32)(xC7S1 * ip[1*8]);
t1 >>= 16;
_B = t1 ;
t1 = (int32)(xC4S4 * _A );
t1 >>= 16;
_Ad = t1;
t1 = (int32)(xC4S4 * _B );
t1 >>= 16;
_Bd = t1;
_Cd = _A ;
_Dd = _B ;
t1 = (int32)(xC4S4 * ip[0*8]);
t1 >>= 16;
_E = t1;
_Add = _E + _Ad;
_Fd = _E - _Ad;
_Add += IdctAdjustBeforeShift;
_E += IdctAdjustBeforeShift;
_Fd += IdctAdjustBeforeShift;
// Final sequence of operations over-write original inputs.
op[0*8] = (int16)((_E + _Cd ) >> 4);
op[7*8] = (int16)((_E - _Cd ) >> 4);
op[1*8] = (int16)((_Add + _Bd ) >> 4);
op[2*8] = (int16)((_Add - _Bd ) >> 4);
op[3*8] = (int16)((_E + _Dd ) >> 4);
op[4*8] = (int16)((_E - _Dd ) >> 4);
op[5*8] = (int16)((_Fd + _Bd ) >> 4);
op[6*8] = (int16)((_Fd - _Bd ) >> 4);
}
else
{
op[0*8] = 0;
op[7*8] = 0;
op[1*8] = 0;
op[2*8] = 0;
op[3*8] = 0;
op[4*8] = 0;
op[5*8] = 0;
op[6*8] = 0;
}
ip++; // next column
op++;
}
}
#endif

View file

@ -0,0 +1,243 @@
/****************************************************************************
*
* Module Title : Reconstruct.c
*
* Description : Block reconstruction functions.
*
****************************************************************************/
#define STRICT // Strict type checking
/****************************************************************************
* Header Files
****************************************************************************/
#include "reconstruct.h"
#include "codec_common.h"
/****************************************************************************
*
* ROUTINE : SatUnsigned8
*
* INPUTS : INT16 *DataBlock : Pointer to 8x8 input block.
* UINT32 ResultLineStep : Stride of output block.
* UINT32 DataLineStep : Stride of input block.
*
* OUTPUTS : UINT8 *ResultPtr : Pointer to 8x8 output block.
*
* RETURNS : void
*
* FUNCTION : Saturates the input data to 8 bits unsigned and stores
* in the output buffer.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void SatUnsigned8 ( UINT8 *ResultPtr, INT16 *DataBlock, UINT32 ResultLineStep, UINT32 DataLineStep )
{
INT32 i;
// Partly expanded loop
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
ResultPtr[0] = (char) LIMIT(DataBlock[0]);
ResultPtr[1] = (char) LIMIT(DataBlock[1]);
ResultPtr[2] = (char) LIMIT(DataBlock[2]);
ResultPtr[3] = (char) LIMIT(DataBlock[3]);
ResultPtr[4] = (char) LIMIT(DataBlock[4]);
ResultPtr[5] = (char) LIMIT(DataBlock[5]);
ResultPtr[6] = (char) LIMIT(DataBlock[6]);
ResultPtr[7] = (char) LIMIT(DataBlock[7]);
DataBlock += DataLineStep;
ResultPtr += ResultLineStep;
}
}
/****************************************************************************
*
* ROUTINE : ScalarReconIntra
*
* INPUTS : INT16 *TmpDataBuffer : Pointer to 8x8 temporary buffer for internal use.
* UINT16 *ChangePtr : Pointer to 8x8 intra prediction block.
* UINT32 LineStep : Stride of reconstruction block.
*
* OUTPUTS : UINT8 *ReconPtr : Pointer to 8x8 block to hold reconstructed block.
*
* RETURNS : None
*
* FUNCTION : Reconstructs an intra block.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void ScalarReconIntra ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT16 *ChangePtr, UINT32 LineStep )
{
UINT32 i;
INT16 *TmpDataPtr = TmpDataBuffer;
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
TmpDataPtr[0] = (INT16) ( ChangePtr[0] + 128 );
TmpDataPtr[1] = (INT16) ( ChangePtr[1] + 128 );
TmpDataPtr[2] = (INT16) ( ChangePtr[2] + 128 );
TmpDataPtr[3] = (INT16) ( ChangePtr[3] + 128 );
TmpDataPtr[4] = (INT16) ( ChangePtr[4] + 128 );
TmpDataPtr[5] = (INT16) ( ChangePtr[5] + 128 );
TmpDataPtr[6] = (INT16) ( ChangePtr[6] + 128 );
TmpDataPtr[7] = (INT16) ( ChangePtr[7] + 128 );
TmpDataPtr += BLOCK_HEIGHT_WIDTH;
ChangePtr += BLOCK_HEIGHT_WIDTH;
}
// Saturate the output to unsigned 8 bit values in recon buffer
SatUnsigned8 ( ReconPtr, TmpDataBuffer, LineStep, BLOCK_HEIGHT_WIDTH );
}
/****************************************************************************
*
* ROUTINE : ScalarReconInter
*
* INPUTS : INT16 *TmpDataBuffer : Pointer to 8x8 temporary buffer for internal use.
* UINT8 *RefPtr : Pointer to 8x8 reference block.
* INT16 *ChangePtr : Pointer to 8x8 inter prediction error block.
* UINT32 LineStep : Stride of reference and output blocks.
*
* OUTPUTS : UINT8 *ReconPtr : Pointer to 8x8 block to hold reconstructed block.
*
* RETURNS : None
*
* FUNCTION : Reconstructs an inter-coded block by adding a prediction
* error to a reference block in the previous frame
* reconstruction buffer.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void ScalarReconInter ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT8 *RefPtr, INT16 *ChangePtr, UINT32 LineStep )
{
UINT32 i;
INT16 *TmpDataPtr = TmpDataBuffer;
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
// Form each row
TmpDataPtr[0] = (INT16)(RefPtr[0] + ChangePtr[0]);
TmpDataPtr[1] = (INT16)(RefPtr[1] + ChangePtr[1]);
TmpDataPtr[2] = (INT16)(RefPtr[2] + ChangePtr[2]);
TmpDataPtr[3] = (INT16)(RefPtr[3] + ChangePtr[3]);
TmpDataPtr[4] = (INT16)(RefPtr[4] + ChangePtr[4]);
TmpDataPtr[5] = (INT16)(RefPtr[5] + ChangePtr[5]);
TmpDataPtr[6] = (INT16)(RefPtr[6] + ChangePtr[6]);
TmpDataPtr[7] = (INT16)(RefPtr[7] + ChangePtr[7]);
// Next row of Block
ChangePtr += BLOCK_HEIGHT_WIDTH;
TmpDataPtr += BLOCK_HEIGHT_WIDTH;
RefPtr += LineStep;
}
// Saturate the output to unsigned 8 bit values in recon buffer
SatUnsigned8 ( ReconPtr, TmpDataBuffer, LineStep, BLOCK_HEIGHT_WIDTH );
}
/****************************************************************************
*
* ROUTINE : ScalarReconInterHalfPixel2
*
* INPUTS : INT16 *TmpDataBuffer : Pointer to 8x8 temporary buffer for internal use.
* UINT8 *RefPtr1 : Pointer to first 8x8 reference block.
* UINT8 *RefPtr2 : Pointer to second 8x8 reference block.
* INT16 *ChangePtr : Pointer to 8x8 inter prediction error block.
* UINT32 LineStep : Stride of reference blocks.
*
* OUTPUTS : UINT8 *ReconPtr : Pointer to 8x8 block to hold reconstructed block.
*
* RETURNS : None
*
* FUNCTION : Reconstructs an inter-coded block by adding a prediction
* error to a reference block computed by averaging the two
* specified reference blocks. The two reference blocks are
* those that bracket the 1/2-pixel accuracy motion vector.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void ScalarReconInterHalfPixel2
(
INT16 *TmpDataBuffer,
UINT8 *ReconPtr,
UINT8 *RefPtr1,
UINT8 *RefPtr2,
INT16 *ChangePtr,
UINT32 LineStep
)
{
UINT32 i;
INT16 *TmpDataPtr = TmpDataBuffer;
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
// Form each row
TmpDataPtr[0] = (INT16)( (((INT32)RefPtr1[0] + (INT32)RefPtr2[0]) >> 1) + ChangePtr[0] );
TmpDataPtr[1] = (INT16)( (((INT32)RefPtr1[1] + (INT32)RefPtr2[1]) >> 1) + ChangePtr[1] );
TmpDataPtr[2] = (INT16)( (((INT32)RefPtr1[2] + (INT32)RefPtr2[2]) >> 1) + ChangePtr[2] );
TmpDataPtr[3] = (INT16)( (((INT32)RefPtr1[3] + (INT32)RefPtr2[3]) >> 1) + ChangePtr[3] );
TmpDataPtr[4] = (INT16)( (((INT32)RefPtr1[4] + (INT32)RefPtr2[4]) >> 1) + ChangePtr[4] );
TmpDataPtr[5] = (INT16)( (((INT32)RefPtr1[5] + (INT32)RefPtr2[5]) >> 1) + ChangePtr[5] );
TmpDataPtr[6] = (INT16)( (((INT32)RefPtr1[6] + (INT32)RefPtr2[6]) >> 1) + ChangePtr[6] );
TmpDataPtr[7] = (INT16)( (((INT32)RefPtr1[7] + (INT32)RefPtr2[7]) >> 1) + ChangePtr[7] );
// Next row of Block
ChangePtr += BLOCK_HEIGHT_WIDTH;
TmpDataPtr += BLOCK_HEIGHT_WIDTH;
RefPtr1 += LineStep;
RefPtr2 += LineStep;
}
// Saturate the output to unsigned 8 bit values in recon buffer
SatUnsigned8( ReconPtr, TmpDataBuffer, LineStep, BLOCK_HEIGHT_WIDTH );
}
/****************************************************************************
*
* ROUTINE : ReconBlock_C
*
* INPUTS : INT16 *SrcBlock : Pointer to 8x8 prediction error.
* INT16 *ReconRefPtr : Pointer to 8x8 block prediction.
* UINT32 LineStep : Stride of output block.
*
* OUTPUTS : UINT8 *DestBlock : Pointer to 8x8 reconstructed block.
*
* RETURNS : void
*
* FUNCTION : Reconstrut a block by adding the prediction error
* block to the source block and clipping values.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void ReconBlock_C ( INT16 *SrcBlock, INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep )
{
UINT32 i;
INT16 *SrcBlockPtr = SrcBlock;
// For each block row
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
SrcBlock[0] = (INT16)(SrcBlock[0] + ReconRefPtr[0]);
SrcBlock[1] = (INT16)(SrcBlock[1] + ReconRefPtr[1]);
SrcBlock[2] = (INT16)(SrcBlock[2] + ReconRefPtr[2]);
SrcBlock[3] = (INT16)(SrcBlock[3] + ReconRefPtr[3]);
SrcBlock[4] = (INT16)(SrcBlock[4] + ReconRefPtr[4]);
SrcBlock[5] = (INT16)(SrcBlock[5] + ReconRefPtr[5]);
SrcBlock[6] = (INT16)(SrcBlock[6] + ReconRefPtr[6]);
SrcBlock[7] = (INT16)(SrcBlock[7] + ReconRefPtr[7]);
// Next row...
SrcBlock += BLOCK_HEIGHT_WIDTH;
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
}
// Saturate the output to unsigned 8 bit values in recon buffer
SatUnsigned8( DestBlock, SrcBlockPtr, LineStep, BLOCK_HEIGHT_WIDTH );
}

View file

@ -0,0 +1,100 @@
/****************************************************************************
*
* Module Title : SystemDependant.c
*
* Description : Miscellaneous system dependant functions.
*
****************************************************************************/
/****************************************************************************
* Header Files
****************************************************************************/
#include "codec_common.h"
#include "vputil_if.h"
/****************************************************************************
* Exports
****************************************************************************/
// Scalar (no mmx) reconstruction functions
extern void ClearSysState_C ( void );
extern void IDctSlow ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
extern void IDct10 ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
extern void IDct1 ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
extern void ScalarReconIntra ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT16 *ChangePtr, UINT32 LineStep );
extern void ScalarReconInter ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT8 *RefPtr, INT16 *ChangePtr, UINT32 LineStep );
extern void ScalarReconInterHalfPixel2 ( INT16 *TmpDataBuffer, UINT8 *ReconPtr,UINT8 *RefPtr1, UINT8 *RefPtr2, INT16 *ChangePtr, UINT32 LineStep );
extern void ReconBlock_C(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep );
extern void SubtractBlock_C ( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
extern void UnpackBlock_C ( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine );
extern void AverageBlock_C ( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine );
extern void CopyBlock_C ( unsigned char *src, unsigned char *dest, unsigned int srcstride );
extern void Copy12x12_C ( const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride );
extern void fdct_short_C ( INT16 *InputData, INT16 *OutputData );
extern void FilterBlockBil_8_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
extern void FilterBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
extern void GetProcessorFlags ( INT32 *MmxEnabled, INT32 *XmmEnabled, INT32 *WmtEnabled );
/****************************************************************************
*
* ROUTINE : fillidctconstants
*
* INPUTS : None
*
* OUTPUTS : None
*
* RETURNS : void
*
* FUNCTION : STUB FUNCTION.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void fillidctconstants ( void )
{
}
/****************************************************************************
*
* ROUTINE : MachineSpecificConfig
*
* INPUTS : None
*
* OUTPUTS : None
*
* RETURNS : None
*
* FUNCTION : Checks for machine specifc features such as MMX support
* sets approipriate flags and function pointers.
*
* SPECIAL NOTES : None.
*
****************************************************************************/
void UtilMachineSpecificConfig ( void )
{
int i;
for(i=0;i<=64;i++)
{
if(i<=1)idctc[i]=IDct1;
else if(i<=10)idctc[i]=IDct10;
else idctc[i]=IDctSlow;
}
fdct_short=fdct_short_C ;
for(i=0;i<=64;i++)
{
if(i<=1)idct[i]=IDct1;
else if(i<=10)idct[i]=IDct10;
else idct[i]=IDctSlow;
}
ClearSysState = ClearSysState_C;
ReconIntra = ScalarReconIntra;
ReconInter = ScalarReconInter;
ReconInterHalfPixel2 = ScalarReconInterHalfPixel2;
AverageBlock = AverageBlock_C;
UnpackBlock = UnpackBlock_C;
ReconBlock = ReconBlock_C;
SubtractBlock = SubtractBlock_C;
CopyBlock = CopyBlock_C;
Copy12x12 = Copy12x12_C;
FilterBlockBil_8 = FilterBlockBil_8_C;
FilterBlock=FilterBlock_C;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,74 @@
/****************************************************************************
*
* Module Title : dct.h
*
* Description : DCT header file.
*
****************************************************************************/
#ifndef __INC_DCT_H
#define __INC_DCT_H
/****************************************************************************
* Header files
****************************************************************************/
#include "type_aliases.h"
/****************************************************************************
* Macros
****************************************************************************/
#define COEFF_MAX 32768 // Max magnitude of DCT coefficient
// Extra bits of precision added to the fdct that have to be stripped off during the quantize
#define FDCT_PRECISION_BITS 1
#define FDCT_PRECISION_NEG_ADJ ((INT16) (1<<FDCT_PRECISION_BITS)-1)
#if 0 // AWG not required any more!!!
/* Cos and Sin constant multipliers used during DCT and IDCT */
extern const double C1S7;
extern const double C2S6;
extern const double C3S5;
extern const double C4S4;
extern const double C5S3;
extern const double C6S2;
extern const double C7S1;
// DCT lookup tables and pointers
extern INT32 * C4S4_TablePtr;
extern INT32 C4S4_Table[(COEFF_MAX * 4) + 1];
extern INT32 * C6S2_TablePtr;
extern INT32 C6S2_Table[(COEFF_MAX * 2) + 1];
extern INT32 * C2S6_TablePtr;
extern INT32 C2S6_Table[(COEFF_MAX * 2) + 1];
extern INT32 * C1S7_TablePtr;
extern INT32 C1S7_Table[(COEFF_MAX * 2) + 1];
extern INT32 * C7S1_TablePtr;
extern INT32 C7S1_Table[(COEFF_MAX * 2) + 1];
extern INT32 * C3S5_TablePtr;
extern INT32 C3S5_Table[(COEFF_MAX * 2) + 1];
extern INT32 * C5S3_TablePtr;
extern INT32 C5S3_Table[(COEFF_MAX * 2) + 1];
#endif
/****************************************************************************
* Exports
****************************************************************************/
#ifdef COMPDLL
// Forward Transform
extern void fdct_slow ( INT32 *InputData, double *OutputData );
#endif
// Reverse Transform
extern void IDctSlow( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
extern void IDct10 ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
extern void IDct1 ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
#endif

View file

@ -0,0 +1,11 @@
#if !defined(_mac_specs_h)
#define _mac_specs_h
#if defined(__cplusplus)
extern "C" {
#endif
int vputil_hasAltivec(void);
int vputil_cpuMhz(void);
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,60 @@
/****************************************************************************
*
* Module Title : Reconstruct.h
*
* Description : Block Reconstruction module header
*
* AUTHOR : Paul Wilkins
*
*****************************************************************************
* Revision History
*
* 1.00 PGW 14/10/99 Created
*
*****************************************************************************
*/
#define STRICT /* Strict type checking. */
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
#include "type_aliases.h"
/****************************************************************************
* Constants
*****************************************************************************
*/
/****************************************************************************
* Types
*****************************************************************************
*/
/****************************************************************************
* Data structures
*****************************************************************************
*/
/****************************************************************************
* Functions
*****************************************************************************
*/
// Scalar (no mmx) reconstruction functions
extern void ScalarReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
extern void ScalarReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
extern void ScalarReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
// MMx versions
extern void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
extern void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
extern void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
// WMT versions
extern void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
extern void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
extern void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
#endif

View file

@ -0,0 +1,388 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>17.0</VCProjectVersion>
<ProjectGuid>{F93716CE-8F89-4334-BE64-43705EF3FB70}</ProjectGuid>
<RootNamespace>vputil</RootNamespace>
<WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<PlatformToolset>v142</PlatformToolset>
<UseOfMfc>false</UseOfMfc>
<CharacterSet>MultiByte</CharacterSet>
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<PlatformToolset>v142</PlatformToolset>
<UseOfMfc>false</UseOfMfc>
<CharacterSet>MultiByte</CharacterSet>
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<PlatformToolset>v142</PlatformToolset>
<UseOfMfc>false</UseOfMfc>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<PlatformToolset>v142</PlatformToolset>
<UseOfMfc>false</UseOfMfc>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
<Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
<Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
<Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
<Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<_ProjectFileVersion>17.0.32505.173</_ProjectFileVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
<IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<LibraryPath>$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<IncludePath>$(IncludePath)</IncludePath>
<LibraryPath>$(LibraryPath)</LibraryPath>
<IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
<OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
<IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<LibraryPath>$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<IncludePath>$(IncludePath)</IncludePath>
<LibraryPath>$(LibraryPath)</LibraryPath>
<IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
<OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
</PropertyGroup>
<PropertyGroup Label="Vcpkg">
<VcpkgEnableManifest>false</VcpkgEnableManifest>
</PropertyGroup>
<PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<VcpkgInstalledDir>
</VcpkgInstalledDir>
<VcpkgUseStatic>false</VcpkgUseStatic>
<VcpkgConfiguration>Debug</VcpkgConfiguration>
<VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
</PropertyGroup>
<PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<VcpkgInstalledDir>
</VcpkgInstalledDir>
<VcpkgUseStatic>false</VcpkgUseStatic>
<VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
</PropertyGroup>
<PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<VcpkgInstalledDir>
</VcpkgInstalledDir>
<VcpkgUseStatic>false</VcpkgUseStatic>
<VcpkgConfiguration>Debug</VcpkgConfiguration>
<VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
</PropertyGroup>
<PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<VcpkgInstalledDir>
</VcpkgInstalledDir>
<VcpkgUseStatic>false</VcpkgUseStatic>
<VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<PrecompiledHeader />
<PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation />
<ObjectFileName>$(IntDir)</ObjectFileName>
<ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
<WarningLevel>Level3</WarningLevel>
<SuppressStartupBanner>true</SuppressStartupBanner>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<CompileAs>Default</CompileAs>
<DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
</ClCompile>
<ResourceCompile>
<PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<Culture>0x0409</Culture>
</ResourceCompile>
<Lib>
<SuppressStartupBanner>true</SuppressStartupBanner>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Lib>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<PrecompiledHeader>
</PrecompiledHeader>
<PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation>
</AssemblerListingLocation>
<ObjectFileName>$(IntDir)</ObjectFileName>
<ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
<WarningLevel>Level3</WarningLevel>
<SuppressStartupBanner>true</SuppressStartupBanner>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
<CompileAs>Default</CompileAs>
<DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
</ClCompile>
<ResourceCompile>
<PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<Culture>0x0409</Culture>
</ResourceCompile>
<Lib>
<SuppressStartupBanner>true</SuppressStartupBanner>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Lib>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>true</OmitFramePointers>
<AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<BufferSecurityCheck>false</BufferSecurityCheck>
<FunctionLevelLinking>true</FunctionLevelLinking>
<PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation />
<ObjectFileName>$(IntDir)</ObjectFileName>
<ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
<WarningLevel>Level3</WarningLevel>
<SuppressStartupBanner>true</SuppressStartupBanner>
<DebugInformationFormat>None</DebugInformationFormat>
<CompileAs>Default</CompileAs>
<DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
</ClCompile>
<ResourceCompile>
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<Culture>0x0409</Culture>
</ResourceCompile>
<Lib>
<SuppressStartupBanner>true</SuppressStartupBanner>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Lib>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>true</OmitFramePointers>
<AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<BufferSecurityCheck>false</BufferSecurityCheck>
<FunctionLevelLinking>true</FunctionLevelLinking>
<PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
<AssemblerListingLocation>
</AssemblerListingLocation>
<ObjectFileName>$(IntDir)</ObjectFileName>
<ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
<WarningLevel>Level3</WarningLevel>
<SuppressStartupBanner>true</SuppressStartupBanner>
<DebugInformationFormat>None</DebugInformationFormat>
<CompileAs>Default</CompileAs>
<DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
</ClCompile>
<ResourceCompile>
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<Culture>0x0409</Culture>
</ResourceCompile>
<Lib>
<SuppressStartupBanner>true</SuppressStartupBanner>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Lib>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="generic\fdct.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="generic\idctpart.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="generic\reconstruct.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="generic\uoptsystemdependant.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="generic\vputil.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\fdctmmx.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\fdctwmt.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\filtmmx.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\filtwmt.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\mmxidct.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\mmxrecon.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\uoptsystemdependant.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\vputilasm.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\wmtidct.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
<ClCompile Include="win32\wmtrecon.c">
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
<BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
<Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
</ClCompile>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View file

@ -0,0 +1,58 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="generic">
<UniqueIdentifier>{f7966dc8-1d55-46a4-b0e6-8584774d721d}</UniqueIdentifier>
</Filter>
<Filter Include="win32">
<UniqueIdentifier>{ad0ce32e-d033-416c-813e-7a7f913ac3fa}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="generic\fdct.c">
<Filter>generic</Filter>
</ClCompile>
<ClCompile Include="generic\idctpart.c">
<Filter>generic</Filter>
</ClCompile>
<ClCompile Include="generic\reconstruct.c">
<Filter>generic</Filter>
</ClCompile>
<ClCompile Include="generic\uoptsystemdependant.c">
<Filter>generic</Filter>
</ClCompile>
<ClCompile Include="generic\vputil.c">
<Filter>generic</Filter>
</ClCompile>
<ClCompile Include="win32\fdctmmx.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\fdctwmt.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\filtmmx.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\filtwmt.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\mmxidct.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\mmxrecon.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\uoptsystemdependant.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\vputilasm.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\wmtidct.c">
<Filter>win32</Filter>
</ClCompile>
<ClCompile Include="win32\wmtrecon.c">
<Filter>win32</Filter>
</ClCompile>
</ItemGroup>
</Project>

View file

@ -0,0 +1,213 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 42;
objects = {
/* Begin PBXBuildFile section */
0CAF34950BB78E9F000FB06C /* vputil.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34940BB78E9F000FB06C /* vputil.c */; };
0CAF34AC0BB78EDF000FB06C /* idctpart.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34A80BB78EDF000FB06C /* idctpart.c */; };
0CAF34AD0BB78EDF000FB06C /* fdct.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34A90BB78EDF000FB06C /* fdct.c */; };
0CAF34AE0BB78EDF000FB06C /* uoptsystemdependant.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34AA0BB78EDF000FB06C /* uoptsystemdependant.c */; };
0CAF34AF0BB78EDF000FB06C /* reconstruct.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34AB0BB78EDF000FB06C /* reconstruct.c */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
0CAF34940BB78E9F000FB06C /* vputil.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = vputil.c; path = generic/vputil.c; sourceTree = "<group>"; };
0CAF34A80BB78EDF000FB06C /* idctpart.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = idctpart.c; path = generic/idctpart.c; sourceTree = "<group>"; };
0CAF34A90BB78EDF000FB06C /* fdct.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = fdct.c; path = generic/fdct.c; sourceTree = "<group>"; };
0CAF34AA0BB78EDF000FB06C /* uoptsystemdependant.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = uoptsystemdependant.c; path = generic/uoptsystemdependant.c; sourceTree = "<group>"; };
0CAF34AB0BB78EDF000FB06C /* reconstruct.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = reconstruct.c; path = generic/reconstruct.c; sourceTree = "<group>"; };
D2AAC046055464E500DB518D /* libvputil.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libvputil.a; sourceTree = BUILT_PRODUCTS_DIR; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
D289987405E68DCB004EDB86 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
08FB7794FE84155DC02AAC07 /* vputil */ = {
isa = PBXGroup;
children = (
08FB7795FE84155DC02AAC07 /* Source */,
C6A0FF2B0290797F04C91782 /* Documentation */,
1AB674ADFE9D54B511CA2CBB /* Products */,
);
name = vputil;
sourceTree = "<group>";
};
08FB7795FE84155DC02AAC07 /* Source */ = {
isa = PBXGroup;
children = (
0CAF34940BB78E9F000FB06C /* vputil.c */,
0CAF34A80BB78EDF000FB06C /* idctpart.c */,
0CAF34A90BB78EDF000FB06C /* fdct.c */,
0CAF34AA0BB78EDF000FB06C /* uoptsystemdependant.c */,
0CAF34AB0BB78EDF000FB06C /* reconstruct.c */,
);
name = Source;
sourceTree = "<group>";
};
1AB674ADFE9D54B511CA2CBB /* Products */ = {
isa = PBXGroup;
children = (
D2AAC046055464E500DB518D /* libvputil.a */,
);
name = Products;
sourceTree = "<group>";
};
C6A0FF2B0290797F04C91782 /* Documentation */ = {
isa = PBXGroup;
children = (
);
name = Documentation;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXHeadersBuildPhase section */
D2AAC043055464E500DB518D /* Headers */ = {
isa = PBXHeadersBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXHeadersBuildPhase section */
/* Begin PBXNativeTarget section */
D2AAC045055464E500DB518D /* vputil */ = {
isa = PBXNativeTarget;
buildConfigurationList = 1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "vputil" */;
buildPhases = (
D2AAC043055464E500DB518D /* Headers */,
D2AAC044055464E500DB518D /* Sources */,
D289987405E68DCB004EDB86 /* Frameworks */,
);
buildRules = (
);
dependencies = (
);
name = vputil;
productName = vputil;
productReference = D2AAC046055464E500DB518D /* libvputil.a */;
productType = "com.apple.product-type.library.static";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
08FB7793FE84155DC02AAC07 /* Project object */ = {
isa = PBXProject;
buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "vputil" */;
hasScannedForEncodings = 1;
mainGroup = 08FB7794FE84155DC02AAC07 /* vputil */;
projectDirPath = "";
targets = (
D2AAC045055464E500DB518D /* vputil */,
);
};
/* End PBXProject section */
/* Begin PBXSourcesBuildPhase section */
D2AAC044055464E500DB518D /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
0CAF34950BB78E9F000FB06C /* vputil.c in Sources */,
0CAF34AC0BB78EDF000FB06C /* idctpart.c in Sources */,
0CAF34AD0BB78EDF000FB06C /* fdct.c in Sources */,
0CAF34AE0BB78EDF000FB06C /* uoptsystemdependant.c in Sources */,
0CAF34AF0BB78EDF000FB06C /* reconstruct.c in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */
1DEB91EC08733DB70010E9CD /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
COPY_PHASE_STRIP = NO;
GCC_DYNAMIC_NO_PIC = NO;
GCC_ENABLE_FIX_AND_CONTINUE = YES;
GCC_MODEL_TUNING = G5;
GCC_OPTIMIZATION_LEVEL = 0;
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = vputil;
ZERO_LINK = YES;
};
name = Debug;
};
1DEB91ED08733DB70010E9CD /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ARCHS = (
ppc,
i386,
);
GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
GCC_MODEL_TUNING = G5;
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = vputil;
};
name = Release;
};
1DEB91F008733DB70010E9CD /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
OBJROOT = build;
PREBINDING = NO;
SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
SYMROOT = ../../../lib/osx;
USER_HEADER_SEARCH_PATHS = "include ../include ../../include ../../../include";
};
name = Debug;
};
1DEB91F108733DB70010E9CD /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
OBJROOT = build;
PREBINDING = NO;
SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
SYMROOT = ../../../lib/osx;
USER_HEADER_SEARCH_PATHS = "include ../include ../../include ../../../include";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "vputil" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1DEB91EC08733DB70010E9CD /* Debug */,
1DEB91ED08733DB70010E9CD /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "vputil" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1DEB91F008733DB70010E9CD /* Debug */,
1DEB91F108733DB70010E9CD /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,810 @@
/****************************************************************************
*
* Module Title : Fdctwmt.c
*
* Description : Forward DCT optimized specifically for Intel P4
* processor
*
* AUTHOR : YaoWu Xu
*
*****************************************************************************
* Revision History
*
* 1.00 YWX 03/11/02 Configuration baseline
*
*****************************************************************************
*/
/*******************************************************************************
* Module Constants
*******************************************************************************
*/
__declspec(align(16)) static unsigned short TIRY[8];
__declspec(align(16)) static unsigned short WmtIdctConst[8 * 8] =
{
0, 0, 0, 0, 0, 0, 0, 0,
64277,64277,64277,64277,64277,64277,64277,64277,
60547,60547,60547,60547,60547,60547,60547,60547,
54491,54491,54491,54491,54491,54491,54491,54491,
46341,46341,46341,46341,46341,46341,46341,46341,
36410,36410,36410,36410,36410,36410,36410,36410,
25080,25080,25080,25080,25080,25080,25080,25080,
12785,12785,12785,12785,12785,12785,12785,12785
};
/**************************************************************************************
*
* Macro: FDct_WMT
*
* Description: The Macro does 1-D IDct on 8 columns.
*
* Input: None
*
* Output: None
*
* Return: None
*
* Special Note: None
*
* Error: None
*
***************************************************************************************
*/
void fdct_WMT(short *InputData, short *OutputData)
{
__asm
{
mov eax, InputData
mov ebx, OutputData
lea edx, WmtIdctConst
#define I(i) [eax + 16 * i ]
#define O(i) [ebx + 16 * i ]
#define C(i) [edx + 16 * i ]
/******************************************************/
/* Do 8x8 Transpose */
/******************************************************/
movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
psllw xmm4, 1
psllw xmm0, 1
movdqa xmm5, xmm4 /* make a copy */
punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
psllw xmm6, 1
psllw xmm0, 1
movdqa xmm7, xmm6 /* make a copy */
punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
movdqa xmm3, xmm4 /* make a copy */
punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
/* Free xmm6 */
movdqa xmm6, xmm5 /* make a copy */
punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
/* Free xmm7 */
movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
psllw xmm0, 1
psllw xmm1, 1
movdqa xmm7, xmm0 /* make a copy */
punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
/* Free xmm1 */
movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
psllw xmm2, 1
psllw xmm3, 1
movdqa xmm1, xmm2 /* make a copy */
punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
movdqa xmm3, xmm0 /* make a copy */
punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
/* Free xmm2 */
movdqa xmm2, xmm7 /* make a copy */
punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
movdqa xmm1, xmm0 /* make a copy */
punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
movdqa I(0), xmm0 /* save I(0) */
movdqa I(1), xmm1 /* save I(1) */
movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
movdqa xmm1, xmm3 /* make a copy */
punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
movdqa xmm4, xmm2 /* make a copy */
punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
movdqa I(2), xmm1 /* save I(2) */
movdqa I(3), xmm3 /* save I(3) */
movdqa I(4), xmm4 /* save I(4) */
movdqa I(5), xmm2 /* save I(5) */
movdqa xmm5, xmm7 /* make a copy */
punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
movdqa I(6), xmm5 /* save I(6) */
movdqa I(7), xmm7 /* save I(7) */
/******************************************************/
/* Done with transpose - Let's do the forward DCT */
/******************************************************/
movdqa xmm0, I(0) /* xmm0 = ip0 */
movdqa xmm1, I(1) /* xmm1 = ip1 */
movdqa xmm2, I(3) /* xmm2 = ip3 */
movdqa xmm3, I(5) /* xmm3 = ip5 */
movdqa xmm4, xmm0 /* xmm4 = ip0 */
movdqa xmm5, xmm1 /* xmm5 = ip1 */
movdqa xmm6, xmm2 /* xmm6 = ip3 */
movdqa xmm7, xmm3 /* xmm7 = ip5 */
paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
movdqa TIRY, xmm0 /* save is07-is34 */
paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
/*---------------------------------------------------------*/
/* op0 and op4
/*---------------------------------------------------------*/
psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
psrlw xmm2, 15
paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
psrlw xmm2, 15
paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
movdqa O(0), xmm3 /* save op0 */
/*---------------------------------------------------------*/
/* op2 and op6
/*---------------------------------------------------------*/
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
movdqa xmm2, TIRY /* xmm2 = irot_input_y */
movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
psrlw xmm2, 15
paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
paddw xmm3, xmm2 /* Truncated */
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
psrlw xmm2, 15
paddw xmm0, xmm2 /* Truncated */
paddsw xmm3, xmm0 /* op[2] */
movdqa O(2), xmm3 /* save op[2] */
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
psrlw xmm2, 15
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
paddw xmm5, xmm2 /* Truncated */
movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
psrlw xmm2, 15
paddw xmm3, xmm2 /* Truncated */
psubsw xmm3, xmm5 /* xmm3 = op[6] */
movdqa O(6), xmm3
/*-----------------------------------------------------------------------*/
/* icommon_product1, icommon_product2 */
/*-----------------------------------------------------------------------*/
movdqa xmm0, C(4) /* xmm0 = xC4s4 */
movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
psrlw xmm2, 15
paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
psrlw xmm2, 15 /* For trucation */
paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
/*---------------------------------------------------------*/
pxor xmm0, xmm0 /* Clear xmm0 */
psubsw xmm0, xmm6 /* xmm0 = - id34 */
psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
/*---------------------------------------------------------*/
/* op1 and op7
/*---------------------------------------------------------*/
movdqa xmm7, C(1) /* xC1S7 */
movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
movdqa xmm7, C(7) /* xC7S1 */
psrlw xmm2, 15 /* for trucation */
paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
paddw xmm1, xmm2 /* Trucated */
pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
paddw xmm3, xmm2 /* Truncated */
movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
movdqa xmm7, C(1) /* xC1S7 */
pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
movdqa xmm7, C(7) /* xC7S1 */
psrlw xmm2, 15 /* for trucation */
paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
paddw xmm0, xmm2 /* Truncated */
pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
paddw xmm5, xmm2 /* Truncated */
psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
movdqa O(1), xmm1
movdqa O(7), xmm3
/*---------------------------------------------------------*/
/* op3 and op5
/*---------------------------------------------------------*/
movdqa xmm0, C(3) /* xC3S5 */
movdqa xmm1, C(5) /* xC5S3 */
movdqa xmm5,xmm6 /* irot_input_x */
movdqa xmm7,xmm6 /* irot_input_x */
movdqa xmm2,xmm4 /* irot_input_y */
movdqa xmm3,xmm4 /* irot_input_y */
pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
psrlw xmm2,15 /* for trucation */
psrlw xmm5,15 /* for trucation */
paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
paddw xmm4,xmm2 /* Truncated */
paddw xmm6,xmm5 /* Truncated */
psubsw xmm4,xmm6 /* op [3] */
movdqa O(3),xmm4 /* Save Op[3] */
movdqa xmm4,xmm3 /* irot_input_y */
movdqa xmm6,xmm7 /* irot_input_x */
pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
paddw xmm4,xmm2 /* Trucated */
paddw xmm6,xmm5 /* Trucated */
paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
paddw xmm3,xmm7 /* Op[5] */
movdqa O(5),xmm3 /* Save Op[5] */
/*---------------------------------------------------------*/
/* End of 8 1-D FDCT */
/*---------------------------------------------------------*/
#undef I
#undef O
#define I(i) [ebx + 16 * i ]
#define O(i) [ebx + 16 * i ]
/******************************************************/
/* Do 8x8 Transpose */
/******************************************************/
movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
movdqa xmm5, xmm4 /* make a copy */
punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
movdqa xmm7, xmm6 /* make a copy */
punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
movdqa xmm3, xmm4 /* make a copy */
punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
/* Free xmm6 */
movdqa xmm6, xmm5 /* make a copy */
punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
/* Free xmm7 */
movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
movdqa xmm7, xmm0 /* make a copy */
punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
/* Free xmm1 */
movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
movdqa xmm1, xmm2 /* make a copy */
punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
movdqa xmm3, xmm0 /* make a copy */
punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
/* Free xmm2 */
movdqa xmm2, xmm7 /* make a copy */
punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
movdqa xmm1, xmm0 /* make a copy */
punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
movdqa I(0), xmm0 /* save I(0) */
movdqa I(1), xmm1 /* save I(1) */
movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
movdqa xmm1, xmm3 /* make a copy */
punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
movdqa xmm4, xmm2 /* make a copy */
punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
movdqa I(2), xmm1 /* save I(2) */
movdqa I(3), xmm3 /* save I(3) */
movdqa I(4), xmm4 /* save I(4) */
movdqa I(5), xmm2 /* save I(5) */
movdqa xmm5, xmm7 /* make a copy */
punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
movdqa I(6), xmm5 /* save I(6) */
movdqa I(7), xmm7 /* save I(7) */
/******************************************************/
/* Done with transpose - Let's do the forward DCT */
/******************************************************/
movdqa xmm0, I(0) /* xmm0 = ip0 */
movdqa xmm1, I(1) /* xmm1 = ip1 */
movdqa xmm2, I(3) /* xmm2 = ip3 */
movdqa xmm3, I(5) /* xmm3 = ip5 */
movdqa xmm4, xmm0 /* xmm4 = ip0 */
movdqa xmm5, xmm1 /* xmm5 = ip1 */
movdqa xmm6, xmm2 /* xmm6 = ip3 */
movdqa xmm7, xmm3 /* xmm7 = ip5 */
paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
movdqa TIRY, xmm0 /* save is07-is34 */
paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
/*---------------------------------------------------------*/
/* op0 and op4
/*---------------------------------------------------------*/
#if 0
movdqa xmm0, xmm2 /* xmm0 =xmm2= is0734 */
pmulhw xmm2, C(4) /* xC4S4 * is0734 - is0734 */
paddw xmm2, xmm0 /* XC4S4 * is0734 */
movdqa xmm0, xmm3 /* xmm0 =xmm3= is1256 */
pmulhw xmm3, C(4) /* xC4S4 * is1256 - is1256 */
paddw xmm3, xmm0 /* xC4S4 * is1256 */
movdqa xmm0, xmm2
paddsw xmm2, xmm3 /* xC4S4 * ( is0734 +is1256 ) */
psubsw xmm0, xmm3 /* xC4S4 * ( is0734 -is1256 ) */
movdqa xmm3, xmm2
psrlw xmm2, 15
paddsw xmm3, xmm2
movdqa xmm2, xmm0
movdqa O(0), xmm3
psrlw xmm0, 15
paddsw xmm2, xmm0
movdqa O(4), xmm2
#else
psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
psrlw xmm2, 15
paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
movdqa xmm2, xmm0
psrlw xmm0, 15
paddw xmm0, xmm2
psraw xmm0, 1
movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
psrlw xmm2, 15
paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
movdqa xmm2, xmm3
psrlw xmm3, 15
paddw xmm3, xmm2
psraw xmm3, 1
movdqa O(0), xmm3 /* save op0 */
#endif
/*---------------------------------------------------------*/
/* op2 and op6
/*---------------------------------------------------------*/
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
movdqa xmm2, TIRY /* xmm2 = irot_input_y */
movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
psrlw xmm2, 15
paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
paddw xmm3, xmm2 /* Truncated */
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
psrlw xmm2, 15
paddw xmm0, xmm2 /* Truncated */
paddsw xmm3, xmm0 /* op[2] */
movdqa xmm0, xmm3
psrlw xmm3, 15
paddw xmm3, xmm0
psraw xmm3, 1
movdqa O(2), xmm3 /* save op[2] */
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
psrlw xmm2, 15
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
paddw xmm5, xmm2 /* Truncated */
movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
psrlw xmm2, 15
paddw xmm3, xmm2 /* Truncated */
psubsw xmm3, xmm5 /* xmm3 = op[6] */
movdqa xmm5, xmm3
psrlw xmm3, 15
paddw xmm3, xmm5
psraw xmm3, 1
movdqa O(6), xmm3
/*-----------------------------------------------------------------------*/
/* icommon_product1, icommon_product2 */
/*-----------------------------------------------------------------------*/
movdqa xmm0, C(4) /* xmm0 = xC4s4 */
movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
psrlw xmm2, 15
paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
psrlw xmm2, 15 /* For trucation */
paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
/*---------------------------------------------------------*/
pxor xmm0, xmm0 /* Clear xmm0 */
psubsw xmm0, xmm6 /* xmm0 = - id34 */
psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
/*---------------------------------------------------------*/
/* op1 and op7
/*---------------------------------------------------------*/
movdqa xmm7, C(1) /* xC1S7 */
movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
movdqa xmm7, C(7) /* xC7S1 */
psrlw xmm2, 15 /* for trucation */
paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
paddw xmm1, xmm2 /* Trucated */
pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
paddw xmm3, xmm2 /* Truncated */
movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
movdqa xmm7, C(1) /* xC1S7 */
pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
movdqa xmm7, C(7) /* xC7S1 */
psrlw xmm2, 15 /* for trucation */
paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
paddw xmm0, xmm2 /* Truncated */
pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
paddw xmm5, xmm2 /* Truncated */
psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
movdqa xmm5, xmm1
movdqa xmm0, xmm3
psrlw xmm1, 15
psrlw xmm3, 15
paddw xmm1, xmm5
paddw xmm3, xmm0
psraw xmm1, 1
psraw xmm3, 1
movdqa O(1), xmm1
movdqa O(7), xmm3
/*---------------------------------------------------------*/
/* op3 and op5
/*---------------------------------------------------------*/
movdqa xmm0, C(3) /* xC3S5 */
movdqa xmm1, C(5) /* xC5S3 */
movdqa xmm5,xmm6 /* irot_input_x */
movdqa xmm7,xmm6 /* irot_input_x */
movdqa xmm2,xmm4 /* irot_input_y */
movdqa xmm3,xmm4 /* irot_input_y */
pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
psrlw xmm2,15 /* for trucation */
psrlw xmm5,15 /* for trucation */
paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
paddw xmm4,xmm2 /* Truncated */
paddw xmm6,xmm5 /* Truncated */
psubsw xmm4,xmm6 /* op [3] */
movdqa xmm6,xmm4
psrlw xmm4,15
paddw xmm4,xmm6
psraw xmm4,1
movdqa O(3),xmm4 /* Save Op[3] */
movdqa xmm4,xmm3 /* irot_input_y */
movdqa xmm6,xmm7 /* irot_input_x */
pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
paddw xmm4,xmm2 /* Trucated */
paddw xmm6,xmm5 /* Trucated */
paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
paddw xmm3,xmm7 /* Op[5] */
movdqa xmm7,xmm3
psrlw xmm3,15
paddw xmm3,xmm7
psraw xmm3,1
movdqa O(5),xmm3 /* Save Op[5] */
/*---------------------------------------------------------*/
/* End of 8 1-D FDCT */
/*---------------------------------------------------------*/
}/* end of _asm code section */
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,790 @@
/****************************************************************************
*
* Module Title : newLoopTest_asm.c
*
* Description : Codec specific functions
*
* AUTHOR : Yaowu Xu
*
*****************************************************************************
* Revision History
*
* 1.02 YWX 03-Nov-00 Changed confusing variable name
* 1.01 YWX 02-Nov-00 Added the set of functions
* 1.00 YWX 19-Oct-00 configuration baseline
*****************************************************************************
*/
/****************************************************************************
* Header Frames
*****************************************************************************
*/
#define STRICT /* Strict type checking. */
#include "codec_common.h"
#include <math.h>
/****************************************************************************
* Module constants.
*****************************************************************************
*/
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define FILTER_WEIGHT 128
#define FILTER_SHIFT 7
__declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};
__declspec(align(16)) INT16 BilinearFilters_wmt[8][16] =
{
{ 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 112,112,112,112,112,112,112,112, 16, 16, 16, 16, 16, 16, 16, 16 },
{ 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
{ 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
{ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
{ 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
{ 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
{ 16, 16, 16, 16, 16, 16, 16, 16, 112,112,112,112,112,112,112,112 }
};
extern __declspec(align(16)) INT16 BicubicFilters_mmx[17][8][32];
_inline
void FilterBlock1d_h_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
__asm
{
mov edi, Filter
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
movdqa xmm2, [edi+ 16] ; xmm3 *= kernel 0 modifiers.
movdqa xmm6, [edi + 32] ; xmm3 *= kernel 0 modifiers.
movdqa xmm7, [edi + 48] ; xmm3 *= kernel 0 modifiers.
mov edi,OutputPtr
mov esi,SrcPtr
dec esi
mov ecx, DWORD PTR OutputHeight
mov eax, OutputWidth ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
// kernel 0 and 3 are potentially negative taps. These negative tap filters
// must be done first or we could have problems saturating our high value
// tap filters
movdqu xmm3, [esi] ; xmm3 = p-1..p14
movdqu xmm4, xmm3 ; xmm4 = p-1..p14
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
psrldq xmm4, 3 ; xmm4 = p2..p13
movdqa xmm5, xmm4 ; xmm5 = p2..p13
punpcklbw xmm5, xmm0 ; xmm5 = p2..p7
pmullw xmm5, xmm7 ; xmm5 *= kernel 3 modifiers
paddsw xmm3, xmm5 ; xmm3 += xmm5
movdqu xmm4, [esi+1] ; xmm4 = p0..p13
movdqa xmm5, xmm4 ; xmm5 = p0..p13
punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
paddsw xmm3, xmm5 ; xmm3 += xmm5
psrldq xmm4, 1 ; xmm4 = p1..p13
movdqa xmm5, xmm4 ; xmm5 = p1..p13
punpcklbw xmm5, xmm0 ; xmm5 = p1..p7
pmullw xmm5, xmm6 ; xmm5 *= kernel 2 modifiers
paddsw xmm3, xmm5 ; xmm3 += xmm5
paddsw xmm3, rd ; xmm3 += round value
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
packuswb xmm3, xmm0 ; pack and saturate
movdq2q mm0, xmm3
movq [edi],mm0 ; store the results in the destination
add esi,SrcPixelsPerLine ; next line
add edi,eax;
dec ecx ; decrement count
jnz nextrow ; next row
}
}
_inline
void FilterBlock1d_v_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
__asm
{
mov edi, Filter
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
movdqa xmm6, [edi + 32] ; xmm3 *= kernel 0 modifiers.
movdqa xmm7, [edi + 48] ; xmm3 *= kernel 0 modifiers.
mov edx, PixelsPerLine
mov edi, OutputPtr
mov esi, SrcPtr
sub esi, PixelsPerLine
mov ecx, DWORD PTR OutputHeight
mov eax, OutputWidth ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
movdqu xmm3, [esi] ; xmm3 = p0..p16
punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
add esi, edx ; move source forward 1 line to avoid 3 * pitch
movdqu xmm4, [esi+2*edx] ; xmm4 = p0..p16
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
pmullw xmm4, xmm7 ; xmm4 *= kernel 3 modifiers.
paddsw xmm3, xmm4 ; xmm3 += xmm4
movdqu xmm4, [esi ] ; xmm4 = p0..p16
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
paddsw xmm3, xmm4 ; xmm3 += xmm4
movdqu xmm4, [esi +edx] ; xmm4 = p0..p16
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
pmullw xmm4, xmm6 ; xmm4 *= kernel 2 modifiers.
paddsw xmm3, xmm4 ; xmm3 += xmm4
paddsw xmm3, rd ; xmm3 += round value
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
packuswb xmm3, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm3
movq [edi],mm0 ; store the results in the destination
// the subsequent iterations repeat 3 out of 4 of these reads. Since the
// recon block should be in cache this shouldn't cost much. Its obviously
// avoidable!!!.
add edi,eax;
dec ecx ; decrement count
jnz nextrow ; next row
}
}
_inline
void FilterBlock1d_hb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
__asm
{
mov edi, Filter
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
mov edi,OutputPtr
mov esi,SrcPtr
mov ecx, DWORD PTR OutputHeight
mov eax, OutputWidth ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
movdqu xmm3, [esi] ; xmm3 = p-1..p14
movdqu xmm5, xmm3 ; xmm4 = p-1..p14
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
psrldq xmm5, 1 ; xmm4 = p0..p13
punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
paddw xmm3, xmm5 ; xmm3 += xmm5
paddw xmm3, rd ; xmm3 += round value
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
packuswb xmm3, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm3
movq [edi],mm0 ; store the results in the destination
add esi,SrcPixelsPerLine ; next line
add edi,eax;
dec ecx ; decrement count
jnz nextrow ; next row
}
}
_inline
void FilterBlock1d_vb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
__asm
{
mov edi, Filter
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
mov edx, PixelsPerLine
mov edi, OutputPtr
mov esi, SrcPtr
mov ecx, DWORD PTR OutputHeight
mov eax, OutputWidth ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
movdqu xmm3, [esi] ; xmm3 = p0..p16
punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
movdqu xmm4, [esi +edx ] ; xmm4 = p0..p16
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
paddw xmm3, xmm4 ; xmm3 += xmm4
paddw xmm3, rd ; xmm3 += round value
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
packuswb xmm3, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm3
movq [edi],mm0 ; store the results in the destination
// the subsequent iterations repeat 3 out of 4 of these reads. Since the
// recon block should be in cache this shouldn't cost much. Its obviously
// avoidable!!!.
add esi,edx
add edi,eax
dec ecx ; decrement count
jnz nextrow ; next row
}
}
/****************************************************************************
*
* ROUTINE : FilterBlock2dBil
*
* INPUTS : Pointer to source data
*
* OUTPUTS : Filtered data
*
* RETURNS : None.
*
* FUNCTION : Applies a bilinear filter on the intput data to produce
* a predictor block (UINT16)
*
* SPECIAL NOTES :
*
* ERRORS : None.
*
****************************************************************************/
_inline
void FilterBlock2dBil_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
{
__asm
{
mov eax, HFilter ;
mov edi, OutputPtr ;
mov esi, SrcPtr ;
lea ecx, [edi+64] ;
mov edx, SrcPixelsPerLine ;
movdqa xmm1, [eax] ;
movdqa xmm2, [eax+16] ;
mov eax, VFilter ;
pxor xmm0, xmm0 ;
// get the first horizontal line done ;
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
pmullw xmm3, xmm1 ;
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
pmullw xmm4, xmm2 ;
paddw xmm3, xmm4 ;
paddw xmm3, rd ;
psraw xmm3, FILTER_SHIFT ; ready for output
movdqa xmm5, xmm3 ;
add esi, edx ; next line
NextRow:
pmullw xmm5, [eax] ;
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
pmullw xmm3, xmm1 ;
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
movdqa xmm6, xmm5 ;
pmullw xmm4, xmm2 ;
paddw xmm3, xmm4 ;
paddw xmm3, rd ;
psraw xmm3, FILTER_SHIFT ; ready for output
movdqa xmm5, xmm3 ; make a copy for the next row
pmullw xmm3, [eax+16] ;
paddw xmm6, xmm3 ;
paddw xmm6, rd ; xmm6 += round value
psraw xmm6, FILTER_SHIFT ; xmm6 /= 128
packuswb xmm6, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm6
movq [edi], mm0 ; store the results in the destination
add esi, edx ; next line
add edi, 8 ;
cmp edi, ecx ;
jne NextRow
}
// First filter 1d Horizontal
//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
// Now filter Verticaly
//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
}
_inline
void FilterUnpackBlock2dBil_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
{
__asm
{
mov eax, HFilter ;
mov edi, OutputPtr ;
mov esi, SrcPtr ;
lea ecx, [edi+128] ;
mov edx, SrcPixelsPerLine ;
movdqa xmm1, [eax] ;
movdqa xmm2, [eax+16] ;
mov eax, VFilter ;
pxor xmm0, xmm0 ;
// get the first horizontal line done ;
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
pmullw xmm3, xmm1 ;
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
pmullw xmm4, xmm2 ;
paddw xmm3, xmm4 ;
paddw xmm3, rd ;
psraw xmm3, FILTER_SHIFT ; ready for output
movdqa xmm5, xmm3 ;
add esi, edx ; next line
NextRow:
pmullw xmm5, [eax] ;
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
pmullw xmm3, xmm1 ;
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
movdqa xmm6, xmm5 ;
pmullw xmm4, xmm2 ;
paddw xmm3, xmm4 ;
paddw xmm3, rd ;
psraw xmm3, FILTER_SHIFT ; ready for output
movdqa xmm5, xmm3 ; make a copy for the next row
pmullw xmm3, [eax+16] ;
paddw xmm6, xmm3 ;
paddw xmm6, rd ; xmm6 += round value
psraw xmm6, FILTER_SHIFT ; xmm6 /= 128
movdqu [edi], xmm6;
/*
packuswb xmm6, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm6
movq [edi], mm0 ; store the results in the destination
*/
add esi, edx ; next line
add edi, 16 ;
cmp edi, ecx ;
jne NextRow
}
// First filter 1d Horizontal
//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
// Now filter Verticaly
//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
}
_inline
void FilterUnpackBlock1d_hb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
__asm
{
mov edi, Filter
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
mov edi,OutputPtr
mov esi,SrcPtr
mov ecx, DWORD PTR OutputHeight
mov eax, OutputWidth ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
movdqu xmm3, [esi] ; xmm3 = p-1..p14
movdqu xmm5, xmm3 ; xmm4 = p-1..p14
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
psrldq xmm5, 1 ; xmm4 = p0..p13
punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
paddw xmm3, xmm5 ; xmm3 += xmm5
paddw xmm3, rd ; xmm3 += round value
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
/*
packuswb xmm3, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm3
*/
movdqu [edi],xmm3 ; store the results in the destination
add esi,SrcPixelsPerLine ; next line
add edi,eax;
dec ecx ; decrement count
jnz nextrow ; next row
}
}
_inline
void FilterUnpackBlock1d_vb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
__asm
{
mov edi, Filter
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
mov edx, PixelsPerLine
mov edi, OutputPtr
mov esi, SrcPtr
mov ecx, DWORD PTR OutputHeight
mov eax, OutputWidth ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
movdqu xmm3, [esi] ; xmm3 = p0..p16
punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
movdqu xmm4, [esi +edx ] ; xmm4 = p0..p16
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
paddw xmm3, xmm4 ; xmm3 += xmm4
paddw xmm3, rd ; xmm3 += round value
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
/*packuswb xmm3, xmm0 ; pack and unpack to saturate
movdq2q mm0, xmm3
*/
movdqu [edi],xmm3 ; store the results in the destination
// the subsequent iterations repeat 3 out of 4 of these reads. Since the
// recon block should be in cache this shouldn't cost much. Its obviously
// avoidable!!!.
add esi,edx
add edi,eax
dec ecx ; decrement count
jnz nextrow ; next row
}
}
/****************************************************************************
*
* ROUTINE : FilterBlockBil_8
*
* INPUTS : ReconPtr1, ReconPtr12
* Two pointers into the block of data to be filtered
* These pointers bound the fractional pel position
* PixelsPerLine
* Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
* Modx, ModY
* The fractional pel bits used to select a filter.
*
*
* OUTPUTS : ReconRefPtr
* A pointer to an 8x8 buffer into which UINT8 filtered data is written.
*
* RETURNS : None.
*
* FUNCTION : Produces a bilinear filtered fractional pel prediction block
* with UINT8 output
*
* SPECIAL NOTES :
*
* ERRORS : None.
*
****************************************************************************/
void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
{
int diff;
// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
diff=ReconPtr2-ReconPtr1;
// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
// This works out to be what we want... despite the pointer swapping that goes on below.
// For example... if the X component of the vector is a +ve ModX = X%8.
// if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
if(diff<0)
{ // swap pointers so ReconPtr1 smaller
UINT8 *temp=ReconPtr1;
ReconPtr1=ReconPtr2;
ReconPtr2=temp;
diff= (int)(ReconPtr2-ReconPtr1);
}
if( diff==1 )
{
FilterBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
}
else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
{
FilterBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
}
else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
{
FilterBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
//FilterBlock2dBil_8_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
}
else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
{
FilterBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
//FilterBlock2dBil_8_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
}
}
_inline void UnpackBlock_wmt( UINT8 *SrcPtr, UINT16 *OutputPtr, UINT32 SrcPixelsPerLine )
{
__asm
{
mov edi,OutputPtr
mov esi,SrcPtr
mov ecx, 8
mov eax, 16 ; destination pitch?
pxor xmm0, xmm0 ; xmm0 = 00000000
nextrow:
movdqu xmm3, [esi] ; xmm3 = p-1..p14
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
movdqu [edi],xmm3 ; store the results in the destination
add esi,SrcPixelsPerLine ; next line
add edi,eax;
dec ecx ; decrement count
jnz nextrow ; next row
}
}
/****************************************************************************
*
* ROUTINE : FilterBlock2d
*
* INPUTS : Pointer to source data
*
* OUTPUTS : Filtered data
*
* RETURNS : None.
*
* FUNCTION : Applies a 2d 4 tap filter on the intput data to produce
* a predictor block (UINT16)
*
* SPECIAL NOTES :
*
* ERRORS : None.
*
****************************************************************************/
void FilterBlock2d_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
{
UINT8 Intermediate[256];
// First filter 1d Horizontal
FilterBlock1d_h_wmt(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );
// Now filter Verticaly
FilterBlock1d_v_wmt(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
}
/****************************************************************************
*
* ROUTINE : FilterBlock
*
* INPUTS : ReconPtr1, ReconPtr12
* Two pointers into the block of data to be filtered
* These pointers bound the fractional pel position
* PixelsPerLine
* Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
* Modx, ModY
* The fractional pel bits used to select a filter.
* UseBicubic
* Whether to use the bicubuc filter set or the bilinear set
*
*
* OUTPUTS : ReconRefPtr
* A pointer to an 8x8 buffer into which the filtered data is written.
*
* RETURNS : None.
*
* FUNCTION : Produces a filtered fractional pel prediction block
* using bilinear or bicubic filters
*
* SPECIAL NOTES :
*
* ERRORS : None.
*
****************************************************************************/
void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
{
int diff;
UINT8 Intermediate[256];
// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
diff=ReconPtr2-ReconPtr1;
// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
// This works out to be what we want... despite the pointer swapping that goes on below.
// For example... if the X component of the vector is a +ve ModX = X%8.
// if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
if(diff<0)
{ // swap pointers so ReconPtr1 smaller
UINT8 *temp=ReconPtr1;
ReconPtr1=ReconPtr2;
ReconPtr2=temp;
diff= (int)(ReconPtr2-ReconPtr1);
}
if(!diff)
{
return;
}
if(UseBicubic)
{
if( diff==1 )
{ // Fractional pixel in horizontal only
FilterBlock1d_h_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
}
else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
{
FilterBlock1d_v_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
}
else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
{
FilterBlock2d_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
}
else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
{
FilterBlock2d_wmt( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
}
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
}
else
{
if( diff==1 )
{
FilterUnpackBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 16, BilinearFilters_wmt[ModX] );
// Fractional pixel in horizontal only
/*
FilterBlock1d_hb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
*/
}
else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
{
FilterUnpackBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 16, BilinearFilters_wmt[ModY]);
/*
FilterBlock1d_vb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
*/
}
else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
{
FilterUnpackBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
/*
FilterBlock2dBil_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
*/
}
else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
{
FilterUnpackBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
/*
FilterBlock2dBil_wmt( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
*/
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,856 @@
/****************************************************************************
*
* Module Title : OptFunctions.c
*
* Description : MMX or otherwise processor specific
* optimised versions of functions
*
* AUTHOR : Paul Wilkins
*
*****************************************************************************
* Revision History
*
* 1.07 JBB 26/01/01 Removed unused function
* 1.06 YWX 23/05/00 Remove the clamping in MmxReconPostProcess()
* 1.05 YWX 15/05/00 Added MmxReconPostProcess()
* 1.04 SJL 03/14/00 Added in Tim's versions of MmxReconInter and MmxReconInterHalfPixel2.
* 1.03 PGW 12/10/99 Changes to reduce uneccessary dependancies.
* 1.02 PGW 30/08/99 Minor changes to MmxReconInterHalfPixel2().
* 1.01 PGW 13/07/99 Changes to keep reconstruction data to 16 bit
* 1.00 PGW 14/06/99 Configuration baseline
*
*****************************************************************************
*/
/*
Use Tim's optimized version.
*/
#define USING_TIMS 1
/****************************************************************************
* Header Files
*****************************************************************************
*/
#define STRICT // Strict type checking.
#include "codec_common.h"
#include "reconstruct.h"
/****************************************************************************
* Module constants.
*****************************************************************************
*/
/****************************************************************************
* Imports.
*****************************************************************************
*/
extern INT32 * XX_LUT;
/****************************************************************************
* Exported Global Variables
*****************************************************************************
*/
/****************************************************************************
* Exported Functions
*****************************************************************************
*/
/****************************************************************************
* Module Statics
*****************************************************************************
*/
INT16 Ones[4] = {1,1,1,1};
INT16 OneTwoEight[4] = {128,128,128,128};
UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
#pragma warning( disable : 4799 ) // Disable no emms instruction warning!
/****************************************************************************
* Forward References
*****************************************************************************
*/
/****************************************************************************
*
* ROUTINE : MMXReconIntra
*
* INPUTS : INT16 * idct
* Pointer to the output from the idct for this block
*
* UINT32 stride
* Line Length in pixels in recon and reference images
*
*
*
*
* OUTPUTS : UINT8 * dest
* The reconstruction buffer
*
* RETURNS : None
*
* FUNCTION : Reconstructs an intra block - MMX version
*
* SPECIAL NOTES : Tim Murphy's optimized version
*
*
* ERRORS : None.
*
****************************************************************************/
void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
{
(void) TmpDataBuffer;
__asm
{
// u pipe
// v pipe
mov eax,[idct] ; Signed 16 bit inputs
mov edx,[dest] ; Signed 8 bit outputs
movq mm0,[Eight128s] ; Set mm0 to 0x8080808080808080
;
mov ebx,[stride] ; Line stride in output buffer
lea ecx,[eax+128] ; Endpoint in input buffer
loop_label: ;
movq mm2,[eax] ; First four input values
;
packsswb mm2,[eax+8] ; pack with next(high) four values
por mm0,mm0 ; stall
pxor mm2,mm0 ; Convert result to unsigned (same as add 128)
lea eax,[eax + 16] ; Step source buffer
cmp eax,ecx ; are we done
;
movq [edx],mm2 ; store results
;
lea edx,[edx+ebx] ; Step output buffer
jc loop_label ; Loop back if we are not done
}
// 6c/8 elts = 9c/8 = 1.125 c/pix
}
/****************************************************************************
*
* ROUTINE : MmxReconInter
*
* INPUTS : UINT8 * RefPtr
* The last frame reference
*
* INT16 * ChangePtr
* Pointer to the change data
*
* UINT32 LineStep
* Line Length in pixels in recon and ref images
*
* OUTPUTS : UINT8 * ReconPtr
* The reconstruction
*
* RETURNS : None
*
* FUNCTION : Reconstructs data from last data and change
*
* SPECIAL NOTES :
*
*
* ERRORS : None.
*
****************************************************************************/
#if USING_TIMS
void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
{
(void) TmpDataBuffer;
_asm {
push edi
;; mov ebx, [ref]
;; mov ecx, [diff]
;; mov eax, [dest]
;; mov edx, [stride]
mov ebx, [RefPtr]
mov ecx, [ChangePtr]
mov eax, [ReconPtr]
mov edx, [LineStep]
pxor mm0, mm0
lea edi, [ecx + 128]
;
L:
movq mm2, [ebx] ; (+3 misaligned) 8 reference pixels
;
movq mm4, [ecx] ; first 4 changes
movq mm3, mm2
movq mm5, [ecx + 8] ; last 4 changes
punpcklbw mm2, mm0 ; turn first 4 refs into positive 16-bit #s
paddsw mm2, mm4 ; add in first 4 changes
punpckhbw mm3, mm0 ; turn last 4 refs into positive 16-bit #s
paddsw mm3, mm5 ; add in last 4 changes
add ebx, edx ; next row of reference pixels
packuswb mm2, mm3 ; pack result to unsigned 8-bit values
lea ecx, [ecx + 16] ; next row of changes
cmp ecx, edi ; are we done?
;
movq [eax], mm2 ; store result
;
lea eax, [eax+edx] ; next row of output
jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
pop edi
}
}
#else
void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
{
// Note that the line step for the change data is assumed to be 8 * 32 bits.
__asm
{
// Set up data pointers
mov eax,dword ptr [ReconPtr]
mov ebx,dword ptr [RefPtr]
mov ecx,dword ptr [ChangePtr]
mov edx,dword ptr [LineStep]
pxor mm6, mm6 ; Blank mmx6
// Row 1
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 2
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 3
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 4
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 5
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 6
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 7
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
add ebx,edx ; Step the reference pointer.
add ecx,16 ; Step the change pointer.
add eax,edx ; Step the reconstruction pointer
// Row 8
// Load the data values. The change data needs to be unpacked to words
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data
paddsw mm0, mm2 ; First 4 values
paddsw mm1, mm4 ; Second 4 values
// Pack and store
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
//emms ; Clear the MMX state.
}
}
#endif
/****************************************************************************
*
* ROUTINE : MmxReconInterHalfPixel2
*
* INPUTS : UINT8 * RefPtr1, RefPtr2
* The last frame reference
*
* INT16 * ChangePtr
* Pointer to the change data
*
* UINT32 LineStep
* Line Length in pixels in recon and ref images
*
*
* OUTPUTS : UINT8 * ReconPtr
* The reconstruction
*
* RETURNS : None
*
* FUNCTION : Reconstructs data from half pixel reference data and change.
* Half pixel data interpolated from 2 references.
*
* SPECIAL NOTES :
*
*
* ERRORS : None.
*
****************************************************************************/
#if USING_TIMS
#define A 0
void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
UINT8 * RefPtr1, UINT8 * RefPtr2,
INT16 * ChangePtr, UINT32 LineStep )
{
# if A
static culong FourOnes[2] = { 65537, 65537}; // only read once
# endif
(void) TmpDataBuffer;
_asm {
push esi
push edi
;; mov ecx, [diff]
;; mov esi, [ref1]
;; mov edi, [ref2]
;; mov ebx, [dest]
;; mov edx, [stride]
mov ecx, [ChangePtr]
mov esi, [RefPtr1]
mov edi, [RefPtr2]
mov ebx, [ReconPtr]
mov edx, [LineStep]
lea eax, [ecx+128]
# if A
movq mm1, [FourOnes]
# endif
pxor mm0, mm0
L:
movq mm2, [esi] ; (+3 misaligned) mm2 = row from ref1
;
movq mm4, [edi] ; (+3 misaligned) mm4 = row from ref2
movq mm3, mm2
punpcklbw mm2, mm0 ; mm2 = start ref1 as positive 16-bit #s
movq mm5, mm4
movq mm6, [ecx] ; mm6 = first 4 changes
punpckhbw mm3, mm0 ; mm3 = end ref1 as positive 16-bit #s
movq mm7, [ecx+8] ; mm7 = last 4 changes
punpcklbw mm4, mm0 ; mm4 = start ref2 as positive 16-bit #s
punpckhbw mm5, mm0 ; mm5 = end ref2 as positive 16-bit #s
paddw mm2, mm4 ; mm2 = start (ref1 + ref2)
paddw mm3, mm5 ; mm3 = end (ref1 + ref2)
# if A
paddw mm2, mm1 ; rounding adjustment
paddw mm3, mm1
# endif
psrlw mm2, 1 ; mm2 = start (ref1 + ref2)/2
psrlw mm3, 1 ; mm3 = end (ref1 + ref2)/2
paddw mm2, mm6 ; add changes to start
paddw mm3, mm7 ; add changes to end
lea ecx, [ecx+16] ; next row idct
packuswb mm2, mm3 ; pack start|end to unsigned 8-bit
add esi, edx ; next row ref1
add edi, edx ; next row ref2
cmp ecx, eax
movq [ebx], mm2 ; store result
;
lea ebx, [ebx+edx]
jc L ; 22c / 8 elts = 33c / 8 pixels = 4.125 c/pix
pop edi
pop esi
}
}
#undef A
#else
void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
UINT8 * RefPtr1, UINT8 * RefPtr2,
INT16 * ChangePtr, UINT32 LineStep )
{
UINT8 * TmpDataPtr = (UINT8 *)TmpDataBuffer->TmpReconBuffer;
// Note that the line step for the change data is assumed to be 8 * 32 bits.
__asm
{
pxor mm6, mm6 ; Blank mmx6
// Set up data pointers
mov eax,dword ptr [RefPtr1]
mov ebx,dword ptr [RefPtr2]
mov edx,dword ptr [LineStep]
// Row 1
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
// Load the data values (Ref1 and Ref2) and unpack to signed 16 bit values
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 2
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,16
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm3, mm2 ; Copy data
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+8],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 3
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,32
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr]
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+16],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 4
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,48
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr]
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+24],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 5
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,64
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr]
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+32],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 6
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,80
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr]
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+40],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 7
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,96
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr]
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+48],mm0 ; Write the data out to the temporary results buffer
add eax,edx ; Step the reference pointers
add ebx,edx
// Row 8
// Load the change pointer
mov ecx,dword ptr [ChangePtr]
add ecx,112
// Load the data values (Ref1 and Ref2).
movq mm0,dword ptr [eax] ; Load 8 elements of source data
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
movq mm1, mm0 ; Copy data
movq mm3, mm2 ; Copy data
punpcklbw mm0, mm6 ; Low bytes to words
punpckhbw mm1, mm6 ; High bytes to words
punpcklbw mm2, mm6 ; Low bytes to words
punpckhbw mm3, mm6 ; High bytes to words
// Average Ref1 and Ref2
paddw mm0, mm2 ; First 4 values
paddw mm1, mm3 ; Second 4 values
psrlw mm0, 1
psrlw mm1, 1
// Load 8 elements of 16 bit change data
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
// Sum the data reference and difference data
paddw mm0, mm2 ; First 4 values
paddw mm1, mm4 ; Second 4 values
// Pack and store
mov ecx,dword ptr [TmpDataPtr]
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
movq dword ptr [ecx+56],mm0 ; Write the data out to the temporary results buffer
// Now copy the results back to the reconstruction buffer.
mov eax,dword ptr [ReconPtr] ; Load the reconstruction Pointer
mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
// Row 1
movq mm0,dword ptr [ecx] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 2
movq mm0,dword ptr [ecx+8] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 3
movq mm0,dword ptr [ecx+16] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 4
movq mm0,dword ptr [ecx+24] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 5
movq mm0,dword ptr [ecx+32] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 6
movq mm0,dword ptr [ecx+40] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 7
movq mm0,dword ptr [ecx+48] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
// Row 8
movq mm0,dword ptr [ecx+56] ; Load 8 elements of results data
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
add eax,edx ; Step the reconstruction pointer
//emms
}
}
#endif

View file

@ -0,0 +1,351 @@
/****************************************************************************
*
* Module Title : SystemDependant.c
*
* Description : Miscellaneous system dependant functions
*
* AUTHOR : Paul Wilkins
*
*****************************************************************************
* Revision History
*
* 1.20 YWX 06-Nov-02 Added forward DCT function optimized for Pentium 4
* 1.19 YWX 15-Jun-01 added function pointer setups for new deblocking filter
* 1.18 YWX 26-Apr-01 Fixed the cpu frequency detection bug caused by Sleep()
* 1.17 JBX 22-Mar-01 Merged with new vp4-mapca bitstream
* 1.16 JBB 26-Jan-01 Cleaned out unused function
* 1.15 YWX 08-dec-00 Added WMT PostProcessor and
* moved function declarations into _head files
* 1.14 JBB 30 NOV 00 Version number changes
* 1.13 YWX 03-Nov-00 Optimized postprocessor filters
* 1.12 YWX 02-Nov-00 Added new loopfilter function pointers
* 1.11 YWX 19-Oct-00 Added 1-2 Scaling functions pointers
* 1.10 jbb 16 oct 00 added ifdefs to insure version code
* 1.09 YWX 04-Oct-00 Added function pointers for scaling
* 1.08 YWX 06 Sep 00 Added function pointers for new deringing filter
* using frag baseed Q Value.
* 1.07 JBB 21 Aug 00 New More Blurry in high variance area deringer
* 1.06 YWX 2 Aug 00 Added function pointers for postprocess
* 1.05 YWX 15/05/00 Added functions to check processor frequency
* and more function pointers for postprocessor
* 1.04 YWX 08/05/00 Added function pointers setup for postprocess
* 1.03 SJL 20/04/00 Added ability to enable the new dequant code.
* 1.02 SJL 22/03/00 Function pointers for the loop filter.
* 1.01 JBB 21/03/00 More Function Pointers for optimized playback
* 1.00 PGW 12/10/99 Configuration baseline
*
*****************************************************************************
*/
/****************************************************************************
* Header Files
*****************************************************************************
*/
#include "codec_common.h"
#include "vputil_if.h"
#include "cpuidlib.h"
//global debugging aid's!
int fastIDCTDisabled = 0;
int forceCPUID = 0;
int CPUID = 0;
extern void GetProcessorFlags(INT32 *MmxEnabled, INT32 *XmmEnabled, INT32 *WmtEnabled);
// Scalar (no mmx) reconstruction functions
extern void ClearSysState_C(void);
extern void IDctSlow( INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void IDct10( INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void IDct1( INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void ScalarReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
extern void ScalarReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
extern void ScalarReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
extern void ReconBlock_C(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep);
extern void SubtractBlock_C( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
extern void UnpackBlock_C( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
extern void AverageBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
extern void CopyBlock_C(unsigned char *src, unsigned char *dest, unsigned int srcstride);
extern void Copy12x12_C(const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride);
extern void fdct_short_C ( INT16 * InputData, INT16 * OutputData );
extern void FilterBlockBil_8_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
extern void FilterBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
// MMx versions
extern void fdct_MMX ( INT16 * InputData, INT16 * OutputData );
extern void ClearMmx(void);
extern void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
extern void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
extern void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
extern void MMX_idct( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void MMX_idct10( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void MMX_idct1( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void MMX_idct_DX( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void MMX_idct10_DX( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void ReconBlock_MMX(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep);
extern void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
extern void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
extern void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride);
extern void Copy12x12_MMX(const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride);
extern void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
extern void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
// WMT versions
extern void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
extern void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
extern void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
extern void Wmt_idct1( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void Wmt_IDct_Dx( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void Wmt_IDct10_Dx( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
extern void fdct_WMT(short *InputData, short *OutputData);
extern void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
extern void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
#define IdctAdjustBeforeShift 8
extern UINT16 idctconstants[(4+7+1) * 4];
extern UINT16 idctcosTbl[ 7];
void fillidctconstants(void)
{
int j = 16;
UINT16 * p;
do
{
idctconstants[ --j] = 0;
}
while( j);
idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
j = 1;
do
{
p = idctconstants + ( (j+3) << 2);
p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
}
while( ++j <= 7);
idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
}
/****************************************************************************
*
* ROUTINE : Get Processor Flags
*
* INPUTS : None
*
* OUTPUTS : None
*
* RETURNS : None
*
* FUNCTION : Checks for machine specifc features such as MMX support
* sets approipriate flags and function pointers.
*
* SPECIAL NOTES : None.
*
*
* ERRORS : None.
*
****************************************************************************/
void GetProcessorFlags
(
INT32 *MmxEnabled,
INT32 *XmmEnabled,
INT32 *WmtEnabled
)
{
PROCTYPE CPUType = findCPUId();
if(forceCPUID)
CPUType = CPUID;
switch(CPUType)
{
case X86 :
case PPRO :
case C6X86 :
case C6X86MX:
case AMDK5 :
case MACG3 :
case MAC68K :
*MmxEnabled = FALSE;
*XmmEnabled = FALSE;
*WmtEnabled = FALSE;
break;
case PII :
case AMDK63D:
case AMDK6 :
case PMMX :
*MmxEnabled = TRUE;
*XmmEnabled = FALSE;
*WmtEnabled = FALSE;
break;
case XMM :
*MmxEnabled = TRUE;
*XmmEnabled = TRUE;
*WmtEnabled = FALSE;
break;
case WMT :
*MmxEnabled = TRUE;
*XmmEnabled = TRUE;
*WmtEnabled = TRUE;
break;
}
}
/****************************************************************************
*
* ROUTINE : MachineSpecificConfig
*
* INPUTS : None
*
* OUTPUTS : None
*
* RETURNS : None
*
* FUNCTION : Checks for machine specifc features such as MMX support
* sets approipriate flags and function pointers.
*
* SPECIAL NOTES : None.
*
*
* ERRORS : None.
*
****************************************************************************/
void UtilMachineSpecificConfig
(
void
)
{
UINT32 i;
INT32 MmxEnabled;
INT32 XmmEnabled;
INT32 WmtEnabled;
GetProcessorFlags( &MmxEnabled,&XmmEnabled,&WmtEnabled);
if(WmtEnabled) //Willamette
{
for(i=0;i<=64;i++)
{
if(fastIDCTDisabled)
idct[i]=Wmt_IDct_Dx;
else
{
if(i<=1)idct[i]=Wmt_idct1;
else if(i<=10)idct[i]=Wmt_IDct10_Dx;
else idct[i]=Wmt_IDct_Dx;
}
}
for(i=0;i<=64;i++)
{
if(fastIDCTDisabled)
idctc[i]=MMX_idct;
else
{
if(i<=1)idctc[i]=Wmt_idct1;
else if(i<=10)idctc[i]=MMX_idct10;
else idctc[i]=MMX_idct;
}
}
fdct_short=fdct_WMT;
ReconIntra = WmtReconIntra;
ReconInter = WmtReconInter;
ReconInterHalfPixel2 = WmtReconInterHalfPixel2;
ClearSysState = ClearMmx;
AverageBlock = AverageBlock_MMX;
UnpackBlock = UnpackBlock_MMX;
ReconBlock = ReconBlock_MMX;
SubtractBlock = SubtractBlock_MMX;
CopyBlock = CopyBlockMMX;
Copy12x12 = Copy12x12_MMX;
FilterBlockBil_8 = FilterBlockBil_8_wmt;
FilterBlock=FilterBlock_wmt;
//FilterBlock=FilterBlock_C;
}
else if ( MmxEnabled )
{
for(i=0;i<=64;i++)
{
if(fastIDCTDisabled)
idctc[i]=MMX_idct_DX;
else
{
if(i<=1)idctc[i]=MMX_idct1;
else if(i<=10)idctc[i]=MMX_idct10;
else idctc[i]=MMX_idct;
}
}
fdct_short=fdct_MMX;
for(i=0;i<=64;i++)
{
if(fastIDCTDisabled)
idct[i]=MMX_idct_DX;
else
{
if(i<=1)idct[i]=MMX_idct1;
else if(i<=10)idct[i]=MMX_idct10_DX;
else idct[i]=MMX_idct_DX;
}
}
ReconIntra = MMXReconIntra;
ReconInter = MmxReconInter;
ReconInterHalfPixel2 = MmxReconInterHalfPixel2;
ClearSysState = ClearMmx;
AverageBlock = AverageBlock_MMX;
UnpackBlock = UnpackBlock_MMX;
ReconBlock = ReconBlock_MMX;
SubtractBlock = SubtractBlock_MMX;
CopyBlock = CopyBlockMMX;
Copy12x12 = Copy12x12_MMX;
FilterBlockBil_8 = FilterBlockBil_8_mmx;
FilterBlock=FilterBlock_mmx;
//FilterBlock=FilterBlock_C;
}
else
{
int i;
for(i=0;i<=64;i++)
{
if(fastIDCTDisabled)
idctc[i]=IDctSlow;
else
{
if(i<=1)idctc[i]=IDct1;
else if(i<=10)idctc[i]=IDct10;
else idctc[i]=IDctSlow;
}
}
fdct_short=fdct_short_C ;
for(i=0;i<=64;i++)
{
if(fastIDCTDisabled)
idct[i]=IDctSlow;
else
{
if(i<=1)idct[i]=IDct1;
else if(i<=10)idct[i]=IDct10;
else idct[i]=IDctSlow;
}
}
ClearSysState = ClearSysState_C;
ReconIntra = ScalarReconIntra;
ReconInter = ScalarReconInter;
ReconInterHalfPixel2 = ScalarReconInterHalfPixel2;
AverageBlock = AverageBlock_C;
UnpackBlock = UnpackBlock_C;
ReconBlock = ReconBlock_C;
SubtractBlock = SubtractBlock_C;
CopyBlock = CopyBlock_C;
Copy12x12 = Copy12x12_MMX;
FilterBlockBil_8 = FilterBlockBil_8_C;
FilterBlock=FilterBlock_C;
}
//FilterBlock=FilterBlock_C;
}

View file

@ -0,0 +1,507 @@
/****************************************************************************
*
* Module Title : newLoopTest_asm.c
*
* Description : Codec specific functions
*
* AUTHOR : Yaowu Xu
*
*****************************************************************************
* Revision History
*
* 1.02 YWX 03-Nov-00 Changed confusing variable name
* 1.01 YWX 02-Nov-00 Added the set of functions
* 1.00 YWX 19-Oct-00 configuration baseline
*****************************************************************************
*/
/****************************************************************************
* Header Frames
*****************************************************************************
*/
#define STRICT /* Strict type checking. */
#include "codec_common.h"
#include <math.h>
/****************************************************************************
* Module constants.
*****************************************************************************
*/
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
/****************************************************************************
* Explicit Imports
*****************************************************************************
*/
extern void SatUnsigned8( UINT8 * ResultPtr, INT16 * DataBlock,
UINT32 ResultLineStep, UINT32 DataLineStep );
/****************************************************************************
* Exported Global Variables
*****************************************************************************
*/
/****************************************************************************
* Exported Functions
*****************************************************************************
*/
/****************************************************************************
* Module Statics
*****************************************************************************
*/
/****************************************************************************
* Foreward References
*****************************************************************************
*/
/****************************************************************************
*
* ROUTINE : ClearMmx()
*
*
* INPUTS : None
*
* OUTPUTS :
*
* RETURNS :
*
*
* FUNCTION : Clears down the MMX state
*
* SPECIAL NOTES : None.
*
*
* ERRORS : None.
*
****************************************************************************/
void ClearMmx(void)
{
__asm
{
emms ; Clear the MMX state.
}
}
/****************************************************************************
*
* ROUTINE : CopyBlockUsingMMX
*
* INPUTS : None
*
* OUTPUTS : None
*
* RETURNS : None.
*
* FUNCTION : Copies a block from source to destination
*
* SPECIAL NOTES : None.
*
*
* ERRORS : None.
*
****************************************************************************/
void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride)
{
unsigned char *s = src;
unsigned char *d = dest;
unsigned int stride = srcstride;
// recon copy
_asm
{
mov ecx, [stride]
mov eax, [s]
mov ebx, [d]
lea edx, [ecx + ecx * 2]
movq mm0, [eax]
movq mm1, [eax + ecx]
movq mm2, [eax + ecx*2]
movq mm3, [eax + edx]
lea eax, [eax + ecx*4]
movq [ebx], mm0
movq [ebx + ecx], mm1
movq [ebx + ecx*2], mm2
movq [ebx + edx], mm3
lea ebx, [ebx + ecx * 4]
movq mm0, [eax]
movq mm1, [eax + ecx]
movq mm2, [eax + ecx*2]
movq mm3, [eax + edx]
movq [ebx], mm0
movq [ebx + ecx], mm1
movq [ebx + ecx*2], mm2
movq [ebx + edx], mm3
}
}
/****************************************************************************
*
* ROUTINE : CopyBlockUsingMMX
*
* INPUTS : None
*
* OUTPUTS : None
*
* RETURNS : None.
*
* FUNCTION : Copies a block from source to destination
*
* SPECIAL NOTES : None.
*
*
* ERRORS : None.
*
****************************************************************************/
void Copy12x12_MMX(
const unsigned char *src,
unsigned char *dest,
unsigned int srcstride,
unsigned int deststride)
{
int j=0;
do
{
((UINT32*)dest)[0] = ((UINT32*)src)[0];
((UINT32*)dest)[1] = ((UINT32*)src)[1];
((UINT32*)dest)[2] = ((UINT32*)src)[2];
src+=srcstride;
dest+=deststride;
}
while(++j<12);
}
/****************************************************************************
/****************************************************************************
*
* ROUTINE : AverageBlock_MMX
*
* INPUTS : Two block data to be averaged
*
* OUTPUTS : block with the average values
*
* RETURNS : None.
*
* FUNCTION : Do pixel averages on two reference blocks
*
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
*
* ERRORS : None.
*
****************************************************************************/
void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
{
__asm
{
mov esi, ReconPtr1
mov eax, ReconPtr2
mov edi, ReconRefPtr
mov ecx, BLOCK_HEIGHT_WIDTH
mov edx, ReconPixelsPerLine
pxor mm7, mm7
AverageBlock_Loop:
movq mm0, [esi]
movq mm1, [eax]
movq mm2, mm0
punpcklbw mm0, mm7
movq mm3, mm1
punpcklbw mm1, mm7
paddw mm0, mm1
punpckhbw mm2, mm7
psraw mm0, 1
punpckhbw mm3, mm7
paddw mm2, mm3
movq [edi], mm0
psraw mm2, 1
add esi, edx
add eax, edx
add edi, 16
movq [edi-8], mm2
dec ecx
jnz AverageBlock_Loop
}
/*
UINT32 i;
// For each block row
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
ReconRefPtr[0] = (INT16)((INT32)(ReconPtr1[0])+ ((INT32)ReconPtr2[0]))>>1;
ReconRefPtr[1] = (INT16)((INT32)(ReconPtr1[1])+ ((INT32)ReconPtr2[1]))>>1;
ReconRefPtr[2] = (INT16)((INT32)(ReconPtr1[2])+ ((INT32)ReconPtr2[2]))>>1;
ReconRefPtr[3] = (INT16)((INT32)(ReconPtr1[3])+ ((INT32)ReconPtr2[3]))>>1;
ReconRefPtr[4] = (INT16)((INT32)(ReconPtr1[4])+ ((INT32)ReconPtr2[4]))>>1;
ReconRefPtr[5] = (INT16)((INT32)(ReconPtr1[5])+ ((INT32)ReconPtr2[5]))>>1;
ReconRefPtr[6] = (INT16)((INT32)(ReconPtr1[6])+ ((INT32)ReconPtr2[6]))>>1;
ReconRefPtr[7] = (INT16)((INT32)(ReconPtr1[7])+ ((INT32)ReconPtr2[7]))>>1;
// Start next row
ReconPtr1 += ReconPixelsPerLine;
ReconPtr2 += ReconPixelsPerLine;
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
}
*/
}
/****************************************************************************
*
* ROUTINE : UnpackBlock
*
* INPUTS : Block of char data to be converted to short
*
* OUTPUTS : converted output
*
* RETURNS : None.
*
* FUNCTION : Converted char block data to short
*
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
*
* ERRORS : None.
*
****************************************************************************/
void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
{
__asm
{
mov esi, ReconPtr
mov edi, ReconRefPtr
mov ecx, BLOCK_HEIGHT_WIDTH
mov edx, ReconPixelsPerLine
pxor mm7, mm7
UnpackBlock_Loop:
movq mm0, [esi]
movq mm2, mm0
punpcklbw mm0, mm7
movq [edi], mm0
punpckhbw mm2, mm7
add esi, edx
movq [edi+8], mm2
add edi, 16
dec ecx
jnz UnpackBlock_Loop
}
/*
UINT32 i;
// For each block row
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
ReconRefPtr[0] = (INT16)(ReconPtr[0]);
ReconRefPtr[1] = (INT16)(ReconPtr[1]);
ReconRefPtr[2] = (INT16)(ReconPtr[2]);
ReconRefPtr[3] = (INT16)(ReconPtr[3]);
ReconRefPtr[4] = (INT16)(ReconPtr[4]);
ReconRefPtr[5] = (INT16)(ReconPtr[5]);
ReconRefPtr[6] = (INT16)(ReconPtr[6]);
ReconRefPtr[7] = (INT16)(ReconPtr[7]);
// Start next row
ReconPtr += ReconPixelsPerLine;
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
}
*/
}
/****************************************************************************
*
* ROUTINE : SubtractBlock
*
* INPUTS : Get the residue data for the block
*
* OUTPUTS : Source block data and ref block data
*
* RETURNS : residue block data
*
* FUNCTION : do pixel subtraction of ref block from source block
*
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
*
* ERRORS : None.
*
****************************************************************************/
void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep )
{
__asm
{
mov esi, SrcBlock
mov edi, DestPtr
mov edx, LineStep
mov ecx, 8
pxor mm7, mm7
SubtractBlock_Loop:
movq mm0, [esi]
movq mm1, [edi]
movq mm2, mm0
punpcklbw mm0, mm7
movq mm3, [edi+8]
psubw mm0, mm1
punpckhbw mm2, mm7
movq [edi], mm0
psubw mm2, mm3
add esi, edx
movq [edi+8], mm2
add edi, 16
dec ecx
jnz SubtractBlock_Loop
}
/*
UINT32 i;
// For each block row
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
DestPtr[0] = (INT16)((INT32)SrcBlock[0] - (INT32)DestPtr[0]);
DestPtr[1] = (INT16)((INT32)SrcBlock[1] - (INT32)DestPtr[1]);
DestPtr[2] = (INT16)((INT32)SrcBlock[2] - (INT32)DestPtr[2]);
DestPtr[3] = (INT16)((INT32)SrcBlock[3] - (INT32)DestPtr[3]);
DestPtr[4] = (INT16)((INT32)SrcBlock[4] - (INT32)DestPtr[4]);
DestPtr[5] = (INT16)((INT32)SrcBlock[5] - (INT32)DestPtr[5]);
DestPtr[6] = (INT16)((INT32)SrcBlock[6] - (INT32)DestPtr[6]);
DestPtr[7] = (INT16)((INT32)SrcBlock[7] - (INT32)DestPtr[7]);
// Start next row
SrcBlock += LineStep;
DestPtr += BLOCK_HEIGHT_WIDTH;
}
*/
}
/****************************************************************************
*
* ROUTINE : ReconBlock
*
* INPUTS :
*
* OUTPUTS :
*
* RETURNS :
*
* FUNCTION : Reconstrut a block using ref blocka and change data
*
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
*
* ERRORS : None.
*
****************************************************************************/
void ReconBlock_MMX( INT16 *SrcBlock, INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep)
{
__asm
{
mov esi, SrcBlock
mov eax, ReconRefPtr
mov edi, DestBlock
mov ecx, 8
mov edx, LineStep
pxor mm7, mm7
ReconBlock_Loop:
movq mm0, [esi]
movq mm1, [eax]
movq mm2, [esi+8]
movq mm3, [eax+8]
paddw mm0, mm1
paddw mm2, mm3
packuswb mm0, mm2
movq [edi], mm0
add esi, 16
add eax, 16
add edi, edx
dec ecx
jnz ReconBlock_Loop
}
/*
UINT32 i;
INT16 *SrcBlockPtr = SrcBlock;
// For each block row
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
{
SrcBlock[0] += ReconRefPtr[0];
SrcBlock[1] += ReconRefPtr[1];
SrcBlock[2] += ReconRefPtr[2];
SrcBlock[3] += ReconRefPtr[3];
SrcBlock[4] += ReconRefPtr[4];
SrcBlock[5] += ReconRefPtr[5];
SrcBlock[6] += ReconRefPtr[6];
SrcBlock[7] += ReconRefPtr[7];
// Start next row
SrcBlock += BLOCK_HEIGHT_WIDTH;
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
}
// Saturated the block and write to the output
SatUnsigned8( DestBlock, SrcBlockPtr, LineStep, BLOCK_HEIGHT_WIDTH );
*/
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,281 @@
/****************************************************************************
*
* Module Title : WmtOptFunctions.c
*
* Description : willamette processor specific
* optimised versions of functions
*
* AUTHOR : Yaowu Xu
*
* Special Note:
*
*****************************************************************************
* Revision History
*
*
* 1.03 YWX 07-Dec-00 Removed constants and functions that are not in use
* Added push and pop ebx in WmtReconIntra
* 1.02 YWX 30 Aug 00 changed to be compatible with Microsoft compiler
* 1.01 YWX 13 JUL 00 New Willamette Optimized Functions
* 1.00 YWX 14/06/00 Configuration baseline from OptFunctions.c
*
*****************************************************************************
*/
/*
Use Tim's optimized version.
*/
/****************************************************************************
* Header Files
*****************************************************************************
*/
#define STRICT // Strict type checking.
#include "reconstruct.h"
/****************************************************************************
* Module constants.
*****************************************************************************
*/
/****************************************************************************
* Imports.
*****************************************************************************
*/
/****************************************************************************
* Exported Global Variables
*****************************************************************************
*/
/****************************************************************************
* Exported Functions
*****************************************************************************
*/
/****************************************************************************
* Module Statics
*****************************************************************************
*/
_declspec(align(16)) static UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
#pragma warning( disable : 4799 ) // Disable no emms instruction warning!
/****************************************************************************
* Forward References
*****************************************************************************
*/
/****************************************************************************
*
* ROUTINE : WmtReconIntra
*
* INPUTS : INT16 * idct
* Pointer to the output from the idct for this block
*
* UINT32 stride
* Line Length in pixels in recon and reference images
*
*
*
*
* OUTPUTS : UINT8 * dest
* The reconstruction buffer
*
* RETURNS : None
*
* FUNCTION : Reconstructs an intra block - wmt version
*
*
* ERRORS : None.
*
****************************************************************************/
void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
{
(void)TmpDataBuffer;
__asm
{
push ebx
mov eax,[idct] ; Signed 16 bit inputs
mov edx,[dest] ; Unsigned 8 bit outputs
movq xmm0,QWORD PTR [Eight128s] ; Set xmm0 to 0x000000000000008080808080808080
pxor xmm3, xmm3 ; set xmm3 to 0
;
mov ebx,[stride] ; Line stride in output buffer
lea ecx,[eax+128] ; Endpoint in input buffer
loop_label:
movdqa xmm2,XMMWORD PTR [eax] ; Read the eight inputs
packsswb xmm2,xmm3 ;
pxor xmm2,xmm0 ; Convert result to unsigned (same as add 128)
lea eax,[eax + 16] ; Step source buffer
cmp eax,ecx ; are we done
movq QWORD PTR [edx],xmm2 ; store results
lea edx,[edx+ebx] ; Step output buffer
jc loop_label ; Loop back if we are not done
pop ebx
}
}
/****************************************************************************
*
* ROUTINE : WmtReconInter
*
* INPUTS : UINT8 * RefPtr
* The last frame reference
*
* INT16 * ChangePtr
* Pointer to the change data
*
* UINT32 LineStep
* Line Length in pixels in recon and ref images
*
* OUTPUTS : UINT8 * ReconPtr
* The reconstruction
*
* RETURNS : None
*
* FUNCTION : Reconstructs data from last data and change
*
* SPECIAL NOTES :
*
*
* ERRORS : None.
*
****************************************************************************/
void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
{
(void) TmpDataBuffer;
_asm {
push edi
mov ebx, [RefPtr]
mov ecx, [ChangePtr]
mov eax, [ReconPtr]
mov edx, [LineStep]
pxor xmm0, xmm0
lea edi, [ecx + 128]
L:
movq xmm2, QWORD ptr [ebx] ; (+3 misaligned) 8 reference pixels
movdqa xmm4, XMMWORD ptr [ecx] ; 8 changes
punpcklbw xmm2, xmm0 ;
add ebx, edx ; next row of reference pixels
paddsw xmm2, xmm4 ; add in first 4 changes
lea ecx, [ecx + 16] ; next row of changes
packuswb xmm2, xmm0 ; pack result to unsigned 8-bit values
cmp ecx, edi ; are we done?
movq QWORD PTR [eax], xmm2 ; store result
lea eax, [eax+edx] ; next row of output
jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
pop edi
}
}
/****************************************************************************
*
* ROUTINE : WmtReconInterHalfPixel2
*
* INPUTS : UINT8 * RefPtr1, RefPtr2
* The last frame reference
*
* INT16 * ChangePtr
* Pointer to the change data
*
* UINT32 LineStep
* Line Length in pixels in recon and ref images
*
*
* OUTPUTS : UINT8 * ReconPtr
* The reconstruction
*
* RETURNS : None
*
* FUNCTION : Reconstructs data from half pixel reference data and change.
* Half pixel data interpolated from 2 references.
*
* SPECIAL NOTES :
*
*
* ERRORS : None.
*
****************************************************************************/
void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
UINT8 * RefPtr1, UINT8 * RefPtr2,
INT16 * ChangePtr, UINT32 LineStep )
{
(void)TmpDataBuffer;
_asm {
push esi
push edi
mov ecx, [ChangePtr]
mov esi, [RefPtr1]
mov edi, [RefPtr2]
mov ebx, [ReconPtr]
mov edx, [LineStep]
lea eax, [ecx+128]
pxor xmm0, xmm0
L:
movq xmm2, QWORD PTR [esi] ; (+3 misaligned) mm2 = row from ref1
movq xmm4, QWORD PTR [edi] ; (+3 misaligned) mm4 = row from ref2
punpcklbw xmm2, xmm0 ;
punpcklbw xmm4, xmm0 ;
movdqa xmm6, [ecx] ; mm6 = first 4 changes
paddw xmm2, xmm4 ; mm2 = start (ref1 + ref2)
psrlw xmm2, 1 ; mm2 = start (ref1 + ref2)/2
paddw xmm2, xmm6 ; add changes to start
lea ecx, [ecx+16] ; next row idct
packuswb xmm2, xmm0 ; pack start|end to unsigned 8-bit
add esi, edx ; next row ref1
add edi, edx ; next row ref2
cmp ecx, eax
movq QWORD PTR [ebx], xmm2 ; store result
;
lea ebx, [ebx+edx]
jc L
pop edi
pop esi
}
}