Initial community commit

2024-09-24 14:54:57 +02:00 · 2024-09-24 14:54:57 +02:00 · fc06254474
commit fc06254474
parent 537bcbc862
16440 changed files with 4239995 additions and 2 deletions
--- a/Src/libvpShared/corelibs/cdxv/vputil/Makefile
+++ b/Src/libvpShared/corelibs/cdxv/vputil/Makefile
@ -0,0 +1,61 @@
+## Target to built
+
+TARGET 			=libvputil
+
+## TOOLS
+CC      		= ecc
+LD      		= ecc
+AR      		= ar
+OBJDUMP 		= objdump
+RM      		= rm -f
+
+## Directories
+TOPDIR  		=C:\DuckSoft
+PRIVATEINCLUDE  =${TOPDIR}\private\include
+CORELIBSINCLUDE =${TOPDIR}\private\corelibs\include
+CDXVINCLUDE     =${TOPDIR}\private\corelibs\cdxv\include 
+VPPPINCLUDE     =${TOPDIR}\private\corelibs\cdxv\vputil\include 
+CURRENTDIR 		=${TOPDIR}\private\corelibs\cdxv\vputil
+LIBDIR			=${TOPDIR}\private\corelibs\lib\mapca 
+
+## Compile Flags
+ALLINCLUDES     =-I${CDXVINCLUDE} -I${CORELIBSINCLUDE} -I${PRIVATEINCLUDE} -I${VPPPINCLUDE}
+VP6DEFINES		=-DPREDICT_2D -DVFW_COMP -DCOMPDLL -DPOSTPROCESS -DCPUISLITTLEENDIAN -DNORMALIZED
+ETIDEFINES      =-DMAPCA
+ALLDEFINES      =${VP6DEFINES} ${ETIDEFINES}
+DEBUG			=-O2
+CFLAGS 			=-msvc -align 8 -etswp -mP3OPT_nonlocal_calls_through_register=true \
+				-mP2OPT_suppress_library_call_conv_warnings=TRUE -maalign_branch_target \
+				-magen_interroutine_padding
+ALLFLAGS 		=$(CFLAGS) ${ALLDEFINES} ${ALLINCLUDES} ${DEBUG}
+
+
+## Files
+OBJS			=generic\fdct.o				\
+				generic\idctpart.o			\
+				generic\reconstruct.o		\
+				generic\vputil.o			\
+				bsp\bspFdct.o				\
+				bsp\bspIDct.o				\
+				bsp\bsprecon.o				\
+				bsp\bspvputil.o				\
+				bsp\uoptsystemdependant.o
+
+
+SRCS			=$(OBJS:.o=.c)
+
+ARTARGET		=${TARGET}.a
+
+# archive
+
+ARTARGET:${OBJS}
+	${AR} -cr ${ARTARGET} ${OBJS}
+	mv ${ARTARGET} ${LIBDIR}
+
+${OBJS} : ${SRCS}
+	$(CC) $(ALLFLAGS) -c $*.c -o $*.o
+
+clean:
+	${RM} ${OBJS} ${ARTARGET}
+
+
--- a/Src/libvpShared/corelibs/cdxv/vputil/generic/fdct.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/generic/fdct.c
@ -0,0 +1,312 @@
+/****************************************************************************
+*
+*   Module Title :     fdct.c
+*
+*   Description  :     Fast 8x8 DCT C-Implementation.
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "dct.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define SIGNBITDUPPED(X) ( (signed )((X & 0x80000000)) >> 31 )
+#define DOROUND(X) X = ( (SIGNBITDUPPED(X) & (0xffff)) + X ); 
+
+/****************************************************************************
+*  Module statics
+****************************************************************************/
+static INT32 xC1S7 = 64277;
+static INT32 xC2S6 = 60547;
+static INT32 xC3S5 = 54491;
+static INT32 xC4S4 = 46341;
+static INT32 xC5S3 = 36410;
+static INT32 xC6S2 = 25080;
+static INT32 xC7S1 = 12785;
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : fdct_short_C_orig
+ *
+ *  INPUTS        : INT16 *InputData  : 16-bit input data.
+ *
+ *  OUTPUTS       : INT16 *OutputData : 16-bit transform coefficients.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs an 8x8 2-D fast DCT.
+ *
+ *                  The algorithm used is derived from the flowgraph for
+ *                  the Vetterli and Ligtenberg fast 1-D dct given in the
+ *                  JPEG reference book by Pennebaker and Mitchell.
+ *
+ *  SPECIAL NOTES : None. 
+ *
+ ****************************************************************************/
+void fdct_short_C_orig ( INT16 *InputData, INT16 *OutputData )
+{
+	int    loop;
+	INT32  is07, is12, is34, is56;
+	INT32  is0734, is1256;
+	INT32  id07, id12, id34, id56; 
+	INT32  irot_input_x, irot_input_y;
+	INT32  icommon_product1;            // Re-used product  (c4s4 * (s12 - s56)). 
+	INT32  icommon_product2;            // Re-used product  (c4s4 * (d12 + d56)).
+	INT32  temp1, temp2;	            // intermediate variable for computation
+	INT32  InterData[64];
+
+    INT32 *ip = InterData;
+	INT16 *op = OutputData;
+	
+    for ( loop=0; loop<8; loop++ )
+	{
+		// Pre calculate some common sums and differences.
+		is07 = InputData[0] + InputData[7];
+		is12 = InputData[1] + InputData[2];
+		is34 = InputData[3] + InputData[4];
+		is56 = InputData[5] + InputData[6];
+
+		id07 = InputData[0] - InputData[7];
+		id12 = InputData[1] - InputData[2];
+		id34 = InputData[3] - InputData[4];
+		id56 = InputData[5] - InputData[6];
+	
+		is0734 = is07 + is34;
+		is1256 = is12 + is56;
+		
+		// Pre-Calculate some common product terms.
+		icommon_product1 = xC4S4*(is12 - is56); 
+		DOROUND ( icommon_product1 )
+		icommon_product1 >>= 16;
+		
+		icommon_product2 = xC4S4*(id12 + id56);
+		DOROUND ( icommon_product2 )
+		icommon_product2 >>= 16;
+
+		ip[0] = (xC4S4*(is0734 + is1256));
+		DOROUND ( ip[0] );
+		ip[0] >>= 16;
+
+		ip[4] = (xC4S4*(is0734 - is1256));
+		DOROUND ( ip[4] );
+		ip[4] >>= 16;
+
+		// Define inputs to rotation for outputs 2 and 6 
+		irot_input_x = id12 - id56;
+		irot_input_y = is07 - is34;
+
+		// Apply rotation for outputs 2 and 6. 
+		temp1 = xC6S2*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC2S6*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		ip[2] = temp1 + temp2;
+
+		temp1 = xC6S2*irot_input_y;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC2S6*irot_input_x;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		ip[6] = temp1 -temp2;
+
+		// Define inputs to rotation for outputs 1 and 7 
+		irot_input_x = icommon_product1 + id07;
+		irot_input_y = -( id34 + icommon_product2 );
+
+		// Apply rotation for outputs 1 and 7. 
+		temp1 = xC1S7*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC7S1*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		ip[1] = temp1 - temp2;
+
+		temp1 = xC7S1*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC1S7*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		ip[7] = temp1 + temp2;
+		
+		// Define inputs to rotation for outputs 3 and 5 
+		irot_input_x = id07 - icommon_product1;
+		irot_input_y = id34 - icommon_product2;
+
+		// Apply rotation for outputs 3 and 5. 
+		temp1 = xC3S5 * irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC5S3*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		ip[3] = temp1 - temp2;
+
+		temp1 = xC5S3*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC3S5*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		ip[5] = temp1 + temp2;
+		
+		// Increment data pointer for next row. 
+		InputData += 8;
+		ip += 8;		// advance pointer to next row 
+	}
+
+	//	Performed DCT on rows, now transform the columns	
+	ip = InterData;
+	for ( loop=0; loop<8; loop++ )
+	{
+		// Pre calculate some common sums and differences. 
+		is07 = ip[0 * 8] + ip[7 * 8];
+		is12 = ip[1 * 8] + ip[2 * 8];
+		is34 = ip[3 * 8] + ip[4 * 8];
+		is56 = ip[5 * 8] + ip[6 * 8];
+
+		id07 = ip[0 * 8] - ip[7 * 8];
+		id12 = ip[1 * 8] - ip[2 * 8];
+		id34 = ip[3 * 8] - ip[4 * 8];
+		id56 = ip[5 * 8] - ip[6 * 8];
+	
+		is0734 = is07 + is34;
+		is1256 = is12 + is56;
+		
+		// Pre-Calculate some common product terms.
+		icommon_product1 = xC4S4*(is12 - is56); 
+		icommon_product2 = xC4S4*(id12 + id56);
+		DOROUND ( icommon_product1 )
+		DOROUND ( icommon_product2 )
+		icommon_product1 >>= 16;
+		icommon_product2 >>= 16;
+
+		temp1 = xC4S4*(is0734 + is1256);
+		temp2 = xC4S4*(is0734 - is1256);
+		DOROUND ( temp1 );
+		DOROUND ( temp2 );
+		temp1 >>= 16;
+		temp2 >>= 16;
+		op[0*8] = (INT16)temp1;
+		op[4*8] = (INT16)temp2;
+
+		// Define inputs to rotation for outputs 2 and 6 
+		irot_input_x = id12 - id56;
+		irot_input_y = is07 - is34;
+
+		// Apply rotation for outputs 2 and 6. 
+		temp1 = xC6S2*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC2S6*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		op[2*8] = (INT16)(temp1 + temp2);
+
+		temp1 = xC6S2*irot_input_y;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC2S6*irot_input_x;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		op[6*8] = (INT16)(temp1 -temp2);
+
+		// Define inputs to rotation for outputs 1 and 7 
+		irot_input_x = icommon_product1 + id07;
+		irot_input_y = -( id34 + icommon_product2 );
+
+		// Apply rotation for outputs 1 and 7. 
+		temp1 = xC1S7*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC7S1*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		op[1*8] = (INT16) (temp1 - temp2);
+
+		temp1 = xC7S1*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC1S7*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		op[7*8] = (INT16)(temp1 + temp2);
+
+		// Define inputs to rotation for outputs 3 and 5 
+		irot_input_x = id07 - icommon_product1;
+		irot_input_y = id34 - icommon_product2;
+
+		// Apply rotation for outputs 3 and 5. 
+		temp1 = xC3S5*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC5S3*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		op[3*8] = (INT16)(temp1 - temp2);
+
+		temp1 = xC5S3*irot_input_x;
+		DOROUND ( temp1 );
+		temp1 >>= 16;
+		temp2 = xC3S5*irot_input_y;
+		DOROUND ( temp2 );
+		temp2 >>= 16;
+		op[5*8] = (INT16) (temp1 + temp2);
+
+		// Increment data pointer for next column. 
+		ip ++;
+		op ++;
+	}
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : fdct_short_C
+ *
+ *  INPUTS        : INT16 *InputData  : 16-bit input data.
+ *
+ *  OUTPUTS       : INT16 *OutputData : 16-bit transform coefficients.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs an 8x8 2-D fast DCT.
+ *
+ *                  The function to up the precision of FDCT by number of bits 
+ *                  defined by FDCT_PRECISION_BITS.
+ *
+ *  SPECIAL NOTES : None. 
+ *
+ ****************************************************************************/
+void fdct_short_C ( INT16 *DCTDataBuffer, INT16 *DCT_codes )
+{
+
+    INT32 i;
+
+	// Increase precision on input to fdct
+	for ( i = 0; i < 64; i++ )
+		DCTDataBuffer[i] = DCTDataBuffer[i] << FDCT_PRECISION_BITS;
+
+	// Transform the error signal using the forward DCT to get set of transform coefficients
+	fdct_short_C_orig ( DCTDataBuffer, DCT_codes );
+
+	// Strip off the extra bits from the DCT output.
+	// This should ultimately be merged into the quantize process but there are also
+	// implications for DC prediction that would then need to be sorted
+	for ( i = 0; i < 64; i++ )
+	{	
+		// signed shift modified so behaves like "/" (truncates towards 0 for + and -)
+		if ( DCT_codes[i]  >= 0 )
+			DCT_codes[i] = (DCT_codes[i]) >> FDCT_PRECISION_BITS;
+		else
+			DCT_codes[i] = (DCT_codes[i] + FDCT_PRECISION_NEG_ADJ) >> FDCT_PRECISION_BITS;
+	}
+
+}
--- a/Src/libvpShared/corelibs/cdxv/vputil/generic/idctpart.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/generic/idctpart.c
@ -0,0 +1,921 @@
+/****************************************************************************
+*
+*   Module Title :     idctpart.c
+*
+*   Description  :     IDCT with multiple versions based on # of non 0 coeffs
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+
+#include "dct.h"
+#include "string.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define int32 int
+#define int16 short
+#define IdctAdjustBeforeShift 8
+
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+/****************************************************************************
+*  Module statics
+****************************************************************************/
+static const UINT32 dequant_index[64] = 
+{	
+    0,  1,  8,  16,  9,  2,  3, 10,
+	17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+#if 0   // AWG CODE NO LONGER USED IN CODEBASE.
+/*	Cos and Sin constant multipliers used during DCT and IDCT */
+const double C1S7 = (double)0.9807852804032;
+const double C2S6 = (double)0.9238795325113;
+const double C3S5 = (double)0.8314696123025;
+const double C4S4 = (double)0.7071067811865;
+const double C5S3 = (double)0.5555702330196;
+const double C6S2 = (double)0.3826834323651;
+const double C7S1 = (double)0.1950903220161;
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+
+// DCT lookup tables
+INT32 * C4S4_TablePtr;
+INT32 C4S4_Table[(COEFF_MAX * 4) + 1];
+
+INT32 * C6S2_TablePtr;
+INT32 C6S2_Table[(COEFF_MAX * 2) + 1];
+
+INT32 * C2S6_TablePtr;
+INT32 C2S6_Table[(COEFF_MAX * 2) + 1];
+
+INT32 * C1S7_TablePtr;
+INT32 C1S7_Table[(COEFF_MAX * 2) + 1];
+
+INT32 * C7S1_TablePtr;
+INT32 C7S1_Table[(COEFF_MAX * 2) + 1];
+
+INT32 * C3S5_TablePtr;
+INT32 C3S5_Table[(COEFF_MAX * 2) + 1];
+
+INT32 * C5S3_TablePtr;
+INT32 C5S3_Table[(COEFF_MAX * 2) + 1];
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     InitDctTables
+ *
+ *  INPUTS        :     None.
+ *
+ *  OUTPUTS       :     None.
+ *
+ *  RETURNS       :     void
+ *
+ *  FUNCTION      :     Initialises lookup tables used in IDCT.
+ *
+ *  SPECIAL NOTES :     NO LONGER USED IN CODEBASE. 
+ *
+ ****************************************************************************/
+void InitDctTables ( void )
+{
+    INT32 i;
+
+    C4S4_TablePtr = &C4S4_Table[COEFF_MAX*2];
+    for( i = -(2 * COEFF_MAX); i < (2 * COEFF_MAX); i++ )
+    {
+        if ( i < 0 )
+            C4S4_TablePtr[i] = (INT32)((i * C4S4) - 0.5);
+        else
+            C4S4_TablePtr[i] = (INT32)((i * C4S4) + 0.5);
+    }
+
+    C6S2_TablePtr = &C6S2_Table[COEFF_MAX];
+    for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
+    {
+        if ( i < 0 )
+            C6S2_TablePtr[i] = (INT32)((i * C6S2) - 0.5);
+        else
+            C6S2_TablePtr[i] = (INT32)((i * C6S2) + 0.5);
+    }
+
+    C2S6_TablePtr = &C2S6_Table[COEFF_MAX];
+    for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
+    {
+        if ( i < 0 )
+            C2S6_TablePtr[i] = (INT32)((i * C2S6) - 0.5);
+        else
+            C2S6_TablePtr[i] = (INT32)((i * C2S6) + 0.5);
+    }
+
+    C1S7_TablePtr = &C1S7_Table[COEFF_MAX];
+    for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
+    {
+        if ( i < 0 )
+            C1S7_TablePtr[i] = (INT32)((i * C1S7) - 0.5);
+        else
+            C1S7_TablePtr[i] = (INT32)((i * C1S7) + 0.5);
+    }
+
+    C7S1_TablePtr = &C7S1_Table[COEFF_MAX];
+    for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
+    {
+        if ( i < 0 )
+            C7S1_TablePtr[i] = (INT32)((i * C7S1) - 0.5);
+        else
+            C7S1_TablePtr[i] = (INT32)((i * C7S1) + 0.5);
+    }
+
+    C3S5_TablePtr = &C3S5_Table[COEFF_MAX];
+    for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
+    {
+        if ( i < 0 )
+            C3S5_TablePtr[i] = (INT32)((i * C3S5) - 0.5);
+        else
+            C3S5_TablePtr[i] = (INT32)((i * C3S5) + 0.5);
+    }
+
+    C5S3_TablePtr = &C5S3_Table[COEFF_MAX];
+    for( i = -COEFF_MAX ; i < COEFF_MAX; i++ )
+    {
+        if ( i < 0 )
+            C5S3_TablePtr[i] = (INT32)((i * C5S3) - 0.5);
+        else
+            C5S3_TablePtr[i] = (INT32)((i * C5S3) + 0.5);
+    }
+}
+#endif
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : dequant_slow
+ *
+ *  INPUTS        : INT16 *dequant_coeffs : Pointer to dequantization step sizes.
+ *                  INT16 *quantized_list : Pointer to quantized DCT coeffs
+ *                                          (in zig-zag order).
+ *
+ *  OUTPUTS       : INT32 *DCT_block      : Pointer to 8x8 de-quantized block
+ *                                          (in 2-D raster order).
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : De-quantizes an 8x8 block of quantized DCT coeffs.
+ *
+ *  SPECIAL NOTES : Uses dequant_index to invert zig-zag ordering. 
+ *
+ ****************************************************************************/
+void dequant_slow ( INT16 *dequant_coeffs, INT16 *quantized_list, INT32 *DCT_block )
+{
+    // Loop fully expanded for maximum speed
+    DCT_block[dequant_index[0]]  = quantized_list[0]  * dequant_coeffs[0];
+    DCT_block[dequant_index[1]]  = quantized_list[1]  * dequant_coeffs[1];
+    DCT_block[dequant_index[2]]  = quantized_list[2]  * dequant_coeffs[2];
+    DCT_block[dequant_index[3]]  = quantized_list[3]  * dequant_coeffs[3];
+    DCT_block[dequant_index[4]]  = quantized_list[4]  * dequant_coeffs[4];
+    DCT_block[dequant_index[5]]  = quantized_list[5]  * dequant_coeffs[5];
+    DCT_block[dequant_index[6]]  = quantized_list[6]  * dequant_coeffs[6];
+    DCT_block[dequant_index[7]]  = quantized_list[7]  * dequant_coeffs[7];
+    DCT_block[dequant_index[8]]  = quantized_list[8]  * dequant_coeffs[8];
+    DCT_block[dequant_index[9]]  = quantized_list[9]  * dequant_coeffs[9];
+    DCT_block[dequant_index[10]] = quantized_list[10] * dequant_coeffs[10];
+    DCT_block[dequant_index[11]] = quantized_list[11] * dequant_coeffs[11];
+    DCT_block[dequant_index[12]] = quantized_list[12] * dequant_coeffs[12];
+    DCT_block[dequant_index[13]] = quantized_list[13] * dequant_coeffs[13];
+    DCT_block[dequant_index[14]] = quantized_list[14] * dequant_coeffs[14];
+    DCT_block[dequant_index[15]] = quantized_list[15] * dequant_coeffs[15];
+    DCT_block[dequant_index[16]] = quantized_list[16] * dequant_coeffs[16];
+    DCT_block[dequant_index[17]] = quantized_list[17] * dequant_coeffs[17];
+    DCT_block[dequant_index[18]] = quantized_list[18] * dequant_coeffs[18];
+    DCT_block[dequant_index[19]] = quantized_list[19] * dequant_coeffs[19];
+    DCT_block[dequant_index[20]] = quantized_list[20] * dequant_coeffs[20];
+    DCT_block[dequant_index[21]] = quantized_list[21] * dequant_coeffs[21];
+    DCT_block[dequant_index[22]] = quantized_list[22] * dequant_coeffs[22];
+    DCT_block[dequant_index[23]] = quantized_list[23] * dequant_coeffs[23];
+    DCT_block[dequant_index[24]] = quantized_list[24] * dequant_coeffs[24];
+    DCT_block[dequant_index[25]] = quantized_list[25] * dequant_coeffs[25];
+    DCT_block[dequant_index[26]] = quantized_list[26] * dequant_coeffs[26];
+    DCT_block[dequant_index[27]] = quantized_list[27] * dequant_coeffs[27];
+    DCT_block[dequant_index[28]] = quantized_list[28] * dequant_coeffs[28];
+    DCT_block[dequant_index[29]] = quantized_list[29] * dequant_coeffs[29];
+    DCT_block[dequant_index[30]] = quantized_list[30] * dequant_coeffs[30];
+    DCT_block[dequant_index[31]] = quantized_list[31] * dequant_coeffs[31];
+    DCT_block[dequant_index[32]] = quantized_list[32] * dequant_coeffs[32];
+    DCT_block[dequant_index[33]] = quantized_list[33] * dequant_coeffs[33];
+    DCT_block[dequant_index[34]] = quantized_list[34] * dequant_coeffs[34];
+    DCT_block[dequant_index[35]] = quantized_list[35] * dequant_coeffs[35];
+    DCT_block[dequant_index[36]] = quantized_list[36] * dequant_coeffs[36];
+    DCT_block[dequant_index[37]] = quantized_list[37] * dequant_coeffs[37];
+    DCT_block[dequant_index[38]] = quantized_list[38] * dequant_coeffs[38];
+    DCT_block[dequant_index[39]] = quantized_list[39] * dequant_coeffs[39];
+    DCT_block[dequant_index[40]] = quantized_list[40] * dequant_coeffs[40];
+    DCT_block[dequant_index[41]] = quantized_list[41] * dequant_coeffs[41];
+    DCT_block[dequant_index[42]] = quantized_list[42] * dequant_coeffs[42];
+    DCT_block[dequant_index[43]] = quantized_list[43] * dequant_coeffs[43];
+    DCT_block[dequant_index[44]] = quantized_list[44] * dequant_coeffs[44];
+    DCT_block[dequant_index[45]] = quantized_list[45] * dequant_coeffs[45];
+    DCT_block[dequant_index[46]] = quantized_list[46] * dequant_coeffs[46];
+    DCT_block[dequant_index[47]] = quantized_list[47] * dequant_coeffs[47];
+    DCT_block[dequant_index[48]] = quantized_list[48] * dequant_coeffs[48];
+    DCT_block[dequant_index[49]] = quantized_list[49] * dequant_coeffs[49];
+    DCT_block[dequant_index[50]] = quantized_list[50] * dequant_coeffs[50];
+    DCT_block[dequant_index[51]] = quantized_list[51] * dequant_coeffs[51];
+    DCT_block[dequant_index[52]] = quantized_list[52] * dequant_coeffs[52];
+    DCT_block[dequant_index[53]] = quantized_list[53] * dequant_coeffs[53];
+    DCT_block[dequant_index[54]] = quantized_list[54] * dequant_coeffs[54];
+    DCT_block[dequant_index[55]] = quantized_list[55] * dequant_coeffs[55];
+    DCT_block[dequant_index[56]] = quantized_list[56] * dequant_coeffs[56];
+    DCT_block[dequant_index[57]] = quantized_list[57] * dequant_coeffs[57];
+    DCT_block[dequant_index[58]] = quantized_list[58] * dequant_coeffs[58];
+    DCT_block[dequant_index[59]] = quantized_list[59] * dequant_coeffs[59];
+    DCT_block[dequant_index[60]] = quantized_list[60] * dequant_coeffs[60];
+    DCT_block[dequant_index[61]] = quantized_list[61] * dequant_coeffs[61];
+    DCT_block[dequant_index[62]] = quantized_list[62] * dequant_coeffs[62];
+    DCT_block[dequant_index[63]] = quantized_list[63] * dequant_coeffs[63];
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : IDctSlow
+ *
+ *  INPUTS        : int16 *InputData   : Pointer to 8x8 quantized DCT coefficients.
+ *                  int16 *QuantMatrix : Pointer to 8x8 quantization matrix.
+ *
+ *  OUTPUTS       : int16 *OutputData  : Pointer to 8x8 block to hold output.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Inverse quantizes and inverse DCT's input 8x8 block
+ *                  to reproduce prediction error.
+ *
+ *  SPECIAL NOTES : None. 
+ *
+ ****************************************************************************/
+void IDctSlow ( int16 *InputData, int16 *QuantMatrix, int16 *OutputData )
+{
+	int   loop;
+	int32 t1, t2;
+    int32 IntermediateData[64];
+	int32 _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
+	int32 _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+	
+    int32 *ip = IntermediateData;
+	int16 *op = OutputData;
+	
+	// dequantize the input 
+	dequant_slow ( QuantMatrix, InputData, IntermediateData );
+
+	// Inverse DCT on the rows now
+	for ( loop=0; loop<8; loop++ )
+	{
+		// Check for non-zero values
+		if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] )
+		{
+			t1 = (int32)(xC1S7 * ip[1]);
+            t2 = (int32)(xC7S1 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_A = t1 + t2;
+
+			t1 = (int32)(xC7S1 * ip[1]);
+			t2 = (int32)(xC1S7 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_B = t1 - t2;
+
+			t1 = (int32)(xC3S5 * ip[3]);
+			t2 = (int32)(xC5S3 * ip[5]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_C = t1 + t2;
+
+			t1 = (int32)(xC3S5 * ip[5]);
+			t2 = (int32)(xC5S3 * ip[3]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_D = t1 - t2;
+
+			t1 = (int32)(xC4S4 * (_A - _C));
+            t1 >>= 16;
+			_Ad = t1;
+
+			t1 = (int32)(xC4S4 * (_B - _D));
+            t1 >>= 16;
+			_Bd = t1;
+			
+			_Cd = _A + _C;
+			_Dd = _B + _D;
+
+			t1 = (int32)(xC4S4 * (ip[0] + ip[4]));
+            t1 >>= 16;
+			_E = t1;
+
+			t1 = (int32)(xC4S4 * (ip[0] - ip[4]));
+            t1 >>= 16;
+			_F = t1;
+			
+			t1 = (int32)(xC2S6 * ip[2]);
+			t2 = (int32)(xC6S2 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_G = t1 + t2;
+
+			t1 = (int32)(xC6S2 * ip[2]);
+			t2 = (int32)(xC2S6 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_H = t1 - t2;
+
+			_Ed = _E - _G;
+			_Gd = _E + _G;
+
+			_Add = _F + _Ad;
+			_Bdd = _Bd - _H;
+			
+			_Fd = _F - _Ad;
+			_Hd = _Bd + _H;
+	
+			// Final sequence of operations over-write original inputs.
+			ip[0] = (int16)((_Gd + _Cd )  >> 0);
+			ip[7] = (int16)((_Gd - _Cd )  >> 0);
+
+			ip[1] = (int16)((_Add + _Hd ) >> 0);
+			ip[2] = (int16)((_Add - _Hd ) >> 0);
+
+			ip[3] = (int16)((_Ed + _Dd )  >> 0);
+			ip[4] = (int16)((_Ed - _Dd )  >> 0);
+
+			ip[5] = (int16)((_Fd + _Bdd ) >> 0);
+			ip[6] = (int16)((_Fd - _Bdd ) >> 0);
+		}
+
+		ip += 8;			/* next row */
+	}
+
+	ip = IntermediateData;
+
+	for ( loop=0; loop<8; loop++ )
+	{
+		// Check for non-zero values (bitwise | faster than logical ||)
+		if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+			 ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] )
+		{
+
+			t1 = (int32)(xC1S7 * ip[1*8]);
+            t2 = (int32)(xC7S1 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_A = t1 + t2;
+
+			t1 = (int32)(xC7S1 * ip[1*8]);
+			t2 = (int32)(xC1S7 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_B = t1 - t2;
+
+			t1 = (int32)(xC3S5 * ip[3*8]);
+			t2 = (int32)(xC5S3 * ip[5*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_C = t1 + t2;
+
+			t1 = (int32)(xC3S5 * ip[5*8]);
+			t2 = (int32)(xC5S3 * ip[3*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_D = t1 - t2;
+
+			t1 = (int32)(xC4S4 * (_A - _C));
+            t1 >>= 16;
+			_Ad = t1;
+
+			t1 = (int32)(xC4S4 * (_B - _D));
+            t1 >>= 16;
+			_Bd = t1;
+
+			_Cd = _A + _C;
+			_Dd = _B + _D;
+
+			t1 = (int32)(xC4S4 * (ip[0*8] + ip[4*8]));
+            t1 >>= 16;
+			_E = t1;
+
+			t1 = (int32)(xC4S4 * (ip[0*8] - ip[4*8]));
+            t1 >>= 16;
+			_F = t1;
+			
+			t1 = (int32)(xC2S6 * ip[2*8]);
+			t2 = (int32)(xC6S2 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_G = t1 + t2;
+
+			t1 = (int32)(xC6S2 * ip[2*8]);
+			t2 = (int32)(xC2S6 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+			_H = t1 - t2;
+			
+			_Ed = _E - _G;
+			_Gd = _E + _G;
+
+			_Add = _F + _Ad;
+			_Bdd = _Bd - _H;
+			
+			_Fd = _F - _Ad;
+			_Hd = _Bd + _H;
+	
+			_Gd += IdctAdjustBeforeShift;
+			_Add += IdctAdjustBeforeShift;
+			_Ed += IdctAdjustBeforeShift;
+			_Fd += IdctAdjustBeforeShift;
+
+			// Final sequence of operations over-write original inputs.
+			op[0*8] = (int16)((_Gd + _Cd )  >> 4);
+			op[7*8] = (int16)((_Gd - _Cd )  >> 4);
+
+			op[1*8] = (int16)((_Add + _Hd ) >> 4);
+			op[2*8] = (int16)((_Add - _Hd ) >> 4);
+
+			op[3*8] = (int16)((_Ed + _Dd )  >> 4);
+			op[4*8] = (int16)((_Ed - _Dd )  >> 4);
+
+			op[5*8] = (int16)((_Fd + _Bdd ) >> 4);
+			op[6*8] = (int16)((_Fd - _Bdd ) >> 4);
+		}
+		else
+		{
+			op[0*8] = 0;
+			op[7*8] = 0;
+			op[1*8] = 0;
+			op[2*8] = 0;
+			op[3*8] = 0;
+			op[4*8] = 0;
+			op[5*8] = 0;
+			op[6*8] = 0;
+		}
+
+		ip++;			// next column
+        op++;
+	}
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : dequant_slow10
+ *
+ *  INPUTS        : INT16 *dequant_coeffs : Pointer to dequantization step sizes.
+ *                  INT16 *quantized_list : Pointer to quantized DCT coeffs
+ *                                          (in zig-zag order).
+ *
+ *  OUTPUTS       : INT32 *DCT_block      : Pointer to 8x8 de-quantized block
+ *                                          (in 2-D raster order).
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : De-quantizes an 8x8 block of quantized DCT coeffs that
+ *                  only has non-zero coefficients in the first 10, i.e.
+ *                  only DC & AC1-9 are non-zero, AC10-63 __MUST_BE_ zero.
+ *
+ *  SPECIAL NOTES : Uses dequant_index to invert zig-zag ordering. 
+ *
+ ****************************************************************************/
+void dequant_slow10 ( INT16 *dequant_coeffs, INT16 *quantized_list, INT32 *DCT_block )
+{
+	memset(DCT_block,0, 128);
+
+	// Loop fully expanded for maximum speed
+    DCT_block[dequant_index[0]]  = quantized_list[0]  * dequant_coeffs[0];
+    DCT_block[dequant_index[1]]  = quantized_list[1]  * dequant_coeffs[1];
+    DCT_block[dequant_index[2]]  = quantized_list[2]  * dequant_coeffs[2];
+    DCT_block[dequant_index[3]]  = quantized_list[3]  * dequant_coeffs[3];
+    DCT_block[dequant_index[4]]  = quantized_list[4]  * dequant_coeffs[4];
+    DCT_block[dequant_index[5]]  = quantized_list[5]  * dequant_coeffs[5];
+    DCT_block[dequant_index[6]]  = quantized_list[6]  * dequant_coeffs[6];
+    DCT_block[dequant_index[7]]  = quantized_list[7]  * dequant_coeffs[7];
+    DCT_block[dequant_index[8]]  = quantized_list[8]  * dequant_coeffs[8];
+    DCT_block[dequant_index[9]]  = quantized_list[9]  * dequant_coeffs[9];
+    DCT_block[dequant_index[10]] = quantized_list[10] * dequant_coeffs[10];
+
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : IDctSlow10
+ *
+ *  INPUTS        : int16 *InputData   : Pointer to 8x8 quantized DCT coefficients.
+ *                  int16 *QuantMatrix : Pointer to 8x8 quantization matrix.
+ *
+ *  OUTPUTS       : int16 *OutputData  : Pointer to 8x8 block to hold output.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Inverse quantizes and inverse DCT's input 8x8 block
+ *                  with non-zero coeffs only in DC & the first 9 AC coeffs.
+ *                  i.e. non-zeros ONLY in the following 10 positions:
+ *                  
+ *                          x  x  x  x  0  0  0  0
+ *                          x  x  x  0  0  0  0  0
+ *                          x  x  0  0  0  0  0  0
+ *                          x  0  0  0  0  0  0  0
+ *                          0  0  0  0  0  0  0  0
+ *                          0  0  0  0  0  0  0  0
+ *                          0  0  0  0  0  0  0  0
+ *                          0  0  0  0  0  0  0  0
+ *
+ *  SPECIAL NOTES : Output data is in raster, not zig-zag, order.
+ *
+ ****************************************************************************/
+void IDct10 ( int16 *InputData, int16 *QuantMatrix, int16 *OutputData )
+{
+	int   loop;
+	int32 t1, t2;
+	int32 IntermediateData[64];
+	int32 _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
+	int32 _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+
+    int32 *ip = IntermediateData;
+	int16 *op = OutputData;
+	
+	// dequantize the input 
+	dequant_slow10 ( QuantMatrix, InputData, IntermediateData );
+
+	// Inverse DCT on the rows now
+	for ( loop=0; loop<4; loop++ )
+	{
+		// Check for non-zero values
+		if ( ip[0] | ip[1] | ip[2] | ip[3] )
+		{
+			t1 = (int32)(xC1S7 * ip[1]);
+            t1 >>= 16;
+			_A = t1; 
+
+			t1 = (int32)(xC7S1 * ip[1]);
+            t1 >>= 16;
+			_B = t1 ;
+
+			t1 = (int32)(xC3S5 * ip[3]);
+            t1 >>= 16;
+			_C = t1; 
+
+			t2 = (int32)(xC5S3 * ip[3]);
+            t2 >>= 16;
+			_D = -t2; 
+
+			t1 = (int32)(xC4S4 * (_A - _C));
+            t1 >>= 16;
+			_Ad = t1;
+
+			t1 = (int32)(xC4S4 * (_B - _D));
+            t1 >>= 16;
+			_Bd = t1;
+			
+			_Cd = _A + _C;
+			_Dd = _B + _D;
+
+			t1 = (int32)(xC4S4 * ip[0] );
+            t1 >>= 16;
+			_E = t1;
+
+			_F = t1;
+			
+			t1 = (int32)(xC2S6 * ip[2]);
+            t1 >>= 16;
+			_G = t1; 
+
+			t1 = (int32)(xC6S2 * ip[2]);
+            t1 >>= 16;
+			_H = t1 ;
+			
+			_Ed = _E - _G;
+			_Gd = _E + _G;
+
+			_Add = _F + _Ad;
+			_Bdd = _Bd - _H;
+			
+			_Fd = _F - _Ad;
+			_Hd = _Bd + _H;
+	
+			// Final sequence of operations over-write original inputs.
+			ip[0] = (int16)((_Gd + _Cd )   >> 0);
+			ip[7] = (int16)((_Gd - _Cd )   >> 0);
+
+			ip[1] = (int16)((_Add + _Hd )  >> 0);
+			ip[2] = (int16)((_Add - _Hd )  >> 0);
+
+			ip[3] = (int16)((_Ed + _Dd )   >> 0);
+			ip[4] = (int16)((_Ed - _Dd )   >> 0);
+
+			ip[5] = (int16)((_Fd + _Bdd )  >> 0);
+			ip[6] = (int16)((_Fd - _Bdd )  >> 0);
+		}
+
+		ip += 8;			/* next row */
+	}
+
+	ip = IntermediateData;
+
+	for ( loop=0; loop<8; loop++ )
+	{	
+		// Check for non-zero values (bitwise or faster than ||)
+		if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] )
+		{
+			t1 = (int32)(xC1S7 * ip[1*8]);
+            t1 >>= 16;
+			_A = t1 ;
+
+			t1 = (int32)(xC7S1 * ip[1*8]);
+            t1 >>= 16;
+			_B = t1 ;
+
+			t1 = (int32)(xC3S5 * ip[3*8]);
+            t1 >>= 16;
+			_C = t1 ;
+
+			t2 = (int32)(xC5S3 * ip[3*8]);
+            t2 >>= 16;
+			_D = - t2;
+
+			t1 = (int32)(xC4S4 * (_A - _C));
+            t1 >>= 16;
+			_Ad = t1;
+
+			t1 = (int32)(xC4S4 * (_B - _D));
+            t1 >>= 16;
+			_Bd = t1;
+
+			_Cd = _A + _C;
+			_Dd = _B + _D;
+
+			t1 = (int32)(xC4S4 * ip[0*8]);
+            t1 >>= 16;
+			_E = t1;
+			_F = t1;
+			
+			t1 = (int32)(xC2S6 * ip[2*8]);
+            t1 >>= 16;
+			_G = t1;
+
+			t1 = (int32)(xC6S2 * ip[2*8]);
+            t1 >>= 16;
+			_H = t1;
+			
+			_Ed = _E - _G;
+			_Gd = _E + _G;
+
+			_Add = _F + _Ad;
+			_Bdd = _Bd - _H;
+			
+			_Fd = _F - _Ad;
+			_Hd = _Bd + _H;
+	
+			_Gd += IdctAdjustBeforeShift;
+			_Add += IdctAdjustBeforeShift;
+			_Ed += IdctAdjustBeforeShift;
+			_Fd += IdctAdjustBeforeShift;
+
+			// Final sequence of operations over-write original inputs.
+			op[0*8] = (int16)((_Gd + _Cd )  >> 4);
+			op[7*8] = (int16)((_Gd - _Cd )  >> 4);
+
+			op[1*8] = (int16)((_Add + _Hd ) >> 4);
+			op[2*8] = (int16)((_Add - _Hd ) >> 4);
+
+			op[3*8] = (int16)((_Ed + _Dd )  >> 4);
+			op[4*8] = (int16)((_Ed - _Dd )  >> 4);
+
+			op[5*8] = (int16)((_Fd + _Bdd ) >> 4);
+			op[6*8] = (int16)((_Fd - _Bdd ) >> 4);
+		}
+		else
+		{
+			op[0*8] = 0;
+			op[7*8] = 0;
+			op[1*8] = 0;
+			op[2*8] = 0;
+			op[3*8] = 0;
+			op[4*8] = 0;
+			op[5*8] = 0;
+			op[6*8] = 0;
+		}
+
+		ip++;	// next column
+        op++;
+	}
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : IDct1
+ *
+ *  INPUTS        : int16 *InputData   : Pointer to 8x8 quantized DCT coefficients.
+ *                  int16 *QuantMatrix : Pointer to 8x8 quantization matrix.
+ *
+ *  OUTPUTS       : int16 *OutputData  : Pointer to 8x8 block to hold output.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Inverse DCT's input 8x8 block with only one non-zero
+ *                  coeff in the DC position:
+ *                  
+ *                          x   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *
+ *  SPECIAL NOTES : Output data is in raster, not zig-zag, order.
+ *
+ ****************************************************************************/
+void IDct1 ( int16 *InputData, int16 *QuantMatrix, INT16 *OutputData )
+{
+    INT32 loop;
+	INT16 OutD;
+	
+	OutD = (INT16)((INT32)(InputData[0]*QuantMatrix[0]+15)>>5);
+
+	for ( loop=0; loop<64; loop++ )
+		OutputData[loop] = OutD;
+}
+
+
+#if 0
+/****************************************************************************
+ * 
+ *  ROUTINE       : IDct4
+ *
+ *  INPUTS        : int16 *InputData   : Pointer to 8x8 DCT coefficients.
+ *
+ *  OUTPUTS       : int16 *OutputData  : Pointer to 8x8 block to hold output.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Inverse DCT's input 8x8 block with at most four non-zero
+ *                  coeffs in the following positions:
+ *                  
+ *                          x   x   0  0  0  0  0  0
+ *                          x   x   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *                          0   0   0  0  0  0  0  0
+ *
+ *  SPECIAL NOTES : CURRENTLY NOT USED IN CODEBASE.
+ *
+ ****************************************************************************/
+void IDct4 ( int16 *InputData, int16 *OutputData )
+{
+	int32 t1;
+	int loop;
+	int32 _Add, _Fd;
+	int32 _A, _B, _Ad, _Bd, _Cd, _Dd, _E;
+
+    int16 *ip = InputData;
+	int16 *op = OutputData;
+
+	// Unzigzag the coefficents
+	ip[8] = ip[2];
+	ip[9] = ip[4];
+	ip[2] = 0;
+	ip[5] = 0;
+
+	// Inverse DCT on the rows now
+	for ( loop = 0; loop < 2; loop++)
+	{
+		// Check for non-zero values
+		if ( ip[0] | ip[1] )
+		{
+			t1 = (int32)(xC1S7 * ip[1]);
+            t1 >>= 16;
+			_A = t1; 
+
+			t1 = (int32)(xC7S1 * ip[1]);
+            t1 >>= 16;
+			_B = t1 ;
+
+			t1 = (int32)(xC4S4 * _A );
+            t1 >>= 16;
+			_Ad = t1;
+
+			t1 = (int32)(xC4S4 * _B );
+            t1 >>= 16;
+			_Bd = t1;
+
+			_Cd = _A ;
+			_Dd = _B ;
+
+			t1 = (int32)(xC4S4 * ip[0] );
+            t1 >>= 16;
+			_E = t1;
+
+			_Add = _E + _Ad;
+			
+			_Fd = _E - _Ad;
+	
+			// Final sequence of operations over-write original inputs.
+			ip[0] = (int16)((_E + _Cd )   >> 0);
+			ip[7] = (int16)((_E - _Cd )   >> 0);
+
+			ip[1] = (int16)((_Add + _Bd ) >> 0);
+			ip[2] = (int16)((_Add - _Bd ) >> 0);
+
+			ip[3] = (int16)((_E + _Dd )   >> 0);
+			ip[4] = (int16)((_E - _Dd )   >> 0);
+
+			ip[5] = (int16)((_Fd + _Bd )  >> 0);
+			ip[6] = (int16)((_Fd - _Bd )  >> 0);
+		}
+
+		ip += 8;			/* next row */
+	}
+
+	ip = InputData;
+
+	for ( loop=0; loop<8; loop++ )
+	{	
+		// Check for non-zero values (bitwise or faster than ||)
+		if ( ip[0 * 8] | ip[1 * 8] )
+		{
+
+			t1 = (int32)(xC1S7 * ip[1*8]);
+            t1 >>= 16;
+			_A = t1 ;
+
+			t1 = (int32)(xC7S1 * ip[1*8]);
+            t1 >>= 16;
+			_B = t1 ;
+
+			t1 = (int32)(xC4S4 * _A );
+            t1 >>= 16;
+			_Ad = t1;
+
+			t1 = (int32)(xC4S4 * _B );
+            t1 >>= 16;
+			_Bd = t1;
+			
+			_Cd = _A ;
+			_Dd = _B ;
+
+			t1 = (int32)(xC4S4 * ip[0*8]);
+            t1 >>= 16;
+			_E = t1;
+
+			_Add = _E + _Ad;
+			
+			_Fd = _E - _Ad;
+	
+			_Add += IdctAdjustBeforeShift;
+			_E   += IdctAdjustBeforeShift;
+			_Fd  += IdctAdjustBeforeShift;
+
+			// Final sequence of operations over-write original inputs.
+			op[0*8] = (int16)((_E + _Cd )   >> 4);
+			op[7*8] = (int16)((_E - _Cd )   >> 4);
+
+			op[1*8] = (int16)((_Add + _Bd ) >> 4);
+			op[2*8] = (int16)((_Add - _Bd ) >> 4);
+
+			op[3*8] = (int16)((_E + _Dd )   >> 4);
+			op[4*8] = (int16)((_E - _Dd )   >> 4);
+
+			op[5*8] = (int16)((_Fd + _Bd )   >> 4);
+			op[6*8] = (int16)((_Fd - _Bd )   >> 4);
+		}
+		else
+		{
+			op[0*8] = 0;
+			op[7*8] = 0;
+			op[1*8] = 0;
+			op[2*8] = 0;
+			op[3*8] = 0;
+			op[4*8] = 0;
+			op[5*8] = 0;
+			op[6*8] = 0;
+		}
+
+		ip++;	// next column
+        op++;
+	}
+}
+#endif
--- a/Src/libvpShared/corelibs/cdxv/vputil/generic/reconstruct.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/generic/reconstruct.c
@ -0,0 +1,243 @@
+/****************************************************************************
+*
+*   Module Title :     Reconstruct.c
+*
+*   Description  :     Block reconstruction functions.
+*
+****************************************************************************/
+#define STRICT              // Strict type checking 
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "reconstruct.h"
+#include "codec_common.h"
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : SatUnsigned8
+ *
+ *  INPUTS        : INT16 *DataBlock      : Pointer to 8x8 input block.
+ *                  UINT32 ResultLineStep : Stride of output block.
+ *                  UINT32 DataLineStep   : Stride of input block.
+ *
+ *  OUTPUTS       : UINT8 *ResultPtr      : Pointer to 8x8 output block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Saturates the input data to 8 bits unsigned and stores
+ *                  in the output buffer.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void SatUnsigned8 ( UINT8 *ResultPtr, INT16 *DataBlock, UINT32 ResultLineStep, UINT32 DataLineStep )
+{
+    INT32 i;
+       
+     // Partly expanded loop
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+    {
+        ResultPtr[0] = (char) LIMIT(DataBlock[0]);
+        ResultPtr[1] = (char) LIMIT(DataBlock[1]);
+        ResultPtr[2] = (char) LIMIT(DataBlock[2]);
+        ResultPtr[3] = (char) LIMIT(DataBlock[3]);
+        ResultPtr[4] = (char) LIMIT(DataBlock[4]);
+        ResultPtr[5] = (char) LIMIT(DataBlock[5]);
+        ResultPtr[6] = (char) LIMIT(DataBlock[6]);
+        ResultPtr[7] = (char) LIMIT(DataBlock[7]);
+
+        DataBlock += DataLineStep;
+        ResultPtr += ResultLineStep;
+    }
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : ScalarReconIntra
+ *
+ *  INPUTS        : INT16 *TmpDataBuffer : Pointer to 8x8 temporary buffer for internal use.
+ *                  UINT16 *ChangePtr    : Pointer to 8x8 intra prediction block.
+ *                  UINT32 LineStep      : Stride of reconstruction block.
+ *
+ *  OUTPUTS       : UINT8 *ReconPtr      : Pointer to 8x8 block to hold reconstructed block.
+ *
+ *  RETURNS       : None
+ *
+ *  FUNCTION      : Reconstructs an intra block.
+ *
+ *  SPECIAL NOTES : None. 
+ *
+ ****************************************************************************/
+void ScalarReconIntra ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT16 *ChangePtr, UINT32 LineStep )
+{
+    UINT32 i;
+	INT16 *TmpDataPtr = TmpDataBuffer;
+
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+   	{	
+        TmpDataPtr[0] = (INT16) ( ChangePtr[0] + 128 );
+        TmpDataPtr[1] = (INT16) ( ChangePtr[1] + 128 );
+        TmpDataPtr[2] = (INT16) ( ChangePtr[2] + 128 );
+        TmpDataPtr[3] = (INT16) ( ChangePtr[3] + 128 );
+        TmpDataPtr[4] = (INT16) ( ChangePtr[4] + 128 );
+        TmpDataPtr[5] = (INT16) ( ChangePtr[5] + 128 );
+        TmpDataPtr[6] = (INT16) ( ChangePtr[6] + 128 );
+        TmpDataPtr[7] = (INT16) ( ChangePtr[7] + 128 );
+
+        TmpDataPtr += BLOCK_HEIGHT_WIDTH;
+        ChangePtr  += BLOCK_HEIGHT_WIDTH;
+    }
+
+    // Saturate the output to unsigned 8 bit values in recon buffer
+    SatUnsigned8 ( ReconPtr, TmpDataBuffer, LineStep, BLOCK_HEIGHT_WIDTH );
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : ScalarReconInter
+ *
+ *  INPUTS        : INT16 *TmpDataBuffer : Pointer to 8x8 temporary buffer for internal use.
+ *                  UINT8 *RefPtr        : Pointer to 8x8 reference block.
+ *                  INT16 *ChangePtr     : Pointer to 8x8 inter prediction error block.
+ *                  UINT32 LineStep      : Stride of reference and output blocks.
+ *
+ *  OUTPUTS       : UINT8 *ReconPtr      : Pointer to 8x8 block to hold reconstructed block.
+ *
+ *  RETURNS       : None
+ *
+ *  FUNCTION      : Reconstructs an inter-coded block by adding a prediction
+ *                  error to a reference block in the previous frame 
+ *                  reconstruction buffer.
+ *
+ *  SPECIAL NOTES : None. 
+ *
+ ****************************************************************************/
+void ScalarReconInter ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT8 *RefPtr, INT16 *ChangePtr, UINT32 LineStep )
+{
+    UINT32 i;
+	INT16 *TmpDataPtr = TmpDataBuffer;
+
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+   	{	
+		// Form each row
+   	    TmpDataPtr[0] = (INT16)(RefPtr[0] + ChangePtr[0]);
+   	    TmpDataPtr[1] = (INT16)(RefPtr[1] + ChangePtr[1]);
+   	    TmpDataPtr[2] = (INT16)(RefPtr[2] + ChangePtr[2]);
+   	    TmpDataPtr[3] = (INT16)(RefPtr[3] + ChangePtr[3]);
+   	    TmpDataPtr[4] = (INT16)(RefPtr[4] + ChangePtr[4]);
+   	    TmpDataPtr[5] = (INT16)(RefPtr[5] + ChangePtr[5]);
+   	    TmpDataPtr[6] = (INT16)(RefPtr[6] + ChangePtr[6]);
+   	    TmpDataPtr[7] = (INT16)(RefPtr[7] + ChangePtr[7]);
+
+        // Next row of Block
+		ChangePtr  += BLOCK_HEIGHT_WIDTH;
+        TmpDataPtr += BLOCK_HEIGHT_WIDTH;
+        RefPtr     += LineStep; 
+    }
+
+    // Saturate the output to unsigned 8 bit values in recon buffer
+    SatUnsigned8 ( ReconPtr, TmpDataBuffer, LineStep, BLOCK_HEIGHT_WIDTH );
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : ScalarReconInterHalfPixel2
+ *
+ *  INPUTS        : INT16 *TmpDataBuffer : Pointer to 8x8 temporary buffer for internal use.
+ *                  UINT8 *RefPtr1       : Pointer to first 8x8 reference block.
+ *                  UINT8 *RefPtr2       : Pointer to second 8x8 reference block.
+ *                  INT16 *ChangePtr     : Pointer to 8x8 inter prediction error block.
+ *                  UINT32 LineStep      : Stride of reference blocks.
+ *
+ *  OUTPUTS       : UINT8 *ReconPtr      : Pointer to 8x8 block to hold reconstructed block.
+ *
+ *  RETURNS       : None
+ *
+ *  FUNCTION      : Reconstructs an inter-coded block by adding a prediction
+ *                  error to a reference block computed by averaging the two
+ *                  specified reference blocks. The two reference blocks are
+ *                  those that bracket the 1/2-pixel accuracy motion vector.
+ *
+ *  SPECIAL NOTES : None. 
+ *
+ ****************************************************************************/
+void ScalarReconInterHalfPixel2 
+(
+    INT16 *TmpDataBuffer,
+    UINT8 *ReconPtr, 
+    UINT8 *RefPtr1,
+    UINT8 *RefPtr2, 
+    INT16 *ChangePtr,
+    UINT32 LineStep 
+)
+{
+    UINT32  i;
+	INT16 *TmpDataPtr = TmpDataBuffer;
+
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+   	{	
+		// Form each row
+        TmpDataPtr[0] = (INT16)( (((INT32)RefPtr1[0] + (INT32)RefPtr2[0]) >> 1) + ChangePtr[0] );
+   	    TmpDataPtr[1] = (INT16)( (((INT32)RefPtr1[1] + (INT32)RefPtr2[1]) >> 1) + ChangePtr[1] );
+   	    TmpDataPtr[2] = (INT16)( (((INT32)RefPtr1[2] + (INT32)RefPtr2[2]) >> 1) + ChangePtr[2] );
+   	    TmpDataPtr[3] = (INT16)( (((INT32)RefPtr1[3] + (INT32)RefPtr2[3]) >> 1) + ChangePtr[3] );
+   	    TmpDataPtr[4] = (INT16)( (((INT32)RefPtr1[4] + (INT32)RefPtr2[4]) >> 1) + ChangePtr[4] );
+   	    TmpDataPtr[5] = (INT16)( (((INT32)RefPtr1[5] + (INT32)RefPtr2[5]) >> 1) + ChangePtr[5] );
+   	    TmpDataPtr[6] = (INT16)( (((INT32)RefPtr1[6] + (INT32)RefPtr2[6]) >> 1) + ChangePtr[6] );
+   	    TmpDataPtr[7] = (INT16)( (((INT32)RefPtr1[7] + (INT32)RefPtr2[7]) >> 1) + ChangePtr[7] );
+
+        // Next row of Block
+		ChangePtr  += BLOCK_HEIGHT_WIDTH;
+        TmpDataPtr += BLOCK_HEIGHT_WIDTH;
+        RefPtr1    += LineStep; 
+        RefPtr2    += LineStep; 
+    }
+
+    // Saturate the output to unsigned 8 bit values in recon buffer
+    SatUnsigned8( ReconPtr, TmpDataBuffer, LineStep, BLOCK_HEIGHT_WIDTH );
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       : ReconBlock_C
+ *  
+ *  INPUTS        : INT16 *SrcBlock    : Pointer to 8x8 prediction error.
+ *					INT16 *ReconRefPtr : Pointer to 8x8 block prediction.
+ *                  UINT32 LineStep    : Stride of output block.
+ *
+ *  OUTPUTS       : UINT8 *DestBlock   : Pointer to 8x8 reconstructed block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Reconstrut a block by adding the prediction error
+ *                  block to the source block and clipping values.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void ReconBlock_C ( INT16 *SrcBlock, INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep )
+{
+    UINT32 i;
+    INT16 *SrcBlockPtr = SrcBlock;
+
+    // For each block row
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+    {
+        SrcBlock[0] = (INT16)(SrcBlock[0] + ReconRefPtr[0]);
+        SrcBlock[1] = (INT16)(SrcBlock[1] + ReconRefPtr[1]);
+        SrcBlock[2] = (INT16)(SrcBlock[2] + ReconRefPtr[2]);
+        SrcBlock[3] = (INT16)(SrcBlock[3] + ReconRefPtr[3]);
+        SrcBlock[4] = (INT16)(SrcBlock[4] + ReconRefPtr[4]);
+        SrcBlock[5] = (INT16)(SrcBlock[5] + ReconRefPtr[5]);
+        SrcBlock[6] = (INT16)(SrcBlock[6] + ReconRefPtr[6]);
+        SrcBlock[7] = (INT16)(SrcBlock[7] + ReconRefPtr[7]);
+        
+        // Next row...
+        SrcBlock    += BLOCK_HEIGHT_WIDTH;
+        ReconRefPtr += BLOCK_HEIGHT_WIDTH;
+    }
+
+    // Saturate the output to unsigned 8 bit values in recon buffer
+    SatUnsigned8( DestBlock, SrcBlockPtr, LineStep, BLOCK_HEIGHT_WIDTH );
+}
--- a/Src/libvpShared/corelibs/cdxv/vputil/generic/uoptsystemdependant.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/generic/uoptsystemdependant.c
@ -0,0 +1,100 @@
+/****************************************************************************
+*
+*   Module Title :     SystemDependant.c
+*
+*   Description  :     Miscellaneous system dependant functions.
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "codec_common.h"
+#include "vputil_if.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+// Scalar (no mmx) reconstruction functions
+extern void ClearSysState_C ( void );
+extern void IDctSlow ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
+extern void IDct10 ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
+extern void IDct1 ( INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
+extern void ScalarReconIntra ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT16 *ChangePtr, UINT32 LineStep );
+extern void ScalarReconInter ( INT16 *TmpDataBuffer, UINT8 *ReconPtr, UINT8 *RefPtr, INT16 *ChangePtr, UINT32 LineStep );
+extern void ScalarReconInterHalfPixel2 ( INT16 *TmpDataBuffer, UINT8 *ReconPtr,UINT8 *RefPtr1, UINT8 *RefPtr2, INT16 *ChangePtr, UINT32 LineStep );
+extern void ReconBlock_C(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep );
+extern void SubtractBlock_C ( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
+extern void UnpackBlock_C ( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine );
+extern void AverageBlock_C ( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine );
+extern void CopyBlock_C ( unsigned char *src, unsigned char *dest, unsigned int srcstride );
+extern void Copy12x12_C ( const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride );
+extern void fdct_short_C ( INT16 *InputData, INT16 *OutputData );
+extern void FilterBlockBil_8_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
+extern void FilterBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
+extern void GetProcessorFlags ( INT32 *MmxEnabled, INT32 *XmmEnabled, INT32 *WmtEnabled );
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     fillidctconstants
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     void
+ *
+ *  FUNCTION      :     STUB FUNCTION.
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ ****************************************************************************/
+void fillidctconstants ( void )
+{
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     MachineSpecificConfig
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Checks for machine specifc features such as MMX support 
+ *                      sets approipriate flags and function pointers.
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ ****************************************************************************/
+void UtilMachineSpecificConfig ( void )
+{
+		int i;
+		for(i=0;i<=64;i++)
+		{
+			if(i<=1)idctc[i]=IDct1;
+			else if(i<=10)idctc[i]=IDct10;
+			else idctc[i]=IDctSlow;
+		}
+		fdct_short=fdct_short_C ;
+		for(i=0;i<=64;i++)
+		{
+			if(i<=1)idct[i]=IDct1;
+			else if(i<=10)idct[i]=IDct10;
+			else idct[i]=IDctSlow;
+		}
+		ClearSysState = ClearSysState_C;
+		ReconIntra = ScalarReconIntra;
+		ReconInter = ScalarReconInter;
+		ReconInterHalfPixel2 = ScalarReconInterHalfPixel2;
+		AverageBlock = AverageBlock_C;
+		UnpackBlock = UnpackBlock_C;
+		ReconBlock = ReconBlock_C;
+		SubtractBlock = SubtractBlock_C;
+		CopyBlock = CopyBlock_C;
+        Copy12x12 = Copy12x12_C;
+        FilterBlockBil_8 = FilterBlockBil_8_C;
+        FilterBlock=FilterBlock_C;
+}
--- a/Src/libvpShared/corelibs/cdxv/vputil/generic/vputil.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/generic/vputil.c
--- a/Src/libvpShared/corelibs/cdxv/vputil/include/dct.h
+++ b/Src/libvpShared/corelibs/cdxv/vputil/include/dct.h
@ -0,0 +1,74 @@
+/****************************************************************************
+*
+*   Module Title :     dct.h
+*
+*   Description  :     DCT header file.
+*
+****************************************************************************/						
+
+#ifndef __INC_DCT_H
+#define __INC_DCT_H
+
+/****************************************************************************
+*  Header files
+****************************************************************************/
+#include "type_aliases.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define COEFF_MAX   32768   // Max magnitude of DCT coefficient
+// Extra bits of precision added to the fdct that have to be stripped off during the quantize
+#define FDCT_PRECISION_BITS			1
+#define FDCT_PRECISION_NEG_ADJ      ((INT16) (1<<FDCT_PRECISION_BITS)-1)
+
+
+
+
+#if 0   // AWG not required any more!!!
+/*	Cos and Sin constant multipliers used during DCT and IDCT */
+extern const double C1S7;
+extern const double C2S6;
+extern const double C3S5;
+extern const double C4S4;
+extern const double C5S3;
+extern const double C6S2;
+extern const double C7S1;
+
+// DCT lookup tables and pointers
+extern INT32 * C4S4_TablePtr;
+extern INT32 C4S4_Table[(COEFF_MAX * 4) + 1];
+
+extern INT32 * C6S2_TablePtr;
+extern INT32 C6S2_Table[(COEFF_MAX * 2) + 1];
+
+extern INT32 * C2S6_TablePtr;
+extern INT32 C2S6_Table[(COEFF_MAX * 2) + 1];
+
+extern INT32 * C1S7_TablePtr;
+extern INT32 C1S7_Table[(COEFF_MAX * 2) + 1];
+
+extern INT32 * C7S1_TablePtr;
+extern INT32 C7S1_Table[(COEFF_MAX * 2) + 1];
+
+extern INT32 * C3S5_TablePtr;
+extern INT32 C3S5_Table[(COEFF_MAX * 2) + 1];
+
+extern INT32 * C5S3_TablePtr;
+extern INT32 C5S3_Table[(COEFF_MAX * 2) + 1];
+#endif
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+#ifdef COMPDLL
+// Forward Transform
+extern void fdct_slow ( INT32 *InputData, double *OutputData );
+#endif
+
+// Reverse Transform
+extern void IDctSlow(  INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
+extern void IDct10  (  INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
+extern void IDct1   (  INT16 *InputData, INT16 *QuantMatrix, INT16 *OutputData );
+
+#endif
--- a/Src/libvpShared/corelibs/cdxv/vputil/include/mac_specs.h
+++ b/Src/libvpShared/corelibs/cdxv/vputil/include/mac_specs.h
@ -0,0 +1,11 @@
+#if !defined(_mac_specs_h)
+#define _mac_specs_h
+#if defined(__cplusplus)
+extern "C" {
+#endif
+int vputil_hasAltivec(void);
+int vputil_cpuMhz(void);
+#if defined(__cplusplus)
+}
+#endif
+#endif
--- a/Src/libvpShared/corelibs/cdxv/vputil/include/reconstruct.h
+++ b/Src/libvpShared/corelibs/cdxv/vputil/include/reconstruct.h
@ -0,0 +1,60 @@
+/****************************************************************************
+*
+*   Module Title :     Reconstruct.h
+*
+*   Description  :     Block Reconstruction module header
+*
+*   AUTHOR       :     Paul Wilkins
+*
+*****************************************************************************
+*   Revision History
+* 
+*   1.00 PGW 14/10/99  Created
+*
+*****************************************************************************
+*/
+
+#define STRICT              /* Strict type checking. */
+
+#ifndef RECONSTRUCT_H
+#define RECONSTRUCT_H
+
+#include "type_aliases.h"
+
+/****************************************************************************
+*  Constants
+*****************************************************************************
+*/
+
+/****************************************************************************
+*  Types
+*****************************************************************************
+*/        
+
+/****************************************************************************
+*   Data structures
+*****************************************************************************
+*/
+
+/****************************************************************************
+*  Functions
+*****************************************************************************
+*/
+
+// Scalar (no mmx) reconstruction functions
+extern void ScalarReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
+extern void ScalarReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
+extern void ScalarReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
+
+// MMx versions
+extern void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
+extern void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
+extern void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
+
+// WMT versions
+extern void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
+extern void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
+extern void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
+
+
+#endif
--- a/Src/libvpShared/corelibs/cdxv/vputil/vputil.vcxproj
+++ b/Src/libvpShared/corelibs/cdxv/vputil/vputil.vcxproj
@ -0,0 +1,388 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>17.0</VCProjectVersion>
+    <ProjectGuid>{F93716CE-8F89-4334-BE64-43705EF3FB70}</ProjectGuid>
+    <RootNamespace>vputil</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>17.0.32505.173</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
+    <IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+    <IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
+    <OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
+    <IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+    <IntDir>..\..\..\obj\vputil\$(PlatformShortName)_$(Configuration)\</IntDir>
+    <OutDir>..\..\..\lib\$(PlatformShortName)_$(Configuration)\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg">
+    <VcpkgEnableManifest>false</VcpkgEnableManifest>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <VcpkgInstalledDir>
+    </VcpkgInstalledDir>
+    <VcpkgUseStatic>false</VcpkgUseStatic>
+    <VcpkgConfiguration>Debug</VcpkgConfiguration>
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <VcpkgInstalledDir>
+    </VcpkgInstalledDir>
+    <VcpkgUseStatic>false</VcpkgUseStatic>
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <VcpkgInstalledDir>
+    </VcpkgInstalledDir>
+    <VcpkgUseStatic>false</VcpkgUseStatic>
+    <VcpkgConfiguration>Debug</VcpkgConfiguration>
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <VcpkgInstalledDir>
+    </VcpkgInstalledDir>
+    <VcpkgUseStatic>false</VcpkgUseStatic>
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader />
+      <PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation />
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation />
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>None</DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <AdditionalIncludeDirectories>.\include;..\include;..\..\..\..\libvp6\include;..\vp60\include;..\..\include;..\..\..\..\include;..\..\..\..\include\vp60;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>$(IntDir)vputil.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>None</DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <DisableSpecificWarnings>4799;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="generic\fdct.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="generic\idctpart.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="generic\reconstruct.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="generic\uoptsystemdependant.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="generic\vputil.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\fdctmmx.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\fdctwmt.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\filtmmx.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\filtwmt.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\mmxidct.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\mmxrecon.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\uoptsystemdependant.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\vputilasm.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\wmtidct.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+    <ClCompile Include="win32\wmtrecon.c">
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Disabled</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Disabled</Optimization>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">EnableFastChecks</BasicRuntimeChecks>
+      <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">EnableFastChecks</BasicRuntimeChecks>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MaxSpeed</Optimization>
+      <Optimization Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MaxSpeed</Optimization>
+    </ClCompile>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/Src/libvpShared/corelibs/cdxv/vputil/vputil.vcxproj.filters
+++ b/Src/libvpShared/corelibs/cdxv/vputil/vputil.vcxproj.filters
@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="generic">
+      <UniqueIdentifier>{f7966dc8-1d55-46a4-b0e6-8584774d721d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="win32">
+      <UniqueIdentifier>{ad0ce32e-d033-416c-813e-7a7f913ac3fa}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="generic\fdct.c">
+      <Filter>generic</Filter>
+    </ClCompile>
+    <ClCompile Include="generic\idctpart.c">
+      <Filter>generic</Filter>
+    </ClCompile>
+    <ClCompile Include="generic\reconstruct.c">
+      <Filter>generic</Filter>
+    </ClCompile>
+    <ClCompile Include="generic\uoptsystemdependant.c">
+      <Filter>generic</Filter>
+    </ClCompile>
+    <ClCompile Include="generic\vputil.c">
+      <Filter>generic</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\fdctmmx.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\fdctwmt.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\filtmmx.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\filtwmt.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\mmxidct.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\mmxrecon.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\uoptsystemdependant.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\vputilasm.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\wmtidct.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+    <ClCompile Include="win32\wmtrecon.c">
+      <Filter>win32</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/Src/libvpShared/corelibs/cdxv/vputil/vputil.xcodeproj/project.pbxproj
+++ b/Src/libvpShared/corelibs/cdxv/vputil/vputil.xcodeproj/project.pbxproj
@ -0,0 +1,213 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 42;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		0CAF34950BB78E9F000FB06C /* vputil.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34940BB78E9F000FB06C /* vputil.c */; };
+		0CAF34AC0BB78EDF000FB06C /* idctpart.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34A80BB78EDF000FB06C /* idctpart.c */; };
+		0CAF34AD0BB78EDF000FB06C /* fdct.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34A90BB78EDF000FB06C /* fdct.c */; };
+		0CAF34AE0BB78EDF000FB06C /* uoptsystemdependant.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34AA0BB78EDF000FB06C /* uoptsystemdependant.c */; };
+		0CAF34AF0BB78EDF000FB06C /* reconstruct.c in Sources */ = {isa = PBXBuildFile; fileRef = 0CAF34AB0BB78EDF000FB06C /* reconstruct.c */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		0CAF34940BB78E9F000FB06C /* vputil.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = vputil.c; path = generic/vputil.c; sourceTree = "<group>"; };
+		0CAF34A80BB78EDF000FB06C /* idctpart.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = idctpart.c; path = generic/idctpart.c; sourceTree = "<group>"; };
+		0CAF34A90BB78EDF000FB06C /* fdct.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = fdct.c; path = generic/fdct.c; sourceTree = "<group>"; };
+		0CAF34AA0BB78EDF000FB06C /* uoptsystemdependant.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = uoptsystemdependant.c; path = generic/uoptsystemdependant.c; sourceTree = "<group>"; };
+		0CAF34AB0BB78EDF000FB06C /* reconstruct.c */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.c; name = reconstruct.c; path = generic/reconstruct.c; sourceTree = "<group>"; };
+		D2AAC046055464E500DB518D /* libvputil.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libvputil.a; sourceTree = BUILT_PRODUCTS_DIR; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		D289987405E68DCB004EDB86 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		08FB7794FE84155DC02AAC07 /* vputil */ = {
+			isa = PBXGroup;
+			children = (
+				08FB7795FE84155DC02AAC07 /* Source */,
+				C6A0FF2B0290797F04C91782 /* Documentation */,
+				1AB674ADFE9D54B511CA2CBB /* Products */,
+			);
+			name = vputil;
+			sourceTree = "<group>";
+		};
+		08FB7795FE84155DC02AAC07 /* Source */ = {
+			isa = PBXGroup;
+			children = (
+				0CAF34940BB78E9F000FB06C /* vputil.c */,
+				0CAF34A80BB78EDF000FB06C /* idctpart.c */,
+				0CAF34A90BB78EDF000FB06C /* fdct.c */,
+				0CAF34AA0BB78EDF000FB06C /* uoptsystemdependant.c */,
+				0CAF34AB0BB78EDF000FB06C /* reconstruct.c */,
+			);
+			name = Source;
+			sourceTree = "<group>";
+		};
+		1AB674ADFE9D54B511CA2CBB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				D2AAC046055464E500DB518D /* libvputil.a */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		C6A0FF2B0290797F04C91782 /* Documentation */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			name = Documentation;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		D2AAC043055464E500DB518D /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		D2AAC045055464E500DB518D /* vputil */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "vputil" */;
+			buildPhases = (
+				D2AAC043055464E500DB518D /* Headers */,
+				D2AAC044055464E500DB518D /* Sources */,
+				D289987405E68DCB004EDB86 /* Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = vputil;
+			productName = vputil;
+			productReference = D2AAC046055464E500DB518D /* libvputil.a */;
+			productType = "com.apple.product-type.library.static";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		08FB7793FE84155DC02AAC07 /* Project object */ = {
+			isa = PBXProject;
+			buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "vputil" */;
+			hasScannedForEncodings = 1;
+			mainGroup = 08FB7794FE84155DC02AAC07 /* vputil */;
+			projectDirPath = "";
+			targets = (
+				D2AAC045055464E500DB518D /* vputil */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		D2AAC044055464E500DB518D /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				0CAF34950BB78E9F000FB06C /* vputil.c in Sources */,
+				0CAF34AC0BB78EDF000FB06C /* idctpart.c in Sources */,
+				0CAF34AD0BB78EDF000FB06C /* fdct.c in Sources */,
+				0CAF34AE0BB78EDF000FB06C /* uoptsystemdependant.c in Sources */,
+				0CAF34AF0BB78EDF000FB06C /* reconstruct.c in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1DEB91EC08733DB70010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				COPY_PHASE_STRIP = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_ENABLE_FIX_AND_CONTINUE = YES;
+				GCC_MODEL_TUNING = G5;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				INSTALL_PATH = /usr/local/lib;
+				PRODUCT_NAME = vputil;
+				ZERO_LINK = YES;
+			};
+			name = Debug;
+		};
+		1DEB91ED08733DB70010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ARCHS = (
+					ppc,
+					i386,
+				);
+				GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
+				GCC_MODEL_TUNING = G5;
+				INSTALL_PATH = /usr/local/lib;
+				PRODUCT_NAME = vputil;
+			};
+			name = Release;
+		};
+		1DEB91F008733DB70010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				OBJROOT = build;
+				PREBINDING = NO;
+				SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
+				SYMROOT = ../../../lib/osx;
+				USER_HEADER_SEARCH_PATHS = "include ../include ../../include ../../../include";
+			};
+			name = Debug;
+		};
+		1DEB91F108733DB70010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				OBJROOT = build;
+				PREBINDING = NO;
+				SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
+				SYMROOT = ../../../lib/osx;
+				USER_HEADER_SEARCH_PATHS = "include ../include ../../include ../../../include";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "vputil" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB91EC08733DB70010E9CD /* Debug */,
+				1DEB91ED08733DB70010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "vputil" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB91F008733DB70010E9CD /* Debug */,
+				1DEB91F108733DB70010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
+}
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/fdct_m.asm
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/fdct_m.asm
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/fdctmmx.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/fdctmmx.c
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/fdctwmt.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/fdctwmt.c
@ -0,0 +1,810 @@
+/****************************************************************************
+ *
+ *   Module Title :     Fdctwmt.c
+ *
+ *   Description  :     Forward DCT optimized specifically for Intel  P4
+ *						processor
+ *
+ *   AUTHOR       :     YaoWu Xu
+ *
+ ***************************************************************************** 
+ *   Revision History
+ *	
+ *   1.00 YWX  03/11/02  Configuration baseline
+ *
+ *****************************************************************************
+ */
+
+
+/*******************************************************************************
+ * Module Constants
+ *******************************************************************************
+ */
+	
+
+__declspec(align(16)) static unsigned short TIRY[8];
+
+__declspec(align(16)) static unsigned short WmtIdctConst[8 * 8] =
+{
+    0,    0,    0,    0,    0,    0,    0,    0, 
+	64277,64277,64277,64277,64277,64277,64277,64277, 
+	60547,60547,60547,60547,60547,60547,60547,60547, 
+	54491,54491,54491,54491,54491,54491,54491,54491, 
+	46341,46341,46341,46341,46341,46341,46341,46341, 
+	36410,36410,36410,36410,36410,36410,36410,36410, 
+	25080,25080,25080,25080,25080,25080,25080,25080, 
+	12785,12785,12785,12785,12785,12785,12785,12785
+};
+
+ 
+/**************************************************************************************
+ *
+ *		Macro:			FDct_WMT
+ *		
+ *		Description:	The Macro does 1-D IDct on 8 columns. 
+ *
+ *		Input:			None
+ *
+ *		Output:			None
+ *		
+ *		Return:			None			
+ *
+ *		Special Note:	None
+ *
+ *		Error:			None
+ *
+ ***************************************************************************************
+ */
+void  fdct_WMT(short *InputData, short *OutputData)
+{
+
+	__asm 
+	{
+		mov		eax, InputData
+		mov		ebx, OutputData
+		lea		edx, WmtIdctConst
+
+#define I(i) [eax + 16 * i ]
+#define O(i) [ebx + 16 * i ]
+#define C(i) [edx + 16 * i ]
+
+/******************************************************/
+/* Do 8x8 Transpose                                   */
+/******************************************************/
+
+    	movdqa		xmm4, I(4)		/* xmm4=e7e6e5e4e3e2e1e0 */	
+        movdqa		xmm0, I(5)		/* xmm4=f7f6f5f4f3f2f1f0 */	
+        
+        psllw       xmm4, 1
+        psllw       xmm0, 1
+
+        movdqa		xmm5, xmm4		/* make a copy */			
+        punpcklwd	xmm4, xmm0		/* xmm4=f3e3f2e2f1e1f0e0 */	
+        
+        punpckhwd	xmm5, xmm0		/* xmm5=f7e7f6e6f5e5f4e4 */	
+        movdqa		xmm6, I(6)		/* xmm6=g7g6g5g4g3g2g1g0 */ 
+        
+        movdqa		xmm0, I(7)		/* xmm0=h7h6h5h4h3h2h1h0 */ 
+
+        psllw       xmm6, 1
+        psllw       xmm0, 1
+
+        movdqa		xmm7, xmm6		/* make a copy */			
+        
+        punpcklwd	xmm6, xmm0		/* xmm6=h3g3h3g2h1g1h0g0 */ 
+        punpckhwd	xmm7, xmm0		/* xmm7=h7g7h6g6h5g5h4g4 */ 
+        
+        movdqa		xmm3, xmm4		/* make a copy */			
+        punpckldq	xmm4, xmm6		/* xmm4=h1g1f1e1h0g0f0e0 */	
+        
+        punpckhdq	xmm3, xmm6		/* xmm3=h3g3g3e3h2g2f2e2 */	
+        movdqa		I(6), xmm3		/* save h3g3g3e3h2g2f2e2 */	
+        /* Free xmm6 */ 
+        movdqa		xmm6, xmm5		/* make a copy */			
+        punpckldq	xmm5, xmm7		/* xmm5=h5g5f5e5h4g4f4e4 */ 
+        
+        punpckhdq	xmm6, xmm7		/* xmm6=h7g7f7e7h6g6f6e6 */ 
+        movdqa		xmm0, I(0)		/* xmm0=a7a6a5a4a3a2a1a0 */	
+        /* Free xmm7 */ 
+        movdqa		xmm1, I(1)		/* xmm1=b7b6b5b4b3b2b1b0 */	
+
+        psllw       xmm0, 1
+        psllw       xmm1, 1
+        
+        movdqa		xmm7, xmm0		/* make a copy */			
+        
+        punpcklwd	xmm0, xmm1		/* xmm0=b3a3b2a2b1a1b0a0 */	
+        punpckhwd	xmm7, xmm1		/* xmm7=b7a7b6a6b5a5b4a4 */ 
+        /* Free xmm1 */ 
+        movdqa		xmm2, I(2)		/* xmm2=c7c6c5c4c3c2c1c0 */ 
+        movdqa		xmm3, I(3)	    /* xmm3=d7d6d5d4d3d2d1d0 */ 
+        
+        psllw       xmm2, 1
+        psllw       xmm3, 1
+
+        movdqa		xmm1, xmm2		/* make a copy */			
+        punpcklwd	xmm2, xmm3		/* xmm2=d3c3d2c2d1c1d0c0 */ 
+        
+        punpckhwd	xmm1, xmm3		/* xmm1=d7c7d6c6d5c5d4c4 */ 
+        movdqa		xmm3, xmm0		/* make a copy	*/			
+        
+        punpckldq	xmm0, xmm2		/* xmm0=d1c1b1a1d0c0b0a0 */ 
+        punpckhdq	xmm3, xmm2		/* xmm3=d3c3b3a3d2c2b2a2 */ 
+        /* Free xmm2 */ 
+        movdqa		xmm2, xmm7		/* make a copy */			
+        punpckldq	xmm2, xmm1		/* xmm2=d5c5b5a5d4c4b4a4 */	
+        
+        punpckhdq	xmm7, xmm1		/* xmm7=d7c7b7a7d6c6b6a6 */ 
+        movdqa		xmm1, xmm0		/* make a copy */			
+        
+        punpcklqdq	xmm0, xmm4		/* xmm0=h0g0f0e0d0c0b0a0 */	
+        punpckhqdq	xmm1, xmm4		/* xmm1=h1g1g1e1d1c1b1a1 */ 
+        
+        movdqa		I(0), xmm0		/* save I(0) */				
+        movdqa		I(1), xmm1		/* save I(1) */				
+        
+        movdqa		xmm0, I(6)		/* load h3g3g3e3h2g2f2e2 */ 
+        movdqa		xmm1, xmm3		/* make a copy */			
+        
+        punpcklqdq	xmm1, xmm0		/* xmm1=h2g2f2e2d2c2b2a2 */ 
+        punpckhqdq	xmm3, xmm0		/* xmm3=h3g3f3e3d3c3b3a3 */	
+        
+        movdqa		xmm4, xmm2		/* make a copy */			
+        punpcklqdq	xmm4, xmm5		/* xmm4=h4g4f4e4d4c4b4a4 */	
+        
+        punpckhqdq	xmm2, xmm5		/* xmm2=h5g5f5e5d5c5b5a5 */	
+        movdqa		I(2), xmm1		/* save I(2) */				
+        
+        movdqa		I(3), xmm3		/* save I(3) */				
+        movdqa		I(4), xmm4		/* save I(4) */				
+        
+        movdqa		I(5), xmm2		/* save I(5) */				
+        movdqa		xmm5, xmm7		/* make a copy */			
+        
+        punpcklqdq	xmm5, xmm6		/* xmm5=h6g6f6e6d6c6b6a6 */	
+        punpckhqdq	xmm7, xmm6		/* xmm7=h7g7f7e7d7c7b7a7 */	
+        
+        movdqa		I(6), xmm5		/* save I(6) */				
+        movdqa		I(7), xmm7		/* save I(7) */				
+
+/******************************************************/
+/* Done with transpose - Let's do the forward DCT     */
+/******************************************************/
+
+        movdqa		xmm0, I(0)      /* xmm0 = ip0 */
+        movdqa      xmm1, I(1)      /* xmm1 = ip1 */
+
+        movdqa      xmm2, I(3)      /* xmm2 = ip3 */
+        movdqa      xmm3, I(5)      /* xmm3 = ip5 */
+
+        movdqa      xmm4, xmm0      /* xmm4 = ip0 */
+        movdqa      xmm5, xmm1      /* xmm5 = ip1 */      
+        
+        movdqa      xmm6, xmm2      /* xmm6 = ip3 */      
+        movdqa      xmm7, xmm3      /* xmm7 = ip5 */      	
+
+        paddsw      xmm0, I(7)      /* xmm0 = ip0 + ip7 */
+        paddsw      xmm1, I(2)      /* xmm1 = ip1 + ip2 */
+
+        paddsw      xmm2, I(4)      /* xmm2 = ip3 + ip4 */
+        paddsw      xmm3, I(6)      /* xmm3 = ip5 + ip6 */
+
+        psubsw      xmm4, I(7)      /* xmm4 = ip0 - ip7 */
+        psubsw      xmm5, I(2)      /* xmm5 = ip1 - ip2 */       
+
+        psubsw		xmm0, xmm2      /* xmm0 = is07 - is34 */			
+        paddsw		xmm2, xmm2		/* xmm2 = is34 * 2    */	
+        
+        psubsw      xmm6, I(4)      /* xmm6 = ip3 - ip4 */               
+        paddsw		xmm2, xmm0		/* xmm2 = is07 + is34 */	
+
+        psubsw		xmm1, xmm3		/* xmm1 = is12 - is56 */	
+        movdqa		TIRY, xmm0		/* save is07-is34 */	
+
+        paddsw		xmm3, xmm3		/* xmm3 = is56 * 2 */	
+        paddsw		xmm3, xmm1	    /* xmm3 = is12 + is56 */
+        
+        psubsw      xmm7, I(6)      /* xmm7 = ip5 -ip6 */
+        psubsw		xmm5, xmm7		/* xmm5 = id12 - id56 */
+	    
+        paddsw		xmm7, xmm7		/* xmm7 = id56 * 2 */		
+	    paddsw		xmm7, xmm5	    /* xmm7 = id12 + id56 */
+/*---------------------------------------------------------*/
+/* op0 and op4 
+/*---------------------------------------------------------*/
+        psubsw		xmm2, xmm3		/* xmm2 = is0734 - is1256 */
+        paddsw		xmm3, xmm3		/* xmm3 = is1256 * 2 */		
+
+        movdqa		xmm0, xmm2	    /* xmm0 = is0734 - is1256 */
+        paddsw		xmm3, xmm2		/* xmm3 = is0734 + is1256 */
+
+        pmulhw		xmm0, C(4)	    /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
+        paddw		xmm0, xmm2		/* xmm0 = xC4S4 * ( is0734 - is1256 ) */
+
+        psrlw		xmm2, 15			
+        paddw		xmm0, xmm2		/* Truncate xmm0, now it is op[4] */
+            
+        movdqa		xmm2, xmm3		/* xmm2 = is0734 + is1256 */
+        movdqa		O(4), xmm0		/*	op4, now xmm0,xmm2 are free */
+            
+        movdqa		xmm0, xmm3		/* xmm0 = is0734 + is1256 */
+        pmulhw		xmm3, C(4)		/* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */            
+        
+        psrlw		xmm2, 15			
+        paddw		xmm3, xmm0		/* xmm3 = xC4S4 * ( is0734 +is1256 ) */
+        
+        paddw		xmm3, xmm2		/* Truncate xmm3, now it is op[0] */     
+        movdqa		O(0), xmm3		/* save op0 */
+/*---------------------------------------------------------*/
+/* op2 and op6 
+/*---------------------------------------------------------*/
+ 	    movdqa		xmm3, TIRY		/* xmm3 = irot_input_y */
+        pmulhw		xmm3, C(2)		/* xmm3 = xC2S6 * irot_input_y - irot_input_y */
+        
+        movdqa		xmm2, TIRY		/* xmm2 = irot_input_y */
+        movdqa		xmm0, xmm2		/* xmm0 = irot_input_y */
+        
+        psrlw		xmm2, 15		
+        paddw		xmm3, xmm0      /* xmm3 = xC2S6 * irot_input_y */
+            
+        paddw       xmm3, xmm2		/* Truncated */
+        movdqa		xmm0, xmm5		/* xmm0 = id12 - id56 */
+        
+        
+        movdqa		xmm2, xmm5      /* xmm2 = id12 - id56 */
+        pmulhw		xmm0, C(6)		/* xmm0 = xC6S2 * irot_input_x */
+            
+        psrlw		xmm2, 15			
+        paddw		xmm0, xmm2		/* Truncated */
+        
+        paddsw		xmm3, xmm0		/* op[2] */
+        movdqa		O(2), xmm3		/* save op[2] */
+        
+        
+        movdqa		xmm0, xmm5		/* xmm0 = id12 - id56 */
+        movdqa		xmm2, xmm5		/* xmm0 = id12 - id56 */
+        
+        pmulhw		xmm5, C(2)		/* xmm5 = xC2S6 * irot_input_x - irot_input_x */
+        psrlw		xmm2, 15		
+        
+        movdqa		xmm3, TIRY		/* xmm3 = irot_input_y */
+        paddw		xmm5, xmm0		/* xmm5 = xC2S6 * irot_input_x */
+            
+        paddw		xmm5, xmm2		/* Truncated */
+        movdqa		xmm2, xmm3		/* xmm2 = irot_input_y */	
+        
+        pmulhw		xmm3, C(6)	    /* mm3 = xC6S2 * irot_input_y */
+        psrlw		xmm2, 15        
+        
+        paddw		xmm3, xmm2		/* Truncated */
+        psubsw		xmm3, xmm5		/* xmm3 = op[6] */
+        
+        movdqa		O(6), xmm3		
+/*-----------------------------------------------------------------------*/
+/* icommon_product1, icommon_product2                                    */
+/*-----------------------------------------------------------------------*/
+	    movdqa		xmm0, C(4)      /* xmm0 = xC4s4 */
+	    movdqa		xmm2, xmm1      /* xmm2 = is12 - is56 */	
+	
+        movdqa		xmm3, xmm1      /* xmm3 = is12 - is56 */	
+	    pmulhw		xmm1, xmm0		/* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
+	
+        psrlw		xmm2, 15				
+	    paddw		xmm1, xmm3	    /* xmm1 = xC4S4 * ( is12 - is56 ) */
+	    
+        paddw		xmm1, xmm2      /* Truncate xmm1, now it is icommon_product1 */
+	    movdqa		xmm2, xmm7      /* xmm2 = id12 + id56 */
+	    
+        movdqa		xmm3, xmm7		/* xmm3 = id12 + id56 */
+        pmulhw		xmm7, xmm0		/* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
+	
+        psrlw		xmm2, 15		/* For trucation */	
+	    paddw		xmm7, xmm3		/* xmm7 = xC4S4 * ( id12 + id56 ) */
+
+	    paddw		xmm7, xmm2		/* Truncate xmm7, now it is icommon_product2 */
+/*---------------------------------------------------------*/
+	    pxor		xmm0, xmm0		/* Clear xmm0 */
+	    psubsw		xmm0, xmm6		/* xmm0 = - id34 */
+
+	    psubsw		xmm0, xmm7	    /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
+	    paddsw		xmm6, xmm6	    /* xmm6 = id34 * 2 */
+
+	    paddsw		xmm6, xmm0		/* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
+	    psubsw		xmm4, xmm1		/* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
+
+	    paddsw		xmm1, xmm1		/* xmm1 = icommon_product1 * 2 */	    
+        paddsw		xmm1, xmm4		/* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
+
+/*---------------------------------------------------------*/
+/* op1 and op7              
+/*---------------------------------------------------------*/
+
+	    movdqa		xmm7, C(1)     /* xC1S7 */
+        movdqa		xmm2, xmm1      /* xmm2 = irot_input_x */
+        
+        movdqa		xmm3, xmm1;     /* xmm3 = irot_input_x */
+        pmulhw		xmm1, xmm7		/* xmm1 = xC1S7 * irot_input_x - irot_input_x */
+            
+        movdqa		xmm7, C(7)		/* xC7S1 */
+        psrlw		xmm2, 15		/* for trucation */		
+            
+        paddw		xmm1, xmm3		/* xmm1 = xC1S7 * irot_input_x */
+        paddw		xmm1, xmm2		/* Trucated */
+            
+        pmulhw		xmm3, xmm7		/* xmm3 = xC7S1 * irot_input_x */
+        paddw		xmm3, xmm2		/* Truncated */
+            
+        movdqa		xmm5, xmm0		/* xmm5 = irot_input_y */	
+        movdqa	    xmm2, xmm0      /* xmm2 = irot_input_y */	
+            
+        movdqa		xmm7, C(1)      /* xC1S7 */			
+        pmulhw		xmm0, xmm7	    /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
+        
+        movdqa		xmm7, C(7)		/* xC7S1 */	
+        psrlw		xmm2, 15		/* for trucation */	
+        
+        paddw		xmm0, xmm5		/* xmm0 = xC1S7 * irot_input_y */
+        paddw		xmm0, xmm2		/* Truncated */
+        
+        pmulhw		xmm5, xmm7		/* xmm5 = xC7S1 * irot_input_y */
+        paddw		xmm5, xmm2		/* Truncated */
+        
+        psubsw		xmm1, xmm5		/* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
+        paddsw		xmm3, xmm0		/* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
+        
+        movdqa		O(1), xmm1
+        movdqa		O(7), xmm3
+/*---------------------------------------------------------*/
+/* op3 and op5 
+/*---------------------------------------------------------*/
+	    movdqa		xmm0, C(3)      /* xC3S5 */
+	    movdqa		xmm1, C(5)      /* xC5S3 */
+
+	    movdqa		xmm5,xmm6       /* irot_input_x */
+	    movdqa		xmm7,xmm6       /* irot_input_x */
+
+	    movdqa		xmm2,xmm4       /* irot_input_y */
+	    movdqa		xmm3,xmm4       /* irot_input_y */
+
+	    pmulhw		xmm4,xmm0       /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
+	    pmulhw		xmm6,xmm1		/* xmm6 = xC5S3 * irot_input_y - irot_input_y */
+
+	    psrlw		xmm2,15         /* for trucation */
+	    psrlw		xmm5,15         /* for trucation */
+
+	    paddw		xmm4,xmm3		/* xmm4 = xC3S5 * irot_input_x */
+	    paddw		xmm6,xmm7		/* xmm6 = xC5S3 * irot_input_y */
+
+	    paddw		xmm4,xmm2		/* Truncated */
+	    paddw		xmm6,xmm5		/* Truncated */
+
+	    psubsw		xmm4,xmm6		/* op [3] */
+	    movdqa		O(3),xmm4		/* Save Op[3] */
+
+	    movdqa		xmm4,xmm3		/* irot_input_y */
+	    movdqa		xmm6,xmm7		/* irot_input_x */
+
+	    pmulhw		xmm3,xmm1		/* mm3 = xC5S3 * irot_input_x - irot_input_x */
+	    pmulhw		xmm7,xmm0		/* mm7 = xC3S5 * irot_input_y - irot_input_y */
+
+	    paddw		xmm4,xmm2       /* Trucated */
+	    paddw		xmm6,xmm5       /* Trucated */
+
+	    paddw		xmm3,xmm4		/* xmm3 = xC5S3 * irot_input_x */
+	    paddw		xmm7,xmm6		/*  mm7 = xC3S5 * irot_input_y */
+
+	    paddw		xmm3,xmm7		/* Op[5] */
+	    movdqa		O(5),xmm3		/* Save Op[5] */
+/*---------------------------------------------------------*/
+/* End of 8 1-D FDCT                                       */        
+/*---------------------------------------------------------*/
+#undef I
+#undef O
+#define I(i) [ebx + 16 * i ]
+#define O(i) [ebx + 16 * i ]
+
+/******************************************************/
+/* Do 8x8 Transpose                                   */
+/******************************************************/
+
+    	movdqa		xmm4, I(4)		/* xmm4=e7e6e5e4e3e2e1e0 */	
+        movdqa		xmm0, I(5)		/* xmm4=f7f6f5f4f3f2f1f0 */	
+        
+        movdqa		xmm5, xmm4		/* make a copy */			
+        punpcklwd	xmm4, xmm0		/* xmm4=f3e3f2e2f1e1f0e0 */	
+        
+        punpckhwd	xmm5, xmm0		/* xmm5=f7e7f6e6f5e5f4e4 */	
+        movdqa		xmm6, I(6)		/* xmm6=g7g6g5g4g3g2g1g0 */ 
+        
+        movdqa		xmm0, I(7)		/* xmm0=h7h6h5h4h3h2h1h0 */ 
+        movdqa		xmm7, xmm6		/* make a copy */			
+        
+        punpcklwd	xmm6, xmm0		/* xmm6=h3g3h3g2h1g1h0g0 */ 
+        punpckhwd	xmm7, xmm0		/* xmm7=h7g7h6g6h5g5h4g4 */ 
+        
+        movdqa		xmm3, xmm4		/* make a copy */			
+        punpckldq	xmm4, xmm6		/* xmm4=h1g1f1e1h0g0f0e0 */	
+        
+        punpckhdq	xmm3, xmm6		/* xmm3=h3g3g3e3h2g2f2e2 */	
+        movdqa		I(6), xmm3		/* save h3g3g3e3h2g2f2e2 */	
+        /* Free xmm6 */ 
+        movdqa		xmm6, xmm5		/* make a copy */			
+        punpckldq	xmm5, xmm7		/* xmm5=h5g5f5e5h4g4f4e4 */ 
+        
+        punpckhdq	xmm6, xmm7		/* xmm6=h7g7f7e7h6g6f6e6 */ 
+        movdqa		xmm0, I(0)		/* xmm0=a7a6a5a4a3a2a1a0 */	
+        /* Free xmm7 */ 
+        movdqa		xmm1, I(1)		/* xmm1=b7b6b5b4b3b2b1b0 */	
+        movdqa		xmm7, xmm0		/* make a copy */			
+        
+        punpcklwd	xmm0, xmm1		/* xmm0=b3a3b2a2b1a1b0a0 */	
+        punpckhwd	xmm7, xmm1		/* xmm7=b7a7b6a6b5a5b4a4 */ 
+        /* Free xmm1 */ 
+        movdqa		xmm2, I(2)		/* xmm2=c7c6c5c4c3c2c1c0 */ 
+        movdqa		xmm3, I(3)	    /* xmm3=d7d6d5d4d3d2d1d0 */ 
+        
+        movdqa		xmm1, xmm2		/* make a copy */			
+        punpcklwd	xmm2, xmm3		/* xmm2=d3c3d2c2d1c1d0c0 */ 
+        
+        punpckhwd	xmm1, xmm3		/* xmm1=d7c7d6c6d5c5d4c4 */ 
+        movdqa		xmm3, xmm0		/* make a copy	*/			
+        
+        punpckldq	xmm0, xmm2		/* xmm0=d1c1b1a1d0c0b0a0 */ 
+        punpckhdq	xmm3, xmm2		/* xmm3=d3c3b3a3d2c2b2a2 */ 
+        /* Free xmm2 */ 
+        movdqa		xmm2, xmm7		/* make a copy */			
+        punpckldq	xmm2, xmm1		/* xmm2=d5c5b5a5d4c4b4a4 */	
+        
+        punpckhdq	xmm7, xmm1		/* xmm7=d7c7b7a7d6c6b6a6 */ 
+        movdqa		xmm1, xmm0		/* make a copy */			
+        
+        punpcklqdq	xmm0, xmm4		/* xmm0=h0g0f0e0d0c0b0a0 */	
+        punpckhqdq	xmm1, xmm4		/* xmm1=h1g1g1e1d1c1b1a1 */ 
+        
+        movdqa		I(0), xmm0		/* save I(0) */				
+        movdqa		I(1), xmm1		/* save I(1) */				
+        
+        movdqa		xmm0, I(6)		/* load h3g3g3e3h2g2f2e2 */ 
+        movdqa		xmm1, xmm3		/* make a copy */			
+        
+        punpcklqdq	xmm1, xmm0		/* xmm1=h2g2f2e2d2c2b2a2 */ 
+        punpckhqdq	xmm3, xmm0		/* xmm3=h3g3f3e3d3c3b3a3 */	
+        
+        movdqa		xmm4, xmm2		/* make a copy */			
+        punpcklqdq	xmm4, xmm5		/* xmm4=h4g4f4e4d4c4b4a4 */	
+        
+        punpckhqdq	xmm2, xmm5		/* xmm2=h5g5f5e5d5c5b5a5 */	
+        movdqa		I(2), xmm1		/* save I(2) */				
+        
+        movdqa		I(3), xmm3		/* save I(3) */				
+        movdqa		I(4), xmm4		/* save I(4) */				
+        
+        movdqa		I(5), xmm2		/* save I(5) */				
+        movdqa		xmm5, xmm7		/* make a copy */			
+        
+        punpcklqdq	xmm5, xmm6		/* xmm5=h6g6f6e6d6c6b6a6 */	
+        punpckhqdq	xmm7, xmm6		/* xmm7=h7g7f7e7d7c7b7a7 */	
+        
+        movdqa		I(6), xmm5		/* save I(6) */				
+        movdqa		I(7), xmm7		/* save I(7) */				
+
+/******************************************************/
+/* Done with transpose - Let's do the forward DCT     */
+/******************************************************/
+
+        movdqa		xmm0, I(0)      /* xmm0 = ip0 */
+        movdqa      xmm1, I(1)      /* xmm1 = ip1 */
+
+        movdqa      xmm2, I(3)      /* xmm2 = ip3 */
+        movdqa      xmm3, I(5)      /* xmm3 = ip5 */
+
+        movdqa      xmm4, xmm0      /* xmm4 = ip0 */
+        movdqa      xmm5, xmm1      /* xmm5 = ip1 */      
+        
+        movdqa      xmm6, xmm2      /* xmm6 = ip3 */      
+        movdqa      xmm7, xmm3      /* xmm7 = ip5 */      	
+
+        paddsw      xmm0, I(7)      /* xmm0 = ip0 + ip7 */
+        paddsw      xmm1, I(2)      /* xmm1 = ip1 + ip2 */
+
+        paddsw      xmm2, I(4)      /* xmm2 = ip3 + ip4 */
+        paddsw      xmm3, I(6)      /* xmm3 = ip5 + ip6 */
+
+        psubsw      xmm4, I(7)      /* xmm4 = ip0 - ip7 */
+        psubsw      xmm5, I(2)      /* xmm5 = ip1 - ip2 */       
+
+        psubsw		xmm0, xmm2      /* xmm0 = is07 - is34 */			
+        paddsw		xmm2, xmm2		/* xmm2 = is34 * 2    */	
+        
+        psubsw      xmm6, I(4)      /* xmm6 = ip3 - ip4 */               
+        paddsw		xmm2, xmm0		/* xmm2 = is07 + is34 */	
+
+        psubsw		xmm1, xmm3		/* xmm1 = is12 - is56 */	
+        movdqa		TIRY, xmm0		/* save is07-is34 */	
+
+        paddsw		xmm3, xmm3		/* xmm3 = is56 * 2 */	
+        paddsw		xmm3, xmm1	    /* xmm3 = is12 + is56 */
+        
+        psubsw      xmm7, I(6)      /* xmm7 = ip5 -ip6 */
+        psubsw		xmm5, xmm7		/* xmm5 = id12 - id56 */
+	    
+        paddsw		xmm7, xmm7		/* xmm7 = id56 * 2 */		
+	    paddsw		xmm7, xmm5	    /* xmm7 = id12 + id56 */
+/*---------------------------------------------------------*/
+/* op0 and op4 
+/*---------------------------------------------------------*/
+#if 0        
+        movdqa      xmm0, xmm2      /* xmm0 =xmm2= is0734  */
+        pmulhw      xmm2, C(4)      /* xC4S4 * is0734 - is0734 */
+    
+        paddw       xmm2, xmm0      /* XC4S4 * is0734  */
+        movdqa      xmm0, xmm3      /* xmm0 =xmm3= is1256 */
+
+        pmulhw      xmm3, C(4)      /* xC4S4 * is1256 - is1256 */
+        paddw       xmm3, xmm0      /* xC4S4 * is1256 */
+
+
+        movdqa      xmm0, xmm2      
+        paddsw      xmm2, xmm3      /* xC4S4 * ( is0734 +is1256 ) */
+
+        psubsw      xmm0, xmm3      /* xC4S4 * ( is0734 -is1256 ) */
+        movdqa      xmm3, xmm2      
+        
+        psrlw       xmm2, 15        
+        paddsw      xmm3, xmm2      
+
+        movdqa      xmm2, xmm0
+        movdqa      O(0), xmm3
+        
+        psrlw       xmm0, 15
+        paddsw      xmm2, xmm0
+
+        movdqa      O(4), xmm2
+
+
+#else
+
+
+        psubsw		xmm2, xmm3		/* xmm2 = is0734 - is1256 */
+        paddsw		xmm3, xmm3		/* xmm3 = is1256 * 2 */		
+
+        movdqa		xmm0, xmm2	    /* xmm0 = is0734 - is1256 */
+        paddsw		xmm3, xmm2		/* xmm3 = is0734 + is1256 */
+
+        pmulhw		xmm0, C(4)	    /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
+        paddw		xmm0, xmm2		/* xmm0 = xC4S4 * ( is0734 - is1256 ) */
+
+        psrlw		xmm2, 15			
+        paddw		xmm0, xmm2		/* Truncate xmm0, now it is op[4] */
+        
+        movdqa      xmm2, xmm0      
+        psrlw       xmm0, 15
+        
+        paddw       xmm0, xmm2
+        psraw       xmm0, 1        
+        
+        movdqa		O(4), xmm0		/*	op4, now xmm0,xmm2 are free */        
+        movdqa		xmm2, xmm3		/* xmm2 = is0734 + is1256 */
+        
+            
+        movdqa		xmm0, xmm3		/* xmm0 = is0734 + is1256 */
+        pmulhw		xmm3, C(4)		/* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */            
+        
+        psrlw		xmm2, 15			
+        paddw		xmm3, xmm0		/* xmm3 = xC4S4 * ( is0734 +is1256 ) */
+        
+        paddw		xmm3, xmm2		/* Truncate xmm3, now it is op[0] */     
+        movdqa      xmm2, xmm3
+
+        psrlw       xmm3, 15
+        paddw       xmm3, xmm2
+        
+        psraw       xmm3, 1
+        movdqa		O(0), xmm3		/* save op0 */
+#endif
+/*---------------------------------------------------------*/
+/* op2 and op6 
+/*---------------------------------------------------------*/
+ 	    movdqa		xmm3, TIRY		/* xmm3 = irot_input_y */
+        pmulhw		xmm3, C(2)		/* xmm3 = xC2S6 * irot_input_y - irot_input_y */
+        
+        movdqa		xmm2, TIRY		/* xmm2 = irot_input_y */
+        movdqa		xmm0, xmm2		/* xmm0 = irot_input_y */
+        
+        psrlw		xmm2, 15		
+        paddw		xmm3, xmm0      /* xmm3 = xC2S6 * irot_input_y */
+            
+        paddw       xmm3, xmm2		/* Truncated */
+        movdqa		xmm0, xmm5		/* xmm0 = id12 - id56 */
+        
+        
+        movdqa		xmm2, xmm5      /* xmm2 = id12 - id56 */
+        pmulhw		xmm0, C(6)		/* xmm0 = xC6S2 * irot_input_x */
+            
+        psrlw		xmm2, 15			
+        paddw		xmm0, xmm2		/* Truncated */
+        
+        paddsw		xmm3, xmm0		/* op[2] */
+        movdqa      xmm0, xmm3
+
+        psrlw       xmm3, 15
+        paddw       xmm3, xmm0
+
+        psraw       xmm3, 1
+        movdqa		O(2), xmm3		/* save op[2] */
+        
+        
+        movdqa		xmm0, xmm5		/* xmm0 = id12 - id56 */
+        movdqa		xmm2, xmm5		/* xmm0 = id12 - id56 */
+        
+        pmulhw		xmm5, C(2)		/* xmm5 = xC2S6 * irot_input_x - irot_input_x */
+        psrlw		xmm2, 15		
+        
+        movdqa		xmm3, TIRY		/* xmm3 = irot_input_y */
+        paddw		xmm5, xmm0		/* xmm5 = xC2S6 * irot_input_x */
+            
+        paddw		xmm5, xmm2		/* Truncated */
+        movdqa		xmm2, xmm3		/* xmm2 = irot_input_y */	
+        
+        pmulhw		xmm3, C(6)	    /* mm3 = xC6S2 * irot_input_y */
+        psrlw		xmm2, 15        
+        
+        paddw		xmm3, xmm2		/* Truncated */
+        psubsw		xmm3, xmm5		/* xmm3 = op[6] */
+        
+        movdqa      xmm5, xmm3
+        psrlw       xmm3,  15
+        
+        paddw       xmm3, xmm5
+        psraw       xmm3, 1
+        
+        movdqa		O(6), xmm3		
+/*-----------------------------------------------------------------------*/
+/* icommon_product1, icommon_product2                                    */
+/*-----------------------------------------------------------------------*/
+	    movdqa		xmm0, C(4)      /* xmm0 = xC4s4 */
+	    movdqa		xmm2, xmm1      /* xmm2 = is12 - is56 */	
+	
+        movdqa		xmm3, xmm1      /* xmm3 = is12 - is56 */	
+	    pmulhw		xmm1, xmm0		/* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
+	
+        psrlw		xmm2, 15				
+	    paddw		xmm1, xmm3	    /* xmm1 = xC4S4 * ( is12 - is56 ) */
+	    
+        paddw		xmm1, xmm2      /* Truncate xmm1, now it is icommon_product1 */
+	    movdqa		xmm2, xmm7      /* xmm2 = id12 + id56 */
+	    
+        movdqa		xmm3, xmm7		/* xmm3 = id12 + id56 */
+        pmulhw		xmm7, xmm0		/* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
+	
+        psrlw		xmm2, 15		/* For trucation */	
+	    paddw		xmm7, xmm3		/* xmm7 = xC4S4 * ( id12 + id56 ) */
+
+	    paddw		xmm7, xmm2		/* Truncate xmm7, now it is icommon_product2 */
+/*---------------------------------------------------------*/
+	    pxor		xmm0, xmm0		/* Clear xmm0 */
+	    psubsw		xmm0, xmm6		/* xmm0 = - id34 */
+
+	    psubsw		xmm0, xmm7	    /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
+	    paddsw		xmm6, xmm6	    /* xmm6 = id34 * 2 */
+
+	    paddsw		xmm6, xmm0		/* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
+	    psubsw		xmm4, xmm1		/* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
+
+	    paddsw		xmm1, xmm1		/* xmm1 = icommon_product1 * 2 */	    
+        paddsw		xmm1, xmm4		/* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
+
+/*---------------------------------------------------------*/
+/* op1 and op7              
+/*---------------------------------------------------------*/
+
+	    movdqa		xmm7, C(1)     /* xC1S7 */
+        movdqa		xmm2, xmm1      /* xmm2 = irot_input_x */
+        
+        movdqa		xmm3, xmm1;     /* xmm3 = irot_input_x */
+        pmulhw		xmm1, xmm7		/* xmm1 = xC1S7 * irot_input_x - irot_input_x */
+            
+        movdqa		xmm7, C(7)		/* xC7S1 */
+        psrlw		xmm2, 15		/* for trucation */		
+            
+        paddw		xmm1, xmm3		/* xmm1 = xC1S7 * irot_input_x */
+        paddw		xmm1, xmm2		/* Trucated */
+            
+        pmulhw		xmm3, xmm7		/* xmm3 = xC7S1 * irot_input_x */
+        paddw		xmm3, xmm2		/* Truncated */
+            
+        movdqa		xmm5, xmm0		/* xmm5 = irot_input_y */	
+        movdqa	    xmm2, xmm0      /* xmm2 = irot_input_y */	
+            
+        movdqa		xmm7, C(1)      /* xC1S7 */			
+        pmulhw		xmm0, xmm7	    /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
+        
+        movdqa		xmm7, C(7)		/* xC7S1 */	
+        psrlw		xmm2, 15		/* for trucation */	
+        
+        paddw		xmm0, xmm5		/* xmm0 = xC1S7 * irot_input_y */
+        paddw		xmm0, xmm2		/* Truncated */
+        
+        pmulhw		xmm5, xmm7		/* xmm5 = xC7S1 * irot_input_y */
+        paddw		xmm5, xmm2		/* Truncated */
+        
+        psubsw		xmm1, xmm5		/* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
+        paddsw		xmm3, xmm0		/* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
+
+        movdqa      xmm5, xmm1
+        movdqa      xmm0, xmm3
+
+        psrlw       xmm1, 15
+        psrlw       xmm3, 15
+
+        paddw       xmm1, xmm5
+        paddw       xmm3, xmm0
+
+        psraw       xmm1, 1
+        psraw       xmm3, 1
+
+        
+        movdqa		O(1), xmm1
+        movdqa		O(7), xmm3
+/*---------------------------------------------------------*/
+/* op3 and op5 
+/*---------------------------------------------------------*/
+	    movdqa		xmm0, C(3)      /* xC3S5 */
+	    movdqa		xmm1, C(5)      /* xC5S3 */
+
+	    movdqa		xmm5,xmm6       /* irot_input_x */
+	    movdqa		xmm7,xmm6       /* irot_input_x */
+
+	    movdqa		xmm2,xmm4       /* irot_input_y */
+	    movdqa		xmm3,xmm4       /* irot_input_y */
+
+	    pmulhw		xmm4,xmm0       /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
+	    pmulhw		xmm6,xmm1		/* xmm6 = xC5S3 * irot_input_y - irot_input_y */
+
+	    psrlw		xmm2,15         /* for trucation */
+	    psrlw		xmm5,15         /* for trucation */
+
+	    paddw		xmm4,xmm3		/* xmm4 = xC3S5 * irot_input_x */
+	    paddw		xmm6,xmm7		/* xmm6 = xC5S3 * irot_input_y */
+
+	    paddw		xmm4,xmm2		/* Truncated */
+	    paddw		xmm6,xmm5		/* Truncated */
+
+	    psubsw		xmm4,xmm6		/* op [3] */
+        movdqa      xmm6,xmm4
+
+        psrlw       xmm4,15        
+        paddw       xmm4,xmm6
+
+        psraw       xmm4,1
+	    movdqa		O(3),xmm4		/* Save Op[3] */
+
+	    movdqa		xmm4,xmm3		/* irot_input_y */
+	    movdqa		xmm6,xmm7		/* irot_input_x */
+
+	    pmulhw		xmm3,xmm1		/* mm3 = xC5S3 * irot_input_x - irot_input_x */
+	    pmulhw		xmm7,xmm0		/* mm7 = xC3S5 * irot_input_y - irot_input_y */
+
+	    paddw		xmm4,xmm2       /* Trucated */
+	    paddw		xmm6,xmm5       /* Trucated */
+
+	    paddw		xmm3,xmm4		/* xmm3 = xC5S3 * irot_input_x */
+	    paddw		xmm7,xmm6		/*  mm7 = xC3S5 * irot_input_y */
+
+	    paddw		xmm3,xmm7		/* Op[5] */        
+        movdqa      xmm7,xmm3
+
+        psrlw       xmm3,15        
+        paddw       xmm3,xmm7
+
+        psraw       xmm3,1
+	    movdqa		O(5),xmm3		/* Save Op[5] */
+/*---------------------------------------------------------*/
+/* End of 8 1-D FDCT                                       */        
+/*---------------------------------------------------------*/
+
+    }/* end of _asm code section */
+}
+
+
+
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/filtmmx.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/filtmmx.c
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/filtwmt.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/filtwmt.c
@ -0,0 +1,790 @@
+/****************************************************************************
+ *
+ *   Module Title :     newLoopTest_asm.c 
+ *
+ *   Description  :     Codec specific functions
+ *
+ *   AUTHOR       :     Yaowu Xu
+ *
+ *****************************************************************************
+ *   Revision History
+ *
+ *   1.02 YWX 03-Nov-00 Changed confusing variable name
+ *   1.01 YWX 02-Nov-00 Added the set of functions
+ *   1.00 YWX 19-Oct-00 configuration baseline
+ *****************************************************************************
+ */ 
+
+/****************************************************************************
+ *  Header Frames
+ *****************************************************************************
+ */
+
+
+#define STRICT              /* Strict type checking. */
+#include "codec_common.h"
+#include <math.h>
+
+ /****************************************************************************
+ *  Module constants.
+ *****************************************************************************
+ */        
+
+#define MIN(a, b)  (((a) < (b)) ? (a) : (b))
+#define FILTER_WEIGHT 128
+#define FILTER_SHIFT  7
+__declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};
+
+
+__declspec(align(16)) INT16  BilinearFilters_wmt[8][16] = 
+{
+{ 128,128,128,128,128,128,128,128,    0,  0, 0,   0,  0,  0,  0,  0 },
+{ 112,112,112,112,112,112,112,112,   16, 16, 16, 16, 16, 16, 16, 16 },
+{  96, 96, 96, 96, 96, 96, 96, 96,   32, 32, 32, 32, 32, 32, 32, 32 },
+{  80, 80, 80, 80, 80, 80, 80, 80,   48, 48, 48, 48, 48, 48, 48, 48 },
+{  64, 64, 64, 64, 64, 64, 64, 64,   64, 64, 64, 64, 64, 64, 64, 64 },
+{  48, 48, 48, 48, 48, 48, 48, 48,   80, 80, 80, 80, 80, 80, 80, 80 },
+{  32, 32, 32, 32, 32, 32, 32, 32,   96, 96, 96, 96, 96, 96, 96, 96 },
+{  16, 16, 16, 16, 16, 16, 16, 16,  112,112,112,112,112,112,112,112 }
+};
+
+extern __declspec(align(16)) INT16  BicubicFilters_mmx[17][8][32];
+
+_inline 
+void FilterBlock1d_h_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
+{
+    __asm
+    {
+
+        mov         edi, Filter
+        movdqa      xmm1, [edi]             ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm2, [edi+ 16]         ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm6, [edi + 32]        ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm7, [edi + 48]        ; xmm3 *= kernel 0 modifiers.
+
+        mov         edi,OutputPtr
+		mov			esi,SrcPtr
+        dec         esi
+        mov         ecx, DWORD PTR OutputHeight
+        mov         eax, OutputWidth        ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+nextrow:
+
+        // kernel 0 and 3 are potentially negative taps.  These negative tap filters 
+        // must be done first or we could have problems saturating our high value 
+        // tap filters
+        movdqu		xmm3, [esi]             ; xmm3 = p-1..p14    
+        movdqu      xmm4, xmm3              ; xmm4 = p-1..p14
+        punpcklbw   xmm3, xmm0              ; xmm3 = p-1..p6
+        pmullw      xmm3, xmm1              ; xmm3 *= kernel 0 modifiers.
+
+        psrldq      xmm4, 3                 ; xmm4 = p2..p13
+        movdqa      xmm5, xmm4              ; xmm5 = p2..p13
+        punpcklbw   xmm5, xmm0              ; xmm5 = p2..p7
+        pmullw      xmm5, xmm7              ; xmm5 *= kernel 3 modifiers
+        paddsw      xmm3, xmm5              ; xmm3 += xmm5
+
+        movdqu      xmm4, [esi+1]           ; xmm4 = p0..p13
+        movdqa      xmm5, xmm4              ; xmm5 = p0..p13
+        punpcklbw   xmm5, xmm0              ; xmm5 = p0..p7
+        pmullw      xmm5, xmm2              ; xmm5 *= kernel 1 modifiers
+        paddsw      xmm3, xmm5              ; xmm3 += xmm5
+
+        psrldq      xmm4, 1                 ; xmm4 = p1..p13
+        movdqa      xmm5, xmm4              ; xmm5 = p1..p13
+        punpcklbw   xmm5, xmm0              ; xmm5 = p1..p7
+        pmullw      xmm5, xmm6              ; xmm5 *= kernel 2 modifiers
+        paddsw      xmm3, xmm5              ; xmm3 += xmm5
+
+        paddsw      xmm3, rd                ; xmm3 += round value
+        psraw       xmm3, FILTER_SHIFT      ; xmm3 /= 128
+        packuswb    xmm3, xmm0              ; pack and saturate
+
+        movdq2q     mm0, xmm3
+        movq        [edi],mm0               ; store the results in the destination
+
+        add         esi,SrcPixelsPerLine    ; next line
+        add         edi,eax; 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+    }
+}
+
+_inline 
+void FilterBlock1d_v_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
+{
+    __asm
+    {
+
+        mov         edi, Filter
+        movdqa      xmm1, [edi]          ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm2, [edi + 16]     ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm6, [edi + 32]     ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm7, [edi + 48]     ; xmm3 *= kernel 0 modifiers.
+
+        mov         edx, PixelsPerLine
+        mov         edi, OutputPtr
+		mov			esi, SrcPtr
+        sub         esi, PixelsPerLine
+        mov         ecx, DWORD PTR OutputHeight
+        mov         eax, OutputWidth        ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+
+nextrow:
+        movdqu		xmm3, [esi]             ; xmm3 = p0..p16
+        punpcklbw   xmm3, xmm0              ; xmm3 = p0..p8
+        pmullw      xmm3, xmm1              ; xmm3 *= kernel 0 modifiers.
+
+        add         esi, edx                ; move source forward 1 line to avoid 3 * pitch
+
+        movdqu		xmm4, [esi+2*edx]       ; xmm4 = p0..p16
+        punpcklbw   xmm4, xmm0              ; xmm4 = p0..p8
+        pmullw      xmm4, xmm7              ; xmm4 *= kernel 3 modifiers.
+        paddsw      xmm3, xmm4              ; xmm3 += xmm4
+
+        movdqu		xmm4, [esi ]            ; xmm4 = p0..p16
+        punpcklbw   xmm4, xmm0              ; xmm4 = p0..p8
+        pmullw      xmm4, xmm2              ; xmm4 *= kernel 1 modifiers.
+        paddsw      xmm3, xmm4              ; xmm3 += xmm4
+
+        movdqu		xmm4, [esi +edx]        ; xmm4 = p0..p16
+        punpcklbw   xmm4, xmm0              ; xmm4 = p0..p8
+        pmullw      xmm4, xmm6              ; xmm4 *= kernel 2 modifiers.
+        paddsw      xmm3, xmm4              ; xmm3 += xmm4
+
+
+
+        paddsw      xmm3, rd                ; xmm3 += round value
+        psraw       xmm3, FILTER_SHIFT      ; xmm3 /= 128
+        packuswb    xmm3, xmm0              ; pack and unpack to saturate
+
+        movdq2q     mm0, xmm3
+        movq        [edi],mm0               ; store the results in the destination
+
+        // the subsequent iterations repeat 3 out of 4 of these reads.  Since the 
+        // recon block should be in cache this shouldn't cost much.  Its obviously 
+        // avoidable!!!. 
+        add         edi,eax; 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+
+    }
+}
+
+
+_inline 
+void FilterBlock1d_hb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
+{
+    __asm
+    {
+
+        mov         edi, Filter
+        movdqa      xmm1, [edi]          ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm2, [edi + 16]     ; xmm3 *= kernel 0 modifiers.
+
+        mov         edi,OutputPtr
+		mov			esi,SrcPtr
+        mov         ecx, DWORD PTR OutputHeight
+        mov         eax, OutputWidth        ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+nextrow:
+        movdqu		xmm3, [esi]             ; xmm3 = p-1..p14    
+        movdqu      xmm5, xmm3              ; xmm4 = p-1..p14
+        punpcklbw   xmm3, xmm0              ; xmm3 = p-1..p6
+        pmullw      xmm3, xmm1              ; xmm3 *= kernel 0 modifiers.
+
+        psrldq      xmm5, 1                 ; xmm4 = p0..p13
+        punpcklbw   xmm5, xmm0              ; xmm5 = p0..p7
+        pmullw      xmm5, xmm2              ; xmm5 *= kernel 1 modifiers
+        paddw       xmm3, xmm5              ; xmm3 += xmm5
+
+        paddw       xmm3, rd                ; xmm3 += round value
+        psraw       xmm3, FILTER_SHIFT      ; xmm3 /= 128
+        packuswb    xmm3, xmm0              ; pack and unpack to saturate
+
+        movdq2q     mm0, xmm3
+        movq        [edi],mm0               ; store the results in the destination
+
+        add         esi,SrcPixelsPerLine    ; next line
+        add         edi,eax; 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+    }
+}
+
+_inline 
+void FilterBlock1d_vb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
+{
+    __asm
+    {
+
+        mov         edi, Filter
+        movdqa      xmm1, [edi]          ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm2, [edi + 16]     ; xmm3 *= kernel 0 modifiers.
+        mov         edx, PixelsPerLine
+        mov         edi, OutputPtr
+		mov			esi, SrcPtr
+        mov         ecx, DWORD PTR OutputHeight
+        mov         eax, OutputWidth        ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+
+nextrow:
+        movdqu		xmm3, [esi]             ; xmm3 = p0..p16
+        punpcklbw   xmm3, xmm0              ; xmm3 = p0..p8
+        pmullw      xmm3, xmm1              ; xmm3 *= kernel 0 modifiers.
+
+        movdqu		xmm4, [esi +edx ]       ; xmm4 = p0..p16
+        punpcklbw   xmm4, xmm0              ; xmm4 = p0..p8
+        pmullw      xmm4, xmm2              ; xmm4 *= kernel 1 modifiers.
+        paddw       xmm3, xmm4              ; xmm3 += xmm4
+
+        paddw       xmm3, rd                ; xmm3 += round value
+        psraw       xmm3, FILTER_SHIFT      ; xmm3 /= 128
+        packuswb    xmm3, xmm0              ; pack and unpack to saturate
+
+        movdq2q     mm0, xmm3
+        movq        [edi],mm0               ; store the results in the destination
+
+        // the subsequent iterations repeat 3 out of 4 of these reads.  Since the 
+        // recon block should be in cache this shouldn't cost much.  Its obviously 
+        // avoidable!!!. 
+        add         esi,edx
+        add         edi,eax 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+
+    }
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     FilterBlock2dBil
+ *  
+ *  INPUTS        :     Pointer to source data
+ *						
+ *  OUTPUTS       :     Filtered data
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Applies a bilinear filter on the intput data to produce
+ *						a predictor block (UINT16)
+ *
+ *  SPECIAL NOTES :     
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+_inline 
+void FilterBlock2dBil_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
+{
+
+    __asm
+    {
+        mov         eax,        HFilter             ; 
+        mov         edi,        OutputPtr           ; 
+        mov         esi,        SrcPtr              ;
+        lea         ecx,        [edi+64]            ;
+        mov         edx,        SrcPixelsPerLine     ;
+               
+        movdqa      xmm1,       [eax]               ;
+        movdqa      xmm2,       [eax+16]            ;
+        
+        mov         eax,        VFilter             ;       
+        pxor        xmm0,       xmm0                ;
+
+        // get the first horizontal line done       ;
+        movdqu      xmm3,       [esi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                ; make a copy of current line
+        
+        punpcklbw   xmm3,       xmm0                ; xx 00 01 02 03 04 05 06
+        psrldq      xmm4,       1                   ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx        
+        
+        pmullw      xmm3,       xmm1                ;        
+        punpcklbw   xmm4,       xmm0                ; 00 01 02 03 04 05 06 07
+
+        pmullw      xmm4,       xmm2                ;
+        paddw       xmm3,       xmm4                ;   
+
+        paddw       xmm3,       rd                  ; 
+        psraw       xmm3,       FILTER_SHIFT        ; ready for output
+        
+        movdqa      xmm5,       xmm3                ;
+
+        add         esi,        edx                 ; next line
+NextRow:
+        pmullw      xmm5,       [eax]               ; 
+        movdqu      xmm3,       [esi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        movdqa      xmm4,       xmm3                ; make a copy of current line        
+        punpcklbw   xmm3,       xmm0                ; xx 00 01 02 03 04 05 06
+
+        psrldq      xmm4,       1                   ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx                
+        pmullw      xmm3,       xmm1                ;        
+        punpcklbw   xmm4,       xmm0                ; 00 01 02 03 04 05 06 07
+
+        movdqa      xmm6,       xmm5                ; 
+        pmullw      xmm4,       xmm2                ;
+
+        paddw       xmm3,       xmm4                ;   
+        paddw       xmm3,       rd                  ; 
+
+        psraw       xmm3,       FILTER_SHIFT        ; ready for output
+        movdqa      xmm5,       xmm3                ; make a copy for the next row
+        
+        pmullw      xmm3,       [eax+16]            ; 
+        paddw       xmm6,       xmm3                ;
+        
+
+        paddw       xmm6,       rd                  ; xmm6 += round value
+        psraw       xmm6,       FILTER_SHIFT        ; xmm6 /= 128
+
+        packuswb    xmm6,       xmm0                ; pack and unpack to saturate
+        movdq2q     mm0,        xmm6
+
+        movq        [edi],      mm0                 ; store the results in the destination
+        add         esi,        edx                 ; next line
+        add         edi,        8                   ; 
+
+        cmp         edi,        ecx                 ;
+        jne         NextRow                         
+
+    }
+
+    // First filter 1d Horizontal
+	//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
+	// Now filter Verticaly
+	//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
+
+
+}
+
+_inline 
+void FilterUnpackBlock2dBil_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
+{
+
+    __asm
+    {
+        mov         eax,        HFilter             ; 
+        mov         edi,        OutputPtr           ; 
+        mov         esi,        SrcPtr              ;
+        lea         ecx,        [edi+128]            ;
+        mov         edx,        SrcPixelsPerLine     ;
+               
+        movdqa      xmm1,       [eax]               ;
+        movdqa      xmm2,       [eax+16]            ;
+        
+        mov         eax,        VFilter             ;       
+        pxor        xmm0,       xmm0                ;
+
+        // get the first horizontal line done       ;
+        movdqu      xmm3,       [esi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                ; make a copy of current line
+        
+        punpcklbw   xmm3,       xmm0                ; xx 00 01 02 03 04 05 06
+        psrldq      xmm4,       1                   ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx        
+        
+        pmullw      xmm3,       xmm1                ;        
+        punpcklbw   xmm4,       xmm0                ; 00 01 02 03 04 05 06 07
+
+        pmullw      xmm4,       xmm2                ;
+        paddw       xmm3,       xmm4                ;   
+
+        paddw       xmm3,       rd                  ; 
+        psraw       xmm3,       FILTER_SHIFT        ; ready for output
+        
+        movdqa      xmm5,       xmm3                ;
+
+        add         esi,        edx                 ; next line
+NextRow:
+        pmullw      xmm5,       [eax]               ; 
+        movdqu      xmm3,       [esi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        movdqa      xmm4,       xmm3                ; make a copy of current line        
+        punpcklbw   xmm3,       xmm0                ; xx 00 01 02 03 04 05 06
+
+        psrldq      xmm4,       1                   ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx                
+        pmullw      xmm3,       xmm1                ;        
+        punpcklbw   xmm4,       xmm0                ; 00 01 02 03 04 05 06 07
+
+        movdqa      xmm6,       xmm5                ; 
+        pmullw      xmm4,       xmm2                ;
+
+        paddw       xmm3,       xmm4                ;   
+        paddw       xmm3,       rd                  ; 
+
+        psraw       xmm3,       FILTER_SHIFT        ; ready for output
+        movdqa      xmm5,       xmm3                ; make a copy for the next row
+        
+        pmullw      xmm3,       [eax+16]            ; 
+        paddw       xmm6,       xmm3                ;
+        
+
+        paddw       xmm6,       rd                  ; xmm6 += round value
+        psraw       xmm6,       FILTER_SHIFT        ; xmm6 /= 128
+
+        movdqu      [edi],      xmm6;
+        
+        /*
+        packuswb    xmm6,       xmm0                ; pack and unpack to saturate
+        movdq2q     mm0,        xmm6
+
+        movq        [edi],      mm0                 ; store the results in the destination
+        */
+        add         esi,        edx                 ; next line
+        add         edi,        16                   ; 
+
+        cmp         edi,        ecx                 ;
+        jne         NextRow                         
+
+    }
+
+    // First filter 1d Horizontal
+	//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
+	// Now filter Verticaly
+	//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
+
+
+}
+_inline 
+void FilterUnpackBlock1d_hb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
+{
+    __asm
+    {
+
+        mov         edi, Filter
+        movdqa      xmm1, [edi]          ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm2, [edi + 16]     ; xmm3 *= kernel 0 modifiers.
+
+        mov         edi,OutputPtr
+		mov			esi,SrcPtr
+        mov         ecx, DWORD PTR OutputHeight
+        mov         eax, OutputWidth        ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+nextrow:
+        movdqu		xmm3, [esi]             ; xmm3 = p-1..p14    
+        movdqu      xmm5, xmm3              ; xmm4 = p-1..p14
+        punpcklbw   xmm3, xmm0              ; xmm3 = p-1..p6
+        pmullw      xmm3, xmm1              ; xmm3 *= kernel 0 modifiers.
+
+        psrldq      xmm5, 1                 ; xmm4 = p0..p13
+        punpcklbw   xmm5, xmm0              ; xmm5 = p0..p7
+        pmullw      xmm5, xmm2              ; xmm5 *= kernel 1 modifiers
+        paddw       xmm3, xmm5              ; xmm3 += xmm5
+
+        paddw       xmm3, rd                ; xmm3 += round value
+        psraw       xmm3, FILTER_SHIFT      ; xmm3 /= 128
+        
+        /*
+        packuswb    xmm3, xmm0              ; pack and unpack to saturate
+        movdq2q     mm0, xmm3
+        */
+
+        movdqu      [edi],xmm3               ; store the results in the destination
+
+        add         esi,SrcPixelsPerLine    ; next line
+        add         edi,eax; 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+    }
+}
+
+_inline 
+void FilterUnpackBlock1d_vb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
+{
+    __asm
+    {
+
+        mov         edi, Filter
+        movdqa      xmm1, [edi]          ; xmm3 *= kernel 0 modifiers.
+        movdqa      xmm2, [edi + 16]     ; xmm3 *= kernel 0 modifiers.
+        mov         edx, PixelsPerLine
+        mov         edi, OutputPtr
+		mov			esi, SrcPtr
+        mov         ecx, DWORD PTR OutputHeight
+        mov         eax, OutputWidth        ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+
+nextrow:
+        movdqu		xmm3, [esi]             ; xmm3 = p0..p16
+        punpcklbw   xmm3, xmm0              ; xmm3 = p0..p8
+        pmullw      xmm3, xmm1              ; xmm3 *= kernel 0 modifiers.
+
+        movdqu		xmm4, [esi +edx ]       ; xmm4 = p0..p16
+        punpcklbw   xmm4, xmm0              ; xmm4 = p0..p8
+        pmullw      xmm4, xmm2              ; xmm4 *= kernel 1 modifiers.
+        paddw       xmm3, xmm4              ; xmm3 += xmm4
+
+        paddw       xmm3, rd                ; xmm3 += round value
+        psraw       xmm3, FILTER_SHIFT      ; xmm3 /= 128
+       
+        /*packuswb    xmm3, xmm0              ; pack and unpack to saturate
+
+        movdq2q     mm0, xmm3
+        */
+        movdqu      [edi],xmm3               ; store the results in the destination
+
+        // the subsequent iterations repeat 3 out of 4 of these reads.  Since the 
+        // recon block should be in cache this shouldn't cost much.  Its obviously 
+        // avoidable!!!. 
+        add         esi,edx
+        add         edi,eax 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+
+    }
+}
+ 
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     FilterBlockBil_8
+ *  
+ *  INPUTS        :     ReconPtr1, ReconPtr12
+ *							Two pointers into the block of data to be filtered
+ *							These pointers bound the fractional pel position
+ *						PixelsPerLine
+ *							Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
+ *						Modx, ModY
+ *							The fractional pel bits used to select a filter.
+ *
+ *				
+ *  OUTPUTS       :     ReconRefPtr
+ *							A pointer to an 8x8 buffer into which UINT8 filtered data is written.
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Produces a bilinear filtered fractional pel prediction block
+ *						with UINT8 output
+ *
+ *  SPECIAL NOTES :      
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
+{
+	int diff;
+
+	// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
+	diff=ReconPtr2-ReconPtr1;
+
+	// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
+	// This works out to be what we want... despite the pointer swapping that goes on below.
+	// For example... if the X component of the vector is a +ve ModX = X%8.
+	//                if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
+
+	if(diff<0) 
+	{											// swap pointers so ReconPtr1 smaller
+		UINT8 *temp=ReconPtr1;
+		ReconPtr1=ReconPtr2;
+		ReconPtr2=temp;
+		diff= (int)(ReconPtr2-ReconPtr1);
+	}
+
+	if( diff==1 )
+	{			
+		FilterBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
+	}
+	else if (diff == (int)(PixelsPerLine) )				// Fractional pixel in vertical only
+	{
+		FilterBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
+	}
+	else if(diff == (int)(PixelsPerLine - 1))			// ReconPtr1 is Top right
+	{										
+        FilterBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+        //FilterBlock2dBil_8_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+	}
+	else if(diff == (int)(PixelsPerLine + 1) )			// ReconPtr1 is Top left
+	{	
+        FilterBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+		//FilterBlock2dBil_8_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+	}
+}
+
+_inline void UnpackBlock_wmt( UINT8 *SrcPtr, UINT16 *OutputPtr, UINT32 SrcPixelsPerLine )
+{
+    __asm
+    {
+        mov         edi,OutputPtr
+		mov			esi,SrcPtr
+
+        mov         ecx, 8
+        mov         eax, 16                 ; destination pitch?
+		pxor		xmm0, xmm0              ; xmm0 = 00000000
+
+nextrow:
+        movdqu		xmm3, [esi]             ; xmm3 = p-1..p14    
+        punpcklbw   xmm3, xmm0              ; xmm3 = p-1..p6
+        movdqu     [edi],xmm3                ; store the results in the destination
+
+        add         esi,SrcPixelsPerLine    ; next line
+        add         edi,eax; 
+
+        dec         ecx                     ; decrement count
+        jnz         nextrow                 ; next row
+    }
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     FilterBlock2d
+ *  
+ *  INPUTS        :     Pointer to source data
+ *						
+ *  OUTPUTS       :     Filtered data
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Applies a 2d 4 tap filter on the intput data to produce
+ *						a predictor block (UINT16)
+ *
+ *  SPECIAL NOTES :     
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void FilterBlock2d_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
+{
+
+    UINT8 Intermediate[256];
+
+	// First filter 1d Horizontal
+	FilterBlock1d_h_wmt(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );
+
+	// Now filter Verticaly
+	FilterBlock1d_v_wmt(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
+
+
+}
+ 
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     FilterBlock
+ *  
+ *  INPUTS        :     ReconPtr1, ReconPtr12
+ *							Two pointers into the block of data to be filtered
+ *							These pointers bound the fractional pel position
+ *						PixelsPerLine
+ *							Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
+ *						Modx, ModY
+ *							The fractional pel bits used to select a filter.
+ *						UseBicubic
+ *							Whether to use the bicubuc filter set or the bilinear set
+ *
+ *				
+ *  OUTPUTS       :     ReconRefPtr
+ *							A pointer to an 8x8 buffer into which the filtered data is written.
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Produces a filtered fractional pel prediction block
+ *						using bilinear or bicubic filters
+ *
+ *  SPECIAL NOTES :     
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
+{
+	int diff;
+    UINT8 Intermediate[256];
+
+	// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
+	diff=ReconPtr2-ReconPtr1;
+
+	// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
+	// This works out to be what we want... despite the pointer swapping that goes on below.
+	// For example... if the X component of the vector is a +ve ModX = X%8.
+	//                if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
+
+	if(diff<0) 
+	{											// swap pointers so ReconPtr1 smaller
+		UINT8 *temp=ReconPtr1;
+		ReconPtr1=ReconPtr2;
+		ReconPtr2=temp;
+		diff= (int)(ReconPtr2-ReconPtr1);
+	}
+
+    if(!diff)
+    {
+        return;
+    }
+
+
+
+    if(UseBicubic)
+    {
+        if( diff==1 )
+        {											        // Fractional pixel in horizontal only
+                FilterBlock1d_h_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
+        }
+        else if (diff == (int)(PixelsPerLine) )				// Fractional pixel in vertical only
+        {
+                FilterBlock1d_v_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
+        }
+        else if(diff == (int)(PixelsPerLine - 1))			// ReconPtr1 is Top right
+        {										
+                FilterBlock2d_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
+        }
+        else if(diff == (int)(PixelsPerLine + 1) )			// ReconPtr1 is Top left
+        {	
+                FilterBlock2d_wmt( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
+        }
+        UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
+    }
+    else
+    {
+   
+        if( diff==1 )
+        {	
+            FilterUnpackBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 16, BilinearFilters_wmt[ModX] );
+            
+            // Fractional pixel in horizontal only
+            /*
+            FilterBlock1d_hb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
+            UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
+            */
+            
+        }
+        else if (diff == (int)(PixelsPerLine) )				// Fractional pixel in vertical only
+        {
+            FilterUnpackBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 16, BilinearFilters_wmt[ModY]);    
+            /*
+            FilterBlock1d_vb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
+            UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
+            */
+        }
+        else if(diff == (int)(PixelsPerLine - 1))			// ReconPtr1 is Top right
+        {										
+
+            FilterUnpackBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+            /*
+            FilterBlock2dBil_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+            UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
+            */
+        }
+        else if(diff == (int)(PixelsPerLine + 1) )			// ReconPtr1 is Top left
+        {	
+            FilterUnpackBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );    
+            /*
+            FilterBlock2dBil_wmt( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
+            UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
+            */
+        }
+    }
+}
+
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/mmxidct.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/mmxidct.c
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/mmxrecon.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/mmxrecon.c
@ -0,0 +1,856 @@
+/****************************************************************************
+*
+*   Module Title :     OptFunctions.c
+*
+*   Description  :     MMX or otherwise processor specific 
+*                      optimised versions of functions
+*
+*    AUTHOR      :     Paul Wilkins
+*
+*****************************************************************************
+*   Revision History
+*
+*   1.07 JBB 26/01/01  Removed unused function
+*	1.06 YWX 23/05/00  Remove the clamping in MmxReconPostProcess()
+*	1.05 YWX 15/05/00  Added MmxReconPostProcess()
+*   1.04 SJL 03/14/00  Added in Tim's versions of MmxReconInter and MmxReconInterHalfPixel2. 
+*   1.03 PGW 12/10/99  Changes to reduce uneccessary dependancies. 
+*   1.02 PGW 30/08/99  Minor changes to MmxReconInterHalfPixel2().
+*   1.01 PGW 13/07/99  Changes to keep reconstruction data to 16 bit
+*   1.00 PGW 14/06/99  Configuration baseline
+*
+*****************************************************************************
+*/
+
+/* 
+    Use Tim's optimized version.
+*/
+#define USING_TIMS 1
+
+/****************************************************************************
+*  Header Files
+*****************************************************************************
+*/
+
+#define STRICT              // Strict type checking. 
+
+#include "codec_common.h"
+
+#include "reconstruct.h"
+
+/****************************************************************************
+*  Module constants.
+*****************************************************************************
+*/        
+
+/****************************************************************************
+*  Imports.
+*****************************************************************************
+*/   
+
+extern INT32 * XX_LUT;
+
+/****************************************************************************
+*  Exported Global Variables
+*****************************************************************************
+*/
+
+/****************************************************************************
+*  Exported Functions 
+*****************************************************************************
+*/              
+
+/****************************************************************************
+*  Module Statics
+*****************************************************************************
+*/  
+
+INT16 Ones[4]               = {1,1,1,1};
+INT16 OneTwoEight[4]        = {128,128,128,128};
+UINT8 Eight128s[8]          = {128,128,128,128,128,128,128,128};
+
+#pragma warning( disable : 4799 )  // Disable no emms instruction warning!
+/****************************************************************************
+*  Forward References
+*****************************************************************************
+*/  
+/****************************************************************************
+ * 
+ *  ROUTINE       :     MMXReconIntra
+ *
+ *  INPUTS        :     INT16 *  idct
+ *                               Pointer to the output from the idct for this block
+ *
+ *                      UINT32   stride
+ *                               Line Length in pixels in recon and reference images
+ *                               
+ *
+ *                     
+ *
+ *  OUTPUTS       :     UINT8 *  dest
+ *                               The reconstruction buffer
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Reconstructs an intra block - MMX version
+ *
+ *  SPECIAL NOTES :     Tim Murphy's optimized version 
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
+{
+	(void) TmpDataBuffer;
+    __asm
+    {
+        // u    pipe
+        //   v  pipe
+        mov         eax,[idct]              ; Signed 16 bit inputs
+          mov         edx,[dest]            ; Signed 8 bit outputs
+        movq        mm0,[Eight128s]         ; Set mm0 to 0x8080808080808080
+          ;
+        mov         ebx,[stride]            ; Line stride in output buffer
+          lea         ecx,[eax+128]         ; Endpoint in input buffer
+loop_label:                                 ;
+        movq        mm2,[eax]               ; First four input values
+          ;
+        packsswb    mm2,[eax+8]             ; pack with next(high) four values
+          por         mm0,mm0               ; stall
+        pxor        mm2,mm0                 ; Convert result to unsigned (same as add 128)
+          lea         eax,[eax + 16]        ; Step source buffer
+        cmp         eax,ecx                 ; are we done
+          ;
+        movq        [edx],mm2               ; store results
+          ;
+        lea         edx,[edx+ebx]           ; Step output buffer
+          jc          loop_label            ; Loop back if we are not done
+    }
+    // 6c/8 elts = 9c/8 = 1.125 c/pix
+
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     MmxReconInter
+ *
+ *  INPUTS        :     UINT8 *  RefPtr
+ *                               The last frame reference
+ *
+ *                      INT16 *  ChangePtr
+ *                               Pointer to the change data
+ *
+ *                      UINT32   LineStep
+ *                               Line Length in pixels in recon and ref images
+ *
+ *  OUTPUTS       :     UINT8 *  ReconPtr
+ *                               The reconstruction
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Reconstructs data from last data and change
+ *
+ *  SPECIAL NOTES :     
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+#if USING_TIMS
+void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
+{
+    (void) TmpDataBuffer;
+
+ _asm {
+	push	edi
+;;	 mov	ebx, [ref]
+;;	mov		ecx, [diff]
+;;	 mov	eax, [dest]
+;;	mov		edx, [stride]
+	 mov	ebx, [RefPtr]
+	mov		ecx, [ChangePtr]
+	 mov	eax, [ReconPtr]
+	mov		edx, [LineStep]
+	 pxor	mm0, mm0
+	lea		edi, [ecx + 128]
+	 ;
+  L:
+	movq	mm2, [ebx]			; (+3 misaligned) 8 reference pixels
+	 ;
+	movq	mm4, [ecx]			; first 4 changes
+	 movq	mm3, mm2
+	movq	mm5, [ecx + 8]		; last 4 changes
+	 punpcklbw mm2, mm0			; turn first 4 refs into positive 16-bit #s
+	paddsw	mm2, mm4			; add in first 4 changes
+	 punpckhbw mm3, mm0			; turn last 4 refs into positive 16-bit #s
+	paddsw	mm3, mm5			; add in last 4 changes
+	 add	ebx, edx			; next row of reference pixels
+	packuswb mm2, mm3			; pack result to unsigned 8-bit values
+	 lea	ecx, [ecx + 16]		; next row of changes
+	cmp		ecx, edi			; are we done?
+	 ;
+	movq	[eax], mm2			; store result
+	 ;
+	lea		eax, [eax+edx]		; next row of output
+	 jc		L					; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
+
+	pop		edi
+ }
+}
+#else
+void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
+{
+
+    // Note that the line step for the change data is assumed to be 8 * 32 bits.
+__asm
+    {
+        // Set up data pointers
+        mov         eax,dword ptr [ReconPtr]  
+        mov         ebx,dword ptr [RefPtr]      
+        mov         ecx,dword ptr [ChangePtr]   
+		mov         edx,dword ptr [LineStep]
+		pxor        mm6, mm6					; Blank mmx6
+
+        // Row 1
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 2
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 3
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 4
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 5
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 6
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 7
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+
+		add         ebx,edx						; Step the reference pointer.
+        add         ecx,16                      ; Step the change pointer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        // Row 8
+        // Load the data values. The change data needs to be unpacked to words
+        movq        mm0,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data
+        paddsw      mm0, mm2                    ; First 4 values
+        paddsw      mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [eax],mm0         ; Write the data out to the results buffer
+   
+        //emms									; Clear the MMX state.
+    }
+}
+#endif
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     MmxReconInterHalfPixel2
+ *
+ *  INPUTS        :     UINT8 *  RefPtr1, RefPtr2
+ *                               The last frame reference
+ *
+ *                      INT16 *  ChangePtr
+ *                               Pointer to the change data
+ *
+ *                      UINT32   LineStep
+ *                               Line Length in pixels in recon and ref images
+ *                               
+ *
+ *  OUTPUTS       :     UINT8 *  ReconPtr
+ *                               The reconstruction
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Reconstructs data from half pixel reference data and change. 
+ *                      Half pixel data interpolated from 2 references.
+ *
+ *  SPECIAL NOTES :     
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+#if USING_TIMS
+
+#define A 0
+
+void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, 
+		    	              UINT8 * RefPtr1, UINT8 * RefPtr2, 
+						      INT16 * ChangePtr, UINT32 LineStep )
+{
+#	if A
+		static culong FourOnes[2] = { 65537, 65537};	// only read once
+#	endif
+	(void) TmpDataBuffer;
+
+ _asm {
+	push	esi
+	 push	edi
+
+;;	mov		ecx, [diff]
+;;	 mov	esi, [ref1]
+;;	mov		edi, [ref2]
+;;	 mov	ebx, [dest]
+;;	mov		edx, [stride]
+
+	mov		ecx, [ChangePtr]
+	 mov	esi, [RefPtr1]
+	mov		edi, [RefPtr2]
+	 mov	ebx, [ReconPtr]
+	mov		edx, [LineStep]
+
+	 lea	eax, [ecx+128]
+
+#	if A
+		movq	mm1, [FourOnes]
+#	endif
+
+	 pxor	mm0, mm0
+  L:
+	movq	mm2, [esi]		; (+3 misaligned) mm2 = row from ref1
+	 ;
+	movq	mm4, [edi]		; (+3 misaligned) mm4 = row from ref2
+	 movq	mm3, mm2
+	punpcklbw mm2, mm0		; mm2 = start ref1 as positive 16-bit #s
+	 movq	mm5, mm4
+	movq	mm6, [ecx]		; mm6 = first 4 changes
+	 punpckhbw mm3, mm0		; mm3 = end ref1 as positive 16-bit #s
+	movq	mm7, [ecx+8]	; mm7 = last 4 changes
+	 punpcklbw mm4, mm0		; mm4 = start ref2 as positive 16-bit #s
+	punpckhbw mm5, mm0		; mm5 = end ref2 as positive 16-bit #s
+	 paddw	mm2, mm4		; mm2 = start (ref1 + ref2)
+	paddw	mm3, mm5		; mm3 = end (ref1 + ref2)
+
+#	if A
+		 paddw	mm2, mm1		; rounding adjustment
+		paddw	mm3, mm1
+#	endif
+
+	 psrlw	mm2, 1			; mm2 = start (ref1 + ref2)/2
+	psrlw	mm3, 1			; mm3 = end (ref1 + ref2)/2
+	 paddw	mm2, mm6		; add changes to start
+	paddw	mm3, mm7		; add changes to end
+	 lea	ecx, [ecx+16]	; next row idct
+	packuswb mm2, mm3		; pack start|end to unsigned 8-bit
+	 add	esi, edx		; next row ref1
+	add		edi, edx		; next row ref2
+	 cmp	ecx, eax
+	movq	[ebx], mm2		; store result
+	 ;
+	lea		ebx, [ebx+edx]
+	 jc		L				; 22c / 8 elts = 33c / 8 pixels = 4.125 c/pix
+
+	pop		edi
+	 pop	esi
+ }
+}
+
+#undef A
+
+#else
+void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, 
+		    	              UINT8 * RefPtr1, UINT8 * RefPtr2, 
+						      INT16 * ChangePtr, UINT32 LineStep )
+{
+    UINT8 * TmpDataPtr = (UINT8 *)TmpDataBuffer->TmpReconBuffer;
+
+    // Note that the line step for the change data is assumed to be 8 * 32 bits.
+    __asm
+    {
+		pxor        mm6, mm6					; Blank mmx6
+
+        // Set up data pointers
+        mov         eax,dword ptr [RefPtr1]      
+        mov         ebx,dword ptr [RefPtr2]      
+        mov         edx,dword ptr [LineStep]
+
+        // Row 1
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+
+        // Load the data values (Ref1 and Ref2) and unpack to signed 16 bit values
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+        punpcklbw   mm0, mm6					; Low bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]  ; Load the temp results pointer 
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx],mm0         ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 2
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,16                    
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm3, mm2                    ; Copy data
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]  ; Load the temp results pointer 
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+8],mm0       ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 3
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,32                    
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]   
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+16],mm0         ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 4
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,48                    
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]   
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+24],mm0      ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 5
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,64                 
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]   
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+32],mm0      ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 6
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,80                    
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]   
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+40],mm0      ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 7
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,96                    
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]   
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+48],mm0      ; Write the data out to the temporary results buffer
+        add         eax,edx                     ; Step the reference pointers
+        add         ebx,edx                    
+
+        // Row 8
+        // Load the change pointer
+        mov         ecx,dword ptr [ChangePtr]   
+        add         ecx,112                    
+
+        // Load the data values (Ref1 and Ref2). 
+        movq        mm0,dword ptr [eax]         ; Load 8 elements of source data
+        movq        mm2,dword ptr [ebx]         ; Load 8 elements of source data
+        movq        mm1, mm0                    ; Copy data
+        movq        mm3, mm2                    ; Copy data
+
+		punpcklbw   mm0, mm6					; Low bytes to words
+		punpckhbw   mm1, mm6					; High bytes to words
+		punpcklbw   mm2, mm6					; Low bytes to words
+		punpckhbw   mm3, mm6					; High bytes to words
+
+        // Average Ref1 and Ref2
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm3                    ; Second 4 values
+        psrlw       mm0, 1
+        psrlw       mm1, 1
+
+        // Load 8 elements of 16 bit change data
+        movq        mm2,dword ptr [ecx]         ; Load 4 elements of change data
+        movq        mm4,dword ptr [ecx+8]       ; Load next 4 elements of change data
+
+        // Sum the data reference and difference data
+        paddw       mm0, mm2                    ; First 4 values
+        paddw       mm1, mm4                    ; Second 4 values
+
+        // Pack and store
+        mov         ecx,dword ptr [TmpDataPtr]   
+        packuswb    mm0, mm1                    ; Then pack and saturate to unsigned bytes
+        movq        dword ptr [ecx+56],mm0      ; Write the data out to the temporary results buffer
+
+
+        // Now copy the results back to the reconstruction buffer.
+        mov         eax,dword ptr [ReconPtr]    ; Load the reconstruction Pointer  
+        mov         ecx,dword ptr [TmpDataPtr]  ; Load the temp results pointer 
+        // Row 1
+        movq        mm0,dword ptr [ecx]         ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 2
+        movq        mm0,dword ptr [ecx+8]       ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 3
+        movq        mm0,dword ptr [ecx+16]      ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 4
+        movq        mm0,dword ptr [ecx+24]      ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 5
+        movq        mm0,dword ptr [ecx+32]      ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 6
+        movq        mm0,dword ptr [ecx+40]      ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 7
+        movq        mm0,dword ptr [ecx+48]      ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+        // Row 8
+        movq        mm0,dword ptr [ecx+56]      ; Load 8 elements of results data
+        movq        dword ptr [eax],mm0         ; Write the data tot he reconstruction buffer.
+        add         eax,edx                     ; Step the reconstruction pointer
+
+        //emms
+    }
+}
+#endif
+
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/uoptsystemdependant.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/uoptsystemdependant.c
@ -0,0 +1,351 @@
+/****************************************************************************
+*
+*   Module Title :     SystemDependant.c
+*
+*   Description  :     Miscellaneous system dependant functions
+*
+*    AUTHOR      :     Paul Wilkins
+*
+*****************************************************************************
+*   Revision History
+* 
+*   1.20 YWX 06-Nov-02 Added forward DCT function optimized for Pentium 4
+*   1.19 YWX 15-Jun-01 added function pointer setups for new deblocking filter
+*   1.18 YWX 26-Apr-01 Fixed the cpu frequency detection bug caused by Sleep()
+*   1.17 JBX 22-Mar-01 Merged with new vp4-mapca bitstream
+*   1.16 JBB 26-Jan-01 Cleaned out unused function
+*   1.15 YWX 08-dec-00 Added WMT PostProcessor and 
+*                        moved function declarations into _head files
+*   1.14 JBB 30 NOV 00 Version number changes 
+*   1.13 YWX 03-Nov-00 Optimized postprocessor filters
+*   1.12 YWX 02-Nov-00 Added new loopfilter function pointers
+*   1.11 YWX 19-Oct-00 Added 1-2 Scaling functions pointers
+*   1.10 jbb 16 oct 00 added ifdefs to insure version code
+*   1.09 YWX 04-Oct-00 Added function pointers for scaling 
+*   1.08 YWX 06 Sep 00 Added function pointers for new deringing filter 
+*                      using frag baseed Q Value.
+*   1.07 JBB 21 Aug 00 New More Blurry in high variance area deringer
+*	1.06 YWX 2  Aug 00 Added function pointers for postprocess  
+*	1.05 YWX 15/05/00  Added functions to check processor frequency
+*					   and more function pointers for postprocessor
+*	1.04 YWX 08/05/00  Added function pointers setup for postprocess
+*   1.03 SJL 20/04/00  Added ability to enable the new dequant code.
+*   1.02 SJL 22/03/00  Function pointers for the loop filter.
+*   1.01 JBB 21/03/00  More Function Pointers for optimized playback
+*   1.00 PGW 12/10/99  Configuration baseline
+*
+*****************************************************************************
+*/
+
+/****************************************************************************
+*  Header Files
+*****************************************************************************
+*/
+#include "codec_common.h"
+#include "vputil_if.h"
+#include "cpuidlib.h"
+
+//global debugging aid's!
+int fastIDCTDisabled = 0;
+int forceCPUID = 0;
+int CPUID = 0;
+
+
+extern void GetProcessorFlags(INT32 *MmxEnabled, INT32 *XmmEnabled, INT32 *WmtEnabled);
+
+// Scalar (no mmx) reconstruction functions
+extern void ClearSysState_C(void);
+extern void IDctSlow(  INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void IDct10(  INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void IDct1(  INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void ScalarReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
+extern void ScalarReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
+extern void ScalarReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
+extern void ReconBlock_C(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep);
+extern void SubtractBlock_C( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
+extern void UnpackBlock_C( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
+extern void AverageBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
+extern void CopyBlock_C(unsigned char *src, unsigned char *dest, unsigned int srcstride);
+extern void Copy12x12_C(const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride);
+extern void fdct_short_C ( INT16 * InputData, INT16 * OutputData );
+extern void FilterBlockBil_8_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
+extern void FilterBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
+
+// MMx versions
+extern void fdct_MMX ( INT16 * InputData, INT16 * OutputData );
+extern void ClearMmx(void);
+extern void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
+extern void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
+extern void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
+extern void MMX_idct(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void MMX_idct10(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void MMX_idct1(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void MMX_idct_DX(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void MMX_idct10_DX(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void ReconBlock_MMX(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep);
+extern void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
+extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
+extern void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
+extern void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride);
+extern void Copy12x12_MMX(const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride);
+extern void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
+extern void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
+
+// WMT versions
+extern void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
+extern void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
+extern void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
+extern void Wmt_idct1(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void Wmt_IDct_Dx( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void Wmt_IDct10_Dx(  Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
+extern void fdct_WMT(short *InputData, short *OutputData);
+extern void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
+extern void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
+
+
+#define IdctAdjustBeforeShift 8
+extern UINT16 idctconstants[(4+7+1) * 4];
+extern UINT16 idctcosTbl[ 7];
+
+void fillidctconstants(void)
+{
+	int j = 16;  
+	UINT16 * p; 
+	do 
+	{ 
+		idctconstants[ --j] = 0;
+	}  
+	while( j);
+	
+	idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
+	
+	j = 1; 
+	do 
+	{
+		p = idctconstants + ( (j+3) << 2);
+		p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
+	} 
+	while( ++j <= 7);
+	
+	idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     Get Processor Flags
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Checks for machine specifc features such as MMX support 
+ *                      sets approipriate flags and function pointers.
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void GetProcessorFlags
+( 
+ INT32 *MmxEnabled,
+ INT32 *XmmEnabled,
+ INT32 *WmtEnabled
+)
+{
+	
+	PROCTYPE CPUType = findCPUId();
+    if(forceCPUID)
+        CPUType = CPUID;
+
+	switch(CPUType)
+	{
+	case X86    :
+	case PPRO   :
+	case C6X86  :
+	case C6X86MX:
+	case AMDK5  :
+	case MACG3	:
+	case MAC68K	:
+		*MmxEnabled = FALSE;
+		*XmmEnabled = FALSE;
+		*WmtEnabled = FALSE;
+		break;
+	case PII	:   
+	case AMDK63D:
+	case AMDK6  :
+	case PMMX	:   
+		*MmxEnabled = TRUE;
+		*XmmEnabled = FALSE;
+		*WmtEnabled = FALSE;
+		break;
+	case XMM    :
+		*MmxEnabled = TRUE;
+		*XmmEnabled = TRUE;
+		*WmtEnabled = FALSE;
+		break;
+	case WMT	:
+		*MmxEnabled = TRUE;
+		*XmmEnabled = TRUE;
+		*WmtEnabled = TRUE;
+		break;
+	}
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     MachineSpecificConfig
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Checks for machine specifc features such as MMX support 
+ *                      sets approipriate flags and function pointers.
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void UtilMachineSpecificConfig
+(
+  void
+)
+{
+	UINT32 i;
+	INT32 MmxEnabled;
+	INT32 XmmEnabled; 
+	INT32 WmtEnabled;
+
+	GetProcessorFlags( &MmxEnabled,&XmmEnabled,&WmtEnabled);
+    
+	if(WmtEnabled)		//Willamette
+	{
+		for(i=0;i<=64;i++)
+		{
+
+            if(fastIDCTDisabled)
+                idct[i]=Wmt_IDct_Dx;
+            else
+            {
+    			if(i<=1)idct[i]=Wmt_idct1;
+	    		else if(i<=10)idct[i]=Wmt_IDct10_Dx;
+		    	else idct[i]=Wmt_IDct_Dx;
+            }
+		}
+		for(i=0;i<=64;i++)
+		{
+            if(fastIDCTDisabled)
+                idctc[i]=MMX_idct;
+            else
+            {
+			    if(i<=1)idctc[i]=Wmt_idct1;
+			    else if(i<=10)idctc[i]=MMX_idct10;
+			    else idctc[i]=MMX_idct;
+            }
+		}
+        fdct_short=fdct_WMT;
+
+        ReconIntra = WmtReconIntra;
+        ReconInter = WmtReconInter;
+        ReconInterHalfPixel2 = WmtReconInterHalfPixel2;
+		ClearSysState = ClearMmx;
+        AverageBlock = AverageBlock_MMX;
+        UnpackBlock = UnpackBlock_MMX;
+        ReconBlock = ReconBlock_MMX;
+        SubtractBlock = SubtractBlock_MMX;
+		CopyBlock = CopyBlockMMX;
+        Copy12x12 = Copy12x12_MMX;    
+        FilterBlockBil_8 = FilterBlockBil_8_wmt;
+        FilterBlock=FilterBlock_wmt;
+        //FilterBlock=FilterBlock_C;
+	}
+	else if ( MmxEnabled )
+    {
+		for(i=0;i<=64;i++)
+		{
+            if(fastIDCTDisabled)
+                idctc[i]=MMX_idct_DX;
+            else
+            {
+    			if(i<=1)idctc[i]=MMX_idct1;
+	    		else if(i<=10)idctc[i]=MMX_idct10;
+		    	else idctc[i]=MMX_idct;
+		    }
+        }
+        fdct_short=fdct_MMX;
+		for(i=0;i<=64;i++)
+		{
+            if(fastIDCTDisabled)
+                idct[i]=MMX_idct_DX;
+            else
+            {
+			    if(i<=1)idct[i]=MMX_idct1;
+			    else if(i<=10)idct[i]=MMX_idct10_DX;
+			    else idct[i]=MMX_idct_DX;
+            }
+		}
+
+        ReconIntra = MMXReconIntra;
+        ReconInter = MmxReconInter;
+        ReconInterHalfPixel2 = MmxReconInterHalfPixel2;
+		ClearSysState = ClearMmx;
+        AverageBlock = AverageBlock_MMX;
+        UnpackBlock = UnpackBlock_MMX;
+        ReconBlock = ReconBlock_MMX;
+        SubtractBlock = SubtractBlock_MMX;
+		CopyBlock = CopyBlockMMX;
+        Copy12x12 = Copy12x12_MMX;
+        FilterBlockBil_8 = FilterBlockBil_8_mmx;
+        FilterBlock=FilterBlock_mmx;
+        //FilterBlock=FilterBlock_C;
+   }
+    else
+    {
+		int i;
+		for(i=0;i<=64;i++)
+		{
+            if(fastIDCTDisabled)
+                idctc[i]=IDctSlow;
+            else
+            {
+			    if(i<=1)idctc[i]=IDct1;
+			    else if(i<=10)idctc[i]=IDct10;
+			    else idctc[i]=IDctSlow;
+            }
+		}
+		fdct_short=fdct_short_C ;
+		for(i=0;i<=64;i++)
+		{
+            if(fastIDCTDisabled)
+                idct[i]=IDctSlow;
+            else
+            {
+			    if(i<=1)idct[i]=IDct1;
+			    else if(i<=10)idct[i]=IDct10;
+			    else idct[i]=IDctSlow;
+            }
+		}
+		ClearSysState = ClearSysState_C;
+		ReconIntra = ScalarReconIntra;
+		ReconInter = ScalarReconInter;
+		ReconInterHalfPixel2 = ScalarReconInterHalfPixel2;
+		AverageBlock = AverageBlock_C;
+		UnpackBlock = UnpackBlock_C;
+		ReconBlock = ReconBlock_C;
+		SubtractBlock = SubtractBlock_C;
+		CopyBlock = CopyBlock_C;
+        Copy12x12 = Copy12x12_MMX;
+        FilterBlockBil_8 = FilterBlockBil_8_C;
+        FilterBlock=FilterBlock_C;
+    } 
+    //FilterBlock=FilterBlock_C;
+
+}
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/vputilasm.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/vputilasm.c
@ -0,0 +1,507 @@
+/****************************************************************************
+ *
+ *   Module Title :     newLoopTest_asm.c 
+ *
+ *   Description  :     Codec specific functions
+ *
+ *   AUTHOR       :     Yaowu Xu
+ *
+ *****************************************************************************
+ *   Revision History
+ *
+ *   1.02 YWX 03-Nov-00 Changed confusing variable name
+ *   1.01 YWX 02-Nov-00 Added the set of functions
+ *   1.00 YWX 19-Oct-00 configuration baseline
+ *****************************************************************************
+ */ 
+
+/****************************************************************************
+ *  Header Frames
+ *****************************************************************************
+ */
+
+
+#define STRICT              /* Strict type checking. */
+#include "codec_common.h"
+#include <math.h>
+
+ /****************************************************************************
+ *  Module constants.
+ *****************************************************************************
+ */        
+
+#define MIN(a, b)  (((a) < (b)) ? (a) : (b))
+
+
+/****************************************************************************
+ *  Explicit Imports
+ *****************************************************************************
+ */ 
+extern void SatUnsigned8( UINT8 * ResultPtr, INT16 * DataBlock, 
+                         UINT32 ResultLineStep, UINT32 DataLineStep );
+
+/****************************************************************************
+ *  Exported Global Variables
+ *****************************************************************************
+ */
+
+/****************************************************************************
+ *  Exported Functions
+ *****************************************************************************
+ */              
+
+/****************************************************************************
+ *  Module Statics
+ *****************************************************************************
+ */              
+
+/****************************************************************************
+ *  Foreward References
+ *****************************************************************************
+ */       
+
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     ClearMmx()
+ *
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     
+ *
+ *  RETURNS       :    
+ * 
+ *
+ *  FUNCTION      :     Clears down the MMX state
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void ClearMmx(void)
+{
+	__asm
+	{
+		emms									; Clear the MMX state.
+	}
+}
+       
+/****************************************************************************
+ * 
+ *  ROUTINE       :     CopyBlockUsingMMX
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Copies a block from source to destination
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride)
+{
+	unsigned char *s = src;
+	unsigned char *d = dest;
+	unsigned int stride = srcstride;
+	// recon copy 
+	_asm
+	{
+			mov		ecx, [stride]
+			mov		eax, [s]
+			mov		ebx, [d]
+			lea		edx, [ecx + ecx * 2]
+
+			movq	mm0, [eax]
+			movq	mm1, [eax + ecx]
+			movq	mm2, [eax + ecx*2]
+			movq	mm3, [eax + edx]
+
+			lea		eax, [eax + ecx*4]
+
+			movq	[ebx], mm0
+			movq	[ebx + ecx], mm1
+			movq	[ebx + ecx*2], mm2
+			movq	[ebx + edx], mm3
+
+			lea		ebx, [ebx + ecx * 4]
+
+			movq	mm0, [eax]
+			movq	mm1, [eax + ecx]
+			movq	mm2, [eax + ecx*2]
+			movq	mm3, [eax + edx]
+
+			movq	[ebx], mm0
+			movq	[ebx + ecx], mm1
+			movq	[ebx + ecx*2], mm2
+			movq	[ebx + edx], mm3
+	}
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     CopyBlockUsingMMX
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Copies a block from source to destination
+ *
+ *  SPECIAL NOTES :     None. 
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void Copy12x12_MMX(
+    const unsigned char *src, 
+    unsigned char *dest, 
+    unsigned int srcstride,
+    unsigned int deststride)
+{
+
+
+	int j=0;
+	do
+	{
+		((UINT32*)dest)[0] = ((UINT32*)src)[0];
+		((UINT32*)dest)[1] = ((UINT32*)src)[1];
+		((UINT32*)dest)[2] = ((UINT32*)src)[2];
+		src+=srcstride;
+		dest+=deststride;
+	}
+	while(++j<12);
+
+}
+
+/****************************************************************************
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     AverageBlock_MMX
+ *  
+ *  INPUTS        :     Two block data to be averaged
+ *						
+ *  OUTPUTS       :     block with the average values
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Do pixel averages on two reference blocks 
+ *
+ *  SPECIAL NOTES :     This functions has a mmx version in newlooptest_asm.c
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
+{
+    
+    __asm 
+    {
+        mov         esi,    ReconPtr1 
+        mov         eax,    ReconPtr2
+
+        mov         edi,    ReconRefPtr
+        mov         ecx,    BLOCK_HEIGHT_WIDTH
+
+        mov         edx,    ReconPixelsPerLine
+        pxor        mm7,    mm7
+
+AverageBlock_Loop:
+
+        movq        mm0,    [esi]
+        movq        mm1,    [eax]
+
+        movq        mm2,    mm0
+        punpcklbw   mm0,    mm7
+
+        movq        mm3,    mm1
+        punpcklbw   mm1,    mm7
+
+        paddw       mm0,    mm1
+        punpckhbw   mm2,    mm7
+
+        psraw       mm0,    1
+        punpckhbw   mm3,    mm7
+
+        paddw       mm2,    mm3
+        movq        [edi],  mm0
+
+        psraw       mm2,    1
+        add         esi,    edx
+
+        add         eax,    edx
+        add         edi,    16
+
+        movq        [edi-8], mm2
+        dec         ecx
+
+        jnz         AverageBlock_Loop
+    }
+    /*    
+    UINT32 i;
+
+    // For each block row
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+    {
+        ReconRefPtr[0] = (INT16)((INT32)(ReconPtr1[0])+ ((INT32)ReconPtr2[0]))>>1;
+        ReconRefPtr[1] = (INT16)((INT32)(ReconPtr1[1])+ ((INT32)ReconPtr2[1]))>>1;
+        ReconRefPtr[2] = (INT16)((INT32)(ReconPtr1[2])+ ((INT32)ReconPtr2[2]))>>1;
+        ReconRefPtr[3] = (INT16)((INT32)(ReconPtr1[3])+ ((INT32)ReconPtr2[3]))>>1;
+        ReconRefPtr[4] = (INT16)((INT32)(ReconPtr1[4])+ ((INT32)ReconPtr2[4]))>>1;
+        ReconRefPtr[5] = (INT16)((INT32)(ReconPtr1[5])+ ((INT32)ReconPtr2[5]))>>1;
+        ReconRefPtr[6] = (INT16)((INT32)(ReconPtr1[6])+ ((INT32)ReconPtr2[6]))>>1;
+        ReconRefPtr[7] = (INT16)((INT32)(ReconPtr1[7])+ ((INT32)ReconPtr2[7]))>>1;
+        
+        // Start next row
+        ReconPtr1 += ReconPixelsPerLine;
+        ReconPtr2 += ReconPixelsPerLine;
+
+        ReconRefPtr += BLOCK_HEIGHT_WIDTH;
+    }
+    */
+}
+
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     UnpackBlock
+ *  
+ *  INPUTS        :     Block of char data to be converted to short
+ *						
+ *  OUTPUTS       :     converted output
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :     Converted char block data to short
+ *
+ *  SPECIAL NOTES :     This functions has a mmx version in newlooptest_asm.c
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
+{
+    
+    __asm 
+    {
+        mov         esi,    ReconPtr 
+        mov         edi,    ReconRefPtr
+
+        mov         ecx,    BLOCK_HEIGHT_WIDTH
+        mov         edx,    ReconPixelsPerLine
+
+        pxor        mm7,    mm7
+
+UnpackBlock_Loop:
+
+        movq        mm0,    [esi] 
+        movq        mm2,    mm0
+
+        punpcklbw   mm0,    mm7
+        movq        [edi],  mm0
+
+        punpckhbw   mm2,    mm7
+        add         esi,    edx
+
+        movq        [edi+8], mm2
+        add         edi,    16
+
+        dec         ecx
+        jnz         UnpackBlock_Loop
+    }
+    
+    /*
+    UINT32 i;
+
+    // For each block row
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+    {
+
+        ReconRefPtr[0] = (INT16)(ReconPtr[0]);
+        ReconRefPtr[1] = (INT16)(ReconPtr[1]);
+        ReconRefPtr[2] = (INT16)(ReconPtr[2]);
+        ReconRefPtr[3] = (INT16)(ReconPtr[3]);
+        ReconRefPtr[4] = (INT16)(ReconPtr[4]);
+        ReconRefPtr[5] = (INT16)(ReconPtr[5]);
+        ReconRefPtr[6] = (INT16)(ReconPtr[6]);
+        ReconRefPtr[7] = (INT16)(ReconPtr[7]);
+        
+        // Start next row
+        ReconPtr += ReconPixelsPerLine;
+        ReconRefPtr += BLOCK_HEIGHT_WIDTH;
+    }
+    */
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     SubtractBlock
+ *  
+ *  INPUTS        :     Get the residue data for the block
+ *						
+ *  OUTPUTS       :     Source block data and ref block data
+ *
+ *  RETURNS       :     residue block data
+ *
+ *  FUNCTION      :     do pixel subtraction of ref block from source block
+ *
+ *  SPECIAL NOTES :     This functions has a mmx version in newlooptest_asm.c
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep )
+{
+
+    __asm 
+    {
+
+        mov         esi,    SrcBlock
+        mov         edi,    DestPtr
+
+        mov         edx,    LineStep
+        mov         ecx,    8
+
+        pxor        mm7,    mm7
+
+SubtractBlock_Loop:
+
+        movq        mm0,    [esi]
+        movq        mm1,    [edi]
+
+        movq        mm2,    mm0
+        punpcklbw   mm0,    mm7
+
+        movq        mm3,    [edi+8]
+        psubw       mm0,    mm1
+        
+        punpckhbw   mm2,    mm7
+        movq        [edi],  mm0
+
+        psubw       mm2,    mm3
+        add         esi,    edx
+
+        movq        [edi+8], mm2
+        add         edi,    16
+
+        dec         ecx
+        jnz         SubtractBlock_Loop
+    }
+
+    /*    
+    UINT32 i;
+
+    // For each block row
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+    {
+
+        DestPtr[0] = (INT16)((INT32)SrcBlock[0] - (INT32)DestPtr[0]);
+        DestPtr[1] = (INT16)((INT32)SrcBlock[1] - (INT32)DestPtr[1]);
+        DestPtr[2] = (INT16)((INT32)SrcBlock[2] - (INT32)DestPtr[2]);
+        DestPtr[3] = (INT16)((INT32)SrcBlock[3] - (INT32)DestPtr[3]);
+        DestPtr[4] = (INT16)((INT32)SrcBlock[4] - (INT32)DestPtr[4]);
+        DestPtr[5] = (INT16)((INT32)SrcBlock[5] - (INT32)DestPtr[5]);
+        DestPtr[6] = (INT16)((INT32)SrcBlock[6] - (INT32)DestPtr[6]);
+        DestPtr[7] = (INT16)((INT32)SrcBlock[7] - (INT32)DestPtr[7]);
+        
+        // Start next row
+        SrcBlock += LineStep;
+        DestPtr += BLOCK_HEIGHT_WIDTH;
+    }
+    */
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     ReconBlock
+ *  
+ *  INPUTS        :     
+ *						
+ *  OUTPUTS       :     
+ *
+ *  RETURNS       :     
+ *
+ *  FUNCTION      :     Reconstrut a block using ref blocka and change data
+ *
+ *  SPECIAL NOTES :     This functions has a mmx version in newlooptest_asm.c
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void ReconBlock_MMX( INT16 *SrcBlock, INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep)
+{
+
+    __asm 
+    {
+    
+        mov         esi,    SrcBlock
+        mov         eax,    ReconRefPtr
+
+        mov         edi,    DestBlock
+        mov         ecx,    8
+
+        mov         edx,    LineStep
+        pxor        mm7,    mm7
+
+ReconBlock_Loop:
+
+        movq        mm0,    [esi]
+        movq        mm1,    [eax]
+    
+        movq        mm2,    [esi+8]
+        movq        mm3,    [eax+8]
+
+        paddw       mm0,    mm1
+        paddw       mm2,    mm3
+
+        packuswb    mm0,    mm2
+        movq        [edi],  mm0
+        
+        add         esi,    16
+        add         eax,    16
+
+        add         edi,    edx
+        dec         ecx
+
+        jnz         ReconBlock_Loop
+        
+    }
+    
+    /*    
+    UINT32 i;
+    INT16 *SrcBlockPtr = SrcBlock;
+
+    // For each block row
+    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
+    {
+        SrcBlock[0] += ReconRefPtr[0];
+        SrcBlock[1] += ReconRefPtr[1];
+        SrcBlock[2] += ReconRefPtr[2];
+        SrcBlock[3] += ReconRefPtr[3];
+        SrcBlock[4] += ReconRefPtr[4];
+        SrcBlock[5] += ReconRefPtr[5];
+        SrcBlock[6] += ReconRefPtr[6];
+        SrcBlock[7] += ReconRefPtr[7];
+        
+        // Start next row
+        SrcBlock += BLOCK_HEIGHT_WIDTH;
+        ReconRefPtr += BLOCK_HEIGHT_WIDTH;
+    }
+    // Saturated the block and write to the output
+    SatUnsigned8( DestBlock, SrcBlockPtr, LineStep, BLOCK_HEIGHT_WIDTH );
+    */
+
+}
+
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/wmtidct.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/wmtidct.c
--- a/Src/libvpShared/corelibs/cdxv/vputil/win32/wmtrecon.c
+++ b/Src/libvpShared/corelibs/cdxv/vputil/win32/wmtrecon.c
@ -0,0 +1,281 @@
+ /****************************************************************************
+ *
+ *   Module Title :     WmtOptFunctions.c
+ *
+ *   Description  :     willamette processor specific 
+ *                      optimised versions of functions
+ *
+ *   AUTHOR      :		Yaowu Xu
+ *
+ *	 Special Note:		
+ *
+ *****************************************************************************
+ *   Revision History
+ *
+ *
+ *   1.03 YWX 07-Dec-00 Removed constants and functions that are not in use
+ * 			Added push and pop ebx in WmtReconIntra
+ *   1.02 YWX 30 Aug 00 changed to be compatible with Microsoft compiler
+ *   1.01 YWX 13 JUL 00 New Willamette Optimized Functions
+ *   1.00 YWX 14/06/00  Configuration baseline from OptFunctions.c
+ *
+ *****************************************************************************
+ */
+ 
+/* 
+    Use Tim's optimized version.
+*/
+
+/****************************************************************************
+ *  Header Files
+ *****************************************************************************
+ */
+
+#define STRICT              // Strict type checking. 
+
+#include "reconstruct.h"
+
+/****************************************************************************
+ *  Module constants.
+ *****************************************************************************
+ */        
+
+/**************************************************************************** 
+ *  Imports.
+ *****************************************************************************
+ */   
+
+
+/****************************************************************************
+ *  Exported Global Variables
+ *****************************************************************************
+ */
+
+/****************************************************************************
+ *  Exported Functions 
+ *****************************************************************************
+ */              
+
+/****************************************************************************
+ *  Module Statics
+ *****************************************************************************
+ */  
+
+
+
+_declspec(align(16)) static  UINT8 Eight128s[8] =  {128,128,128,128,128,128,128,128};
+
+#pragma warning( disable : 4799 )  // Disable no emms instruction warning!
+                                      
+/****************************************************************************
+*  Forward References
+*****************************************************************************
+*/  
+
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     WmtReconIntra
+ *
+ *  INPUTS        :     INT16 *  idct
+ *                               Pointer to the output from the idct for this block
+ *
+ *                      UINT32   stride
+ *                               Line Length in pixels in recon and reference images
+ *                               
+ *
+ *                     
+ *
+ *  OUTPUTS       :     UINT8 *  dest
+ *                               The reconstruction buffer
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Reconstructs an intra block - wmt version
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
+{
+	(void)TmpDataBuffer;
+    __asm
+    {
+	
+		push		ebx
+
+        mov         eax,[idct]						; Signed 16 bit inputs
+        mov         edx,[dest]						; Unsigned 8 bit outputs
+
+        movq		xmm0,QWORD PTR [Eight128s]		; Set xmm0 to 0x000000000000008080808080808080
+		pxor		xmm3, xmm3						; set xmm3 to 0
+													;
+        mov         ebx,[stride]					; Line stride in output buffer
+        lea         ecx,[eax+128]					; Endpoint in input buffer
+
+loop_label:                                 
+
+        movdqa		xmm2,XMMWORD PTR [eax]			; Read the eight inputs
+		packsswb	xmm2,xmm3						;		
+		
+		pxor        xmm2,xmm0						; Convert result to unsigned (same as add 128)
+        lea         eax,[eax + 16]					; Step source buffer
+
+        cmp         eax,ecx							; are we done
+        movq		QWORD PTR [edx],xmm2			; store results
+
+        lea         edx,[edx+ebx]					; Step output buffer
+        jc          loop_label						; Loop back if we are not done
+
+		pop			ebx
+    }
+
+}
+
+/****************************************************************************
+ * 
+ *  ROUTINE       :     WmtReconInter
+ *
+ *  INPUTS        :     UINT8 *  RefPtr
+ *                               The last frame reference
+ *
+ *                      INT16 *  ChangePtr
+ *                               Pointer to the change data
+ *
+ *                      UINT32   LineStep
+ *                               Line Length in pixels in recon and ref images
+ *
+ *  OUTPUTS       :     UINT8 *  ReconPtr
+ *                               The reconstruction
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Reconstructs data from last data and change
+ *
+ *  SPECIAL NOTES :     
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
+{
+    (void) TmpDataBuffer;
+
+ _asm {
+		push	edi
+		
+		mov		ebx, [RefPtr]
+		mov		ecx, [ChangePtr]
+
+		mov		eax, [ReconPtr]
+		mov		edx, [LineStep]
+
+		pxor	xmm0, xmm0
+		lea		edi, [ecx + 128]
+  L:
+		movq	xmm2, QWORD ptr [ebx]		; (+3 misaligned) 8 reference pixels
+		movdqa	xmm4, XMMWORD ptr [ecx]		; 8 changes
+		
+		punpcklbw xmm2, xmm0				; 
+
+		add	ebx, edx						; next row of reference pixels
+		paddsw	xmm2, xmm4					; add in first 4 changes
+
+		lea		ecx, [ecx + 16]				; next row of changes
+		packuswb xmm2, xmm0					; pack result to unsigned 8-bit values
+
+		cmp		ecx, edi					; are we done?
+		movq	QWORD PTR [eax], xmm2		; store result
+
+		lea		eax, [eax+edx]				; next row of output
+		jc		L							; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
+
+		pop		edi
+ }
+
+}
+/****************************************************************************
+ * 
+ *  ROUTINE       :     WmtReconInterHalfPixel2
+ *
+ *  INPUTS        :     UINT8 *  RefPtr1, RefPtr2
+ *                               The last frame reference
+ *
+ *                      INT16 *  ChangePtr
+ *                               Pointer to the change data
+ *
+ *                      UINT32   LineStep
+ *                               Line Length in pixels in recon and ref images
+ *                               
+ *
+ *  OUTPUTS       :     UINT8 *  ReconPtr
+ *                               The reconstruction
+ *
+ *  RETURNS       :     None
+ *
+ *  FUNCTION      :     Reconstructs data from half pixel reference data and change. 
+ *                      Half pixel data interpolated from 2 references.
+ *
+ *  SPECIAL NOTES :     
+ *
+ *
+ *  ERRORS        :     None.
+ *
+ ****************************************************************************/
+
+void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, 
+		    	              UINT8 * RefPtr1, UINT8 * RefPtr2, 
+						      INT16 * ChangePtr, UINT32 LineStep )
+{
+	(void)TmpDataBuffer;
+
+ _asm {
+	push	esi
+	push	edi
+
+	mov		ecx, [ChangePtr]
+	mov		esi, [RefPtr1]
+
+	mov		edi, [RefPtr2]
+	mov		ebx, [ReconPtr]
+	
+	mov		edx, [LineStep]
+	lea		eax, [ecx+128]
+
+	pxor	xmm0, xmm0
+
+  L:
+	
+	movq		xmm2, QWORD PTR [esi]		; (+3 misaligned) mm2 = row from ref1
+	movq		xmm4, QWORD PTR [edi]		; (+3 misaligned) mm4 = row from ref2
+
+	punpcklbw	xmm2, xmm0					;
+	punpcklbw	xmm4, xmm0					;
+
+	movdqa		xmm6, [ecx]					; mm6 = first 4 changes
+	paddw		xmm2, xmm4					; mm2 = start (ref1 + ref2)
+
+
+	psrlw		xmm2, 1						; mm2 = start (ref1 + ref2)/2
+	paddw		xmm2, xmm6					; add changes to start
+
+	lea			ecx, [ecx+16]				; next row idct
+	packuswb	xmm2, xmm0					; pack start|end to unsigned 8-bit
+	
+	add			esi, edx					; next row ref1
+	add			edi, edx					; next row ref2
+	
+	cmp			ecx, eax
+	movq		QWORD PTR [ebx], xmm2		; store result
+	 ;
+	lea			ebx, [ebx+edx]
+	jc		L				
+
+	pop		edi
+	pop		esi
+ }
+}
+
+