Initial community commit

2024-09-24 14:54:57 +02:00 · 2024-09-24 14:54:57 +02:00 · fc06254474
commit fc06254474
parent 537bcbc862
16440 changed files with 4239995 additions and 2 deletions
--- a/Src/h264dec/lcommon/src/img_io.c
+++ b/Src/h264dec/lcommon/src/img_io.c
@ -0,0 +1,327 @@
+
+/*!
+ *************************************************************************************
+ * \file img_io.c
+ *
+ * \brief
+ *    image I/O related functions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>
+ *************************************************************************************
+ */
+#include "contributors.h"
+#include "global.h"
+#include "img_io.h"
+#include "report.h"
+
+static const VIDEO_SIZE VideoRes[] = {
+  { "qcif"  ,  176,  144},
+  { "qqvga" ,  160,  128},
+  { "qvga"  ,  320,  240},
+  { "sif"   ,  352,  240},
+  { "cif"   ,  352,  288},
+  { "vga"   ,  640,  480},
+  { "sd1"   ,  720,  480},
+  { "sd2"   ,  704,  576},
+  { "sd3"   ,  720,  576},
+  { "720p"  , 1280,  720},
+  { "1080p" , 1920, 1080},
+  { NULL, 0, 0}
+};
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Parse Size from from file name
+ *
+ ************************************************************************
+ */
+int ParseSizeFromString (VideoDataFile *input_file, int *x_size, int *y_size, double *fps) 
+{
+  char *p1, *p2, *tail;
+  char *fn = input_file->fname;
+  char c;
+  int i = 0;
+
+  *x_size = *y_size = -1;
+  p1 = p2 = fn;
+  while (p1 != NULL && p2 != NULL) 
+  {
+    // Search for first '_'
+    p1 = strstr( p1, "_");
+    if (p1 == NULL)
+      break;
+
+    // Search for end character of x_size (first 'x' after last '_')
+    p2 = strstr( p1, "x");
+
+    // If no 'x' is found, exit
+    if (p2 == NULL)    
+      break;
+
+    // Try conversion of number
+    *p2 = 0;
+    *x_size = strtol( p1 + 1, &tail, 10);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p1 + 1) == '\0') 
+    {
+      *p2 = 'x';
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p2 = 'x';
+
+    // Search for end character of y_size (first '_' or '.' after last 'x')
+    p1 = strpbrk( p2 + 1, "_.");
+    // If no '_' or '.' is found, try again from current position
+    if (p1 == NULL) 
+    {
+      p1 = p2 + 1;
+      continue;
+    }
+
+    // Try conversion of number
+    c = *p1;
+    *p1 = 0;
+    *y_size = strtol( p2 + 1, &tail, 10);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p2 + 1) == '\0') 
+    {
+      *p1 = c;
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p1 = c;
+
+    // Search for end character of y_size (first 'i' or 'p' after last '_')
+    p2 = strstr( p1 + 1, "ip");
+
+    // If no 'i' or 'p' is found, exit
+    if (p2 == NULL)      
+      break;
+
+    // Try conversion of number
+    c = *p2;
+    *p2 = 0;
+    *fps = strtod( p1 + 1, &tail);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p1 + 1) == '\0') 
+    {
+      *p2 = c;
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p2 = c;
+    break;
+  }
+
+  // Now lets test some common video file formats
+  if (p1 == NULL || p2 == NULL)
+  {       
+    for (i = 0; VideoRes[i].name != NULL; i++) 
+    {
+      if (strcasecmp (fn, VideoRes[i].name)) 
+      {
+        *x_size = VideoRes[i].x_size;
+        *y_size = VideoRes[i].y_size;       
+        // Should add frame rate support as well
+        break;
+      }
+    }
+  }
+
+  return (*x_size == -1 || *y_size == -1) ? 0 : 1; 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Parse Size from from file name
+ *
+ ************************************************************************
+ */
+void ParseFrameNoFormatFromString (VideoDataFile *input_file)
+{
+  char *p1, *p2, *tail;  
+  char *fn         = input_file->fname;
+  char *fhead      = input_file->fhead;
+  char *ftail      = input_file->ftail;
+  int  *zero_pad   = &input_file->zero_pad;
+  int  *num_digits = &input_file->num_digits;
+
+  *zero_pad = 0;
+  *num_digits = -1;
+  p1 = p2 = fn;
+  while (p1 != NULL && p2 != NULL) 
+  {
+    // Search for first '_'
+    p1 = strstr( p1, "%");
+    if (p1 == NULL)
+      break;
+
+    strncpy(fhead, fn, p1 - fn);
+
+    // Search for end character of x_size (first 'x' after last '_')
+    p2 = strstr( p1, "d");
+
+    // If no 'x' is found, exit
+    if (p2 == NULL)    
+      break;
+    
+    // Try conversion of number
+    *p2 = 0;
+
+    if (*(p1 + 1) == '0')
+      *zero_pad = 1;
+
+    *num_digits = strtol( p1 + 1, &tail, 10);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p1 + 1) == '\0') 
+    {
+      *p2 = 'd';
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p2 = 'd';
+
+    tail++;
+    strncpy(ftail, tail, strlen(tail));
+    break;
+  }
+
+  if (input_file->vdtype == VIDEO_TIFF)
+  {
+    input_file->is_concatenated = 0;
+  }
+  else
+    input_file->is_concatenated = (*num_digits == -1) ? 1 : 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Open file containing a single frame
+ ************************************************************************
+ */
+void OpenFrameFile( VideoDataFile *input_file, int FrameNumberInFile)
+{
+  char infile [FILE_NAME_SIZE], in_number[16];
+  int length = 0;
+  in_number[length]='\0';
+  length = strlen(input_file->fhead);
+  strncpy(infile, input_file->fhead, length);
+  infile[length]='\0';
+  if (input_file->zero_pad)       
+    snprintf(in_number, 16, "%0*d", input_file->num_digits, FrameNumberInFile);
+  else
+    snprintf(in_number, 16, "%*d", input_file->num_digits, FrameNumberInFile);
+
+  strncat(infile, in_number, sizeof(in_number));
+  length += sizeof(in_number);
+  infile[length]='\0';
+  strncat(infile, input_file->ftail, strlen(input_file->ftail));
+  length += strlen(input_file->ftail);
+  infile[length]='\0';
+
+  if ((input_file->f_num = open(infile, OPENFLAGS_READ)) == -1)
+  {
+    printf ("OpenFrameFile: cannot open file %s\n", infile);
+    report_stats_on_error();
+  }    
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Open file(s) containing the entire frame sequence
+ ************************************************************************
+ */
+void OpenFiles( VideoDataFile *input_file)
+{
+  if (input_file->is_concatenated == 1)
+  {
+    if (strlen(input_file->fname) == 0)
+    {
+      snprintf(errortext, ET_SIZE, "No input sequence name was provided. Please check settings.");
+      error (errortext, 500);
+    }
+
+    if ((input_file->f_num = open(input_file->fname, OPENFLAGS_READ)) == -1)
+    {
+      snprintf(errortext, ET_SIZE, "Input file %s does not exist",input_file->fname);
+      error (errortext, 500);
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Close input file
+ ************************************************************************
+ */
+void CloseFiles(VideoDataFile *input_file)
+{
+  if (input_file->f_num != -1)
+    close(input_file->f_num);
+  input_file->f_num = -1;
+}
+
+/* ==========================================================================
+ *
+ * ParseVideoType
+ *
+ * ==========================================================================
+*/
+VideoFileType ParseVideoType (VideoDataFile *input_file)
+{
+  char *format;
+
+  format = input_file->fname + strlen(input_file->fname) - 3;
+
+  if (strcasecmp (format, "yuv") == 0)
+  {
+    input_file->vdtype = VIDEO_YUV;
+    input_file->format.yuv_format = YUV420;
+    input_file->avi = NULL;
+  }
+  else if (strcasecmp (format, "rgb") == 0)
+  {
+    input_file->vdtype = VIDEO_RGB;
+    input_file->format.yuv_format = YUV444;
+    input_file->avi = NULL;
+  }
+  else if (strcasecmp (format, "tif") == 0)
+  {
+    input_file->vdtype = VIDEO_TIFF;
+    input_file->avi = NULL;
+  }
+  else if (strcasecmp (format, "avi") == 0) 
+  {
+    input_file->vdtype = VIDEO_AVI;
+  }
+  else
+  {
+    //snprintf(errortext, ET_SIZE, "ERROR: video file format not supported");
+    //error (errortext, 500);
+    input_file->vdtype = VIDEO_YUV;
+    input_file->format.yuv_format = YUV420;
+    input_file->avi = NULL;
+  }
+
+  return input_file->vdtype;
+}
--- a/Src/h264dec/lcommon/src/memalloc.c
+++ b/Src/h264dec/lcommon/src/memalloc.c
--- a/Src/h264dec/lcommon/src/memcache.c
+++ b/Src/h264dec/lcommon/src/memcache.c
@ -0,0 +1,106 @@
+#include "memcache.h"
+#include "mbuffer.h"
+#include "memalloc.h"
+
+void image_cache_flush(ImageCache *cache)
+{
+	while (cache->head)
+	{
+		VideoImage *next = cache->head->next;
+		free_memImage(cache->head);
+		cache->head = next;
+	}
+	cache->size_x = 0;
+	cache->size_y = 0;
+}
+
+void image_cache_set_dimensions(ImageCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+	{
+		image_cache_flush(cache);
+		cache->size_x = width;
+		cache->size_y = height;
+	}
+}
+
+int image_cache_dimensions_match(ImageCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+		return 0;
+
+	return 1;
+}
+
+void image_cache_add(ImageCache *cache, VideoImage *image)
+{
+	image->next = cache->head;
+	cache->head = image;
+}
+
+struct video_image *image_cache_get(ImageCache *cache)
+{
+	if (cache->head)
+	{
+		VideoImage *ret = cache->head;
+		cache->head = ret->next;
+		ret->next = 0;
+		return ret;
+	}
+	return 0;
+}
+
+/* ------------- 
+
+PicMotion arrays are allowed with one extra slot in the first dimension
+which we use as the next pointer
+------------- */
+
+
+void motion_cache_flush(MotionCache *cache)
+{
+	while (cache->head)
+	{
+		PicMotion **next = (PicMotion **)cache->head[cache->size_y];
+		free_mem2DPicMotion(cache->head);
+		cache->head = next;
+	}
+	cache->size_x = 0;
+	cache->size_y = 0;
+}
+
+void motion_cache_set_dimensions(MotionCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+	{
+		motion_cache_flush(cache);
+		cache->size_x = width;
+		cache->size_y = height;
+	}
+}
+
+int motion_cache_dimensions_match(MotionCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+		return 0;
+
+	return 1;
+}
+
+void motion_cache_add(MotionCache *cache, PicMotion **image)
+{
+	image[cache->size_y] = (PicMotion *)cache->head;
+	cache->head = image;
+}
+
+struct pic_motion **motion_cache_get(MotionCache *cache)
+{
+	if (cache->head)
+	{
+		PicMotion **ret = cache->head;
+		cache->head = (PicMotion **)ret[cache->size_y];
+		ret[cache->size_y] = 0;
+		return ret;
+	}
+	return 0;
+}
--- a/Src/h264dec/lcommon/src/mv_prediction.c
+++ b/Src/h264dec/lcommon/src/mv_prediction.c
@ -0,0 +1,250 @@
+/*!
+ *************************************************************************************
+ * \file mv_prediction.c
+ *
+ * \brief
+ *    Motion Vector Prediction Functions
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *      - Karsten Sühring          <suehring@hhi.de>
+ *************************************************************************************
+ */
+
+#include "global.h"
+#include "mbuffer.h"
+/*!
+ ************************************************************************
+ * \brief
+ *    Get motion vector predictor
+ ************************************************************************
+ */
+static void GetMotionVectorPredictorMBAFF (Macroblock *currMB, 
+                                    PixelPos *block,        // <--> block neighbors
+                                    short  pmv[2],
+                                    short  ref_frame,
+                                    PicMotion **motion, 
+                                    int    mb_x,
+                                    int    mb_y,
+                                    int    blockshape_x,
+                                    int    blockshape_y)
+{
+  int mv_a, mv_b, mv_c, pred_vec=0;
+  int mvPredType, rFrameL, rFrameU, rFrameUR;
+  int hv;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  mvPredType = MVPRED_MEDIAN;
+
+
+  if (currMB->mb_field)
+	{
+		rFrameL  = block[0].available
+			? (p_Vid->mb_data[block[0].mb_addr].mb_field
+			? motion[block[0].pos_y][block[0].pos_x].ref_idx
+			: motion[block[0].pos_y][block[0].pos_x].ref_idx * 2) : -1;
+		rFrameU  = block[1].available
+			? (p_Vid->mb_data[block[1].mb_addr].mb_field
+			? motion[block[1].pos_y][block[1].pos_x].ref_idx
+			: motion[block[1].pos_y][block[1].pos_x].ref_idx * 2) : -1;
+		rFrameUR = block[2].available
+			? (p_Vid->mb_data[block[2].mb_addr].mb_field
+			? motion[block[2].pos_y][block[2].pos_x].ref_idx
+			: motion[block[2].pos_y][block[2].pos_x].ref_idx * 2) : -1;
+	}
+	else
+  {
+    rFrameL = block[0].available
+      ? (p_Vid->mb_data[block[0].mb_addr].mb_field
+      ? motion[block[0].pos_y][block[0].pos_x].ref_idx >>1
+      : motion[block[0].pos_y][block[0].pos_x].ref_idx) : -1;
+    rFrameU  = block[1].available
+      ? (p_Vid->mb_data[block[1].mb_addr].mb_field
+      ? motion[block[1].pos_y][block[1].pos_x].ref_idx >>1
+      : motion[block[1].pos_y][block[1].pos_x].ref_idx) : -1;
+    rFrameUR = block[2].available
+      ? (p_Vid->mb_data[block[2].mb_addr].mb_field
+      ? motion[block[2].pos_y][block[2].pos_x].ref_idx >>1
+      : motion[block[2].pos_y][block[2].pos_x].ref_idx) : -1;
+  }
+
+
+  /* Prediction if only one of the neighbors uses the reference frame
+  *  we are checking
+  */
+  if(rFrameL == ref_frame && rFrameU != ref_frame && rFrameUR != ref_frame)       
+    mvPredType = MVPRED_L;
+  else if(rFrameL != ref_frame && rFrameU == ref_frame && rFrameUR != ref_frame)  
+    mvPredType = MVPRED_U;
+  else if(rFrameL != ref_frame && rFrameU != ref_frame && rFrameUR == ref_frame)  
+    mvPredType = MVPRED_UR;
+  // Directional predictions
+  if(blockshape_x == 8 && blockshape_y == 16)
+  {
+    if(mb_x == 0)
+    {
+      if(rFrameL == ref_frame)
+        mvPredType = MVPRED_L;
+    }
+    else
+    {
+      if( rFrameUR == ref_frame)
+        mvPredType = MVPRED_UR;
+    }
+  }
+  else if(blockshape_x == 16 && blockshape_y == 8)
+  {
+    if(mb_y == 0)
+    {
+      if(rFrameU == ref_frame)
+        mvPredType = MVPRED_U;
+    }
+    else
+    {
+      if(rFrameL == ref_frame)
+        mvPredType = MVPRED_L;
+    }
+  }
+
+  for (hv=0; hv < 2; hv++)
+  {
+    if (hv == 0)
+    {
+      mv_a = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[hv] : 0;
+      mv_b = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[hv] : 0;
+      mv_c = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[hv] : 0;
+    }
+    else
+    {
+			if (currMB->mb_field)
+			{
+				mv_a = block[0].available  ? p_Vid->mb_data[block[0].mb_addr].mb_field
+					? motion[block[0].pos_y][block[0].pos_x].mv[hv]
+				: motion[block[0].pos_y][block[0].pos_x].mv[hv] / 2
+					: 0;
+				mv_b = block[1].available  ? p_Vid->mb_data[block[1].mb_addr].mb_field
+					? motion[block[1].pos_y][block[1].pos_x].mv[hv]
+				: motion[block[1].pos_y][block[1].pos_x].mv[hv] / 2
+					: 0;
+				mv_c = block[2].available  ? p_Vid->mb_data[block[2].mb_addr].mb_field
+					? motion[block[2].pos_y][block[2].pos_x].mv[hv]
+				: motion[block[2].pos_y][block[2].pos_x].mv[hv] / 2
+					: 0;
+			}
+			else
+			{
+				mv_a = block[0].available  ? p_Vid->mb_data[block[0].mb_addr].mb_field
+					? motion[block[0].pos_y][block[0].pos_x].mv[hv] * 2
+					: motion[block[0].pos_y][block[0].pos_x].mv[hv]
+				: 0;
+				mv_b = block[1].available  ? p_Vid->mb_data[block[1].mb_addr].mb_field
+					? motion[block[1].pos_y][block[1].pos_x].mv[hv] * 2
+					: motion[block[1].pos_y][block[1].pos_x].mv[hv]
+				: 0;
+				mv_c = block[2].available  ? p_Vid->mb_data[block[2].mb_addr].mb_field
+					? motion[block[2].pos_y][block[2].pos_x].mv[hv] * 2
+					: motion[block[2].pos_y][block[2].pos_x].mv[hv]
+				: 0;
+			}
+    }
+
+    switch (mvPredType)
+    {
+    case MVPRED_MEDIAN:
+      if(!(block[1].available || block[2].available))
+      {
+        pred_vec = mv_a;
+      }
+      else
+      {
+        pred_vec = mv_a + mv_b + mv_c - imin(mv_a, imin(mv_b, mv_c)) - imax(mv_a, imax(mv_b ,mv_c));
+      }
+      break;
+    case MVPRED_L:
+      pred_vec = mv_a;
+      break;
+    case MVPRED_U:
+      pred_vec = mv_b;
+      break;
+    case MVPRED_UR:
+      pred_vec = mv_c;
+      break;
+    default:
+      break;
+    }
+
+    pmv[hv] = (short) pred_vec;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Get motion vector predictor
+ ************************************************************************
+ */
+// TODO: benski> make SSE3/MMX version
+static void GetMotionVectorPredictorNormal (Macroblock *currMB, 
+                                            PixelPos *block,      // <--> block neighbors
+                                            short  pmv[2],
+                                            short  ref_frame,
+                                            PicMotion **motion, 
+                                            int    mb_x,
+                                            int    mb_y,
+                                            int    blockshape_x,
+																						int    blockshape_y)
+{
+	int rFrameL    = block[0].available ? motion[block[0].pos_y][block[0].pos_x].ref_idx : -1;
+	int rFrameU    = block[1].available ? motion[block[1].pos_y][block[1].pos_x].ref_idx : -1;
+	int rFrameUR   = block[2].available ? motion[block[2].pos_y][block[2].pos_x].ref_idx : -1;
+
+	/* Prediction if only one of the neighbors uses the reference frame
+	*  we are checking
+	*/
+	if (rFrameL == ref_frame && 
+		((rFrameU != ref_frame && rFrameUR != ref_frame) || (blockshape_x == 8 && blockshape_y == 16 && mb_x == 0) || (blockshape_x == 16 && blockshape_y == 8 && mb_y != 0)))
+	{ // left
+		pmv[0] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[0] : 0;
+		pmv[1] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[1] : 0;
+	}
+	else if (rFrameU == ref_frame && 
+		((rFrameL != ref_frame && rFrameUR != ref_frame) || (blockshape_x == 16 && blockshape_y == 8 && mb_y == 0)))
+	{ // up
+		pmv[0] = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[0] : 0;
+		pmv[1] = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[1] : 0;
+	}
+	else if (rFrameUR == ref_frame &&
+		((rFrameL != ref_frame && rFrameU != ref_frame) || (blockshape_x == 8 && blockshape_y == 16 && mb_x != 0)))
+	{ // upper right
+		pmv[0] = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[0] : 0;   
+		pmv[1] = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[1] : 0;   
+	}
+	else
+	{ // median
+		if(!(block[1].available || block[2].available))
+		{
+			pmv[0] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[0] : 0;
+			pmv[1] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[1] : 0;
+		}
+		else
+		{
+			int mv_a = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[0] : 0;
+			int mv_b = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[0] : 0;
+			int mv_c = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[0] : 0;   
+			pmv[0] = mv_a + mv_b + mv_c - imin(mv_a, imin(mv_b, mv_c)) - imax(mv_a, imax(mv_b ,mv_c));
+			mv_a = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[1] : 0;
+			mv_b = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[1] : 0;
+			mv_c = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[1] : 0;   
+			pmv[1] = mv_a + mv_b + mv_c - imin(mv_a, imin(mv_b, mv_c)) - imax(mv_a, imax(mv_b ,mv_c));
+		}
+	}
+}
+
+void init_motion_vector_prediction(Macroblock *currMB, int mb_aff_frame_flag)
+{
+  if (mb_aff_frame_flag)
+    currMB->GetMVPredictor = GetMotionVectorPredictorMBAFF;
+  else
+    currMB->GetMVPredictor = GetMotionVectorPredictorNormal;
+}
--- a/Src/h264dec/lcommon/src/parsetcommon.c
+++ b/Src/h264dec/lcommon/src/parsetcommon.c
@ -0,0 +1,244 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    parsetcommon.c
+ * \brief
+ *    Picture and Sequence Parameter set generation and handling
+ *  \date 25 November 2002
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Stephan Wenger        <stewe@cs.tu-berlin.de>
+ *
+ **************************************************************************************
+ */
+
+#include "global.h"
+#include "parsetcommon.h"
+#include "memalloc.h"
+/*!
+ *************************************************************************************
+ * \brief
+ *    Allocates memory for a picture paramater set
+ *
+ * \return
+ *    pointer to a pps
+ *************************************************************************************
+ */
+
+pic_parameter_set_rbsp_t *AllocPPS ()
+ {
+   pic_parameter_set_rbsp_t *p;
+
+   if ((p=calloc (sizeof (pic_parameter_set_rbsp_t), 1)) == NULL)
+     no_mem_exit ("AllocPPS: PPS");
+   p->slice_group_id = NULL;
+   return p;
+ }
+
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Allocates memory for am sequence paramater set
+ *
+ * \return
+ *    pointer to a sps
+ *************************************************************************************
+ */
+
+seq_parameter_set_rbsp_t *AllocSPS ()
+ {
+   seq_parameter_set_rbsp_t *p;
+
+   if ((p=calloc (sizeof (seq_parameter_set_rbsp_t), 1)) == NULL)
+     no_mem_exit ("AllocSPS: SPS");
+   return p;
+ }
+
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Frees a picture parameter set
+ *
+ * \param pps to be freed
+ *   Picture parameter set to be freed
+ *************************************************************************************
+ */
+
+ void FreePPS (pic_parameter_set_rbsp_t *pps)
+ {
+   assert (pps != NULL);
+   if (pps->slice_group_id != NULL) 
+     free (pps->slice_group_id);
+   free (pps);
+ }
+
+
+ /*!
+ *************************************************************************************
+ * \brief
+ *    Frees a sps
+ *
+ * \param sps
+ *   Sequence parameter set to be freed
+ *************************************************************************************
+ */
+
+ void FreeSPS (seq_parameter_set_rbsp_t *sps)
+ {
+   assert (sps != NULL);
+   free (sps);
+ }
+
+
+int sps_is_equal(seq_parameter_set_rbsp_t *sps1, seq_parameter_set_rbsp_t *sps2)
+{
+  unsigned i;
+  int equal = 1;
+
+  if ((!sps1->Valid) || (!sps2->Valid))
+    return 0;
+
+  equal &= (sps1->profile_idc == sps2->profile_idc);
+  equal &= (sps1->constrained_set0_flag == sps2->constrained_set0_flag);
+  equal &= (sps1->constrained_set1_flag == sps2->constrained_set1_flag);
+  equal &= (sps1->constrained_set2_flag == sps2->constrained_set2_flag);
+  equal &= (sps1->level_idc == sps2->level_idc);
+  equal &= (sps1->seq_parameter_set_id == sps2->seq_parameter_set_id);
+  equal &= (sps1->log2_max_frame_num_minus4 == sps2->log2_max_frame_num_minus4);
+  equal &= (sps1->pic_order_cnt_type == sps2->pic_order_cnt_type);
+
+  if (!equal) return equal;
+
+  if( sps1->pic_order_cnt_type == 0 )
+  {
+    equal &= (sps1->log2_max_pic_order_cnt_lsb_minus4 == sps2->log2_max_pic_order_cnt_lsb_minus4);
+  }
+
+  else if( sps1->pic_order_cnt_type == 1 )
+  {
+    equal &= (sps1->delta_pic_order_always_zero_flag == sps2->delta_pic_order_always_zero_flag);
+    equal &= (sps1->offset_for_non_ref_pic == sps2->offset_for_non_ref_pic);
+    equal &= (sps1->offset_for_top_to_bottom_field == sps2->offset_for_top_to_bottom_field);
+    equal &= (sps1->num_ref_frames_in_pic_order_cnt_cycle == sps2->num_ref_frames_in_pic_order_cnt_cycle);
+    if (!equal) return equal;
+
+    for ( i = 0 ; i< sps1->num_ref_frames_in_pic_order_cnt_cycle ;i ++)
+      equal &= (sps1->offset_for_ref_frame[i] == sps2->offset_for_ref_frame[i]);
+  }
+
+  equal &= (sps1->num_ref_frames == sps2->num_ref_frames);
+  equal &= (sps1->gaps_in_frame_num_value_allowed_flag == sps2->gaps_in_frame_num_value_allowed_flag);
+  equal &= (sps1->pic_width_in_mbs_minus1 == sps2->pic_width_in_mbs_minus1);
+  equal &= (sps1->pic_height_in_map_units_minus1 == sps2->pic_height_in_map_units_minus1);
+  equal &= (sps1->frame_mbs_only_flag == sps2->frame_mbs_only_flag);
+
+  if (!equal) return equal;
+  if( !sps1->frame_mbs_only_flag )
+    equal &= (sps1->mb_adaptive_frame_field_flag == sps2->mb_adaptive_frame_field_flag);
+
+  equal &= (sps1->direct_8x8_inference_flag == sps2->direct_8x8_inference_flag);
+  equal &= (sps1->frame_cropping_flag == sps2->frame_cropping_flag);
+  if (!equal) return equal;
+  if (sps1->frame_cropping_flag)
+  {
+    equal &= (sps1->frame_cropping_rect_left_offset == sps2->frame_cropping_rect_left_offset);
+    equal &= (sps1->frame_cropping_rect_right_offset == sps2->frame_cropping_rect_right_offset);
+    equal &= (sps1->frame_cropping_rect_top_offset == sps2->frame_cropping_rect_top_offset);
+    equal &= (sps1->frame_cropping_rect_bottom_offset == sps2->frame_cropping_rect_bottom_offset);
+  }
+  equal &= (sps1->vui_parameters_present_flag == sps2->vui_parameters_present_flag);
+
+  return equal;
+}
+
+int pps_is_equal(pic_parameter_set_rbsp_t *pps1, pic_parameter_set_rbsp_t *pps2)
+{
+  unsigned i, j;
+  int equal = 1;
+
+  if ((!pps1->Valid) || (!pps2->Valid))
+    return 0;
+
+  equal &= (pps1->pic_parameter_set_id == pps2->pic_parameter_set_id);
+  equal &= (pps1->seq_parameter_set_id == pps2->seq_parameter_set_id);
+  equal &= (pps1->entropy_coding_mode_flag == pps2->entropy_coding_mode_flag);
+  equal &= (pps1->bottom_field_pic_order_in_frame_present_flag == pps2->bottom_field_pic_order_in_frame_present_flag);
+  equal &= (pps1->num_slice_groups_minus1 == pps2->num_slice_groups_minus1);
+
+  if (!equal) return equal;
+
+  if (pps1->num_slice_groups_minus1>0)
+  {
+      equal &= (pps1->slice_group_map_type == pps2->slice_group_map_type);
+      if (!equal) return equal;
+      if (pps1->slice_group_map_type == 0)
+      {
+        for (i=0; i<=pps1->num_slice_groups_minus1; i++)
+          equal &= (pps1->run_length_minus1[i] == pps2->run_length_minus1[i]);
+      }
+      else if( pps1->slice_group_map_type == 2 )
+      {
+        for (i=0; i<pps1->num_slice_groups_minus1; i++)
+        {
+          equal &= (pps1->top_left[i] == pps2->top_left[i]);
+          equal &= (pps1->bottom_right[i] == pps2->bottom_right[i]);
+        }
+      }
+      else if( pps1->slice_group_map_type == 3 || pps1->slice_group_map_type==4 || pps1->slice_group_map_type==5 )
+      {
+        equal &= (pps1->slice_group_change_direction_flag == pps2->slice_group_change_direction_flag);
+        equal &= (pps1->slice_group_change_rate_minus1 == pps2->slice_group_change_rate_minus1);
+      }
+      else if( pps1->slice_group_map_type == 6 )
+      {
+        equal &= (pps1->pic_size_in_map_units_minus1 == pps2->pic_size_in_map_units_minus1);
+        if (!equal) return equal;
+        for (i=0; i<=pps1->pic_size_in_map_units_minus1; i++)
+          equal &= (pps1->slice_group_id[i] == pps2->slice_group_id[i]);
+      }
+  }
+
+  equal &= (pps1->num_ref_idx_l0_active_minus1 == pps2->num_ref_idx_l0_active_minus1);
+  equal &= (pps1->num_ref_idx_l1_active_minus1 == pps2->num_ref_idx_l1_active_minus1);
+  equal &= (pps1->weighted_pred_flag == pps2->weighted_pred_flag);
+  equal &= (pps1->weighted_bipred_idc == pps2->weighted_bipred_idc);
+  equal &= (pps1->pic_init_qp_minus26 == pps2->pic_init_qp_minus26);
+  equal &= (pps1->pic_init_qs_minus26 == pps2->pic_init_qs_minus26);
+  equal &= (pps1->chroma_qp_index_offset == pps2->chroma_qp_index_offset);
+  equal &= (pps1->deblocking_filter_control_present_flag == pps2->deblocking_filter_control_present_flag);
+  equal &= (pps1->constrained_intra_pred_flag == pps2->constrained_intra_pred_flag);
+  equal &= (pps1->redundant_pic_cnt_present_flag == pps2->redundant_pic_cnt_present_flag);
+
+  if (!equal) return equal;
+
+  //Fidelity Range Extensions Stuff
+  //It is initialized to zero, so should be ok to check all the time.
+  equal &= (pps1->transform_8x8_mode_flag == pps2->transform_8x8_mode_flag);
+  equal &= (pps1->pic_scaling_matrix_present_flag == pps2->pic_scaling_matrix_present_flag);
+  if(pps1->pic_scaling_matrix_present_flag)
+  {
+    for(i = 0; i < (6 + ((unsigned)pps1->transform_8x8_mode_flag << 1)); i++)
+    {
+      equal &= (pps1->pic_scaling_list_present_flag[i] == pps2->pic_scaling_list_present_flag[i]);
+      if(pps1->pic_scaling_list_present_flag[i])
+      {
+        if(i < 6)
+        {
+          for (j = 0; j < 16; j++)
+            equal &= (pps1->ScalingList4x4[i][j] == pps2->ScalingList4x4[i][j]);
+        }
+        else
+        {
+          for (j = 0; j < 64; j++)
+            equal &= (pps1->ScalingList8x8[i-6][j] == pps2->ScalingList8x8[i-6][j]);
+        }
+      }
+    }
+  }
+  equal &= (pps1->second_chroma_qp_index_offset == pps2->second_chroma_qp_index_offset);
+
+  return equal;
+}
--- a/Src/h264dec/lcommon/src/transform.c
+++ b/Src/h264dec/lcommon/src/transform.c
@ -0,0 +1,809 @@
+/*!
+***************************************************************************
+* \file transform.c
+*
+* \brief
+*    Transform functions
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Alexis Michael Tourapis
+* \date
+*    01. July 2007
+**************************************************************************
+*/
+#include "global.h"
+#include "transform.h"
+#include <emmintrin.h>
+
+void forward4x4(int **block, int **tblock, int pos_y, int pos_x)
+{
+	int i, ii;  
+	int tmp[16];
+	int *pTmp = tmp, *pblock;
+	int p0,p1,p2,p3;
+	int t0,t1,t2,t3;
+
+	// Horizontal
+	for (i=pos_y; i < pos_y + BLOCK_SIZE; i++)
+	{
+		pblock = &block[i][pos_x];
+		p0 = *(pblock++);
+		p1 = *(pblock++);
+		p2 = *(pblock++);
+		p3 = *(pblock  );
+
+		t0 = p0 + p3;
+		t1 = p1 + p2;
+		t2 = p1 - p2;
+		t3 = p0 - p3;
+
+		*(pTmp++) =  t0 + t1;
+		*(pTmp++) = (t3 << 1) + t2;
+		*(pTmp++) =  t0 - t1;    
+		*(pTmp++) =  t3 - (t2 << 1);
+	}
+
+	// Vertical 
+	for (i=0; i < BLOCK_SIZE; i++)
+	{
+		pTmp = tmp + i;
+		p0 = *pTmp;
+		p1 = *(pTmp += BLOCK_SIZE);
+		p2 = *(pTmp += BLOCK_SIZE);
+		p3 = *(pTmp += BLOCK_SIZE);
+
+		t0 = p0 + p3;
+		t1 = p1 + p2;
+		t2 = p1 - p2;
+		t3 = p0 - p3;
+
+		ii = pos_x + i;
+		tblock[pos_y    ][ii] = t0 +  t1;
+		tblock[pos_y + 1][ii] = t2 + (t3 << 1);
+		tblock[pos_y + 2][ii] = t0 -  t1;
+		tblock[pos_y + 3][ii] = t3 - (t2 << 1);
+	}
+}
+
+static void inverse4x4(const h264_short_block_t tblock, h264_short_block_t block, int pos_y, int pos_x)
+{
+		int i;  
+		short tmp[16];
+		short *pTmp = tmp;
+		int p0,p1,p2,p3;
+		int t0,t1,t2,t3;
+
+		// Horizontal
+		for (i = 0; i < BLOCK_SIZE; i++)
+		{
+			t0 = tblock[i][0];
+			t1 = tblock[i][1];
+			t2 = tblock[i][2];
+			t3 = tblock[i][3];
+
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+		}
+
+		//  Vertical 
+		for (i = 0; i < BLOCK_SIZE; i++)
+		{
+			pTmp = tmp + i;
+			t0 = *pTmp;
+			t1 = *(pTmp += BLOCK_SIZE);
+			t2 = *(pTmp += BLOCK_SIZE);
+			t3 = *(pTmp += BLOCK_SIZE);
+
+			p0 = t0 + t2;
+			p1 = t0 - t2;
+			p2 =(t1 >> 1) - t3;
+			p3 = t1 + (t3 >> 1);
+
+			block[0][i] = p0 + p3;
+			block[1][i] = p1 + p2;
+			block[2][i] = p1 - p2;
+			block[3][i] = p0 - p3;
+		}
+}
+
+#ifdef _M_IX86
+// benski> this exists just for conformance testing. not used in production code
+static void inverse4x4_sse2_x86(const h264_short_macroblock_t tblock, h264_short_macroblock_t block, int pos_y, int pos_x)
+{
+		__asm
+		{
+			mov edx, pos_y
+			shl edx, 4 // 16 step stride
+			add edx, pos_x
+			shl edx, 1 // * sizeof(short)
+
+			// eax: pointer to the start of tblock (offset by passed pos_y, pos_x)
+			mov eax, edx
+			add eax, tblock
+
+			// esi: results
+			mov esi, edx
+			add esi, block
+
+			// load 4x4 matrix
+			movq mm0, MMWORD PTR 0[eax]
+			movq mm1, MMWORD PTR 32[eax]
+			movq mm2, MMWORD PTR 64[eax]
+			movq mm3, MMWORD PTR 96[eax]
+
+			// rotate 4x4 matrix
+			movq mm4, mm0 // p0 = mm4 (copy)
+			punpcklwd mm0, mm2 // r0 = mm0
+			punpckhwd mm4, mm2 // r2 = mm4
+			movq mm5, mm1 // p1 = mm5 (copy)
+			punpcklwd mm1, mm3 // r1 = mm1
+			punpckhwd mm5, mm3 // r3 = mm5
+			movq mm6, mm0 // r0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // r2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+
+			/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+
+			// rotate 4x4 matrix to set up for vertical
+			movq mm4, mm0 // r0 = mm4 (copy)
+			punpcklwd mm0, mm2 // p0 = mm0
+			punpckhwd mm4, mm2 // p2 = mm4
+			movq mm5, mm1 // r1 = mm5 (copy)
+			punpcklwd mm1, mm3 // p1 = mm1
+			punpckhwd mm5, mm3 // p3 = mm5
+			movq mm6, mm0 // p0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // p2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+					/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+			movq XMMWORD PTR 0[esi], mm0
+			movq XMMWORD PTR 32[esi], mm1
+			movq XMMWORD PTR 64[esi], mm2
+			movq XMMWORD PTR 96[esi], mm3
+		}
+}
+#endif
+
+static void sample_reconstruct(h264_imgpel_macroblock_t curImg, const h264_imgpel_macroblock_t mpr, const h264_short_block_t tblock, int joff, int mb_x, int max_imgpel_value)
+{
+	#ifdef _M_IX86
+		__asm
+		{
+			// mm0 : constant value 32
+			mov edx, 0x00200020
+			movd mm0, edx
+			punpckldq	mm0, mm0
+
+			// ecx: y offset
+			mov ecx, joff
+			shl ecx, 4 // imgpel stuff is going to be 16 byte stride
+			add ecx, mb_x
+
+			// eax: curImg
+			mov eax, curImg
+			add eax, ecx
+
+			// edx: mpr
+			mov edx, mpr
+			add edx, ecx
+
+			// ecx: tblock (which is short, not byte)
+			mov ecx, tblock
+			
+			// mm7: zero
+			pxor mm7, mm7
+
+			// load coefficients
+			movq	mm1, MMWORD PTR 0[ecx]
+			movq	mm2, MMWORD PTR 8[ecx]
+			movq	mm3, MMWORD PTR 16[ecx]
+			movq	mm4, MMWORD PTR 24[ecx]
+			paddw mm1, mm0 // rres + 32
+			paddw mm2, mm0 // rres + 32
+			paddw mm3, mm0 // rres + 32
+			paddw mm0, mm4 // rres + 32
+			psraw mm1, 6 // (rres + 32) >> 6
+			psraw mm2, 6 // (rres + 32) >> 6
+			psraw mm3, 6 // (rres + 32) >> 6
+			psraw mm0, 6 // (rres + 32) >> 6
+			// mm1-mm3: tblock[0] - tblock[2], mm0: tblock[3]
+
+			// convert mpr from unsigned char to short
+			movd mm4, DWORD PTR 0[edx]
+			movd mm5, DWORD PTR 16[edx]
+			movd mm6, DWORD PTR 32[edx]
+			punpcklbw mm4, mm7
+			punpcklbw mm5, mm7
+			punpcklbw mm6, mm7
+			paddsw mm4, mm1 // pred_row + rres_row
+			movd mm1, DWORD PTR 48[edx] // reuse mm1 for mpr[3]
+			paddsw mm5, mm2 // pred_row + rres_row
+			punpcklbw mm1, mm7
+			paddsw mm6, mm3 // pred_row + rres_row			
+			paddsw mm1, mm0 // pred_row + rres_row
+			// results in mm4, mm5, mm6, mm1
+			
+			// move back to 8 bit
+			packuswb mm4, mm7
+			packuswb mm5, mm7
+			packuswb mm6, mm7
+			packuswb mm1, mm7
+			movd DWORD PTR 0[eax], mm4
+			movd DWORD PTR 16[eax], mm5
+			movd DWORD PTR 32[eax], mm6
+			movd DWORD PTR 48[eax], mm1
+		}
+#else
+  int i, j;
+
+  for (j = 0; j < BLOCK_SIZE; j++)
+  {
+    for (i=0;i<BLOCK_SIZE;i++)
+      curImg[j+joff][mb_x+i] = (imgpel) iClip1( max_imgpel_value, rshift_rnd_sf(tblock[j][i], DQ_BITS) + mpr[j+joff][mb_x+i]);
+  }
+#endif
+}
+
+#if defined(_M_IX86) && defined(_DEBUG)
+void itrans4x4_sse2(const h264_short_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y)
+{
+	__declspec(align(32)) static const short const32[4] = {32, 32, 32, 32};
+		__asm
+		{
+			mov edx, pos_y
+			shl edx, 4 // imgpel stuff is going to be 16 byte stride
+			add edx, pos_x
+
+			// eax: tblock
+			lea eax, [edx*2]
+			add eax, tblock
+
+			// ecx: mpr
+			mov ecx, mb_pred
+			add ecx, edx
+
+			// edx: results
+			add edx, mb_rec
+
+			// load 4x4 matrix
+			movq mm0, MMWORD PTR 0[eax]
+			movq mm1, MMWORD PTR 32[eax]
+			movq mm2, MMWORD PTR 64[eax]
+			movq mm3, MMWORD PTR 96[eax]
+
+			// rotate 4x4 matrix
+			movq mm4, mm0 // p0 = mm4 (copy)
+			punpcklwd mm0, mm2 // r0 = mm0
+			punpckhwd mm4, mm2 // r2 = mm4
+			movq mm5, mm1 // p1 = mm5 (copy)
+			punpcklwd mm1, mm3 // r1 = mm1
+			punpckhwd mm5, mm3 // r3 = mm5
+			movq mm6, mm0 // r0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // r2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+
+			/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+
+			// rotate 4x4 matrix to set up for vertical
+			movq mm4, mm0 // r0 = mm4 (copy)
+			punpcklwd mm0, mm2 // p0 = mm0
+			punpckhwd mm4, mm2 // p2 = mm4
+			movq mm5, mm1 // r1 = mm5 (copy)
+			punpcklwd mm1, mm3 // p1 = mm1
+			punpckhwd mm5, mm3 // p3 = mm5
+			movq mm6, mm0 // p0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // p2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+					/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+/* --- 4x4 iDCT done, now time to combine with mpr --- */
+			// mm0 : constant value 32
+			movq	mm7, const32
+
+			paddw mm0, mm7 // rres + 32
+			psraw mm0, 6 // (rres + 32) >> 6
+			paddw mm1, mm7 // rres + 32
+			psraw mm1, 6 // (rres + 32) >> 6
+			paddw mm2, mm7 // rres + 32
+			psraw mm2, 6 // (rres + 32) >> 6
+			paddw mm3, mm7 // rres + 32
+			psraw mm3, 6 // (rres + 32) >> 6
+
+			pxor mm7, mm7
+
+			// convert mpr from unsigned char to short
+			movd mm4, DWORD PTR 0[ecx]
+			movd mm5, DWORD PTR 16[ecx]
+			movd mm6, DWORD PTR 32[ecx]
+			punpcklbw mm4, mm7
+			punpcklbw mm5, mm7
+			punpcklbw mm6, mm7
+			paddsw mm4, mm0 // pred_row + rres_row
+			movd mm0, DWORD PTR 48[ecx] // reuse mm0 for mpr[3]
+			paddsw mm5, mm1 // pred_row + rres_row
+			punpcklbw mm0, mm7
+			paddsw mm6, mm2 // pred_row + rres_row			
+			paddsw mm0, mm3 // pred_row + rres_row
+			// results in mm4, mm5, mm6, mm0
+			
+			// move back to 8 bit
+			packuswb mm4, mm7
+			packuswb mm5, mm7
+			packuswb mm6, mm7
+			packuswb mm0, mm7
+			movd DWORD PTR 0[edx], mm4
+			movd DWORD PTR 16[edx], mm5
+			movd DWORD PTR 32[edx], mm6
+			movd DWORD PTR 48[edx], mm0
+		}
+}
+#elif defined(_M_X64)
+static void itrans4x4_sse2(const h264_int_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y)
+{
+	__declspec(align(32)) static const int const32[4] = {32, 32, 32, 32};
+			__m128i  p0,p1,p2,p3;
+		__m128i t0,t1,t2,t3;
+		__m128i r0,r1,r2,r3;
+		__m128i c32, zero;
+
+		// horizontal 
+		// load registers in vertical mode, we'll rotate them next
+		p0 = _mm_loadu_si128((__m128i *)&tblock[pos_y][pos_x]); // 00 01 02 03
+		p1 = _mm_loadu_si128((__m128i *)&tblock[pos_y+1][pos_x]); // 10 11 12 13
+		p2 = _mm_loadu_si128((__m128i *)&tblock[pos_y+2][pos_x]); // 20 21 22 23
+		p3 = _mm_loadu_si128((__m128i *)&tblock[pos_y+3][pos_x]); // 30 31 32 33
+		
+		// rotate 4x4 matrix
+		r0 = _mm_unpacklo_epi32(p0, p2); // 00 20 01 21
+		r1 = _mm_unpacklo_epi32(p1, p3); // 10 30 11 31
+		r2 = _mm_unpackhi_epi32(p0, p2); // 02 22 03 23
+		r3 = _mm_unpackhi_epi32(p1, p3); // 12 32 13 33
+		t0 = _mm_unpacklo_epi32(r0, r1); // 00 10 20 30
+		t1 = _mm_unpackhi_epi32(r0, r1); // 01 11 21 31
+		t2 = _mm_unpacklo_epi32(r2, r3); // 02 12 22 32
+		t3 = _mm_unpackhi_epi32(r2, r3); // 03 13 23 33
+
+		p0 = _mm_add_epi32(t0, t2); //t0 + t2;
+		p1 = _mm_sub_epi32(t0, t2); // t0 - t2;
+		p2 = _mm_srai_epi32(t1, 1); // t1 >> 1
+		p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3;
+		p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1)
+		p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1);
+
+		t0 = _mm_add_epi32(p0, p3); //p0 + p3;
+		t1 = _mm_add_epi32(p1, p2);//p1 + p2;
+		t2 = _mm_sub_epi32(p1, p2); //p1 - p2;
+		t3 = _mm_sub_epi32(p0, p3); //p0 - p3;
+
+		// rotate 4x4 matrix to set up for vertical
+		r0 = _mm_unpacklo_epi32(t0, t2); 
+		r1 = _mm_unpacklo_epi32(t1, t3); 
+		r2 = _mm_unpackhi_epi32(t0, t2); 
+		r3 = _mm_unpackhi_epi32(t1, t3); 
+		t0 = _mm_unpacklo_epi32(r0, r1); 
+		t1 = _mm_unpackhi_epi32(r0, r1); 
+		t2 = _mm_unpacklo_epi32(r2, r3); 
+		t3 = _mm_unpackhi_epi32(r2, r3); 
+
+		// vertical
+		p0 = _mm_add_epi32(t0, t2); //t0 + t2;
+		p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1)
+		p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1);
+		r0 = _mm_add_epi32(p0, p3); //p0 + p3;
+		r3 = _mm_sub_epi32(p0, p3); //p0 - p3;
+		p1 = _mm_sub_epi32(t0, t2); // t0 - t2;
+		p2 = _mm_srai_epi32(t1, 1); // t1 >> 1
+		p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3;
+		r1 = _mm_add_epi32(p1, p2);//p1 + p2;
+		r2 = _mm_sub_epi32(p1, p2); //p1 - p2;
+
+		c32 = _mm_load_si128((const __m128i *)const32);
+		zero = _mm_setzero_si128();
+
+		// (x + 32) >> 6
+		r0 = _mm_add_epi32(r0, c32);
+		r0 = _mm_srai_epi32(r0, 6);
+		r1 = _mm_add_epi32(r1, c32);
+		r1 = _mm_srai_epi32(r1, 6);
+		r2 = _mm_add_epi32(r2, c32);
+		r2 = _mm_srai_epi32(r2, 6);
+		r3 = _mm_add_epi32(r3, c32);
+		r3 = _mm_srai_epi32(r3, 6);
+
+		// convert to 16bit values
+		r0 = _mm_packs_epi32(r0, r1);
+		r2 = _mm_packs_epi32(r2, r3);
+
+		// convert mpr from unsigned char to short
+		p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y][pos_x]);
+		p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+1][pos_x]);
+		p0 = _mm_unpacklo_epi32(p0, p1);
+		p0 = _mm_unpacklo_epi8(p0, zero); // convert to short
+		r0 = _mm_add_epi16(r0, p0);
+
+		p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+2][pos_x]);
+		p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+3][pos_x]);
+		p0 = _mm_unpacklo_epi32(p0, p1);
+		p0 = _mm_unpacklo_epi8(p0, zero); // convert to short
+		r2 = _mm_add_epi16(r2, p0);
+
+		r0 = _mm_packus_epi16(r0, r2); // convert to unsigned char
+		*(int32_t *)&mb_rec[pos_y][pos_x] = _mm_cvtsi128_si32(r0);
+		r0 = _mm_srli_si128(r0, 4);
+		*(int32_t *)&mb_rec[pos_y+1][pos_x] = _mm_cvtsi128_si32(r0);
+		r0 = _mm_srli_si128(r0, 4);
+		*(int32_t *)&mb_rec[pos_y+2][pos_x] = _mm_cvtsi128_si32(r0);
+		r0 = _mm_srli_si128(r0, 4);
+		*(int32_t *)&mb_rec[pos_y+3][pos_x] = _mm_cvtsi128_si32(r0);
+}
+#endif
+
+void itrans4x4_c(const h264_short_block_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y)
+{
+  inverse4x4(tblock, (h264_short_block_row_t *)tblock,pos_y,pos_x);
+	sample_reconstruct(mb_rec, mb_pred, tblock, pos_y, pos_x, 255);
+}
+
+void ihadamard4x4(int block[4][4])
+{
+	int i;  
+	int tmp[16];
+	int *pTmp = tmp;
+	int p0,p1,p2,p3;
+	int t0,t1,t2,t3;
+
+	// Horizontal
+	for (i = 0; i < BLOCK_SIZE; i++)
+	{
+		t0 = block[i][0];
+		t1 = block[i][1];
+		t2 = block[i][2];
+		t3 = block[i][3];
+
+		p0 = t0 + t2;
+		p1 = t0 - t2;
+		p2 = t1 - t3;
+		p3 = t1 + t3;
+
+		*(pTmp++) = p0 + p3;
+		*(pTmp++) = p1 + p2;
+		*(pTmp++) = p1 - p2;
+		*(pTmp++) = p0 - p3;
+	}
+
+	//  Vertical 
+	for (i = 0; i < BLOCK_SIZE; i++)
+	{
+		pTmp = tmp + i;
+		t0 = *pTmp;
+		t1 = *(pTmp += BLOCK_SIZE);
+		t2 = *(pTmp += BLOCK_SIZE);
+		t3 = *(pTmp += BLOCK_SIZE);
+
+		p0 = t0 + t2;
+		p1 = t0 - t2;
+		p2 = t1 - t3;
+		p3 = t1 + t3;
+
+		block[0][i] = p0 + p3;
+		block[1][i] = p1 + p2;
+		block[2][i] = p1 - p2;
+		block[3][i] = p0 - p3;
+	}
+}
+
+void ihadamard4x2(int **tblock, int **block)
+{
+	int i;  
+	int tmp[8];
+	int *pTmp = tmp;
+	int p0,p1,p2,p3;
+	int t0,t1,t2,t3;
+
+	// Horizontal
+	*(pTmp++) = tblock[0][0] + tblock[1][0];
+	*(pTmp++) = tblock[0][1] + tblock[1][1];
+	*(pTmp++) = tblock[0][2] + tblock[1][2];
+	*(pTmp++) = tblock[0][3] + tblock[1][3];
+
+	*(pTmp++) = tblock[0][0] - tblock[1][0];
+	*(pTmp++) = tblock[0][1] - tblock[1][1];
+	*(pTmp++) = tblock[0][2] - tblock[1][2];
+	*(pTmp  ) = tblock[0][3] - tblock[1][3];
+
+	// Vertical
+	pTmp = tmp;
+	for (i = 0; i < 2; i++)
+	{
+		p0 = *(pTmp++);
+		p1 = *(pTmp++);
+		p2 = *(pTmp++);
+		p3 = *(pTmp++);
+
+		t0 = p0 + p2;
+		t1 = p0 - p2;
+		t2 = p1 - p3;
+		t3 = p1 + p3;
+
+		// coefficients (transposed)
+		block[0][i] = t0 + t3;
+		block[1][i] = t1 + t2;
+		block[2][i] = t1 - t2;
+		block[3][i] = t0 - t3;
+	}
+}
+
+//following functions perform 8 additions, 8 assignments. Should be a bit faster
+void ihadamard2x2(int tblock[4], int block[4])
+{
+	int t0,t1,t2,t3;
+
+	t0 = tblock[0] + tblock[1];
+	t1 = tblock[0] - tblock[1];
+	t2 = tblock[2] + tblock[3];
+	t3 = tblock[2] - tblock[3];
+
+	block[0] = (t0 + t2);
+	block[1] = (t1 + t3);
+	block[2] = (t0 - t2);
+	block[3] = (t1 - t3);
+}
+
--- a/Src/h264dec/lcommon/src/win32.c
+++ b/Src/h264dec/lcommon/src/win32.c
@ -0,0 +1,67 @@
+
+/*!
+ *************************************************************************************
+ * \file win32.c
+ *
+ * \brief
+ *    Platform dependent code
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Karsten Suehring                  <suehring@hhi.de>
+ *************************************************************************************
+ */
+
+#include "global.h"
+
+
+#ifdef _WIN32
+
+static LARGE_INTEGER freq;
+
+void gettime(TIME_T* time)
+{
+  QueryPerformanceCounter(time);
+}
+
+int64 timediff(TIME_T* start, TIME_T* end)
+{
+  return (int64)((end->QuadPart - start->QuadPart));
+}
+
+int64 timenorm(int64  cur_time)
+{
+  static int first = 1;
+
+  if(first) 
+  {
+    QueryPerformanceFrequency(&freq);
+    first = 0;
+  }
+
+  return (int64)(cur_time * 1000 /(freq.QuadPart));
+}
+
+#else
+
+static struct timezone tz;
+
+void gettime(TIME_T* time)
+{
+  gettimeofday(time, &tz);
+}
+
+int64 timediff(TIME_T* start, TIME_T* end)
+{
+  int t1, t2;
+
+  t1 =  end->tv_sec  - start->tv_sec;
+  t2 =  end->tv_usec - start->tv_usec;
+  return (int64) t2 + (int64) t1 * (int64) 1000000;
+}
+
+int64 timenorm(int64 cur_time)
+{
+  return (int64)(cur_time / (int64) 1000);
+}
+#endif