/***************************************************************************
                             th-yuvscale.c
                             -------------
    begin                : Wed Jan 12 2005
    copyright            : (C) 2000 Red Hat, Inc. (original pixops)
    copyright            : (C) 2005 by Tim-Philipp Mller (pixops modifications)
    email                : t.i.m@orange.net
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This library is free software; you can redistribute it and/or         *
 *   modify it under the terms of the GNU Library General Public           *
 *   License as published by the Free Software Foundation; either          *
 *   version 2 of the License, or (at your option) any later version.      *
 *                                                                         *
 *   This library is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     *
 *   Library General Public License for more details.                      *
 *                                                                         *
 *   You should have received a copy of the GNU Library General Public     *
 *   License along with this library; if not, write to the                 *
 *   Free Software Foundation, Inc., 59 Temple Place - Suite 330,          *
 *   Boston, MA 02111-1307, USA.                                           *
 *                                                                         *
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   GStreamer plugin that scales our yuv picture with a bilinear          *
 *   scaling algorithm. The scaling algorithm is based on the              *
 *   GdkPixbuf code.                                                       *
 *                                                                         *
 *   The pixops code has been taken from gdk-pixbuf and adjusted for our   *
 *   purposes (downscaling, 1 channel, dest_x = 0, dest_y = 0, no alpha).  *
 *                                                                         *
 *   Note: does nothing with the pixel-aspect-ratio (not relevant for us   *
 *    anyway, as scale_x always equals scale_y for thoggen).               *
 *                                                                         *
 ***************************************************************************/

#if 0
 	TH_PICTURE_SIZE_XSMALL = 0,   /* 1/8, 0.125, dim.n = 9   */
	TH_PICTURE_SIZE_SMALL,        /* 1/4, 0.250, dim.n = 5   */
	TH_PICTURE_SIZE_MEDIUM_SMALL, /* 1/2, 0.500, dim.n = 3   */
	TH_PICTURE_SIZE_MEDIUM_LARGE, /* 2/3, 0.666, dim.n = 3   */
	TH_PICTURE_SIZE_LARGE,        /* 3/4, 0.750, dim.n = 3   */
	TH_PICTURE_SIZE_XLARGE,       /* 5/6, 0.833, dim.n = 3   */
	TH_PICTURE_SIZE_FULL          /* 1/1, 1.000, dim.n = N/A */
#endif

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <math.h>

#include "th-yuvscale.h"

#include <gst/video/video.h> /* for GST_VIDEO_CAPS_YUV */

GST_DEBUG_CATEGORY (thyuvscale_debug);

#define GST_CAT_DEFAULT thyuvscale_debug

static GstElementDetails yuvscale_details =
	GST_ELEMENT_DETAILS 
	(
		"YUV video scaler",
		"Filter/Effect/Video",
		"Resizes YUV video (using GdkPixbuf)",
		"Tim-Philipp M\303\274ller <tim@centricular.net>"
	);

static GstStaticPadTemplate ys_src_template =
	GST_STATIC_PAD_TEMPLATE 
	(
		"src",
		GST_PAD_SRC,
		GST_PAD_ALWAYS,
		GST_STATIC_CAPS (GST_VIDEO_CAPS_YUV ("I420"))
	);

static GstStaticPadTemplate ys_sink_template =
	GST_STATIC_PAD_TEMPLATE 
	(
		"sink",
		GST_PAD_SINK,
		GST_PAD_ALWAYS,
		GST_STATIC_CAPS (GST_VIDEO_CAPS_YUV ("I420"))
	);

	/* Interpolation modes; must match GdkInterpType */ 
typedef enum 
{
	PIXOPS_INTERP_NEAREST,
	PIXOPS_INTERP_TILES,
	PIXOPS_INTERP_BILINEAR,
	PIXOPS_INTERP_HYPER
} PixopsInterpType;

#define YUVSCALE_INTERPOLATION  PIXOPS_INTERP_BILINEAR

static void      yuvscale_base_init (gpointer g_class);

static void      yuvscale_class_init (ThYuvScaleClass *klass);

static void      yuvscale_instance_init (ThYuvScale *ys);

static gboolean  yuvscale_handle_src_event (GstPad *pad, GstEvent *event);

static void      yuvscale_finalize (GObject *object);

static GstCaps  *yuvscale_getcaps (GstPad *pad);

static void      yuvscale_chain (GstPad * pad, GstData * _data);

static void      yuvscale_bilinear_box_make_weights (PixopsFilterDimension *dim, gdouble scale);

static void      yuvscale_tile_make_weights (PixopsFilterDimension *dim, gdouble scale);

static void      yuvscale_make_weights (PixopsFilter *filter, gdouble scale_x, gdouble scale_y);

static gint     *yuvscale_make_filter_table (PixopsFilter *filter);

static void      yuvscale_scale (ThYuvScale *ys, guint8 *src_yuv, guint8 *dest_yuv);

static void      yuvscale_setup_pixops_func (ThYuvScale *ys);


static gpointer  parent_class; /* NULL */



/* These macros are adapted from videotestsrc.c 
 *  and/or gst-plugins/gst/games/gstvideoimage.c */
#define ROUND_UP_2(x)  (((x)+1)&~1)
#define ROUND_UP_4(x)  (((x)+3)&~3)
#define ROUND_UP_8(x)  (((x)+7)&~7)
   
/* I420 */
#define I420_Y_ROWSTRIDE(width) (ROUND_UP_4(width))
#define I420_U_ROWSTRIDE(width) (ROUND_UP_8(width)/2)
#define I420_V_ROWSTRIDE(width) ((ROUND_UP_8(I420_Y_ROWSTRIDE(width)))/2)

#define I420_Y_OFFSET(w,h) (0)
#define I420_U_OFFSET(w,h) (I420_Y_OFFSET(w,h)+(I420_Y_ROWSTRIDE(w)*ROUND_UP_2(h)))
#define I420_V_OFFSET(w,h) (I420_U_OFFSET(w,h)+(I420_U_ROWSTRIDE(w)*ROUND_UP_2(h)/2))

#define I420_SIZE(w,h)     (I420_V_OFFSET(w,h)+(I420_V_ROWSTRIDE(w)*ROUND_UP_2(h)/2))

/***************************************************************************
 *
 *   th_yuvscale_get_type
 *
 ***************************************************************************/

GType
th_yuvscale_get_type (void)
{
	static GType  type; /* 0 */

	if (type == 0) 
	{
		static const GTypeInfo info = 
		{
			sizeof (ThYuvScaleClass),
			yuvscale_base_init,
			NULL,
			(GClassInitFunc) yuvscale_class_init,
			NULL,
			NULL,
			sizeof (ThYuvScale),
			0,
			(GInstanceInitFunc) yuvscale_instance_init,
		};

		type = g_type_register_static (GST_TYPE_ELEMENT, "ThYuvScale", &info, 0);
	}
	
	return type;
}


/***************************************************************************
 *
 *   yuvscale_base_init
 *
 ***************************************************************************/

static void
yuvscale_base_init (gpointer g_class)
{
	GstElementClass *element_class;
	GstPadTemplate  *padtmpl;

	element_class = GST_ELEMENT_CLASS (g_class);
	
	gst_element_class_set_details (element_class, &yuvscale_details);

	padtmpl = gst_static_pad_template_get (&ys_sink_template);
	gst_element_class_add_pad_template (element_class, padtmpl);
	
	padtmpl = gst_static_pad_template_get (&ys_src_template);
	gst_element_class_add_pad_template (element_class, padtmpl);

  GST_DEBUG_CATEGORY_INIT (thyuvscale_debug, "thyuvscale", 0, "thyuvscale element");
}

/***************************************************************************
 *
 *   yuvscale_change_state
 *
 ***************************************************************************/

static GstElementStateReturn
yuvscale_change_state (GstElement *element)
{
	ThYuvScale *ys = TH_YUVSCALE (element);

	switch (GST_STATE_TRANSITION (element)) 
	{
		case GST_STATE_NULL_TO_READY:
			break;
		case GST_STATE_READY_TO_PAUSED:
			break;
		case GST_STATE_PAUSED_TO_PLAYING:
			break;
		case GST_STATE_PLAYING_TO_PAUSED:
			break;
		case GST_STATE_PAUSED_TO_READY:
			break;
		case GST_STATE_READY_TO_NULL:
			ys->inited = FALSE;
			ys->process_func = NULL; 
			break;
	}

	if (GST_ELEMENT_CLASS (parent_class)->change_state != NULL)
		return GST_ELEMENT_CLASS (parent_class)->change_state (element);

	return GST_STATE_SUCCESS;
}

/***************************************************************************
 *
 *   yuvscale_class_init
 *
 ***************************************************************************/

static void
yuvscale_class_init (ThYuvScaleClass *klass)
{
	GstElementClass *gstelement_class;
	GObjectClass    *gobject_class;

	gobject_class = (GObjectClass *) klass;
	gstelement_class = (GstElementClass *) klass;

	parent_class = g_type_class_peek_parent (klass);

	gobject_class->finalize = yuvscale_finalize;
	gstelement_class->change_state = yuvscale_change_state;
}

/***************************************************************************
 *
 *   yuvscale_getcaps
 *
 ***************************************************************************/

static GstCaps *
yuvscale_getcaps (GstPad *pad)
{
	ThYuvScale *ys;
	GstCaps    *caps;
	GstPad     *otherpad;
	gint        i;

	ys = TH_YUVSCALE (gst_pad_get_parent (pad));

	otherpad = (pad == ys->srcpad) ? ys->sinkpad : ys->srcpad;
	caps = gst_pad_get_allowed_caps (otherpad);

	for (i = 0; i < gst_caps_get_size (caps); i++) 
	{
		GstStructure *structure = gst_caps_get_structure (caps, i);

		gst_structure_set (structure,
		                   "width", GST_TYPE_INT_RANGE, 16, 4096,
		                   "height", GST_TYPE_INT_RANGE, 16, 4096,
		                   NULL);
		gst_structure_remove_field (structure, "pixel-aspect-ratio");
	}

	return caps;
}

/***************************************************************************
 *
 *   yuvscale_link
 *
 ***************************************************************************/

static GstPadLinkReturn
yuvscale_link (GstPad *pad, const GstCaps *caps)
{
	GstPadLinkReturn  ret;
	GstStructure     *otherstructure, *structure, *newstructure;
	const GValue     *par = NULL, *otherpar;
	ThYuvScale       *ys;
	GstCaps          *othercaps, *newcaps;
	GstPad           *otherpad;
	gint              newwidth, newheight;
	gint              height = 0, width = 0; 

	ys = TH_YUVSCALE (gst_pad_get_parent (pad));

	otherpad = (pad == ys->srcpad) ? ys->sinkpad : ys->srcpad;

	structure = gst_caps_get_structure (caps, 0);
	
	if (!gst_structure_get_int (structure, "width", &width)
	 || !gst_structure_get_int (structure, "height", &height))
		return GST_PAD_LINK_REFUSED;

	par = gst_structure_get_value (structure, "pixel-aspect-ratio");

	GST_DEBUG_OBJECT (ys, "trying to set caps %" GST_PTR_FORMAT 
	                  " on pad %s:%s for passthru",
	                  caps, GST_DEBUG_PAD_NAME (otherpad));

	ret = gst_pad_try_set_caps (otherpad, caps);
	if (GST_PAD_LINK_SUCCESSFUL (ret)) 
	{
		/* cool, we can use passthru */
		GST_DEBUG_OBJECT (ys, "passthru works");

		ys->passthru = TRUE;
		newwidth = width;
		newheight = height;

		goto beach;
	}

	/* no passthru, so try to convert */
	GST_DEBUG_OBJECT (ys, "no passthru");

	/* copy caps to find which one works for the otherpad */
	newcaps = gst_caps_copy (caps);
	newstructure = gst_caps_get_structure (newcaps, 0);

	/* iterate over other pad's caps, find a nice conversion.
	 * For calculations, we only use the first because we
	 * (falsely) assume that all caps have the same PAR and
	 * size values. */
	othercaps = gst_pad_get_allowed_caps (otherpad);
	otherstructure = gst_caps_get_structure (othercaps, 0);
	otherpar = gst_structure_get_value (otherstructure, "pixel-aspect-ratio");
	if (par && otherpar) 
	{
		gboolean  keep_h, w_align, h_align, w_inc;
		gint      num, den, onum, oden;
		
		num = gst_value_get_fraction_numerator (par);
		den = gst_value_get_fraction_denominator (par);
		onum = gst_value_get_fraction_numerator (otherpar);
		oden = gst_value_get_fraction_denominator (otherpar);

		w_align = (width * num * oden % (den * onum) == 0);
		h_align = (height * den * onum % (num * oden) == 0);
		w_inc = (num * oden > den * onum);

		/* decide whether to change width or height */
		if (w_align && w_inc)
			keep_h = TRUE;
		else if (h_align && !w_inc)
			keep_h = FALSE;
		else if (w_align)
			keep_h = TRUE;
		else if (h_align)
			keep_h = FALSE;
		else
			keep_h = w_inc;

		/* take par into effect */
		if (keep_h) 
		{
			newwidth = width * num / den;
			newheight = height;
		} 
		else 
		{
			newwidth = width;
			newheight = height * den / num;
		}
	} 
	else 
	{
		/* (at least) one has no par, so it should accept the other */
		newwidth = width;
		newheight = height;
	}

	/* size: don't check return values. We honestly don't care. */
	gst_structure_set_value (newstructure, "width",
	                         gst_structure_get_value (otherstructure, "width"));
	gst_structure_set_value (newstructure, "height",
	                         gst_structure_get_value (otherstructure, "height"));

	gst_caps_structure_fixate_field_nearest_int (newstructure, "width", newwidth);
	gst_caps_structure_fixate_field_nearest_int (newstructure, "height", newheight);
	gst_structure_get_int (newstructure, "width", &newwidth);
	gst_structure_get_int (newstructure, "height", &newheight);

	/* obviously, keep PAR if we got one */
	if (otherpar)
		gst_structure_set_value (newstructure, "pixel-aspect-ratio", otherpar);

	GST_DEBUG_OBJECT (ys, "trying to set caps %" GST_PTR_FORMAT 
	                  " on pad %s:%s for non-passthru",
	                  caps, GST_DEBUG_PAD_NAME (otherpad));

	/* try - bail out if fail */
	ret = gst_pad_try_set_caps (otherpad, newcaps);
	if (GST_PAD_LINK_FAILED (ret))
		return ret;

	ys->passthru = FALSE;

beach:

	/* whee, works. Save for use in _chain and get moving. */
	if (pad == ys->srcpad) 
	{
		ys->to_width = width;
		ys->to_height = height;
		ys->from_width = newwidth;
		ys->from_height = newheight;
	} 
	else 
	{
		ys->from_width = width;
		ys->from_height = height;
		ys->to_width = newwidth;
		ys->to_height = newheight;
	}

	ys->to_buf_size = I420_SIZE (ys->to_width, ys->to_height);
	ys->from_buf_size = I420_SIZE (ys->from_width, ys->from_height);

	ys->scale_x = (gdouble) ys->to_width / (gdouble) ys->from_width;
	ys->scale_y = (gdouble) ys->to_height / (gdouble) ys->from_height;

	/* we threw out the code for upscaling */
	if (ys->scale_x > 1.0 || ys->scale_y > 1.0)
	{
		GST_DEBUG_OBJECT (ys, "trying to set caps %" GST_PTR_FORMAT 
		                  " on pad %s:%s for upscaling, which is not implemented",
		                  caps, GST_DEBUG_PAD_NAME (otherpad));
		return GST_PAD_LINK_REFUSED;
	}

  g_free (ys->filter.x.weights);
  g_free (ys->filter.y.weights);
  yuvscale_make_weights (&ys->filter, ys->scale_x, ys->scale_y);
	
	g_free (ys->filter_weights);
	ys->filter_weights = yuvscale_make_filter_table (&ys->filter);

	yuvscale_setup_pixops_func (ys);
	
	ys->inited = TRUE;

	return GST_PAD_LINK_OK;
}

/***************************************************************************
 *
 *   yuvscale_instance_init
 *
 ***************************************************************************/

static void
yuvscale_instance_init (ThYuvScale *ys)
{
	GstPadTemplate  *padtmpl;
	  
	padtmpl = gst_static_pad_template_get (&ys_sink_template);
	ys->sinkpad = gst_pad_new_from_template (padtmpl, "sink");
	gst_element_add_pad (GST_ELEMENT (ys), ys->sinkpad);
	gst_pad_set_link_function (ys->sinkpad, yuvscale_link);
	gst_pad_set_chain_function (ys->sinkpad, yuvscale_chain);
	gst_pad_set_getcaps_function (ys->sinkpad, yuvscale_getcaps);

	padtmpl = gst_static_pad_template_get (&ys_src_template);
	ys->srcpad = gst_pad_new_from_template (padtmpl, "src");
	gst_element_add_pad (GST_ELEMENT (ys), ys->srcpad);
	gst_pad_set_link_function (ys->srcpad, yuvscale_link);
	gst_pad_set_event_function (ys->srcpad, yuvscale_handle_src_event);
	gst_pad_set_getcaps_function (ys->srcpad, yuvscale_getcaps);

	ys->inited = FALSE; /* set in change_state? */
}

/***************************************************************************
 *
 *   yuvscale_handle_src_event
 *
 ***************************************************************************/

static gboolean
yuvscale_handle_src_event (GstPad *pad, GstEvent *event)
{
	if (GST_EVENT_TYPE (event) == GST_EVENT_NAVIGATION)
	{
		GstStructure *structure;
		ThYuvScale   *yuvscale;
		gdouble       old_x, old_y;
	
		yuvscale = TH_YUVSCALE (gst_pad_get_parent (pad));
		
		structure = gst_structure_copy (event->event_data.structure.structure);
	
		if (gst_structure_get_double (event->event_data.structure.structure, "pointer_x", &old_x))
		{
			gdouble new_x = old_x * yuvscale->from_width / yuvscale->to_width;
			gst_structure_set (structure, "pointer_x", G_TYPE_DOUBLE, new_x, NULL);
		}
      
		if (gst_structure_get_double (event->event_data.structure.structure, "pointer_y", &old_y))
		{
			gdouble new_y = old_y * yuvscale->from_height / yuvscale->to_height;
			gst_structure_set (structure, "pointer_y", G_TYPE_DOUBLE, new_y, NULL);
		}
		
		gst_event_unref (event);

		event = gst_event_new (GST_EVENT_NAVIGATION);
		event->event_data.structure.structure = structure;
	}
	
	return gst_pad_event_default (pad, event);
}

/***************************************************************************
 *
 *   yuvscale_chain
 *
 ***************************************************************************/

static void
yuvscale_chain (GstPad *pad, GstData *_data)
{
	ThYuvScale *ys;
	GstBuffer  *buf;
	GstBuffer  *outbuf;
	guchar     *data;
	gulong      size;

	g_return_if_fail (pad != NULL);
	g_return_if_fail (GST_IS_PAD (pad));
	g_return_if_fail (_data != NULL);

	ys = TH_YUVSCALE (gst_pad_get_parent (pad));
	g_return_if_fail (ys->inited);

	if (ys->passthru) 
	{
		gst_pad_push (ys->srcpad, _data);
		return;
	}

	buf = GST_BUFFER (_data);
	data = GST_BUFFER_DATA (buf);
	size = GST_BUFFER_SIZE (buf);

	GST_LOG_OBJECT (ys,
		"buffersize=%ld from=%dx%d to=%dx%d fromsize=%ld tosize=%ld",
		size, ys->from_width, ys->from_height,
		ys->to_width, ys->to_height,
		ys->from_buf_size, ys->to_buf_size);

	if (size != ys->from_buf_size)
	{
		g_print ("YuvScale: size = %u [%u x %u], expected: %u\n", 
		         (guint) size, ys->from_width, ys->from_height, ys->from_buf_size);

		/* smaller => segfault almost guaranteed */
		g_return_if_fail (size > ys->from_buf_size);
	}

	outbuf = gst_pad_alloc_buffer (ys->srcpad, 
	                               GST_BUFFER_OFFSET_NONE, 
	                               ys->to_buf_size);

	gst_buffer_stamp (outbuf, buf);

	yuvscale_scale (ys, GST_BUFFER_DATA (buf), GST_BUFFER_DATA (outbuf));

	gst_pad_push (ys->srcpad, GST_DATA (outbuf));

	gst_buffer_unref (buf);
}

/***************************************************************************
 *
 *   yuvscale_finalize
 *
 ***************************************************************************/

static void
yuvscale_finalize (GObject *object)
{
	ThYuvScale *ys = TH_YUVSCALE (object);

	/* maybe these free's should go in a state change instead;
	 * in that case, we'd probably also want to clear from/to width/height there. */
	/* g_free (ys->from_par); */ /* uh? never allocated FIXME */
	/* g_free (ys->to_par); */ /* uh? never allocated FIXME */

  g_free (ys->filter.x.weights);
  g_free (ys->filter.y.weights);
	g_free (ys->filter_weights);

	G_OBJECT_CLASS (parent_class)->finalize (object);
}

/***************************************************************************
 *
 *   plugin_init
 *
 ***************************************************************************/

static gboolean
plugin_init (GstPlugin *plugin)
{
  g_print ("registering static plugin thyuvscale\n");
	return gst_element_register (plugin, "thyuvscale", GST_RANK_NONE, TH_TYPE_YUVSCALE);
}

GST_PLUGIN_DEFINE_STATIC
(
	GST_VERSION_MAJOR,
	GST_VERSION_MINOR,
	"thyuvscale",
	"Resizes YUV video", 
	plugin_init, 
	"0.2", 
	"LGPL", 
	"Thoggen", 
	"http://thoggen.net"
)

#define SUBSAMPLE_BITS 4
#define SUBSAMPLE (1 << SUBSAMPLE_BITS)
#define SUBSAMPLE_MASK ((1 << SUBSAMPLE_BITS)-1)
#define SCALE_SHIFT 16

/***************************************************************************
 *
 *   scale_line_3_3
 *
 *   Special scale_line() for
 *     filter.x.n = 3
 *     filter.y.n = 3
 *   which is given when we're downscaling to 1/2 and 2/3
 *
 ***************************************************************************/

static inline guchar *
scale_line_3_3 (int *weights, guchar *dest, int dest_x, guchar *dest_end, 
                guchar **src, int x_init, int x_step, int src_width)
{
	gint x;

	x = x_init;
	while (dest < dest_end)
	{
		guchar *q;
		guint  pixval = 0;
		gint   x_scaled = x >> SCALE_SHIFT;
		gint  *pixel_weights;

		pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * 3 * 3;

		q = src[0] + x_scaled;
		pixval += *(q++) * *(pixel_weights++);
		pixval += *(q++) * *(pixel_weights++);
		pixval += *(q++) * *(pixel_weights++);
		q = src[1] + x_scaled;
		pixval += *(q++) * *(pixel_weights++);
		pixval += *(q++) * *(pixel_weights++);
		pixval += *(q++) * *(pixel_weights++);
		q = src[2] + x_scaled;
		pixval += *(q++) * *(pixel_weights++);
		pixval += *(q++) * *(pixel_weights++);
		pixval += *(q++) * *(pixel_weights++);

		*(dest++) = (pixval + 0xffff) >> 16;

		x += x_step;
	}

	return dest;
}

/***************************************************************************
 *
 *   process_pixel_3_3
 *
 *   Special process_pixel() for
 *     filter.x.n = 3
 *     filter.y.n = 3
 *   which is given when we're downscaling to 1/2 and 2/3
 *
 ***************************************************************************/

static inline void
process_pixel_3_3 (int *weights, guchar *dest, int dest_x, 
                   guchar **src, int x_start, int src_width)
{
	guint pixval = 0;
	gint  i;
  
	for (i=0; i < 3; i++)
	{
		guchar *q;

		/* j = 0 */
		if (x_start < 0)
			q = src[i];
		else if (x_start < src_width)
			q = src[i] + x_start;
		else
			q = src[i] + (src_width - 1);

		pixval += (0xff * *(weights++)) * q[0];
		
		/* j = 1 */
		if (x_start + 1 < 0)
			q = src[i];
		else if (x_start + 1 < src_width)
			q = src[i] + (x_start + 1);
		else
			q = src[i] + (src_width - 1);

		pixval += (0xff * *(weights++)) * q[0];
		
		/* j = 2 */
		if (x_start + 2 < 0)
			q = src[i];
		else if (x_start + 2 < src_width)
			q = src[i] + (x_start + 2);
		else
			q = src[i] + (src_width - 1);

		pixval += (0xff * *(weights++)) * q[0];
	}

	*dest = (pixval + 0xffffff) >> 24; /* the former scale_pixel() */
}

/***************************************************************************
 *
 *   pixops_process_3_3
 *
 *   Special pixops_process() to scale down using the tiled 
 *    box weights (so that offset = 0). This is what GdkPixbuf 
 *    uses when downscaling with the bilinear method.
 *
 *   filter.x.n = 3
 *   filter.y.n = 3
 *   filter.x.offset = 0
 *   filter.y.offset = 0
 *
 ***************************************************************************/

static void
pixops_process_3_3 (guchar         *dest_buf,
                    int             dest_width,
                    int             dest_height,
                    int             dest_rowstride,
                    const guchar   *src_buf,
                    int             src_width,
                    int             src_height,
                    int             src_rowstride,
                    gdouble         scale_x,
                    gdouble         scale_y,
                    int            *filter_weights)
{
  int i, x, y; /* X and Y position in source (fixed_point) */
  
  guchar *line_bufs[3];

  int x_step = (1 << SCALE_SHIFT) / scale_x; /* X step in source (fixed point) */
  int y_step = (1 << SCALE_SHIFT) / scale_y; /* Y step in source (fixed point) */

  /* Compute the index where we run off the end of the source buffer. The furthest
   * source pixel we access at index i is:
   *
   *  (i * x_step + scaled_x_offset) >> SCALE_SHIFT + filter->x.n - 1
   *
   * So, run_end_index is the smallest i for which this pixel is src_width, i.e, for which:
   *
   *  i * x_step >= ((src_width - filter->x.n + 1) << SCALE_SHIFT) - scaled_x_offset
   *
   */
#define MYDIV(a,b) ((a) > 0 ? (a) / (b) : ((a) - (b) + 1) / (b))    /* Division so that -1/5 = -1 */
  
	int run_end_x = ((src_width - 3 + 1) << SCALE_SHIFT);
	int run_end_index = MYDIV (run_end_x + x_step - 1, x_step);
	run_end_index = MIN (run_end_index, dest_width);

	y = 0;
	for (i = 0; i < dest_height; i++)
	{
		int dest_x;
		int y_start = y >> SCALE_SHIFT;
		int x_start;
		int *run_weights = filter_weights +
                         ((y >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) 
                         * 3 * 3 * SUBSAMPLE;
		guchar *new_outbuf;
      
		guchar *outbuf = dest_buf + dest_rowstride * i;
		guchar *outbuf_end = outbuf + dest_width;

		if (G_LIKELY (y_start < src_height-3))
		{
				line_bufs[0] = (guchar *)src_buf + src_rowstride * (y_start++);
				line_bufs[1] = (guchar *)src_buf + src_rowstride * (y_start++);
				line_bufs[2] = (guchar *)src_buf + src_rowstride * (y_start++);
		}
		else
		{
			gint j;
			for (j=0; j < 3; j++)
			{
				if (G_LIKELY (y_start < src_height))
					line_bufs[j] = (guchar *)src_buf + src_rowstride * y_start;
				else
					line_bufs[j] = (guchar *)src_buf + src_rowstride * (src_height - 1);

				y_start++;
			}
		}

		dest_x = 0;
		x = 0;
		x_start = x >> SCALE_SHIFT;

		while (x_start < 0 && outbuf < outbuf_end)
		{
			process_pixel_3_3 (run_weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * 3 * 3, 
			                   outbuf, dest_x, 
			                   line_bufs, 
			                   x >> SCALE_SHIFT, src_width);
	  
			x += x_step;
			x_start = x >> SCALE_SHIFT;
			dest_x++;
			outbuf++;
		}

		new_outbuf = scale_line_3_3 (run_weights, outbuf, dest_x,
		                             dest_buf + dest_rowstride * i + run_end_index,
		                             line_bufs, x, x_step, src_width);

		dest_x += (new_outbuf - outbuf);

		x = dest_x * x_step;
		outbuf = new_outbuf;

		while (outbuf < outbuf_end)
		{
			process_pixel_3_3 (run_weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * 3 * 3, 
			                   outbuf, dest_x,
			                   line_bufs, 
			                   x >> SCALE_SHIFT, src_width);
	  
			x += x_step;
			++dest_x;
			++outbuf;
		}

		y += y_step;
	}
#undef MYDIV
}


/***************************************************************************
 *
 *   scale_line
 *
 *   Generic scale_line
 *
 ***************************************************************************/

static inline guchar *
scale_line (int *weights, int n_x, int n_y,
            guchar *dest, int dest_x, guchar *dest_end, 
            guchar **src, int x_init, int x_step, int src_width)
{
	gint x, i, j;

	x = x_init;
	while (dest < dest_end)
	{
		guint  pixval = 0;
		gint   x_scaled = x >> SCALE_SHIFT;
		gint  *pixel_weights;

		pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y;
			
		for (i = 0;  i < n_y;  ++i)
		{
			guchar *q = src[i] + x_scaled;
			gint *line_weights  = pixel_weights + n_x * i;
	      
			for (j = 0;  j < n_x;  ++j)
			{
				pixval += *(q++) * line_weights[j];
			}
		}

		*(dest++) = (pixval + 0xffff) >> 16;

		x += x_step;
	}

	return dest;
}


/***************************************************************************
 *
 *   process_pixel
 *
 *   Generic process_pixel
 *
 ***************************************************************************/

static inline void
process_pixel (int *weights, int n_x, int n_y,
               guchar *dest, int dest_x, 
               guchar **src, int x_start, int src_width)
{
	guint pixval = 0;
	gint  i, j;
  
	for (i=0; i<n_y; i++)
	{
		gint *line_weights  = weights + n_x * i;

		for (j=0; j<n_x; j++)
		{
			guchar *q;

			if (x_start + j < 0)
				q = src[i];
			else if (x_start + j < src_width)
				q = src[i] + (x_start + j);
			else
				q = src[i] + (src_width - 1);

			pixval += (0xff * line_weights[j]) * q[0];
		}
	}

	*dest = (pixval + 0xffffff) >> 24; /* the former scale_pixel() */
}

/***************************************************************************
 *
 *   pixops_process
 *
 *   Generic scaling routine
 *
 ***************************************************************************/

static inline void
pixops_process (guchar         *dest_buf,
                int             dest_width,
                int             dest_height,
                int             dest_rowstride,
                const guchar   *src_buf,
                int             src_width,
                int             src_height,
                int             src_rowstride,
                double          scale_x,
                double          scale_y,
                int            *filter_weights,
                PixopsFilter   *filter)
{
  int i, j;
  int x, y; /* X and Y position in source (fixed_point) */
  
  guchar **line_bufs = g_new (guchar *, filter->y.n);

  int x_step = (1 << SCALE_SHIFT) / scale_x; /* X step in source (fixed point) */
  int y_step = (1 << SCALE_SHIFT) / scale_y; /* Y step in source (fixed point) */

  int scaled_x_offset = floor (filter->x.offset * (1 << SCALE_SHIFT));

  /* Compute the index where we run off the end of the source buffer. The furthest
   * source pixel we access at index i is:
   *
   *  (i * x_step + scaled_x_offset) >> SCALE_SHIFT + filter->x.n - 1
   *
   * So, run_end_index is the smallest i for which this pixel is src_width, i.e, for which:
   *
   *  i * x_step >= ((src_width - filter->x.n + 1) << SCALE_SHIFT) - scaled_x_offset
   *
   */
#define MYDIV(a,b) ((a) > 0 ? (a) / (b) : ((a) - (b) + 1) / (b))    /* Division so that -1/5 = -1 */
  
	int run_end_x = (((src_width - filter->x.n + 1) << SCALE_SHIFT) - scaled_x_offset);
	int run_end_index = MYDIV (run_end_x + x_step - 1, x_step);
	run_end_index = MIN (run_end_index, dest_width);

	y = floor (filter->y.offset * (1 << SCALE_SHIFT));
	for (i = 0; i < dest_height; i++)
	{
		int dest_x;
		int y_start = y >> SCALE_SHIFT;
		int x_start;
		int *run_weights = filter_weights +
                         ((y >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) *
                         filter->x.n * filter->y.n * SUBSAMPLE;
		guchar *new_outbuf;
      
		guchar *outbuf = dest_buf + dest_rowstride * i;
		guchar *outbuf_end = outbuf + dest_width;

		for (j=0; j<filter->y.n; j++)
		{
			if (y_start <  0)
				line_bufs[j] = (guchar *)src_buf;
			else if (y_start < src_height)
				line_bufs[j] = (guchar *)src_buf + src_rowstride * y_start;
			else
				line_bufs[j] = (guchar *)src_buf + src_rowstride * (src_height - 1);

			y_start++;
		}

		dest_x = 0;
		x = scaled_x_offset;
		x_start = x >> SCALE_SHIFT;

		while (x_start < 0 && outbuf < outbuf_end)
		{
			process_pixel (run_weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * (filter->x.n * filter->y.n), 
			               filter->x.n, filter->y.n,
			               outbuf, dest_x, 
			               line_bufs, 
			               x >> SCALE_SHIFT, src_width);
	  
			x += x_step;
			x_start = x >> SCALE_SHIFT;
			dest_x++;
			outbuf++;
		}

		new_outbuf = scale_line (run_weights, filter->x.n, filter->y.n,
		                         outbuf, dest_x,
		                         dest_buf + dest_rowstride * i + run_end_index,
		                         line_bufs, x, x_step, src_width);

		dest_x += (new_outbuf - outbuf);

		x = dest_x * x_step + scaled_x_offset;
		outbuf = new_outbuf;

		while (outbuf < outbuf_end)
		{
			process_pixel (run_weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * (filter->x.n * filter->y.n), 
			               filter->x.n, filter->y.n,
			               outbuf, dest_x,
			               line_bufs, 
			               x >> SCALE_SHIFT, src_width);
	  
			x += x_step;
			++dest_x;
			++outbuf;
		}

		y += y_step;
	}

	g_free (line_bufs);
#undef MYDIV
}

/***************************************************************************
 *
 *   correct_total
 *
 ***************************************************************************/

static void 
correct_total (int *weights, 
               int  n_x, 
               int  n_y,
               int  total)
{
	int correction = (int)(0.5 + 65536 * 1.0) - total;
	int remaining, c, d, i;
  
	if (correction != 0)
	{
		remaining = correction;
		for (d = 1, c = correction; c != 0 && remaining != 0; d++, c = correction / d)
		{
			for (i = n_x * n_y - 1; i >= 0 && c != 0 && remaining != 0; i--)
			{
				if (*(weights + i) + c >= 0) 
				{
					*(weights + i) += c;
					remaining -= c;
					if ((0 < remaining && remaining < c)
					 || (0 > remaining && remaining > c))
					{
						c = remaining;
					}
				}
			}
		}
	}
}

/***************************************************************************
 *
 *   yuvscale_make_filter_table
 *
 ***************************************************************************/

static gint *
yuvscale_make_filter_table (PixopsFilter *filter)
{
	int i_offset, j_offset;
	int n_x = filter->x.n;
	int n_y = filter->y.n;
	int *weights = g_new (int, SUBSAMPLE * SUBSAMPLE * n_x * n_y);

	for (i_offset=0; i_offset < SUBSAMPLE; i_offset++)
	{
		for (j_offset=0; j_offset < SUBSAMPLE; j_offset++)
		{
			double weight;
			int *pixel_weights = weights + ((i_offset*SUBSAMPLE) + j_offset) * n_x * n_y;
			int total = 0;
			int i, j;

			for (i=0; i < n_y; i++)
			{
				for (j=0; j < n_x; j++)
				{
					weight = filter->x.weights[(j_offset * n_x) + j]
					          * filter->y.weights[(i_offset * n_y) + i]
					          * 1.0 * 65536 + 0.5;

					total += (int)weight;

					*(pixel_weights + n_x * i + j) = weight;
				}
			}

			correct_total (pixel_weights, n_x, n_y, total);
		}
	}

  return weights;
}

/***************************************************************************
 *
 *   yuvscale_tile_make_weights
 *
 *   Compute weights for reconstruction by replication 
 *    followed by sampling with a box filter
 *
 ***************************************************************************/

static void
yuvscale_tile_make_weights (PixopsFilterDimension *dim, gdouble scale)
{
	gdouble  *pixel_weights;
	gint      n = ceil (1.0 / scale + 1.0);
	gint      offset, i;

	dim->n = n;
	dim->offset = 0;
	dim->weights = g_new (gdouble, SUBSAMPLE * n);
	
	pixel_weights = dim->weights;
	
	for (offset = 0; offset < SUBSAMPLE; offset++)
	{
		gdouble x = (gdouble)offset / SUBSAMPLE;
		gdouble a = x + 1 / scale;

		/*           x
		 * ---------|--.-|----|--.-|-------  SRC
		 * ------------|---------|---------  DEST
		 */
		for (i = 0; i < n; i++)
		{
			if (i < x)
			{
				if (i + 1 > x)
					*(pixel_weights++)  = (MIN (i + 1, a) - x) * scale;
				else
					*(pixel_weights++) = 0;
			}
			else
			{
				if (a > i)
					*(pixel_weights++)  = (MIN (i + 1, a) - i) * scale;
				else
					*(pixel_weights++) = 0;
			}
		}
	}
}

/***************************************************************************
 *
 *   yuvscale_linear_box_half
 *
 *   Computes the integral from b0 to b1 of
 *
 *     f(x) = x; 0 <= x < 1
 *     f(x) = 0; otherwise
 *
 *   We combine two of these to compute the convolution of
 *    a box filter with a triangular spike.
 *
 ***************************************************************************/

static gdouble
yuvscale_linear_box_half (gdouble b0, gdouble b1)
{
	gdouble a0, a1;
	gdouble x0, x1;

	a0 = 0.0;
	a1 = 1.0;

	if (a0 < b0)
	{
		if (a1 > b0)
		{
			x0 = b0;
			x1 = MIN (a1, b1);
		}
		else
		{
			return 0;
		}
	}
	else
	{
		if (b1 > a0)
		{
			x0 = a0;
			x1 = MIN (a1, b1);
		}
		else
		{
			return 0;
		}
	}

	return 0.5 * (x1*x1 - x0*x0);
}

/***************************************************************************
 *
 *   yuvscale_bilinear_box_make_weights
 *
 *   Compute weights for reconstructing with bilinear
 *    interpolation, then sampling with a box filter
 *
 ***************************************************************************/

static void
yuvscale_bilinear_box_make_weights (PixopsFilterDimension *dim, gdouble scale)
{
	int n = ceil (1/scale + 3.0);
	double *pixel_weights = g_new (double, SUBSAMPLE * n);
	double w;
	int offset, i;

	dim->offset = -1.0;
	dim->n = n;
	dim->weights = pixel_weights;

	for (offset = 0; offset < SUBSAMPLE; offset++)
	{
		double x = (double)offset / SUBSAMPLE;
		double a = x + 1 / scale;

		for (i = 0; i < n; i++)
		{
			w  = yuvscale_linear_box_half (0.5 + i - a, 0.5 + i - x);
			w += yuvscale_linear_box_half (1.5 + x - i, 1.5 + a - i);
      
			*(pixel_weights++) = w * scale;
		}
	}
}

/***************************************************************************
 *
 *   yuvscale_make_weights
 *
 ***************************************************************************/

static void
yuvscale_make_weights (PixopsFilter *filter, gdouble scale_x, gdouble scale_y)
{
	switch (YUVSCALE_INTERPOLATION)
	{
		case PIXOPS_INTERP_TILES:
			yuvscale_tile_make_weights (&filter->x, scale_x);
			yuvscale_tile_make_weights (&filter->y, scale_y);
			break;
      
		/* For minification this is the same as 'tiles',
		 *  and we never scale up here */
		case PIXOPS_INTERP_BILINEAR:
			yuvscale_tile_make_weights (&filter->x, scale_x);
			yuvscale_tile_make_weights (&filter->y, scale_y);
			break;
      
		case PIXOPS_INTERP_HYPER:
			yuvscale_bilinear_box_make_weights (&filter->x, scale_x);
			yuvscale_bilinear_box_make_weights (&filter->y, scale_y);
			break;

		case PIXOPS_INTERP_NEAREST: /* not implemented */
		default:
			g_assert_not_reached ();
	}
}

/***************************************************************************
 *
 *   yuvscale_scale_plane_slow
 *
 ***************************************************************************/

static inline void
yuvscale_scale_plane_slow (ThYuvScale *ys, 
                           guint8 *src, guint8 *dest_start,
                           guint src_width, guint src_height, 
                           guint dest_width, guint dest_height, 
                           guint src_stride, guint dest_stride)
{
	if (G_LIKELY (ys->process_func))
	{
		ys->process_func (dest_start, dest_width, dest_height, dest_stride, 
		                  src, src_width, src_height, src_stride, 
		                  ys->scale_x, ys->scale_y,
		                  ys->filter_weights);
	}
	else
	{
		pixops_process (dest_start, dest_width, dest_height, dest_stride,
		                src, src_width, src_height, src_stride,
		                ys->scale_x, ys->scale_y,
		                ys->filter_weights, &ys->filter);
	}
}

/***************************************************************************
 *
 *   yuvscale_scale
 *
 ***************************************************************************/

static void
yuvscale_scale (ThYuvScale *ys, guint8 *src_yuv, guint8 *dest_yuv)
{
	/* GST_DEBUG (0, "thyuvscale: scaling YUV420P %dx%d to %dx%d\n", sw, sh, dw, dh); */

	/* Y plane */
	yuvscale_scale_plane_slow (ys, 
	                           src_yuv + I420_Y_OFFSET (ys->from_width, ys->from_height),
	                           dest_yuv + I420_Y_OFFSET (ys->to_width, ys->to_height),
	                           ys->from_width, ys->from_height,
	                           ys->to_width, ys->to_height,
	                           I420_Y_ROWSTRIDE (ys->from_width),
	                           I420_Y_ROWSTRIDE (ys->to_width));
	/* U plane */
	yuvscale_scale_plane_slow (ys, 
	                           src_yuv + I420_U_OFFSET (ys->from_width, ys->from_height),
	                           dest_yuv + I420_U_OFFSET (ys->to_width, ys->to_height),
	                           (ys->from_width >> 1), (ys->from_height >> 1), 
	                           (ys->to_width >> 1), (ys->to_height >> 1),
	                           I420_U_ROWSTRIDE (ys->from_width),
	                           I420_U_ROWSTRIDE (ys->to_width));
	
	/* V plane */
	yuvscale_scale_plane_slow (ys, 
	                           src_yuv + I420_V_OFFSET (ys->from_width, ys->from_height),
	                           dest_yuv + I420_V_OFFSET (ys->to_width, ys->to_height),
	                           (ys->from_width >> 1), (ys->from_height >> 1), 
	                           (ys->to_width >> 1), (ys->to_height >> 1),
	                           I420_V_ROWSTRIDE (ys->from_width),
	                           I420_V_ROWSTRIDE (ys->to_width));
}

/***************************************************************************
 *
 *   yuvscale_setup_pixops_func
 *
 ***************************************************************************/

#define EPSILON 0.01

static void
yuvscale_setup_pixops_func (ThYuvScale *ys)
{
	/* see if we can use any of our specialised routines */
	if (YUVSCALE_INTERPOLATION == PIXOPS_INTERP_BILINEAR)
	{
		/* TH_PICTURE_SIZE_MEDIUM_SMALL: 1/2, 0.500, dim.n = 3  */
		if (fabs (ys->scale_x - 0.50) <= EPSILON
		 && fabs (ys->scale_y - 0.50) <= EPSILON)
		{
			ys->process_func = pixops_process_3_3;
			g_print ("YuvScale: pixops_process_3_3() for scaling to %.3f\n", ys->scale_x);
			return;
		}
		
		/* TH_PICTURE_SIZE_MEDIUM_LARGE: 2/3, 0.666, dim.n = 3  */
		if (fabs (ys->scale_x - 0.666666666) <= EPSILON
		 && fabs (ys->scale_y - 0.666666666) <= EPSILON)
		{
			ys->process_func = pixops_process_3_3;
			g_print ("YuvScale: pixops_process_3_3() for scaling to %.3f\n", ys->scale_x);
			return;
		}
	
		/* TH_PICTURE_SIZE_LARGE: 3/4, 0.750, dim.n = 3  */
		if (fabs (ys->scale_x - 0.75) <= EPSILON
		 && fabs (ys->scale_y - 0.75) <= EPSILON)
		{
			ys->process_func = pixops_process_3_3;
			g_print ("YuvScale: pixops_process_3_3() for scaling to %.3f\n", ys->scale_x);
			return;
		}
		
		/* TH_PICTURE_SIZE_XLARGE: 5/6, 0.833, dim.n = 3  */
		if (fabs (ys->scale_x - 0.8333333) <= EPSILON
		 && fabs (ys->scale_y - 0.8333333) <= EPSILON)
		{
			ys->process_func = pixops_process_3_3;
			g_print ("YuvScale: pixops_process_3_3() for scaling to %.3f\n", ys->scale_x);
			return;
		}
	}

	/*use generic process_pixops() */
	ys->process_func = NULL; 
	g_print ("YuvScale: using generic pixops_process() for scaling to %.3f\n", ys->scale_x);
}
