/*
 * Copyright 2009 Nicolai Haehnle.
 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
 * SPDX-License-Identifier: MIT
 */

#include "util/u_math.h"

#include "radeon_dataflow.h"

#include "radeon_compiler.h"
#include "radeon_compiler_util.h"
#include "radeon_list.h"
#include "radeon_swizzle.h"
#include "radeon_variable.h"

struct src_clobbered_reads_cb_data {
	rc_register_file File;
	unsigned int Index;
	unsigned int Mask;
	struct rc_reader_data * ReaderData;
};

typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
						struct rc_instruction *,
						unsigned int);

static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
{
	struct rc_src_register combine;
	combine.File = inner.File;
	combine.Index = inner.Index;
	combine.RelAddr = inner.RelAddr;
	if (outer.Abs) {
		combine.Abs = 1;
		combine.Negate = outer.Negate;
	} else {
		combine.Abs = inner.Abs;
		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
		combine.Negate ^= outer.Negate;
	}
	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
	return combine;
}

static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
						struct rc_src_register * src)
{
	rc_register_file file = src->File;
	struct rc_reader_data * reader_data = data;

	if(!rc_inst_can_use_presub(reader_data->C,
				inst,
				reader_data->Writer->U.I.PreSub.Opcode,
				rc_swizzle_to_writemask(src->Swizzle),
				src,
				&reader_data->Writer->U.I.PreSub.SrcReg[0],
				&reader_data->Writer->U.I.PreSub.SrcReg[1])) {
		reader_data->Abort = 1;
		return;
	}

	/* XXX This could probably be handled better. */
	if (file == RC_FILE_ADDRESS) {
		reader_data->Abort = 1;
		return;
	}

	/* R300/R400 is unhappy about propagating
	 *  0: MOV temp[1], -none.1111;
	 *  1: KIL temp[1];
	 * to
	 *  0: KIL -none.1111;
	 *
	 * R500 is fine with it.
	 */
	if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
		reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
		reader_data->Abort = 1;
		return;
	}

	/* These instructions cannot read from the constants file.
	 * see radeonTransformTEX()
	 */
	if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
				(inst->U.I.Opcode == RC_OPCODE_TEX ||
				inst->U.I.Opcode == RC_OPCODE_TXB ||
				inst->U.I.Opcode == RC_OPCODE_TXP ||
				inst->U.I.Opcode == RC_OPCODE_TXD ||
				inst->U.I.Opcode == RC_OPCODE_TXL ||
				inst->U.I.Opcode == RC_OPCODE_KIL)){
		reader_data->Abort = 1;
		return;
	}
}

static void src_clobbered_reads_cb(
	void * data,
	struct rc_instruction * inst,
	struct rc_src_register * src)
{
	struct src_clobbered_reads_cb_data * sc_data = data;

	if (src->File == sc_data->File
	    && src->Index == sc_data->Index
	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {

		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
	}

	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
	}
}

static void is_src_clobbered_scan_write(
	void * data,
	struct rc_instruction * inst,
	rc_register_file file,
	unsigned int index,
	unsigned int mask)
{
	struct src_clobbered_reads_cb_data sc_data;
	struct rc_reader_data * reader_data = data;
	sc_data.File = file;
	sc_data.Index = index;
	sc_data.Mask = mask;
	sc_data.ReaderData = reader_data;
	rc_for_all_reads_src(reader_data->Writer,
					src_clobbered_reads_cb, &sc_data);
}

static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
{
	struct rc_reader_data reader_data;
	unsigned int i;

	if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
	    inst_mov->U.I.WriteALUResult)
		return;

	/* Get a list of all the readers of this MOV instruction. */
	reader_data.ExitOnAbort = 1;
	rc_get_readers(c, inst_mov, &reader_data,
		       copy_propagate_scan_read, NULL,
		       is_src_clobbered_scan_write);

	if (reader_data.Abort || reader_data.ReaderCount == 0)
		return;

	/* We can propagate SaturateMode if all the readers are MOV instructions
	 * without a presubtract operation, source negation and absolute.
	 * In that case, we just move SaturateMode to all readers. */
        if (inst_mov->U.I.SaturateMode) {
		for (i = 0; i < reader_data.ReaderCount; i++) {
			struct rc_instruction * inst = reader_data.Readers[i].Inst;

			if (inst->U.I.Opcode != RC_OPCODE_MOV ||
			    inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
			    inst->U.I.SrcReg[0].Abs ||
			    inst->U.I.SrcReg[0].Negate) {
				return;
			}
		}
	}

	/* Propagate the MOV instruction. */
	for (i = 0; i < reader_data.ReaderCount; i++) {
		struct rc_instruction * inst = reader_data.Readers[i].Inst;
		*reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);

		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
			inst->U.I.PreSub = inst_mov->U.I.PreSub;
		if (!inst->U.I.SaturateMode)
			inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
	}

	/* Finally, remove the original MOV instruction */
	rc_remove_instruction(inst_mov);
}

/**
 * Check if a source register is actually always the same
 * swizzle constant.
 */
static int is_src_uniform_constant(struct rc_src_register src,
		rc_swizzle * pswz, unsigned int * pnegate)
{
	int have_used = 0;

	if (src.File != RC_FILE_NONE) {
		*pswz = 0;
		return 0;
	}

	for(unsigned int chan = 0; chan < 4; ++chan) {
		unsigned int swz = GET_SWZ(src.Swizzle, chan);
		if (swz < 4) {
			*pswz = 0;
			return 0;
		}
		if (swz == RC_SWIZZLE_UNUSED)
			continue;

		if (!have_used) {
			*pswz = swz;
			*pnegate = GET_BIT(src.Negate, chan);
			have_used = 1;
		} else {
			if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
				*pswz = 0;
				return 0;
			}
		}
	}

	return 1;
}

/**
 * Replace 0.0, 1.0 and 0.5 immediate constants by their
 * respective swizzles. Simplify instructions like ADD dst, src, 0;
 */
static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
{
	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
	unsigned int i;

	/* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
	for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
		struct rc_constant * constant;
		struct rc_src_register newsrc;
		int have_real_reference;
		unsigned int chan;

		/* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
		for (chan = 0; chan < 4; ++chan)
			if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
				break;
		if (chan == 4) {
			inst->U.I.SrcReg[src].File = RC_FILE_NONE;
			continue;
		}

		/* Convert immediates to swizzles. */
		if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
		    inst->U.I.SrcReg[src].RelAddr ||
		    inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
			continue;

		constant =
			&c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];

		if (constant->Type != RC_CONSTANT_IMMEDIATE)
			continue;

		newsrc = inst->U.I.SrcReg[src];
		have_real_reference = 0;
		for (chan = 0; chan < 4; ++chan) {
			unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
			unsigned int newswz;
			float imm;
			float baseimm;

			if (swz >= 4)
				continue;

			imm = constant->u.Immediate[swz];
			baseimm = imm;
			if (imm < 0.0)
				baseimm = -baseimm;

			if (baseimm == 0.0) {
				newswz = RC_SWIZZLE_ZERO;
			} else if (baseimm == 1.0) {
				newswz = RC_SWIZZLE_ONE;
			} else if (baseimm == 0.5 && c->has_half_swizzles) {
				newswz = RC_SWIZZLE_HALF;
			} else {
				have_real_reference = 1;
				continue;
			}

			SET_SWZ(newsrc.Swizzle, chan, newswz);
			if (imm < 0.0 && !newsrc.Abs)
				newsrc.Negate ^= 1 << chan;
		}

		if (!have_real_reference) {
			newsrc.File = RC_FILE_NONE;
			newsrc.Index = 0;
		}

		/* don't make the swizzle worse */
		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
			continue;

		inst->U.I.SrcReg[src] = newsrc;
	}

	/* In case this instruction has been converted, make sure all of the
	 * registers that are no longer used are empty. */
	opcode = rc_get_opcode_info(inst->U.I.Opcode);
	for(i = opcode->NumSrcRegs; i < 3; i++) {
		memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
	}
}

/**
 * If src and dst use the same register, this function returns a writemask that
 * indicates which components are read by src.  Otherwise zero is returned.
 */
static unsigned int src_reads_dst_mask(struct rc_src_register src,
						struct rc_dst_register dst)
{
	if (dst.File != src.File || dst.Index != src.Index) {
		return 0;
	}
	return rc_swizzle_to_writemask(src.Swizzle);
}

/* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
 * in any of its channels.  Return 0 otherwise. */
static int src_has_const_swz(struct rc_src_register src) {
	int chan;
	for(chan = 0; chan < 4; chan++) {
		unsigned int swz = GET_SWZ(src.Swizzle, chan);
		if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
						|| swz == RC_SWIZZLE_ONE) {
			return 1;
		}
	}
	return 0;
}

static void presub_scan_read(
	void * data,
	struct rc_instruction * inst,
	struct rc_src_register * src)
{
	struct rc_reader_data * reader_data = data;
	rc_presubtract_op * presub_opcode = reader_data->CbData;

	if (!rc_inst_can_use_presub(reader_data->C,
			inst,
			*presub_opcode,
			reader_data->Writer->U.I.DstReg.WriteMask,
			src,
			&reader_data->Writer->U.I.SrcReg[0],
			&reader_data->Writer->U.I.SrcReg[1])) {
		reader_data->Abort = 1;
		return;
	}
}

static int presub_helper(
	struct radeon_compiler * c,
	struct rc_instruction * inst_add,
	rc_presubtract_op presub_opcode,
	rc_presub_replace_fn presub_replace)
{
	struct rc_reader_data reader_data;
	unsigned int i;
	rc_presubtract_op cb_op = presub_opcode;

	reader_data.CbData = &cb_op;
	reader_data.ExitOnAbort = 1;
	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
						is_src_clobbered_scan_write);

	if (reader_data.Abort || reader_data.ReaderCount == 0)
		return 0;

	for(i = 0; i < reader_data.ReaderCount; i++) {
		unsigned int src_index;
		struct rc_reader reader = reader_data.Readers[i];
		const struct rc_opcode_info * info =
				rc_get_opcode_info(reader.Inst->U.I.Opcode);

		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
				presub_replace(inst_add, reader.Inst, src_index);
		}
	}
	return 1;
}

static void presub_replace_add(
	struct rc_instruction * inst_add,
	struct rc_instruction * inst_reader,
	unsigned int src_index)
{
	rc_presubtract_op presub_opcode;

	unsigned int negates = 0;
	if (inst_add->U.I.SrcReg[0].Negate)
		negates++;
	if (inst_add->U.I.SrcReg[1].Negate)
		negates++;
	assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);

	if (negates == 1)
		presub_opcode = RC_PRESUB_SUB;
	else
		presub_opcode = RC_PRESUB_ADD;

	if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
	} else {
		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
	}
	/* If both sources are negative we can move the negate to the presub. */
	unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
	inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
	inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
	inst_reader->U.I.PreSub.Opcode = presub_opcode;
	inst_reader->U.I.SrcReg[src_index] =
			chain_srcregs(inst_reader->U.I.SrcReg[src_index],
					inst_reader->U.I.PreSub.SrcReg[0]);
	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
}

static int is_presub_candidate(
	struct radeon_compiler * c,
	struct rc_instruction * inst)
{
	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
	unsigned int i;
	unsigned int is_constant[2] = {0, 0};

	assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);

	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
			|| inst->U.I.SaturateMode
			|| inst->U.I.WriteALUResult
			|| inst->U.I.Omod) {
		return 0;
	}

	/* If first two sources use a constant swizzle, then we can't convert it to
	 * a presubtract operation.  In fact for the ADD and SUB presubtract
	 * operations neither source can contain a constant swizzle.  This
	 * specific case is checked in peephole_add_presub_add() when
	 * we make sure the swizzles for both sources are equal, so we
	 * don't need to worry about it here. */
	for (i = 0; i < 2; i++) {
		int chan;
		for (chan = 0; chan < 4; chan++) {
			rc_swizzle swz =
				get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
			if (swz == RC_SWIZZLE_ONE
					|| swz == RC_SWIZZLE_ZERO
					|| swz == RC_SWIZZLE_HALF) {
				is_constant[i] = 1;
			}
		}
	}
	if (is_constant[0] && is_constant[1])
		return 0;

	for(i = 0; i < info->NumSrcRegs; i++) {
		struct rc_src_register src = inst->U.I.SrcReg[i];
		if (src_reads_dst_mask(src, inst->U.I.DstReg))
			return 0;

		src.File = RC_FILE_PRESUB;
		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
			return 0;
	}
	return 1;
}

static int peephole_add_presub_add(
	struct radeon_compiler * c,
	struct rc_instruction * inst_add)
{
	unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
        unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
        unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;

	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
		return 0;

	/* src0 and src1 can't have absolute values */
	if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
	        return 0;

        /* if src0 is negative, at least all bits of dstmask have to be set */
        if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
	        return 0;

        /* if src1 is negative, at least all bits of dstmask have to be set */
        if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
	        return 0;

	if (!is_presub_candidate(c, inst_add))
		return 0;

	if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
		rc_remove_instruction(inst_add);
		return 1;
	}
	return 0;
}

static void presub_replace_inv(
	struct rc_instruction * inst_add,
	struct rc_instruction * inst_reader,
	unsigned int src_index)
{
	/* We must be careful not to modify inst_add, since it
	 * is possible it will remain part of the program.*/
	inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
						inst_reader->U.I.PreSub.SrcReg[0]);

	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
}

static void presub_replace_bias(
	struct rc_instruction * inst_mad,
	struct rc_instruction * inst_reader,
	unsigned int src_index)
{
	/* We must be careful not to modify inst_mad, since it
	 * is possible it will remain part of the program.*/
	inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
						inst_reader->U.I.PreSub.SrcReg[0]);

	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
}

/**
 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
 * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
 * of the add instruction must have the constant 1 swizzle.  This function
 * does not check const registers to see if their value is 1.0, so it should
 * be called after the constant_folding optimization.
 * @return
 * 	0 if the ADD instruction is still part of the program.
 * 	1 if the ADD instruction is no longer part of the program.
 */
static int peephole_add_presub_inv(
	struct radeon_compiler * c,
	struct rc_instruction * inst_add)
{
	unsigned int i, swz;

	if (!is_presub_candidate(c, inst_add))
		return 0;

	/* Check if src0 is 1. */
	/* XXX It would be nice to use is_src_uniform_constant here, but that
	 * function only works if the register's file is RC_FILE_NONE */
	for(i = 0; i < 4; i++ ) {
		if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
			continue;

		swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
		if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
			return 0;
	}

	/* Check src1. */
	if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
						inst_add->U.I.DstReg.WriteMask
		|| inst_add->U.I.SrcReg[1].Abs
		|| src_has_const_swz(inst_add->U.I.SrcReg[1])) {

		return 0;
	}

	if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
		rc_remove_instruction(inst_add);
		return 1;
	}
	return 0;
}

/**
 * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
 * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
 * of the add instruction must have the constant 1 swizzle.  This function
 * does not check const registers to see if their value is 1.0, so it should
 * be called after the constant_folding optimization.
 * @return
 * 	0 if the MAD instruction is still part of the program.
 * 	1 if the MAD instruction is no longer part of the program.
 */
static int peephole_mad_presub_bias(
	struct radeon_compiler * c,
	struct rc_instruction * inst_mad)
{
	unsigned int i, swz;

	if (!is_presub_candidate(c, inst_mad))
		return 0;

	/* Check if src2 is 1. */
	for(i = 0; i < 4; i++ ) {
		if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
			continue;

		swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
		if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
			return 0;
	}

	/* Check if src1 is 2. */
	struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
	if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
		return 0;
	if (src1_reg.File == RC_FILE_INLINE) {
		if (rc_inline_to_float(src1_reg.Index) != 2.0f)
			 return 0;
	} else {
		if (src1_reg.File != RC_FILE_CONSTANT)
			return 0;

		struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
		if (constant->Type != RC_CONSTANT_IMMEDIATE)
			return 0;
	        for (i = 0; i < 4; i++) {
			if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
				continue;
			swz = GET_SWZ(src1_reg.Swizzle, i);
			if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
				return 0;
		}
	}

	/* Check src0. */
	if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
						inst_mad->U.I.DstReg.WriteMask
		|| inst_mad->U.I.SrcReg[0].Abs
		|| src_has_const_swz(inst_mad->U.I.SrcReg[0])) {

		return 0;
	}

	if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
		rc_remove_instruction(inst_mad);
		return 1;
	}
	return 0;
}

struct peephole_mul_cb_data {
	struct rc_dst_register * Writer;
	unsigned int Clobbered;
};

static void omod_filter_reader_cb(
	void * userdata,
	struct rc_instruction * inst,
	rc_register_file file,
	unsigned int index,
	unsigned int mask)
{
	struct peephole_mul_cb_data * d = userdata;
	if (rc_src_reads_dst_mask(file, mask, index,
		d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {

		d->Clobbered = 1;
	}
}

static void omod_filter_writer_cb(
	void * userdata,
	struct rc_instruction * inst,
	rc_register_file file,
	unsigned int index,
	unsigned int mask)
{
	struct peephole_mul_cb_data * d = userdata;
	if (file == d->Writer->File && index == d->Writer->Index &&
					(mask & d->Writer->WriteMask)) {
		d->Clobbered = 1;
	}
}

static int peephole_mul_omod(
	struct radeon_compiler * c,
	struct rc_instruction * inst_mul,
	struct rc_list * var_list)
{
	unsigned int chan = 0, swz, i;
	int const_index = -1;
	int temp_index = -1;
	float const_value;
	rc_omod_op omod_op = RC_OMOD_DISABLE;
	struct rc_list * writer_list;
	struct rc_variable * var;
	struct peephole_mul_cb_data cb_data;
	unsigned writemask_sum;

	for (i = 0; i < 2; i++) {
		unsigned int j;
		if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY
			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
			return 0;
		}

		/* The only relevant case with constant swizzles we should check for
		 * is multiply by one half.
		 */
		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
			for (j = 0; j < 4; j++) {
				swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
				if (swz == RC_SWIZZLE_UNUSED) {
					continue;
				}
				if (swz != RC_SWIZZLE_HALF) {
					return 0;
				} else {
					omod_op = RC_OMOD_DIV_2;
				}
			}
		}

		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
			if (temp_index != -1) {
				/* The instruction has two temp sources */
				return 0;
			} else {
				temp_index = i;
				continue;
			}
		}
		/* If we get this far Src[i] must be a constant src */
		if (inst_mul->U.I.SrcReg[i].Negate) {
			return 0;
		}
		/* The constant src needs to read from the same swizzle */
		swz = RC_SWIZZLE_UNUSED;
		chan = 0;
		for (j = 0; j < 4; j++) {
			unsigned int j_swz =
				GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
			if (j_swz == RC_SWIZZLE_UNUSED) {
				continue;
			}
			if (swz == RC_SWIZZLE_UNUSED) {
				swz = j_swz;
				chan = j;
			} else if (j_swz != swz) {
				return 0;
			}
		}

		if (const_index != -1) {
			/* The instruction has two constant sources */
			return 0;
		} else {
			const_index = i;
		}
	}

	if (omod_op == RC_OMOD_DISABLE) {
		if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
					inst_mul->U.I.SrcReg[const_index].Index)) {
			return 0;
		}
		const_value = rc_get_constant_value(c,
				inst_mul->U.I.SrcReg[const_index].Index,
				inst_mul->U.I.SrcReg[const_index].Swizzle,
				inst_mul->U.I.SrcReg[const_index].Negate,
				chan);

		if (const_value == 2.0f) {
			omod_op = RC_OMOD_MUL_2;
		} else if (const_value == 4.0f) {
			omod_op = RC_OMOD_MUL_4;
		} else if (const_value == 8.0f) {
			omod_op = RC_OMOD_MUL_8;
		} else if (const_value == (1.0f / 2.0f)) {
			omod_op = RC_OMOD_DIV_2;
		} else if (const_value == (1.0f / 4.0f)) {
			omod_op = RC_OMOD_DIV_4;
		} else if (const_value == (1.0f / 8.0f)) {
			omod_op = RC_OMOD_DIV_8;
		} else {
			return 0;
		}
	}

	writer_list = rc_variable_list_get_writers_one_reader(var_list,
		RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);

	if (!writer_list) {
		return 0;
	}

	cb_data.Clobbered = 0;
	cb_data.Writer = &inst_mul->U.I.DstReg;
	for (var = writer_list->Item; var; var = var->Friend) {
		struct rc_instruction * inst;
		const struct rc_opcode_info * info = rc_get_opcode_info(
				var->Inst->U.I.Opcode);
		if (info->HasTexture) {
			return 0;
		}
		if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
			return 0;
		}

		/* Empirical testing shows that DDX/DDY directly into output
		 * with non-identity omod is problematic.
		 */
		if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
			inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
			return 0;
		}

		for (inst = inst_mul->Prev; inst != var->Inst;
							inst = inst->Prev) {
			rc_for_all_reads_mask(inst, omod_filter_reader_cb,
								&cb_data);
			rc_for_all_writes_mask(inst, omod_filter_writer_cb,
								&cb_data);
			if (cb_data.Clobbered) {
				break;
			}
		}
	}

	if (cb_data.Clobbered) {
		return 0;
	}

	writemask_sum = rc_variable_writemask_sum(writer_list->Item);

	/* rc_normal_rewrite_writemask can't expand a previous writemask to store
	 * more channels replicated.
	 */
	if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
		return 0;

	/* Rewrite the instructions */
	for (var = writer_list->Item; var; var = var->Friend) {
		struct rc_variable * writer = var;
		unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
		for (chan = 0; chan < 4; chan++) {
			unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
			if (swz <= RC_SWIZZLE_W)
				SET_SWZ(conversion_swizzle, swz, chan);
		}
		writer->Inst->U.I.Omod = omod_op;
		writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
		writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
		rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
		writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
	}

	rc_remove_instruction(inst_mul);

	return 1;
}

/**
 * @return
 * 	0 if inst is still part of the program.
 * 	1 if inst is no longer part of the program.
 */
int
rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
{
	switch(inst->U.I.Opcode) {
	case RC_OPCODE_ADD:
	{
		if (peephole_add_presub_inv(c, inst))
			return 1;
		if (peephole_add_presub_add(c, inst))
			return 1;
		break;
	}
	case RC_OPCODE_MAD:
	{
		if (peephole_mad_presub_bias(c, inst))
			return 1;
		break;
	}
	default:
		break;
	}
	return 0;
}

static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2)
{
	unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
	for (unsigned int chan = 0; chan < 4; chan++) {
		unsigned int swz = GET_SWZ(swz1, chan);
		if (swz != RC_SWIZZLE_UNUSED) {
			SET_SWZ(new_swz, chan, swz);
			continue;
		}
		swz = GET_SWZ(swz2, chan);
		SET_SWZ(new_swz, chan, swz);
	}
	return new_swz;
}

/* Sets negate to 0 for unused channels. */
static unsigned int clean_negate(struct rc_src_register src)
{
	unsigned int new_negate = 0;
	for (unsigned int chan = 0; chan < 4; chan++) {
		unsigned int swz = GET_SWZ(src.Swizzle, chan);
		if (swz != RC_SWIZZLE_UNUSED)
			new_negate |= src.Negate & (1 << chan);
	}
	return new_negate;
}

static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2)
{
	return clean_negate(src1) | clean_negate(src2);
}

static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
{
	for (unsigned int chan = 0; chan < 4; chan++) {
		unsigned int swz = GET_SWZ(orig_swz, chan);
		if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
			SET_SWZ(orig_swz, chan, const_swz);
		}
	}
	return orig_swz;
}

static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2)
{
	int shared_src = -1;
	const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
	const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
	for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
		for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
			if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
				inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
				inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
				inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
				shared_src = i;
		}
	}
	return shared_src;
}

/**
 * Merges two MOVs writing different channels of the same destination register
 * with the use of the constant swizzles.
 */
static bool merge_movs(
	struct radeon_compiler * c,
	struct rc_instruction * inst,
	struct rc_instruction * cur)
{
	/* We can merge two MOVs into MOV if one of them is from inline constant,
	 * i.e., constant swizzles and RC_FILE_NONE).
	 *
	 * For example
	 *   MOV temp[0].x none.1___
	 *   MOV temp[0].y input[0]._x__
	 *
	 * becomes
	 *   MOV temp[0].xy input[0].1x__
	 */
	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
	if (cur->U.I.SrcReg[0].File == RC_FILE_NONE ||
		inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
		struct rc_src_register src;
		if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
			src = inst->U.I.SrcReg[0];
		else
			src = cur->U.I.SrcReg[0];
		src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
						inst->U.I.SrcReg[0].Swizzle);
		src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
		if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
			cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
			cur->U.I.SrcReg[0] = src;
			rc_remove_instruction(inst);
			return true;
		}
	}

	/* Handle the trivial case where the MOVs share a source.
	 *
	 * For example
	 *   MOV temp[0].x const[0].x
	 *   MOV temp[0].y const[0].z
	 *
	 * becomes
	 *   MOV temp[0].xy const[0].xz
	 */
	if (have_shared_source(inst, cur) == 0) {
		struct rc_src_register src = cur->U.I.SrcReg[0];
		src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
		src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
						inst->U.I.SrcReg[0].Swizzle);

                if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
                        cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
                        cur->U.I.SrcReg[0] = src;
                        rc_remove_instruction(inst);
                        return true;
                }
	}

	/* Otherwise, we can convert the MOVs into ADD.
	 *
	 * For example
	 *   MOV temp[0].x const[0].x
	 *   MOV temp[0].y input[0].y
	 *
	 * becomes
	 *   ADD temp[0].xy const[0].x0 input[0].0y
	 */
	unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
	struct rc_src_register src0 = inst->U.I.SrcReg[0];
	struct rc_src_register src1 = cur->U.I.SrcReg[0];

	src0.Swizzle = fill_swizzle(src0.Swizzle,
				wmask, RC_SWIZZLE_ZERO);
	src1.Swizzle = fill_swizzle(src1.Swizzle,
				wmask, RC_SWIZZLE_ZERO);
	if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
		!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
		return false;

	cur->U.I.DstReg.WriteMask = wmask;
	cur->U.I.Opcode = RC_OPCODE_ADD;
	cur->U.I.SrcReg[0] = src0;
	cur->U.I.SrcReg[1] = src1;

	/* finally delete the original mov */
	rc_remove_instruction(inst);
	return true;
}

/**
 * This function will try to merge MOV and ADD/MUL instructions with the same
 * destination, making use of the constant swizzles.
 *
 * For example:
 *   MOV temp[0].x const[0].x
 *   MUL temp[0].yz const[1].yz const[2].yz
 *
 * becomes
 *   MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
 */
static int merge_mov_add_mul(
	struct radeon_compiler * c,
	struct rc_instruction * inst1,
	struct rc_instruction * inst2)
{
	struct rc_instruction * inst, * mov;
	if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
		mov = inst1;
		inst = inst2;
	} else {
		mov = inst2;
		inst = inst1;
	}

	const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
	int shared_index = have_shared_source(inst, mov);
	unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;

	/* If there is a shared source, just merge the swizzles and be done with it. */
	if (shared_index != -1) {
		struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
		struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];

		shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
		shared_src.Swizzle = merge_swizzles(shared_src.Swizzle,
					mov->U.I.SrcReg[0].Swizzle);
		other_src.Negate = clean_negate(other_src);
		unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
		other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);

		if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
			!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
			return 0;

		inst2->U.I.Opcode = inst->U.I.Opcode;
		inst2->U.I.SrcReg[0] = shared_src;
		inst2->U.I.SrcReg[1] = other_src;

	/* TODO: we can do a bit better in the special case when one of the sources is none.
	 * Convert to MAD otherwise.
	 */
	} else {
		struct rc_src_register src0, src1, src2;
		if (is_mul) {
			src2 = mov->U.I.SrcReg[0];
			src0 = inst->U.I.SrcReg[0];
			src1 = inst->U.I.SrcReg[1];
		} else {
			src0 = mov->U.I.SrcReg[0];
			src1 = inst->U.I.SrcReg[0];
			src2 = inst->U.I.SrcReg[1];
		}
		/* The following login expects that the unused channels have empty negate bits. */
		src0.Negate = clean_negate(src0);
		src1.Negate = clean_negate(src1);
		src2.Negate = clean_negate(src2);

		src0.Swizzle = fill_swizzle(src0.Swizzle,
					wmask, RC_SWIZZLE_ONE);
		src1.Swizzle = fill_swizzle(src1.Swizzle,
					wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
		src2.Swizzle = fill_swizzle(src2.Swizzle,
					wmask, RC_SWIZZLE_ZERO);
		if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
			!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
			!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
			return 0;

		inst2->U.I.Opcode = RC_OPCODE_MAD;
		inst2->U.I.SrcReg[0] = src0;
		inst2->U.I.SrcReg[1] = src1;
		inst2->U.I.SrcReg[2] = src2;
	}
	inst2->U.I.DstReg.WriteMask = wmask;
	/* finally delete the original instruction */
	rc_remove_instruction(inst1);

	return 1;
}

/**
 * This function will try to merge MOV and MAD instructions with the same
 * destination, making use of the constant swizzles. This only works
 * if there is a shared source or one of the sources is RC_FILE_NONE.
 *
 * For example:
 *   MOV temp[0].x const[0].x
 *   MAD temp[0].yz const[0].yz const[1].yz input[0].xw
 *
 * becomes
 *   MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
 */
static bool merge_mov_mad(
	struct radeon_compiler * c,
	struct rc_instruction * inst1,
	struct rc_instruction * inst2)
{
	struct rc_instruction * mov, * mad;
	if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
		mov = inst1;
		mad = inst2;
	} else {
		mov = inst2;
		mad = inst1;
	}

	int shared_index = have_shared_source(mad, mov);
	unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
	struct rc_src_register src[3];
	src[0] = mad->U.I.SrcReg[0];
	src[1] = mad->U.I.SrcReg[1];
	src[2] = mad->U.I.SrcReg[2];

	/* Shared source is the one for multiplication. */
	if (shared_index == 0 || shared_index == 1) {
		src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
		src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
		src[shared_index].Swizzle = merge_swizzles(src[shared_index].Swizzle,
				mov->U.I.SrcReg[0].Swizzle);
		src[1 - shared_index].Swizzle = fill_swizzle(
				src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
		src[2].Swizzle =  fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);

	/* Shared source is the one for used for addition, or it is none. Additionally,
	 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
	 * because than we have the highest change the swizzles will be legal.
	 */
	} else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
			src[2].File == RC_FILE_NONE) {
		src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
		src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
		src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
		src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
		if (src[2].File == RC_FILE_NONE) {
			src[2].File = mov->U.I.SrcReg[0].File;
			src[2].Index = mov->U.I.SrcReg[0].Index;
			src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
			src[2].Abs = mov->U.I.SrcReg[0].Abs;
		}

	/* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
	 * fill the other one with ones and the reg for addition with zeros.
	 */
	} else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
		unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
		src[none_src] = mov->U.I.SrcReg[0];
		src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
		src[none_src].Swizzle = merge_swizzles(src[none_src].Swizzle,
				mad->U.I.SrcReg[none_src].Swizzle);
		src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
		src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle,
				wmask, RC_SWIZZLE_ONE);
		src[2].Swizzle =  fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
	} else {
		return false;
	}

	if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
		!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
		!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
		return false;

	inst2->U.I.Opcode = RC_OPCODE_MAD;
	inst2->U.I.SrcReg[0] = src[0];
	inst2->U.I.SrcReg[1] = src[1];
	inst2->U.I.SrcReg[2] = src[2];
	inst2->U.I.DstReg.WriteMask = wmask;
	rc_remove_instruction(inst1);
	return true;
}

static bool inst_combination(
	struct rc_instruction * inst1,
	struct rc_instruction * inst2,
	rc_opcode opcode1,
	rc_opcode opcode2)
{
	return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
		(inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
}

/**
 * Searches for instructions writing different channels of the same register that could
 * be merged together with the use of constant swizzles.
 *
 * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
 */
static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst)
{
	unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
	unsigned int orig_dst_file = inst->U.I.DstReg.File;
	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
	const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);

	struct rc_instruction * cur = inst;
	while (cur!= &c->Program.Instructions) {
		cur = cur->Next;
		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);

		/* Keep it simple for now and stop when encountering any
		 * control flow.
		 */
		if (opcode->IsFlowControl)
			return;

		/* Stop when the original destination is overwritten */
		if (orig_dst_reg == cur->U.I.DstReg.Index &&
			orig_dst_file == cur->U.I.DstReg.File &&
			(orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
			return;

		/* Stop the search when the original instruction destination
		 * is used as a source for anything.
		 */
		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
			if (cur->U.I.SrcReg[i].File == orig_dst_file &&
				cur->U.I.SrcReg[i].Index == orig_dst_reg)
				return;
		}

		/* Stop the search when some of the original sources are touched. */
		for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
			if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
				inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
				return;
		}

		if (cur->U.I.DstReg.File == orig_dst_file &&
			cur->U.I.DstReg.Index == orig_dst_reg &&
			cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
			(cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {

			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
				if (merge_movs(c, inst, cur))
					return;
			}

			/* Skip the merge if one of the instructions writes just w channel
			 * and we are compiling a fragment shader. We can pair-schedule it together
			 * later anyway and it will also give the scheduler a bit more flexibility.
			 * Only check this after merging MOVs as when we manage to merge two MOVs
			 * into another MOV we can still copy propagate it away. So it is a win in
			 * that case.
			 */
			if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W ||
				inst->U.I.DstReg.WriteMask == RC_MASK_W))
				continue;

			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
				inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
				if (merge_mov_add_mul(c, inst, cur))
					return;
			}

			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
				if (merge_mov_mad(c, inst, cur))
					return;
			}
		}
	}
}

/**
 * Searches for duplicate ARLs/ARRs
 *
 * Only a very trivial case is now optimized where if a second one is detected which reads from
 * the same register as the first one and source is the same, just remove the second one.
 */
static void merge_A0_loads(
	struct radeon_compiler * c,
	struct rc_instruction * inst,
	bool is_ARL)
{
	unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
	unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
	unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
	int cf_depth = 0;

	struct rc_instruction * cur = inst;
	while (cur != &c->Program.Instructions) {
		cur = cur->Next;
		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);

		/* Keep it simple for now and stop when encountering any
		 * control flow besides simple ifs.
		 */
		if (opcode->IsFlowControl) {
			switch (cur->U.I.Opcode) {
			case RC_OPCODE_IF:
			{
				cf_depth++;
				break;
			}
			case RC_OPCODE_ELSE:
			{
				if (cf_depth < 1)
					return;
				break;
			}
			case RC_OPCODE_ENDIF:
			{
                                cf_depth--;
                                break;
			}
			default:
				return;
			}
		}

		/* Stop when the original source is overwritten */
		if (A0_src_reg == cur->U.I.DstReg.Index &&
			A0_src_file == cur->U.I.DstReg.File &&
			cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
			return;

		/* Wrong A0 load type. */
		if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
		    (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
			return;

		if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
			if (A0_src_reg == cur->U.I.SrcReg[0].Index &&
			    A0_src_file == cur->U.I.SrcReg[0].File &&
			    A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
				struct rc_instruction * next = cur->Next;
				rc_remove_instruction(cur);
				cur = next;
			} else {
				return;
			}
		}
	}
}

/**
 * According to the GLSL spec, round is only 1.30 and up
 * so the only reason why we should ever see round is if it actually
 * is lowered ARR (from nine->ttn). In that case we want to reconstruct
 * the ARR instead of lowering the round.
 */
static void transform_vertex_ROUND(struct radeon_compiler* c,
	struct rc_instruction* inst)
{
	struct rc_reader_data readers;
	rc_get_readers(c, inst, &readers, NULL, NULL, NULL);

	assert(readers.ReaderCount > 0);
	for (unsigned i = 0; i < readers.ReaderCount; i++) {
		struct rc_instruction *reader = readers.Readers[i].Inst;
		if (reader->U.I.Opcode != RC_OPCODE_ARL) {
			assert(!"Unable to convert ROUND+ARL to ARR\n");
			return;
		}
	}

	/* Only ARL readers, convert all to ARR */
	for (unsigned i = 0; i < readers.ReaderCount; i++) {
		readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
	}
	/* Switch ROUND to MOV and let copy propagate sort it out later. */
	inst->U.I.Opcode = RC_OPCODE_MOV;
}

/**
 * Apply various optimizations specific to the A0 address register loads.
 */
static void optimize_A0_loads(struct radeon_compiler * c) {
	struct rc_instruction * inst = c->Program.Instructions.Next;

	while (inst != &c->Program.Instructions) {
		struct rc_instruction * cur = inst;
		inst = inst->Next;
		if (cur->U.I.Opcode == RC_OPCODE_ARL) {
			merge_A0_loads(c, cur, true);
		} else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
			merge_A0_loads(c, cur, false);
		} else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
			transform_vertex_ROUND(c, cur);
		}
	}
}

void rc_optimize(struct radeon_compiler * c, void *user)
{
	struct rc_instruction * inst = c->Program.Instructions.Next;
	while(inst != &c->Program.Instructions) {
		struct rc_instruction * cur = inst;
		inst = inst->Next;
		constant_folding(c, cur);
	}

	/* Copy propagate simple movs away. */
	inst = c->Program.Instructions.Next;
	while(inst != &c->Program.Instructions) {
		struct rc_instruction * cur = inst;
		inst = inst->Next;
		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
			copy_propagate(c, cur);
		}
	}

	if (c->type == RC_VERTEX_PROGRAM) {
		optimize_A0_loads(c);
	}

	/* Merge MOVs to same source in different channels using the constant
	 * swizzle.
	 */
	if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
		inst = c->Program.Instructions.Next;
		while(inst != &c->Program.Instructions) {
			struct rc_instruction * cur = inst;
			inst = inst->Next;
			if (cur->U.I.Opcode == RC_OPCODE_MOV ||
				cur->U.I.Opcode == RC_OPCODE_ADD ||
				cur->U.I.Opcode == RC_OPCODE_MAD ||
				cur->U.I.Opcode == RC_OPCODE_MUL)
				merge_channels(c, cur);
		}
	}

	/* Copy propagate few extra movs from the merge_channels pass. */
	inst = c->Program.Instructions.Next;
	while(inst != &c->Program.Instructions) {
		struct rc_instruction * cur = inst;
		inst = inst->Next;
		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
			copy_propagate(c, cur);
		}
	}

	if (c->type != RC_FRAGMENT_PROGRAM) {
		return;
	}

	/* Output modifiers. */
	inst = c->Program.Instructions.Next;
	struct rc_list * var_list = NULL;
	while(inst != &c->Program.Instructions) {
		struct rc_instruction * cur = inst;
		inst = inst->Next;
		if (cur->U.I.Opcode == RC_OPCODE_MUL) {
			if (!var_list)
				var_list = rc_get_variables(c);
			if (peephole_mul_omod(c, cur, var_list))
				var_list = NULL;
		}
	}
}
