st_glsl_to_tgsi.cpp 166 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
 * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
 * Copyright © 2010 Intel Corporation
 * Copyright © 2011 Bryan Cain
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/**
 * \file glsl_to_tgsi.cpp
 *
30
 * Translate GLSL IR to TGSI.
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
 */

#include <stdio.h>
#include "main/compiler.h"
#include "ir.h"
#include "ir_visitor.h"
#include "ir_expression_flattening.h"
#include "glsl_types.h"
#include "glsl_parser_extras.h"
#include "../glsl/program.h"
#include "ir_optimization.h"
#include "ast.h"

#include "main/mtypes.h"
#include "main/shaderobj.h"
46
#include "main/uniforms.h"
47
#include "program/hash_table.h"
48 49 50

extern "C" {
#include "main/shaderapi.h"
51 52 53 54 55 56 57 58 59 60 61 62 63 64
#include "program/prog_instruction.h"
#include "program/prog_optimize.h"
#include "program/prog_print.h"
#include "program/program.h"
#include "program/prog_parameter.h"
#include "program/sampler.h"

#include "pipe/p_compiler.h"
#include "pipe/p_context.h"
#include "pipe/p_screen.h"
#include "pipe/p_shader_tokens.h"
#include "pipe/p_state.h"
#include "util/u_math.h"
#include "tgsi/tgsi_ureg.h"
65
#include "tgsi/tgsi_info.h"
66 67 68 69
#include "st_context.h"
#include "st_program.h"
#include "st_glsl_to_tgsi.h"
#include "st_mesa_to_tgsi.h"
70
}
71

72
#define PROGRAM_IMMEDIATE PROGRAM_FILE_MAX
73 74 75 76 77 78
#define PROGRAM_ANY_CONST ((1 << PROGRAM_LOCAL_PARAM) |  \
                           (1 << PROGRAM_ENV_PARAM) |    \
                           (1 << PROGRAM_STATE_VAR) |    \
                           (1 << PROGRAM_CONSTANT) |     \
                           (1 << PROGRAM_UNIFORM))

79 80 81 82 83 84
/**
 * Maximum number of temporary registers.
 *
 * It is too big for stack allocated arrays -- it will cause stack overflow on
 * Windows and likely Mac OS X.
 */
85 86
#define MAX_TEMPS         4096

87 88 89 90 91
/**
 * Maximum number of arrays
 */
#define MAX_ARRAYS        256

92 93 94
/* will be 4 for GLSL 4.00 */
#define MAX_GLSL_TEXTURE_OFFSET 1

95 96 97 98 99 100
class st_src_reg;
class st_dst_reg;

static int swizzle_for_size(int size);

/**
101
 * This struct is a corresponding struct to TGSI ureg_src.
102 103 104 105 106 107 108 109 110 111 112 113
 */
class st_src_reg {
public:
   st_src_reg(gl_register_file file, int index, const glsl_type *type)
   {
      this->file = file;
      this->index = index;
      if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
         this->swizzle = swizzle_for_size(type->vector_elements);
      else
         this->swizzle = SWIZZLE_XYZW;
      this->negate = 0;
114
      this->index2D = 0;
115
      this->type = type ? type->base_type : GLSL_TYPE_ERROR;
116 117 118
      this->reladdr = NULL;
   }

119
   st_src_reg(gl_register_file file, int index, int type)
120
   {
121
      this->type = type;
122 123
      this->file = file;
      this->index = index;
124 125 126 127 128 129 130 131 132 133 134 135
      this->index2D = 0;
      this->swizzle = SWIZZLE_XYZW;
      this->negate = 0;
      this->reladdr = NULL;
   }

   st_src_reg(gl_register_file file, int index, int type, int index2D)
   {
      this->type = type;
      this->file = file;
      this->index = index;
      this->index2D = index2D;
136 137 138 139 140
      this->swizzle = SWIZZLE_XYZW;
      this->negate = 0;
      this->reladdr = NULL;
   }

141 142
   st_src_reg()
   {
143
      this->type = GLSL_TYPE_ERROR;
144 145
      this->file = PROGRAM_UNDEFINED;
      this->index = 0;
146
      this->index2D = 0;
147 148 149 150 151 152 153 154
      this->swizzle = 0;
      this->negate = 0;
      this->reladdr = NULL;
   }

   explicit st_src_reg(st_dst_reg reg);

   gl_register_file file; /**< PROGRAM_* from Mesa */
155
   int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
156
   int index2D;
157 158
   GLuint swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
   int negate; /**< NEGATE_XYZW mask from mesa */
159
   int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
160 161 162 163 164 165
   /** Register index should be offset by the integer in this reg. */
   st_src_reg *reladdr;
};

class st_dst_reg {
public:
166
   st_dst_reg(gl_register_file file, int writemask, int type)
167 168 169 170 171 172
   {
      this->file = file;
      this->index = 0;
      this->writemask = writemask;
      this->cond_mask = COND_TR;
      this->reladdr = NULL;
173
      this->type = type;
174 175 176 177
   }

   st_dst_reg()
   {
178
      this->type = GLSL_TYPE_ERROR;
179 180 181 182 183 184 185 186 187 188
      this->file = PROGRAM_UNDEFINED;
      this->index = 0;
      this->writemask = 0;
      this->cond_mask = COND_TR;
      this->reladdr = NULL;
   }

   explicit st_dst_reg(st_src_reg reg);

   gl_register_file file; /**< PROGRAM_* from Mesa */
189
   int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
190 191
   int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
   GLuint cond_mask:4;
192
   int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
193 194 195 196 197 198
   /** Register index should be offset by the integer in this reg. */
   st_src_reg *reladdr;
};

st_src_reg::st_src_reg(st_dst_reg reg)
{
199
   this->type = reg.type;
200 201 202 203
   this->file = reg.file;
   this->index = reg.index;
   this->swizzle = SWIZZLE_XYZW;
   this->negate = 0;
204
   this->reladdr = reg.reladdr;
205
   this->index2D = 0;
206 207 208 209
}

st_dst_reg::st_dst_reg(st_src_reg reg)
{
210
   this->type = reg.type;
211 212 213 214 215 216 217 218 219
   this->file = reg.file;
   this->index = reg.index;
   this->writemask = WRITEMASK_XYZW;
   this->cond_mask = COND_TR;
   this->reladdr = reg.reladdr;
}

class glsl_to_tgsi_instruction : public exec_node {
public:
220
   DECLARE_RALLOC_CXX_OPERATORS(glsl_to_tgsi_instruction)
221

222
   unsigned op;
223 224 225 226 227 228 229 230 231
   st_dst_reg dst;
   st_src_reg src[3];
   /** Pointer to the ir source this tree came from for debugging */
   ir_instruction *ir;
   GLboolean cond_update;
   bool saturate;
   int sampler; /**< sampler index */
   int tex_target; /**< One of TEXTURE_*_INDEX */
   GLboolean tex_shadow;
232 233
   struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
   unsigned tex_offset_num_offset;
234
   int dead_mask; /**< Used in dead code elimination */
235

236
   class function_entry *function; /* Set on TGSI_OPCODE_CAL or TGSI_OPCODE_BGNSUB */
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
};

class variable_storage : public exec_node {
public:
   variable_storage(ir_variable *var, gl_register_file file, int index)
      : file(file), index(index), var(var)
   {
      /* empty */
   }

   gl_register_file file;
   int index;
   ir_variable *var; /* variable that maps to this, if any */
};

252 253 254 255 256 257 258 259 260 261 262 263 264 265
class immediate_storage : public exec_node {
public:
   immediate_storage(gl_constant_value *values, int size, int type)
   {
      memcpy(this->values, values, size * sizeof(gl_constant_value));
      this->size = size;
      this->type = type;
   }
   
   gl_constant_value values[4];
   int size; /**< Number of components (1-4) */
   int type; /**< GL_FLOAT, GL_INT, GL_BOOL, or GL_UNSIGNED_INT */
};

266 267 268 269 270 271 272
class function_entry : public exec_node {
public:
   ir_function_signature *sig;

   /**
    * identifier of this function signature used by the program.
    *
Bryan Cain's avatar
Bryan Cain committed
273
    * At the point that TGSI instructions for function calls are
274 275 276 277 278 279 280 281 282 283 284 285 286 287
    * generated, we don't know the address of the first instruction of
    * the function body.  So we make the BranchTarget that is called a
    * small integer and rewrite them during set_branchtargets().
    */
   int sig_id;

   /**
    * Pointer to first instruction of the function body.
    *
    * Set during function body emits after main() is processed.
    */
   glsl_to_tgsi_instruction *bgn_inst;

   /**
Bryan Cain's avatar
Bryan Cain committed
288
    * Index of the first instruction of the function body in actual TGSI.
289
    *
Bryan Cain's avatar
Bryan Cain committed
290
    * Set after conversion from glsl_to_tgsi_instruction to TGSI.
291 292 293 294 295 296 297
    */
   int inst;

   /** Storage for the return value. */
   st_src_reg return_reg;
};

298
struct glsl_to_tgsi_visitor : public ir_visitor {
299 300 301 302 303 304 305 306 307 308 309 310
public:
   glsl_to_tgsi_visitor();
   ~glsl_to_tgsi_visitor();

   function_entry *current_function;

   struct gl_context *ctx;
   struct gl_program *prog;
   struct gl_shader_program *shader_program;
   struct gl_shader_compiler_options *options;

   int next_temp;
311

312 313 314
   unsigned array_sizes[MAX_ARRAYS];
   unsigned next_array;

315
   int num_address_regs;
316
   int samplers_used;
317
   bool indirect_addr_consts;
318 319
   
   int glsl_version;
320
   bool native_integers;
321
   bool have_sqrt;
322 323 324

   variable_storage *find_variable_storage(ir_variable *var);

325 326 327
   int add_constant(gl_register_file file, gl_constant_value values[4],
                    int size, int datatype, GLuint *swizzle_out);

328 329 330 331 332 333
   function_entry *get_function_signature(ir_function_signature *sig);

   st_src_reg get_temp(const glsl_type *type);
   void reladdr_to_temp(ir_instruction *ir, st_src_reg *reg, int *num_reladdr);

   st_src_reg st_src_reg_for_float(float val);
334 335
   st_src_reg st_src_reg_for_int(int val);
   st_src_reg st_src_reg_for_type(int type, int val);
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361

   /**
    * \name Visit methods
    *
    * As typical for the visitor pattern, there must be one \c visit method for
    * each concrete subclass of \c ir_instruction.  Virtual base classes within
    * the hierarchy should not have \c visit methods.
    */
   /*@{*/
   virtual void visit(ir_variable *);
   virtual void visit(ir_loop *);
   virtual void visit(ir_loop_jump *);
   virtual void visit(ir_function_signature *);
   virtual void visit(ir_function *);
   virtual void visit(ir_expression *);
   virtual void visit(ir_swizzle *);
   virtual void visit(ir_dereference_variable  *);
   virtual void visit(ir_dereference_array *);
   virtual void visit(ir_dereference_record *);
   virtual void visit(ir_assignment *);
   virtual void visit(ir_constant *);
   virtual void visit(ir_call *);
   virtual void visit(ir_return *);
   virtual void visit(ir_discard *);
   virtual void visit(ir_texture *);
   virtual void visit(ir_if *);
362 363
   virtual void visit(ir_emit_vertex *);
   virtual void visit(ir_end_primitive *);
364 365 366 367 368 369 370
   /*@}*/

   st_src_reg result;

   /** List of variable_storage */
   exec_list variables;

371 372
   /** List of immediate_storage */
   exec_list immediates;
373
   unsigned num_immediates;
374

375 376 377 378 379 380 381
   /** List of function_entry */
   exec_list function_signatures;
   int next_signature_id;

   /** List of glsl_to_tgsi_instruction */
   exec_list instructions;

382
   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op);
383

384
   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
385 386
        		        st_dst_reg dst, st_src_reg src0);

387
   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
388 389
        		        st_dst_reg dst, st_src_reg src0, st_src_reg src1);

390
   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
391 392
        		        st_dst_reg dst,
        		        st_src_reg src0, st_src_reg src1, st_src_reg src2);
393 394 395 396
   
   unsigned get_opcode(ir_instruction *ir, unsigned op,
                    st_dst_reg dst,
                    st_src_reg src0, st_src_reg src1);
397 398 399 400

   /**
    * Emit the correct dot-product instruction for the type of arguments
    */
401 402 403 404 405
   glsl_to_tgsi_instruction *emit_dp(ir_instruction *ir,
                                     st_dst_reg dst,
                                     st_src_reg src0,
                                     st_src_reg src1,
                                     unsigned elements);
406

407
   void emit_scalar(ir_instruction *ir, unsigned op,
408 409
        	    st_dst_reg dst, st_src_reg src0);

410
   void emit_scalar(ir_instruction *ir, unsigned op,
411 412
        	    st_dst_reg dst, st_src_reg src0, st_src_reg src1);

413 414
   void emit_arl(ir_instruction *ir, st_dst_reg dst, st_src_reg src0);

415
   void emit_scs(ir_instruction *ir, unsigned op,
416 417
        	 st_dst_reg dst, const st_src_reg &src);

418 419
   bool try_emit_mad(ir_expression *ir,
              int mul_operand);
420 421
   bool try_emit_mad_for_and_not(ir_expression *ir,
              int mul_operand);
422
   bool try_emit_sat(ir_expression *ir);
423 424 425 426 427

   void emit_swz(ir_expression *ir);

   bool process_move_condition(ir_rvalue *ir);

428
   void simplify_cmp(void);
429

430 431 432 433 434 435 436 437
   void rename_temp_register(int index, int new_index);
   int get_first_temp_read(int index);
   int get_first_temp_write(int index);
   int get_last_temp_read(int index);
   int get_last_temp_write(int index);

   void copy_propagate(void);
   void eliminate_dead_code(void);
438
   int eliminate_dead_code_advanced(void);
439 440 441
   void merge_registers(void);
   void renumber_registers(void);

442 443 444
   void emit_block_mov(ir_assignment *ir, const struct glsl_type *type,
                       st_dst_reg *l, st_src_reg *r);

445 446 447
   void *mem_ctx;
};

448
static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
449

450
static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
451

452
static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT);
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481

static void
fail_link(struct gl_shader_program *prog, const char *fmt, ...) PRINTFLIKE(2, 3);

static void
fail_link(struct gl_shader_program *prog, const char *fmt, ...)
{
   va_list args;
   va_start(args, fmt);
   ralloc_vasprintf_append(&prog->InfoLog, fmt, args);
   va_end(args);

   prog->LinkStatus = GL_FALSE;
}

static int
swizzle_for_size(int size)
{
   int size_swizzles[4] = {
      MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
      MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
      MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
      MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
   };

   assert((size >= 1) && (size <= 4));
   return size_swizzles[size - 1];
}

482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
static bool
is_tex_instruction(unsigned opcode)
{
   const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
   return info->is_tex;
}

static unsigned
num_inst_dst_regs(unsigned opcode)
{
   const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
   return info->num_dst;
}

static unsigned
num_inst_src_regs(unsigned opcode)
{
   const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
   return info->is_tex ? info->num_src - 1 : info->num_src;
}

503
glsl_to_tgsi_instruction *
504
glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
505 506 507 508 509
        		 st_dst_reg dst,
        		 st_src_reg src0, st_src_reg src1, st_src_reg src2)
{
   glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
   int num_reladdr = 0, i;
510 511
   
   op = get_opcode(ir, op, dst, src0, src1);
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526

   /* If we have to do relative addressing, we want to load the ARL
    * reg directly for one of the regs, and preload the other reladdr
    * sources into temps.
    */
   num_reladdr += dst.reladdr != NULL;
   num_reladdr += src0.reladdr != NULL;
   num_reladdr += src1.reladdr != NULL;
   num_reladdr += src2.reladdr != NULL;

   reladdr_to_temp(ir, &src2, &num_reladdr);
   reladdr_to_temp(ir, &src1, &num_reladdr);
   reladdr_to_temp(ir, &src0, &num_reladdr);

   if (dst.reladdr) {
527
      emit_arl(ir, address_reg, *dst.reladdr);
528 529 530 531 532 533 534 535 536 537
      num_reladdr--;
   }
   assert(num_reladdr == 0);

   inst->op = op;
   inst->dst = dst;
   inst->src[0] = src0;
   inst->src[1] = src1;
   inst->src[2] = src2;
   inst->ir = ir;
538
   inst->dead_mask = 0;
539 540 541

   inst->function = NULL;
   
542
   if (op == TGSI_OPCODE_ARL || op == TGSI_OPCODE_UARL)
543 544 545 546 547 548 549 550 551 552 553 554
      this->num_address_regs = 1;
   
   /* Update indirect addressing status used by TGSI */
   if (dst.reladdr) {
      switch(dst.file) {
      case PROGRAM_LOCAL_PARAM:
      case PROGRAM_ENV_PARAM:
      case PROGRAM_STATE_VAR:
      case PROGRAM_CONSTANT:
      case PROGRAM_UNIFORM:
         this->indirect_addr_consts = true;
         break;
555 556 557
      case PROGRAM_IMMEDIATE:
         assert(!"immediates should not have indirect addressing");
         break;
558 559 560 561 562 563 564
      default:
         break;
      }
   }
   else {
      for (i=0; i<3; i++) {
         if(inst->src[i].reladdr) {
565
            switch(inst->src[i].file) {
566 567 568 569 570 571 572
            case PROGRAM_LOCAL_PARAM:
            case PROGRAM_ENV_PARAM:
            case PROGRAM_STATE_VAR:
            case PROGRAM_CONSTANT:
            case PROGRAM_UNIFORM:
               this->indirect_addr_consts = true;
               break;
573 574 575
            case PROGRAM_IMMEDIATE:
               assert(!"immediates should not have indirect addressing");
               break;
576 577 578 579 580 581 582 583
            default:
               break;
            }
         }
      }
   }

   this->instructions.push_tail(inst);
584

585 586 587 588 589
   return inst;
}


glsl_to_tgsi_instruction *
590
glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
591 592 593 594 595 596
        		 st_dst_reg dst, st_src_reg src0, st_src_reg src1)
{
   return emit(ir, op, dst, src0, src1, undef_src);
}

glsl_to_tgsi_instruction *
597
glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
598 599 600 601 602 603 604
        		 st_dst_reg dst, st_src_reg src0)
{
   assert(dst.writemask != 0);
   return emit(ir, op, dst, src0, undef_src, undef_src);
}

glsl_to_tgsi_instruction *
605
glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op)
606 607 608 609
{
   return emit(ir, op, undef_dst, undef_src, undef_src, undef_src);
}

610 611 612 613 614 615 616 617 618 619 620
/**
 * Determines whether to use an integer, unsigned integer, or float opcode 
 * based on the operands and input opcode, then emits the result.
 */
unsigned
glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
        		 st_dst_reg dst,
        		 st_src_reg src0, st_src_reg src1)
{
   int type = GLSL_TYPE_FLOAT;
   
621 622 623 624 625
   assert(src0.type != GLSL_TYPE_ARRAY);
   assert(src0.type != GLSL_TYPE_STRUCT);
   assert(src1.type != GLSL_TYPE_ARRAY);
   assert(src1.type != GLSL_TYPE_STRUCT);

626 627
   if (src0.type == GLSL_TYPE_FLOAT || src1.type == GLSL_TYPE_FLOAT)
      type = GLSL_TYPE_FLOAT;
628
   else if (native_integers)
629
      type = src0.type == GLSL_TYPE_BOOL ? GLSL_TYPE_INT : src0.type;
630 631 632

#define case4(c, f, i, u) \
   case TGSI_OPCODE_##c: \
633 634 635 636 637 638
      if (type == GLSL_TYPE_INT) \
         op = TGSI_OPCODE_##i; \
      else if (type == GLSL_TYPE_UINT) \
         op = TGSI_OPCODE_##u; \
      else \
         op = TGSI_OPCODE_##f; \
639
      break;
640

641 642 643
#define case3(f, i, u)  case4(f, f, i, u)
#define case2fi(f, i)   case4(f, f, i, i)
#define case2iu(i, u)   case4(i, LAST, i, u)
644 645 646 647 648 649 650 651 652 653 654 655 656

#define casecomp(c, f, i, u) \
   case TGSI_OPCODE_##c: \
      if (type == GLSL_TYPE_INT) \
         op = TGSI_OPCODE_##i; \
      else if (type == GLSL_TYPE_UINT) \
         op = TGSI_OPCODE_##u; \
      else if (native_integers) \
         op = TGSI_OPCODE_##f; \
      else \
         op = TGSI_OPCODE_##c; \
      break;

657 658 659 660 661 662 663 664
   switch(op) {
      case2fi(ADD, UADD);
      case2fi(MUL, UMUL);
      case2fi(MAD, UMAD);
      case3(DIV, IDIV, UDIV);
      case3(MAX, IMAX, UMAX);
      case3(MIN, IMIN, UMIN);
      case2iu(MOD, UMOD);
665 666 667 668 669 670

      casecomp(SEQ, FSEQ, USEQ, USEQ);
      casecomp(SNE, FSNE, USNE, USNE);
      casecomp(SGE, FSGE, ISGE, USGE);
      casecomp(SLT, FSLT, ISLT, USLT);

671
      case2iu(ISHR, USHR);
672 673 674

      case2fi(SSG, ISSG);
      case3(ABS, IABS, IABS);
675 676 677 678 679 680 681 682
      
      default: break;
   }
   
   assert(op != TGSI_OPCODE_LAST);
   return op;
}

683
glsl_to_tgsi_instruction *
684 685 686 687
glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
        		    st_dst_reg dst, st_src_reg src0, st_src_reg src1,
        		    unsigned elements)
{
688 689
   static const unsigned dot_opcodes[] = {
      TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
690 691
   };

692
   return emit(ir, dot_opcodes[elements - 2], dst, src0, src1);
693 694 695
}

/**
696
 * Emits TGSI scalar opcodes to produce unique answers across channels.
697
 *
698
 * Some TGSI opcodes are scalar-only, like ARB_fp/vp.  The src X
699 700 701 702 703
 * channel determines the result across all channels.  So to do a vec4
 * of this operation, we want to emit a scalar per source channel used
 * to produce dest channels.
 */
void
704
glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
705 706 707 708 709 710
        		        st_dst_reg dst,
        			st_src_reg orig_src0, st_src_reg orig_src1)
{
   int i, j;
   int done_mask = ~dst.writemask;

711
   /* TGSI RCP is a scalar operation splatting results to all channels,
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748
    * like ARB_fp/vp.  So emit as many RCPs as necessary to cover our
    * dst channels.
    */
   for (i = 0; i < 4; i++) {
      GLuint this_mask = (1 << i);
      glsl_to_tgsi_instruction *inst;
      st_src_reg src0 = orig_src0;
      st_src_reg src1 = orig_src1;

      if (done_mask & this_mask)
         continue;

      GLuint src0_swiz = GET_SWZ(src0.swizzle, i);
      GLuint src1_swiz = GET_SWZ(src1.swizzle, i);
      for (j = i + 1; j < 4; j++) {
         /* If there is another enabled component in the destination that is
          * derived from the same inputs, generate its value on this pass as
          * well.
          */
         if (!(done_mask & (1 << j)) &&
             GET_SWZ(src0.swizzle, j) == src0_swiz &&
             GET_SWZ(src1.swizzle, j) == src1_swiz) {
            this_mask |= (1 << j);
         }
      }
      src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
        			   src0_swiz, src0_swiz);
      src1.swizzle = MAKE_SWIZZLE4(src1_swiz, src1_swiz,
        			  src1_swiz, src1_swiz);

      inst = emit(ir, op, dst, src0, src1);
      inst->dst.writemask = this_mask;
      done_mask |= this_mask;
   }
}

void
749
glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
750 751 752 753 754 755 756 757 758
        		        st_dst_reg dst, st_src_reg src0)
{
   st_src_reg undef = undef_src;

   undef.swizzle = SWIZZLE_XXXX;

   emit_scalar(ir, op, dst, src0, undef);
}

759 760 761 762
void
glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
        		        st_dst_reg dst, st_src_reg src0)
{
763 764 765 766 767 768
   int op = TGSI_OPCODE_ARL;

   if (src0.type == GLSL_TYPE_INT || src0.type == GLSL_TYPE_UINT)
      op = TGSI_OPCODE_UARL;

   emit(NULL, op, dst, src0);
769 770
}

771
/**
772
 * Emit an TGSI_OPCODE_SCS instruction
773
 *
774 775 776 777
 * The \c SCS opcode functions a bit differently than the other TGSI opcodes.
 * Instead of splatting its result across all four components of the 
 * destination, it writes one value to the \c x component and another value to 
 * the \c y component.
778 779
 *
 * \param ir        IR instruction being processed
780 781
 * \param op        Either \c TGSI_OPCODE_SIN or \c TGSI_OPCODE_COS depending 
 *                  on which value is desired.
782 783 784 785
 * \param dst       Destination register
 * \param src       Source register
 */
void
786
glsl_to_tgsi_visitor::emit_scs(ir_instruction *ir, unsigned op,
787 788 789 790 791 792 793 794 795 796
        		     st_dst_reg dst,
        		     const st_src_reg &src)
{
   /* Vertex programs cannot use the SCS opcode.
    */
   if (this->prog->Target == GL_VERTEX_PROGRAM_ARB) {
      emit_scalar(ir, op, dst, src);
      return;
   }

797
   const unsigned component = (op == TGSI_OPCODE_SIN) ? 0 : 1;
798 799 800 801
   const unsigned scs_mask = (1U << component);
   int done_mask = ~dst.writemask;
   st_src_reg tmp;

802
   assert(op == TGSI_OPCODE_SIN || op == TGSI_OPCODE_COS);
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844

   /* If there are compnents in the destination that differ from the component
    * that will be written by the SCS instrution, we'll need a temporary.
    */
   if (scs_mask != unsigned(dst.writemask)) {
      tmp = get_temp(glsl_type::vec4_type);
   }

   for (unsigned i = 0; i < 4; i++) {
      unsigned this_mask = (1U << i);
      st_src_reg src0 = src;

      if ((done_mask & this_mask) != 0)
         continue;

      /* The source swizzle specified which component of the source generates
       * sine / cosine for the current component in the destination.  The SCS
       * instruction requires that this value be swizzle to the X component.
       * Replace the current swizzle with a swizzle that puts the source in
       * the X component.
       */
      unsigned src0_swiz = GET_SWZ(src.swizzle, i);

      src0.swizzle = MAKE_SWIZZLE4(src0_swiz, src0_swiz,
        			   src0_swiz, src0_swiz);
      for (unsigned j = i + 1; j < 4; j++) {
         /* If there is another enabled component in the destination that is
          * derived from the same inputs, generate its value on this pass as
          * well.
          */
         if (!(done_mask & (1 << j)) &&
             GET_SWZ(src0.swizzle, j) == src0_swiz) {
            this_mask |= (1 << j);
         }
      }

      if (this_mask != scs_mask) {
         glsl_to_tgsi_instruction *inst;
         st_dst_reg tmp_dst = st_dst_reg(tmp);

         /* Emit the SCS instruction.
          */
845
         inst = emit(ir, TGSI_OPCODE_SCS, tmp_dst, src0);
846 847 848 849 850 851 852
         inst->dst.writemask = scs_mask;

         /* Move the result of the SCS instruction to the desired location in
          * the destination.
          */
         tmp.swizzle = MAKE_SWIZZLE4(component, component,
        			     component, component);
853
         inst = emit(ir, TGSI_OPCODE_SCS, dst, tmp);
854 855 856 857
         inst->dst.writemask = this_mask;
      } else {
         /* Emit the SCS instruction to write directly to the destination.
          */
858
         glsl_to_tgsi_instruction *inst = emit(ir, TGSI_OPCODE_SCS, dst, src0);
859 860 861 862 863 864 865
         inst->dst.writemask = scs_mask;
      }

      done_mask |= this_mask;
   }
}

866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
int
glsl_to_tgsi_visitor::add_constant(gl_register_file file,
        		     gl_constant_value values[4], int size, int datatype,
        		     GLuint *swizzle_out)
{
   if (file == PROGRAM_CONSTANT) {
      return _mesa_add_typed_unnamed_constant(this->prog->Parameters, values,
                                              size, datatype, swizzle_out);
   } else {
      int index = 0;
      immediate_storage *entry;
      assert(file == PROGRAM_IMMEDIATE);

      /* Search immediate storage to see if we already have an identical
       * immediate that we can use instead of adding a duplicate entry.
       */
      foreach_iter(exec_list_iterator, iter, this->immediates) {
         entry = (immediate_storage *)iter.get();
         
         if (entry->size == size &&
             entry->type == datatype &&
             !memcmp(entry->values, values, size * sizeof(gl_constant_value))) {
             return index;
         }
         index++;
      }
      
      /* Add this immediate to the list. */
      entry = new(mem_ctx) immediate_storage(values, size, datatype);
      this->immediates.push_tail(entry);
      this->num_immediates++;
      return index;
   }
}

901
st_src_reg
902 903
glsl_to_tgsi_visitor::st_src_reg_for_float(float val)
{
904
   st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_FLOAT);
905
   union gl_constant_value uval;
906

907
   uval.f = val;
908
   src.index = add_constant(src.file, &uval, 1, GL_FLOAT, &src.swizzle);
909 910 911 912

   return src;
}

913
st_src_reg
914 915
glsl_to_tgsi_visitor::st_src_reg_for_int(int val)
{
916
   st_src_reg src(PROGRAM_IMMEDIATE, -1, GLSL_TYPE_INT);
917 918
   union gl_constant_value uval;
   
919
   assert(native_integers);
920 921

   uval.i = val;
922
   src.index = add_constant(src.file, &uval, 1, GL_INT, &src.swizzle);
923 924 925 926

   return src;
}

927
st_src_reg
928 929
glsl_to_tgsi_visitor::st_src_reg_for_type(int type, int val)
{
930
   if (native_integers)
931 932 933 934 935 936
      return type == GLSL_TYPE_FLOAT ? st_src_reg_for_float(val) : 
                                       st_src_reg_for_int(val);
   else
      return st_src_reg_for_float(val);
}

937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
static int
type_size(const struct glsl_type *type)
{
   unsigned int i;
   int size;

   switch (type->base_type) {
   case GLSL_TYPE_UINT:
   case GLSL_TYPE_INT:
   case GLSL_TYPE_FLOAT:
   case GLSL_TYPE_BOOL:
      if (type->is_matrix()) {
         return type->matrix_columns;
      } else {
         /* Regardless of size of vector, it gets a vec4. This is bad
          * packing for things like floats, but otherwise arrays become a
          * mess.  Hopefully a later pass over the code can pack scalars
          * down if appropriate.
          */
         return 1;
      }
   case GLSL_TYPE_ARRAY:
      assert(type->length > 0);
      return type_size(type->fields.array) * type->length;
   case GLSL_TYPE_STRUCT:
      size = 0;
      for (i = 0; i < type->length; i++) {
         size += type_size(type->fields.structure[i].type);
      }
      return size;
   case GLSL_TYPE_SAMPLER:
      /* Samplers take up one slot in UNIFORMS[], but they're baked in
       * at link time.
       */
      return 1;
972
   case GLSL_TYPE_INTERFACE:
973 974 975 976
   case GLSL_TYPE_VOID:
   case GLSL_TYPE_ERROR:
      assert(!"Invalid type in type_size");
      break;
977
   }
978
   return 0;
979 980 981 982 983
}

/**
 * In the initial pass of codegen, we assign temporary numbers to
 * intermediate results.  (not SSA -- variable assignments will reuse
984
 * storage).
985 986 987 988 989 990
 */
st_src_reg
glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
{
   st_src_reg src;

991
   src.type = native_integers ? type->base_type : GLSL_TYPE_FLOAT;
992
   src.reladdr = NULL;
993 994
   src.negate = 0;

995 996 997
   if (!options->EmitNoIndirectTemp &&
       (type->is_array() || type->is_matrix())) {

998 999 1000 1001 1002 1003 1004 1005 1006 1007
      src.file = PROGRAM_ARRAY;
      src.index = next_array << 16 | 0x8000;
      array_sizes[next_array] = type_size(type);
      ++next_array;

   } else {
      src.file = PROGRAM_TEMPORARY;
      src.index = next_temp;
      next_temp += type_size(type);
   }
1008 1009 1010 1011

   if (type->is_array() || type->is_record()) {
      src.swizzle = SWIZZLE_NOOP;
   } else {
1012
      src.swizzle = swizzle_for_size(type->vector_elements);
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
   }

   return src;
}

variable_storage *
glsl_to_tgsi_visitor::find_variable_storage(ir_variable *var)
{
   
   variable_storage *entry;

   foreach_iter(exec_list_iterator, iter, this->variables) {
      entry = (variable_storage *)iter.get();

      if (entry->var == var)
         return entry;
   }

   return NULL;
}

void
glsl_to_tgsi_visitor::visit(ir_variable *ir)
{
   if (strcmp(ir->name, "gl_FragCoord") == 0) {
      struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;

      fp->OriginUpperLeft = ir->origin_upper_left;
      fp->PixelCenterInteger = ir->pixel_center_integer;
   }

   if (ir->mode == ir_var_uniform && strncmp(ir->name, "gl_", 3) == 0) {
      unsigned int i;
      const ir_state_slot *const slots = ir->state_slots;
      assert(ir->state_slots != NULL);

      /* Check if this statevar's setup in the STATE file exactly
       * matches how we'll want to reference it as a
       * struct/array/whatever.  If not, then we need to move it into
       * temporary storage and hope that it'll get copy-propagated
       * out.
       */
      for (i = 0; i < ir->num_state_slots; i++) {
         if (slots[i].swizzle != SWIZZLE_XYZW) {
            break;
         }
      }

1061
      variable_storage *storage;
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
      st_dst_reg dst;
      if (i == ir->num_state_slots) {
         /* We'll set the index later. */
         storage = new(mem_ctx) variable_storage(ir, PROGRAM_STATE_VAR, -1);
         this->variables.push_tail(storage);

         dst = undef_dst;
      } else {
         /* The variable_storage constructor allocates slots based on the size
          * of the type.  However, this had better match the number of state
          * elements that we're going to copy into the new temporary.
          */
         assert((int) ir->num_state_slots == type_size(ir->type));

1076 1077 1078
         dst = st_dst_reg(get_temp(ir->type));

         storage = new(mem_ctx) variable_storage(ir, dst.file, dst.index);
1079

1080
         this->variables.push_tail(storage);
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
      }


      for (unsigned int i = 0; i < ir->num_state_slots; i++) {
         int index = _mesa_add_state_reference(this->prog->Parameters,
        				       (gl_state_index *)slots[i].tokens);

         if (storage->file == PROGRAM_STATE_VAR) {
            if (storage->index == -1) {
               storage->index = index;
            } else {
               assert(index == storage->index + (int)i);
            }
         } else {
1095 1096 1097 1098 1099 1100
         	/* We use GLSL_TYPE_FLOAT here regardless of the actual type of
         	 * the data being moved since MOV does not care about the type of
         	 * data it is moving, and we don't want to declare registers with
         	 * array or struct types.
         	 */
            st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
1101
            src.swizzle = slots[i].swizzle;
1102
            emit(ir, TGSI_OPCODE_MOV, dst, src);
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
            /* even a float takes up a whole vec4 reg in a struct/array. */
            dst.index++;
         }
      }

      if (storage->file == PROGRAM_TEMPORARY &&
          dst.index != storage->index + (int) ir->num_state_slots) {
         fail_link(this->shader_program,
        	   "failed to load builtin uniform `%s'  (%d/%d regs loaded)\n",
        	   ir->name, dst.index - storage->index,
        	   type_size(ir->type));
      }
   }
}

void
glsl_to_tgsi_visitor::visit(ir_loop *ir)
{
   ir_dereference_variable *counter = NULL;

   if (ir->counter != NULL)
      counter = new(ir) ir_dereference_variable(ir->counter);

   if (ir->from != NULL) {
      assert(ir->counter != NULL);

      ir_assignment *a = new(ir) ir_assignment(counter, ir->from, NULL);

      a->accept(this);
      delete a;
   }

1135
   emit(NULL, TGSI_OPCODE_BGNLOOP);
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167

   if (ir->to) {
      ir_expression *e =
         new(ir) ir_expression(ir->cmp, glsl_type::bool_type,
        		       counter, ir->to);
      ir_if *if_stmt =  new(ir) ir_if(e);

      ir_loop_jump *brk = new(ir) ir_loop_jump(ir_loop_jump::jump_break);

      if_stmt->then_instructions.push_tail(brk);

      if_stmt->accept(this);

      delete if_stmt;
      delete e;
      delete brk;
   }

   visit_exec_list(&ir->body_instructions, this);

   if (ir->increment) {
      ir_expression *e =
         new(ir) ir_expression(ir_binop_add, counter->type,
        		       counter, ir->increment);

      ir_assignment *a = new(ir) ir_assignment(counter, e, NULL);

      a->accept(this);
      delete a;
      delete e;
   }

1168
   emit(NULL, TGSI_OPCODE_ENDLOOP);
1169 1170 1171 1172 1173 1174 1175
}

void
glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
{
   switch (ir->mode) {
   case ir_loop_jump::jump_break:
1176
      emit(NULL, TGSI_OPCODE_BRK);
1177 1178
      break;
   case ir_loop_jump::jump_continue:
1179
      emit(NULL, TGSI_OPCODE_CONT);
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201
      break;
   }
}


void
glsl_to_tgsi_visitor::visit(ir_function_signature *ir)
{
   assert(0);
   (void)ir;
}

void
glsl_to_tgsi_visitor::visit(ir_function *ir)
{
   /* Ignore function bodies other than main() -- we shouldn't see calls to
    * them since they should all be inlined before we get to glsl_to_tgsi.
    */
   if (strcmp(ir->name, "main") == 0) {
      const ir_function_signature *sig;
      exec_list empty;

1202
      sig = ir->matching_signature(NULL, &empty);
1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213

      assert(sig);

      foreach_iter(exec_list_iterator, iter, sig->body) {
         ir_instruction *ir = (ir_instruction *)iter.get();

         ir->accept(this);
      }
   }
}

1214
bool
1215 1216 1217 1218
glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
{
   int nonmul_operand = 1 - mul_operand;
   st_src_reg a, b, c;
1219
   st_dst_reg result_dst;
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232

   ir_expression *expr = ir->operands[mul_operand]->as_expression();
   if (!expr || expr->operation != ir_binop_mul)
      return false;

   expr->operands[0]->accept(this);
   a = this->result;
   expr->operands[1]->accept(this);
   b = this->result;
   ir->operands[nonmul_operand]->accept(this);
   c = this->result;

   this->result = get_temp(ir->type);
1233 1234 1235
   result_dst = st_dst_reg(this->result);
   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
   emit(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
1236 1237 1238 1239

   return true;
}

1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
/**
 * Emit MAD(a, -b, a) instead of AND(a, NOT(b))
 *
 * The logic values are 1.0 for true and 0.0 for false.  Logical-and is
 * implemented using multiplication, and logical-or is implemented using
 * addition.  Logical-not can be implemented as (true - x), or (1.0 - x).
 * As result, the logical expression (a & !b) can be rewritten as:
 *
 *     - a * !b
 *     - a * (1 - b)
 *     - (a * 1) - (a * b)
 *     - a + -(a * b)
 *     - a + (a * -b)
 *
 * This final expression can be implemented as a single MAD(a, -b, a)
 * instruction.
 */
bool
glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operand)
{
   const int other_operand = 1 - try_operand;
   st_src_reg a, b;

   ir_expression *expr = ir->operands[try_operand]->as_expression();
   if (!expr || expr->operation != ir_unop_logic_not)
      return false;

   ir->operands[other_operand]->accept(this);
   a = this->result;
   expr->operands[0]->accept(this);
   b = this->result;

   b.negate = ~b.negate;

   this->result = get_temp(ir->type);
   emit(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);

   return true;
}

1280
bool
1281 1282
glsl_to_tgsi_visitor::try_emit_sat(ir_expression *ir)
{
1283
   /* Emit saturates in the vertex shader only if SM 3.0 is supported.
1284
    */
1285 1286
   if (this->prog->Target == GL_VERTEX_PROGRAM_ARB &&
       !st_context(this->ctx)->has_shader_model3) {
1287
      return false;
1288
   }
1289 1290 1291 1292 1293 1294 1295 1296

   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
   if (!sat_src)
      return false;

   sat_src->accept(this);
   st_src_reg src = this->result;

1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322
   /* If we generated an expression instruction into a temporary in
    * processing the saturate's operand, apply the saturate to that
    * instruction.  Otherwise, generate a MOV to do the saturate.
    *
    * Note that we have to be careful to only do this optimization if
    * the instruction in question was what generated src->result.  For
    * example, ir_dereference_array might generate a MUL instruction
    * to create the reladdr, and return us a src reg using that
    * reladdr.  That MUL result is not the value we're trying to
    * saturate.
    */
   ir_expression *sat_src_expr = sat_src->as_expression();
   if (sat_src_expr && (sat_src_expr->operation == ir_binop_mul ||
			sat_src_expr->operation == ir_binop_add ||
			sat_src_expr->operation == ir_binop_dot)) {
      glsl_to_tgsi_instruction *new_inst;
      new_inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
      new_inst->saturate = true;
   } else {
      this->result = get_temp(ir->type);
      st_dst_reg result_dst = st_dst_reg(this->result);
      result_dst.writemask = (1 << ir->type->vector_elements) - 1;
      glsl_to_tgsi_instruction *inst;
      inst = emit(ir, TGSI_OPCODE_MOV, result_dst, src);
      inst->saturate = true;
   }
1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333

   return true;
}

void
glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
        			    st_src_reg *reg, int *num_reladdr)
{
   if (!reg->reladdr)
      return;

1334
   emit_arl(ir, address_reg, *reg->reladdr);
1335 1336 1337 1338

   if (*num_reladdr != 1) {
      st_src_reg temp = get_temp(glsl_type::vec4_type);

1339
      emit(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
      *reg = temp;
   }

   (*num_reladdr)--;
}

void
glsl_to_tgsi_visitor::visit(ir_expression *ir)
{
   unsigned int operand;
   st_src_reg op[Elements(ir->operands)];
   st_src_reg result_src;
   st_dst_reg result_dst;

1354
   /* Quick peephole: Emit MAD(a, b, c) instead of ADD(MUL(a, b), c)
1355 1356 1357 1358 1359 1360 1361
    */
   if (ir->operation == ir_binop_add) {
      if (try_emit_mad(ir, 1))
         return;
      if (try_emit_mad(ir, 0))
         return;
   }
1362 1363 1364 1365 1366 1367 1368 1369 1370 1371

   /* Quick peephole: Emit OPCODE_MAD(-a, -b, a) instead of AND(a, NOT(b))
    */
   if (ir->operation == ir_binop_logic_and) {
      if (try_emit_mad_for_and_not(ir, 1))
	 return;
      if (try_emit_mad_for_and_not(ir, 0))
	 return;
   }

1372 1373 1374
   if (try_emit_sat(ir))
      return;

1375 1376
   if (ir->operation == ir_quadop_vector)
      assert(!"ir_quadop_vector should have been lowered");
1377 1378 1379 1380 1381 1382

   for (operand = 0; operand < ir->get_num_operands(); operand++) {
      this->result.file = PROGRAM_UNDEFINED;
      ir->operands[operand]->accept(this);
      if (this->result.file == PROGRAM_UNDEFINED) {
         printf("Failed to get tree for expression operand:\n");
1383 1384
         ir->operands[operand]->print();
         printf("\n");
1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
         exit(1);
      }
      op[operand] = this->result;

      /* Matrix expression operands should have been broken down to vector
       * operations already.
       */
      assert(!ir->operands[operand]->type->is_matrix());
   }

   int vector_elements = ir->operands[0]->type->vector_elements;
   if (ir->operands[1]) {
      vector_elements = MAX2(vector_elements,
        		     ir->operands[1]->type->vector_elements);
   }

   this->result.file = PROGRAM_UNDEFINED;

   /* Storage for our result.  Ideally for an assignment we'd be using
    * the actual storage for the result here, instead.
    */
   result_src = get_temp(ir->type);
   /* convenience for the emit functions below. */
   result_dst = st_dst_reg(result_src);
   /* Limit writes to the channels that will be used by result_src later.
    * This does limit this temp's use as a temporary for multi-instruction
    * sequences.
    */
   result_dst.writemask = (1 << ir->type->vector_elements) - 1;

   switch (ir->operation) {
   case ir_unop_logic_not:
1417
      if (result_dst.type != GLSL_TYPE_FLOAT)
1418
         emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
1419 1420 1421 1422 1423 1424 1425 1426 1427
      else {
         /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
          * older GPUs implement SEQ using multiple instructions (i915 uses two
          * SGE instructions and a MUL instruction).  Since our logic values are
          * 0.0 and 1.0, 1-x also implements !x.
          */
         op[0].negate = ~op[0].negate;
         emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
      }
1428 1429
      break;
   case ir_unop_neg:
1430
      if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
1431 1432 1433 1434 1435
         emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
      else {
         op[0].negate = ~op[0].negate;
         result_src = op[0];
      }
1436 1437
      break;
   case ir_unop_abs:
1438
      emit(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
1439 1440
      break;
   case ir_unop_sign:
1441
      emit(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
1442 1443
      break;
   case ir_unop_rcp:
1444
      emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
1445 1446 1447
      break;

   case ir_unop_exp2:
1448
      emit_scalar(ir, TGSI_OPCODE_EX2, result_dst, op[0]);
1449 1450 1451 1452 1453 1454
      break;
   case ir_unop_exp:
   case ir_unop_log:
      assert(!"not reached: should be handled by ir_explog_to_explog2");
      break;
   case ir_unop_log2:
1455
      emit_scalar(ir, TGSI_OPCODE_LG2, result_dst, op[0]);
1456 1457
      break;
   case ir_unop_sin:
1458
      emit_scalar(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1459 1460
      break;
   case ir_unop_cos:
1461
      emit_scalar(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1462 1463
      break;
   case ir_unop_sin_reduced:
1464
      emit_scs(ir, TGSI_OPCODE_SIN, result_dst, op[0]);
1465 1466
      break;
   case ir_unop_cos_reduced:
1467
      emit_scs(ir, TGSI_OPCODE_COS, result_dst, op[0]);
1468 1469 1470
      break;

   case ir_unop_dFdx:
1471
      emit(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
1472 1473
      break;
   case ir_unop_dFdy:
1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
   {
      /* The X component contains 1 or -1 depending on whether the framebuffer
       * is a FBO or the window system buffer, respectively.
       * It is then multiplied with the source operand of DDY.
       */
      static const gl_state_index transform_y_state[STATE_LENGTH]
         = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM };

      unsigned transform_y_index =
         _mesa_add_state_reference(this->prog->Parameters,
                                   transform_y_state);

      st_src_reg transform_y = st_src_reg(PROGRAM_STATE_VAR,
                                          transform_y_index,
                                          glsl_type::vec4_type);
      transform_y.swizzle = SWIZZLE_XXXX;

      st_src_reg temp = get_temp(glsl_type::vec4_type);

      emit(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
      emit(ir, TGSI_OPCODE_DDY, result_dst, temp);
1495
      break;
1496
   }
1497 1498

   case ir_unop_noise: {
1499 1500 1501 1502 1503 1504
      /* At some point, a motivated person could add a better
       * implementation of noise.  Currently not even the nvidia
       * binary drivers do anything more than this.  In any case, the
       * place to do this is in the GL state tracker, not the poor
       * driver.
       */
1505
      emit(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
1506 1507 1508 1509
      break;
   }

   case ir_binop_add:
1510
      emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
1511 1512
      break;
   case ir_binop_sub:
1513
      emit(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
1514 1515 1516
      break;

   case ir_binop_mul:
1517
      emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
1518 1519
      break;
   case ir_binop_div:
1520 1521 1522 1523 1524
      if (result_dst.type == GLSL_TYPE_FLOAT)
         assert(!"not reached: should be handled by ir_div_to_mul_rcp");
      else
         emit(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
      break;
1525
   case ir_binop_mod:
1526 1527 1528 1529
      if (result_dst.type == GLSL_TYPE_FLOAT)
         assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
      else
         emit(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
1530 1531 1532
      break;

   case ir_binop_less:
1533
      emit(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
1534 1535
      break;
   case ir_binop_greater:
1536
      emit(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
1537 1538
      break;
   case ir_binop_lequal:
1539
      emit(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
1540 1541
      break;
   case ir_binop_gequal:
1542
      emit(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
1543 1544
      break;
   case ir_binop_equal:
1545
      emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
1546 1547
      break;
   case ir_binop_nequal:
1548
      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
1549 1550 1551 1552 1553
      break;
   case ir_binop_all_equal:
      /* "==" operator producing a scalar boolean. */
      if (ir->operands[0]->type->is_vector() ||
          ir->operands[1]->type->is_vector()) {