brw_fs.cpp 123 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 * Copyright © 2010 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
22
23
24
 */

/** @file brw_fs.cpp
25
 *
26
27
28
 * This file drives the GLSL IR -> LIR translation, contains the
 * optimizations on the LIR, and drives the generation of native code
 * from the LIR.
29
30
31
 */

extern "C" {
32
33
34

#include <sys/types.h>

35
#include "util/hash_table.h"
36
37
#include "main/macros.h"
#include "main/shaderobj.h"
38
#include "main/fbobject.h"
39
40
#include "program/prog_parameter.h"
#include "program/prog_print.h"
41
#include "util/register_allocate.h"
42
#include "program/hash_table.h"
43
44
45
46
#include "brw_context.h"
#include "brw_eu.h"
#include "brw_wm.h"
}
47
#include "brw_fs.h"
48
#include "brw_cfg.h"
49
#include "brw_dead_control_flow.h"
50
#include "main/uniforms.h"
51
#include "brw_fs_live_variables.h"
52
#include "glsl/glsl_types.h"
53
#include "program/sampler.h"
54

55
void
56
57
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
              fs_reg *src, int sources)
58
59
{
   memset(this, 0, sizeof(*this));
60

61
62
63
   this->opcode = opcode;
   this->dst = dst;
   this->src = src;
64
   this->sources = sources;
65
66
67
68
69
70
71
72
73
74
75
76
77
78
   this->exec_size = exec_size;

   assert(dst.file != IMM && dst.file != UNIFORM);

   /* If exec_size == 0, try to guess it from the registers.  Since all
    * manner of things may use hardware registers, we first try to guess
    * based on GRF registers.  If this fails, we will go ahead and take the
    * width from the destination register.
    */
   if (this->exec_size == 0) {
      if (dst.file == GRF) {
         this->exec_size = dst.width;
      } else {
         for (int i = 0; i < sources; ++i) {
79
            if (src[i].file != GRF && src[i].file != ATTR)
80
81
82
83
84
85
86
87
88
89
90
91
               continue;

            if (this->exec_size <= 1)
               this->exec_size = src[i].width;
            assert(src[i].width == 1 || src[i].width == this->exec_size);
         }
      }

      if (this->exec_size == 0 && dst.file != BAD_FILE)
         this->exec_size = dst.width;
   }
   assert(this->exec_size != 0);
92

93
94
95
96
97
98
99
   for (int i = 0; i < sources; ++i) {
      switch (this->src[i].file) {
      case BAD_FILE:
         this->src[i].effective_width = 8;
         break;
      case GRF:
      case HW_REG:
100
      case ATTR:
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
         assert(this->src[i].width > 0);
         if (this->src[i].width == 1) {
            this->src[i].effective_width = this->exec_size;
         } else {
            this->src[i].effective_width = this->src[i].width;
         }
         break;
      case IMM:
      case UNIFORM:
         this->src[i].effective_width = this->exec_size;
         break;
      default:
         unreachable("Invalid source register file");
      }
   }
   this->dst.effective_width = this->exec_size;

118
119
   this->conditional_mod = BRW_CONDITIONAL_NONE;

120
   /* This will be the case for almost all instructions. */
121
122
123
124
   switch (dst.file) {
   case GRF:
   case HW_REG:
   case MRF:
125
   case ATTR:
126
127
128
129
130
131
132
133
134
135
136
      this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
      break;
   case BAD_FILE:
      this->regs_written = 0;
      break;
   case IMM:
   case UNIFORM:
      unreachable("Invalid destination register file");
   default:
      unreachable("Invalid register file");
   }
137
138

   this->writes_accumulator = false;
139
140
}

141
142
143
144
145
146
147
148
149
150
151
152
fs_inst::fs_inst()
{
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   init(BRW_OPCODE_NOP, 8, dst, src, 0);
}

fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
{
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   init(opcode, exec_size, reg_undef, src, 0);
}

153
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
154
{
155
   fs_reg *src = ralloc_array(this, fs_reg, 3);
156
157
158
159
160
161
162
163
164
   init(opcode, 0, dst, src, 0);
}

fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                 const fs_reg &src0)
{
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   src[0] = src0;
   init(opcode, exec_size, dst, src, 1);
165
166
}

167
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168
{
169
170
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   src[0] = src0;
171
172
173
174
175
176
177
178
179
180
   init(opcode, 0, dst, src, 1);
}

fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                 const fs_reg &src0, const fs_reg &src1)
{
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   src[0] = src0;
   src[1] = src1;
   init(opcode, exec_size, dst, src, 2);
181
182
}

183
184
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
                 const fs_reg &src1)
185
{
186
187
188
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   src[0] = src0;
   src[1] = src1;
189
190
191
192
193
194
195
196
197
198
199
   init(opcode, 0, dst, src, 2);
}

fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
{
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   src[0] = src0;
   src[1] = src1;
   src[2] = src2;
   init(opcode, exec_size, dst, src, 3);
200
201
}

202
203
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
                 const fs_reg &src1, const fs_reg &src2)
204
{
205
206
207
208
   fs_reg *src = ralloc_array(this, fs_reg, 3);
   src[0] = src0;
   src[1] = src1;
   src[2] = src2;
209
   init(opcode, 0, dst, src, 3);
210
211
}

212
213
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
{
214
215
216
217
218
219
220
   init(opcode, 0, dst, src, sources);
}

fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
                 fs_reg src[], int sources)
{
   init(opcode, exec_width, dst, src, sources);
221
222
}

223
224
225
fs_inst::fs_inst(const fs_inst &that)
{
   memcpy(this, &that, sizeof(that));
226

227
   this->src = ralloc_array(this, fs_reg, that.sources);
228

229
   for (int i = 0; i < that.sources; i++)
230
      this->src[i] = that.src[i];
231
232
}

233
234
235
236
237
238
239
240
241
void
fs_inst::resize_sources(uint8_t num_sources)
{
   if (this->sources != num_sources) {
      this->src = reralloc(this, this->src, fs_reg, num_sources);
      this->sources = num_sources;
   }
}

242
243
#define ALU1(op)                                                        \
   fs_inst *                                                            \
244
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
245
246
247
248
249
250
   {                                                                    \
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
   }

#define ALU2(op)                                                        \
   fs_inst *                                                            \
251
252
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
                  const fs_reg &src1)                                   \
253
254
255
256
   {                                                                    \
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
   }

257
258
#define ALU2_ACC(op)                                                    \
   fs_inst *                                                            \
259
260
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
                  const fs_reg &src1)                                   \
261
262
263
264
265
266
   {                                                                    \
      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
      inst->writes_accumulator = true;                                  \
      return inst;                                                      \
   }

267
268
#define ALU3(op)                                                        \
   fs_inst *                                                            \
269
270
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
                  const fs_reg &src1, const fs_reg &src2)               \
271
272
273
274
   {                                                                    \
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
   }

275
276
277
278
279
280
281
282
ALU1(NOT)
ALU1(MOV)
ALU1(FRC)
ALU1(RNDD)
ALU1(RNDE)
ALU1(RNDZ)
ALU2(ADD)
ALU2(MUL)
283
ALU2_ACC(MACH)
284
285
286
287
288
289
ALU2(AND)
ALU2(OR)
ALU2(XOR)
ALU2(SHL)
ALU2(SHR)
ALU2(ASR)
290
ALU3(LRP)
291
292
293
294
295
296
297
ALU1(BFREV)
ALU3(BFE)
ALU2(BFI1)
ALU3(BFI2)
ALU1(FBH)
ALU1(FBL)
ALU1(CBIT)
298
ALU3(MAD)
299
300
ALU2_ACC(ADDC)
ALU2_ACC(SUBB)
301
ALU2(SEL)
302
ALU2(MAC)
303

304
305
/** Gen4 predicated IF. */
fs_inst *
Matt Turner's avatar
Matt Turner committed
306
fs_visitor::IF(enum brw_predicate predicate)
307
{
308
   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
309
310
311
312
   inst->predicate = predicate;
   return inst;
}

313
/** Gen6 IF with embedded comparison. */
314
fs_inst *
315
316
fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
               enum brw_conditional_mod condition)
317
{
318
   assert(brw->gen == 6);
319
   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
320
321
322
323
324
325
326
327
328
329
330
                                        reg_null_d, src0, src1);
   inst->conditional_mod = condition;
   return inst;
}

/**
 * CMP: Sets the low bit of the destination channels with the result
 * of the comparison, while the upper bits are undefined, and updates
 * the flag register with the packed 16 bits of the result.
 */
fs_inst *
331
332
fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
                enum brw_conditional_mod condition)
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
{
   fs_inst *inst;

   /* Take the instruction:
    *
    * CMP null<d> src0<f> src1<f>
    *
    * Original gen4 does type conversion to the destination type before
    * comparison, producing garbage results for floating point comparisons.
    * gen5 does the comparison on the execution type (resolved source types),
    * so dst type doesn't matter.  gen6 does comparison and then uses the
    * result as if it was the dst type with no conversion, which happens to
    * mostly work out for float-interpreted-as-int since our comparisons are
    * for >0, =0, <0.
    */
348
   if (brw->gen == 4) {
349
      dst.type = src0.type;
350
      if (dst.file == HW_REG)
351
352
353
354
355
356
357
358
359
360
361
362
	 dst.fixed_hw_reg.type = dst.type;
   }

   resolve_ud_negate(&src0);
   resolve_ud_negate(&src1);

   inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
   inst->conditional_mod = condition;

   return inst;
}

363
364
365
fs_inst *
fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
{
366
367
368
369
370
371
372
373
374
   uint8_t exec_size = dst.width;
   for (int i = 0; i < sources; ++i) {
      assert(src[i].width % dst.width == 0);
      if (src[i].width > exec_size)
         exec_size = src[i].width;
   }

   fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
                                        dst, src, sources);
375
376
377
378
379
380
   inst->regs_written = 0;
   for (int i = 0; i < sources; ++i) {
      /* The LOAD_PAYLOAD instruction only really makes sense if we are
       * dealing with whole registers.  If this ever changes, we can deal
       * with it later.
       */
381
      int size = src[i].effective_width * type_sz(src[i].type);
382
383
384
      assert(size % 32 == 0);
      inst->regs_written += (size + 31) / 32;
   }
385
386
387
388

   return inst;
}

389
exec_list
390
391
392
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
                                       const fs_reg &surf_index,
                                       const fs_reg &varying_offset,
393
                                       uint32_t const_offset)
394
395
396
397
{
   exec_list instructions;
   fs_inst *inst;

398
399
400
401
402
403
404
405
406
407
408
409
410
   /* We have our constant surface use a pitch of 4 bytes, so our index can
    * be any component of a vector, and then we load 4 contiguous
    * components starting from that.
    *
    * We break down the const_offset to a portion added to the variable
    * offset and a portion done using reg_offset, which means that if you
    * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
    * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
    * CSE can later notice that those loads are all the same and eliminate
    * the redundant ones.
    */
   fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
   instructions.push_tail(ADD(vec4_offset,
411
                              varying_offset, fs_reg(const_offset & ~3)));
412
413

   int scale = 1;
414
   if (brw->gen == 4 && dst.width == 8) {
415
416
417
418
      /* Pre-gen5, we can either use a SIMD8 message that requires (header,
       * u, v, r) as parameters, or we can just use the SIMD16 message
       * consisting of (header, u).  We choose the second, at the cost of a
       * longer return length.
419
       */
420
421
      scale = 2;
   }
422

423
   enum opcode op;
424
   if (brw->gen >= 7)
425
426
427
      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
   else
      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
428
429
430
431
432

   assert(dst.width % 8 == 0);
   int regs_written = 4 * (dst.width / 8) * scale;
   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
                               dst.type, dst.width);
433
   inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
434
   inst->regs_written = regs_written;
435
436
   instructions.push_tail(inst);

437
   if (brw->gen < 7) {
438
439
      inst->base_mrf = 13;
      inst->header_present = true;
440
      if (brw->gen == 4)
441
442
443
         inst->mlen = 3;
      else
         inst->mlen = 1 + dispatch_width / 8;
444
445
   }

446
447
   fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
   instructions.push_tail(MOV(dst, result));
448

449
450
451
   return instructions;
}

452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
/**
 * A helper for MOV generation for fixing up broken hardware SEND dependency
 * handling.
 */
fs_inst *
fs_visitor::DEP_RESOLVE_MOV(int grf)
{
   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));

   inst->ir = NULL;
   inst->annotation = "send dependency resolve";

   /* The caller always wants uncompressed to emit the minimal extra
    * dependencies, and to avoid having to deal with aligning its regs to 2.
    */
467
   inst->exec_size = 8;
468
469
470
471

   return inst;
}

472
bool
473
fs_inst::equals(fs_inst *inst) const
474
475
476
477
478
479
480
{
   return (opcode == inst->opcode &&
           dst.equals(inst->dst) &&
           src[0].equals(inst->src[0]) &&
           src[1].equals(inst->src[1]) &&
           src[2].equals(inst->src[2]) &&
           saturate == inst->saturate &&
481
           predicate == inst->predicate &&
482
483
484
485
486
487
488
           conditional_mod == inst->conditional_mod &&
           mlen == inst->mlen &&
           base_mrf == inst->base_mrf &&
           target == inst->target &&
           eot == inst->eot &&
           header_present == inst->header_present &&
           shadow_compare == inst->shadow_compare &&
489
           exec_size == inst->exec_size &&
490
491
492
           offset == inst->offset);
}

493
bool
494
fs_inst::overwrites_reg(const fs_reg &reg) const
495
496
497
498
{
   return (reg.file == dst.file &&
           reg.reg == dst.reg &&
           reg.reg_offset >= dst.reg_offset  &&
499
           reg.reg_offset < dst.reg_offset + regs_written);
500
501
}

502
bool
503
fs_inst::is_send_from_grf() const
504
{
505
506
507
508
509
510
511
   switch (opcode) {
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
   case SHADER_OPCODE_SHADER_TIME_ADD:
   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
512
   case SHADER_OPCODE_UNTYPED_ATOMIC:
513
   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
514
   case SHADER_OPCODE_URB_WRITE_SIMD8:
515
516
517
      return true;
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
      return src[1].file == GRF;
518
519
   case FS_OPCODE_FB_WRITE:
      return src[0].file == GRF;
520
521
522
523
524
525
   default:
      if (is_tex())
         return src[0].file == GRF;

      return false;
   }
526
527
528
}

bool
529
fs_inst::can_do_source_mods(struct brw_context *brw)
530
{
531
   if (brw->gen == 6 && is_math())
532
533
      return false;

534
   if (is_send_from_grf())
535
536
      return false;

537
   if (!backend_instruction::can_do_source_mods())
538
539
      return false;

540
541
542
   return true;
}

543
544
545
546
void
fs_reg::init()
{
   memset(this, 0, sizeof(*this));
547
   stride = 1;
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
}

/** Generic unset register constructor. */
fs_reg::fs_reg()
{
   init();
   this->file = BAD_FILE;
}

/** Immediate value constructor. */
fs_reg::fs_reg(float f)
{
   init();
   this->file = IMM;
   this->type = BRW_REGISTER_TYPE_F;
563
   this->fixed_hw_reg.dw1.f = f;
564
   this->width = 1;
565
566
567
568
569
570
571
572
}

/** Immediate value constructor. */
fs_reg::fs_reg(int32_t i)
{
   init();
   this->file = IMM;
   this->type = BRW_REGISTER_TYPE_D;
573
   this->fixed_hw_reg.dw1.d = i;
574
   this->width = 1;
575
576
577
578
579
580
581
582
}

/** Immediate value constructor. */
fs_reg::fs_reg(uint32_t u)
{
   init();
   this->file = IMM;
   this->type = BRW_REGISTER_TYPE_UD;
583
   this->fixed_hw_reg.dw1.ud = u;
584
   this->width = 1;
585
586
}

587
588
589
590
591
592
593
594
595
/** Vector float immediate value constructor. */
fs_reg::fs_reg(uint8_t vf[4])
{
   init();
   this->file = IMM;
   this->type = BRW_REGISTER_TYPE_VF;
   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
}

596
597
598
599
600
601
602
603
604
605
606
607
/** Vector float immediate value constructor. */
fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
{
   init();
   this->file = IMM;
   this->type = BRW_REGISTER_TYPE_VF;
   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
                               (vf1 <<  8) |
                               (vf2 << 16) |
                               (vf3 << 24);
}

608
/** Fixed brw_reg. */
609
610
611
fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
{
   init();
612
   this->file = HW_REG;
613
   this->fixed_hw_reg = fixed_hw_reg;
614
   this->type = fixed_hw_reg.type;
615
   this->width = 1 << fixed_hw_reg.width;
616
617
618
619
620
621
622
623
}

bool
fs_reg::equals(const fs_reg &r) const
{
   return (file == r.file &&
           reg == r.reg &&
           reg_offset == r.reg_offset &&
624
           subreg_offset == r.subreg_offset &&
625
626
627
           type == r.type &&
           negate == r.negate &&
           abs == r.abs &&
628
           !reladdr && !r.reladdr &&
629
630
           memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
           width == r.width &&
631
           stride == r.stride);
632
633
}

634
635
636
637
638
639
640
641
642
fs_reg &
fs_reg::set_smear(unsigned subreg)
{
   assert(file != HW_REG && file != IMM);
   subreg_offset = subreg * type_sz(type);
   stride = 0;
   return *this;
}

643
644
645
646
647
648
bool
fs_reg::is_contiguous() const
{
   return stride == 1;
}

649
650
int
fs_visitor::type_size(const struct glsl_type *type)
651
652
653
654
655
656
657
658
{
   unsigned int size, i;

   switch (type->base_type) {
   case GLSL_TYPE_UINT:
   case GLSL_TYPE_INT:
   case GLSL_TYPE_FLOAT:
   case GLSL_TYPE_BOOL:
659
      return type->components();
660
661
662
663
664
665
666
667
668
669
670
671
672
   case GLSL_TYPE_ARRAY:
      return type_size(type->fields.array) * type->length;
   case GLSL_TYPE_STRUCT:
      size = 0;
      for (i = 0; i < type->length; i++) {
	 size += type_size(type->fields.structure[i].type);
      }
      return size;
   case GLSL_TYPE_SAMPLER:
      /* Samplers take up no register space, since they're baked in at
       * link time.
       */
      return 0;
673
674
   case GLSL_TYPE_ATOMIC_UINT:
      return 0;
675
   case GLSL_TYPE_IMAGE:
676
677
   case GLSL_TYPE_VOID:
   case GLSL_TYPE_ERROR:
Ian Romanick's avatar
Ian Romanick committed
678
   case GLSL_TYPE_INTERFACE:
679
      unreachable("not reached");
680
   }
681
682

   return 0;
683
684
}

685
686
687
fs_reg
fs_visitor::get_timestamp()
{
688
   assert(brw->gen >= 7);
689

690
   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
691
692
693
694
                                          BRW_ARF_TIMESTAMP,
                                          0),
                             BRW_REGISTER_TYPE_UD));

695
   fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
696
697

   fs_inst *mov = emit(MOV(dst, ts));
698
699
   /* We want to read the 3 fields we care about even if it's not enabled in
    * the dispatch.
700
701
702
703
704
705
706
707
708
709
710
711
712
    */
   mov->force_writemask_all = true;

   /* The caller wants the low 32 bits of the timestamp.  Since it's running
    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
    * which is plenty of time for our purposes.  It is identical across the
    * EUs, but since it's tracking GPU core speed it will increment at a
    * varying rate as render P-states change.
    *
    * The caller could also check if render P-states have changed (or anything
    * else that might disrupt timing) by setting smear to 2 and checking if
    * that field is != 0.
    */
713
   dst.set_smear(0);
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729

   return dst;
}

void
fs_visitor::emit_shader_time_begin()
{
   current_annotation = "shader time start";
   shader_start_time = get_timestamp();
}

void
fs_visitor::emit_shader_time_end()
{
   current_annotation = "shader time end";

730
   enum shader_time_shader_type type, written_type, reset_type;
731
732
   if (dispatch_width == 8) {
      type = ST_FS8;
733
734
      written_type = ST_FS8_WRITTEN;
      reset_type = ST_FS8_RESET;
735
736
737
   } else {
      assert(dispatch_width == 16);
      type = ST_FS16;
738
739
      written_type = ST_FS16_WRITTEN;
      reset_type = ST_FS16_RESET;
740
741
   }

742
   fs_reg shader_end_time = get_timestamp();
743
744
745
746

   /* Check that there weren't any timestamp reset events (assuming these
    * were the only two timestamp reads that happened).
    */
747
   fs_reg reset = shader_end_time;
748
   reset.set_smear(2);
749
750
751
752
   fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
   test->conditional_mod = BRW_CONDITIONAL_Z;
   emit(IF(BRW_PREDICATE_NORMAL));

753
   fs_reg start = shader_start_time;
754
   start.negate = true;
755
   fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
756
   emit(ADD(diff, start, shader_end_time));
757
758
759
760
761
762
763

   /* If there were no instructions between the two timestamp gets, the diff
    * is 2 cycles.  Remove that overhead, so I can forget about that when
    * trying to determine the time taken for single instructions.
    */
   emit(ADD(diff, diff, fs_reg(-2u)));

764
   emit_shader_time_write(type, diff);
765
766
767
   emit_shader_time_write(written_type, fs_reg(1u));
   emit(BRW_OPCODE_ELSE);
   emit_shader_time_write(reset_type, fs_reg(1u));
768
769
770
771
772
773
774
   emit(BRW_OPCODE_ENDIF);
}

void
fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
                                   fs_reg value)
{
775
   int shader_time_index =
776
      brw_get_shader_time_index(brw, shader_prog, prog, type);
777
   fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
778

779
780
781
782
783
   fs_reg payload;
   if (dispatch_width == 8)
      payload = fs_reg(this, glsl_type::uvec2_type);
   else
      payload = fs_reg(this, glsl_type::uint_type);
784

785
786
   emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
                             fs_reg(), payload, offset, value));
787
788
}

789
void
790
fs_visitor::vfail(const char *format, va_list va)
791
{
792
   char *msg;
793

794
795
   if (failed)
      return;
796

797
798
799
800
801
802
803
804
   failed = true;

   msg = ralloc_vasprintf(mem_ctx, format, va);
   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);

   this->fail_msg = msg;

   if (INTEL_DEBUG & DEBUG_WM) {
805
      fprintf(stderr, "%s",  msg);
806
807
808
   }
}

809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
void
fs_visitor::fail(const char *format, ...)
{
   va_list va;

   va_start(va, format);
   vfail(format, va);
   va_end(va);
}

/**
 * Mark this program as impossible to compile in SIMD16 mode.
 *
 * During the SIMD8 compile (which happens first), we can detect and flag
 * things that are unsupported in SIMD16 mode, so the compiler can skip
 * the SIMD16 compile altogether.
 *
 * During a SIMD16 compile (if one happens anyway), this just calls fail().
 */
void
fs_visitor::no16(const char *format, ...)
{
   va_list va;

   va_start(va, format);

   if (dispatch_width == 16) {
      vfail(format, va);
   } else {
      simd16_unsupported = true;

840
      if (brw->perf_debug) {
841
842
843
844
845
846
847
848
849
850
         if (no16_msg)
            ralloc_vasprintf_append(&no16_msg, format, va);
         else
            no16_msg = ralloc_vasprintf(mem_ctx, format, va);
      }
   }

   va_end(va);
}

851
852
853
fs_inst *
fs_visitor::emit(enum opcode opcode)
{
854
   return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
855
856
857
}

fs_inst *
858
fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
859
{
860
   return emit(new(mem_ctx) fs_inst(opcode, dst));
861
862
863
}

fs_inst *
864
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
865
{
866
   return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
867
868
869
}

fs_inst *
870
871
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
                 const fs_reg &src1)
872
{
873
   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
874
875
876
}

fs_inst *
877
878
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
                 const fs_reg &src1, const fs_reg &src2)
879
{
880
   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
881
882
}

883
fs_inst *
884
fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
885
886
887
888
889
                 fs_reg src[], int sources)
{
   return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
}

890
891
892
893
894
895
896
897
898
/**
 * Returns true if the instruction has a flag that means it won't
 * update an entire destination register.
 *
 * For example, dead code elimination and live variable analysis want to know
 * when a write to a variable screens off any preceding values that were in
 * it.
 */
bool
899
fs_inst::is_partial_write() const
900
{
901
   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
902
903
           (this->dst.width * type_sz(this->dst.type)) < 32 ||
           !this->dst.is_contiguous());
904
905
}

906
int
907
fs_inst::regs_read(fs_visitor *v, int arg) const
908
909
{
   if (is_tex() && arg == 0 && src[0].file == GRF) {
910
      return mlen;
911
912
   } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
      return mlen;
913
914
   } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
      return mlen;
915
916
   } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
      return mlen;
917
918
   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
      return mlen;
919
920
   } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
      return mlen;
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
   }

   switch (src[arg].file) {
   case BAD_FILE:
   case UNIFORM:
   case IMM:
      return 1;
   case GRF:
   case HW_REG:
      if (src[arg].stride == 0) {
         return 1;
      } else {
         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
         return (size + 31) / 32;
      }
   case MRF:
      unreachable("MRF registers are not allowed as sources");
   default:
      unreachable("Invalid register file");
940
941
942
   }
}

943
bool
944
fs_inst::reads_flag() const
945
946
947
948
949
{
   return predicate;
}

bool
950
fs_inst::writes_flag() const
951
{
952
953
954
   return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
                               opcode != BRW_OPCODE_IF &&
                               opcode != BRW_OPCODE_WHILE)) ||
955
956
957
          opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
}

958
959
960
961
962
963
964
965
966
967
968
969
/**
 * Returns how many MRFs an FS opcode will write over.
 *
 * Note that this is not the 0 or 1 implied writes in an actual gen
 * instruction -- the FS opcodes often generate MOVs in addition.
 */
int
fs_visitor::implied_mrf_writes(fs_inst *inst)
{
   if (inst->mlen == 0)
      return 0;

970
971
972
   if (inst->base_mrf == -1)
      return 0;

973
   switch (inst->opcode) {
974
975
976
977
978
979
980
   case SHADER_OPCODE_RCP:
   case SHADER_OPCODE_RSQ:
   case SHADER_OPCODE_SQRT:
   case SHADER_OPCODE_EXP2:
   case SHADER_OPCODE_LOG2:
   case SHADER_OPCODE_SIN:
   case SHADER_OPCODE_COS:
981
      return 1 * dispatch_width / 8;
982
   case SHADER_OPCODE_POW:
983
984
   case SHADER_OPCODE_INT_QUOTIENT:
   case SHADER_OPCODE_INT_REMAINDER:
985
      return 2 * dispatch_width / 8;
986
   case SHADER_OPCODE_TEX:
987
   case FS_OPCODE_TXB:
988
989
   case SHADER_OPCODE_TXD:
   case SHADER_OPCODE_TXF:
990
   case SHADER_OPCODE_TXF_CMS:
991
   case SHADER_OPCODE_TXF_MCS:
Chris Forbes's avatar
Chris Forbes committed
992
   case SHADER_OPCODE_TG4:
993
   case SHADER_OPCODE_TG4_OFFSET:
994
995
   case SHADER_OPCODE_TXL:
   case SHADER_OPCODE_TXS:
996
   case SHADER_OPCODE_LOD:
997
998
999
      return 1;
   case FS_OPCODE_FB_WRITE:
      return 2;
1000
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1001
   case SHADER_OPCODE_GEN4_SCRATCH_READ:
1002
      return 1;
1003
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1004
      return inst->mlen;
1005
   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1006
      return 2;
1007
   case SHADER_OPCODE_UNTYPED_ATOMIC:
1008
   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1009
   case SHADER_OPCODE_URB_WRITE_SIMD8:
1010
1011
1012
1013
   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1014
      return 0;
1015
   default:
1016
      unreachable("not reached");
1017
1018
1019
   }
}

1020
1021
1022
int
fs_visitor::virtual_grf_alloc(int size)
{
1023
   if (virtual_grf_array_size <= virtual_grf_count) {
1024
1025
1026
1027
      if (virtual_grf_array_size == 0)
	 virtual_grf_array_size = 16;
      else
	 virtual_grf_array_size *= 2;
1028
1029
      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
				   virtual_grf_array_size);
1030
   }
1031
1032
   virtual_grf_sizes[virtual_grf_count] = size;
   return virtual_grf_count++;
1033
1034
}

1035
/** Fixed HW reg constructor. */
1036
fs_reg::fs_reg(enum register_file file, int reg)
1037
{
1038
   init();
1039
   this->file = file;
1040
   this->reg = reg;
1041
   this->type = BRW_REGISTER_TYPE_F;
1042
1043
1044
1045
1046
1047
1048
1049

   switch (file) {
   case UNIFORM:
      this->width = 1;
      break;
   default:
      this->width = 8;
   }
1050
1051
}

1052
/** Fixed HW reg constructor. */
1053
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1054
1055
1056
{
   init();
   this->file = file;
1057
   this->reg = reg;
1058
   this->type = type;
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077

   switch (file) {
   case UNIFORM:
      this->width = 1;
      break;
   default:
      this->width = 8;
   }
}

/** Fixed HW reg constructor. */
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
               uint8_t width)
{
   init();
   this->file = file;
   this->reg = reg;
   this->type = type;
   this->width = width;
1078
1079
}

1080
/** Automatic reg constructor. */
1081
fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1082
1083
{
   init();
1084
   int reg_width = v->dispatch_width / 8;
1085
1086

   this->file = GRF;
1087
   this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1088
1089
   this->reg_offset = 0;
   this->type = brw_type_for_base_type(type);
1090
1091
   this->width = v->dispatch_width;
   assert(this->width == 8 || this->width == 16);
1092
1093
}

1094
1095
1096
1097
1098
1099
fs_reg *
fs_visitor::variable_storage(ir_variable *var)
{
   return (fs_reg *)hash_table_find(this->variable_ht, var);
}

1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
void
import_uniforms_callback(const void *key,
			 void *data,
			 void *closure)
{
   struct hash_table *dst_ht = (struct hash_table *)closure;
   const fs_reg *reg = (const fs_reg *)data;

   if (reg->file != UNIFORM)
      return;

   hash_table_insert(dst_ht, data, key);
}

1114
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115
1116
1117
 * This brings in those uniform definitions
 */
void
1118
fs_visitor::import_uniforms(fs_visitor *v)
1119
{
1120
   hash_table_call_foreach(v->variable_ht,
1121
1122
			   import_uniforms_callback,
			   variable_ht);
1123
   this->push_constant_loc = v->push_constant_loc;
1124
   this->pull_constant_loc = v->pull_constant_loc;
1125
   this->uniforms = v->uniforms;
1126
   this->param_size = v->param_size;
1127
1128
}

1129
1130
1131
1132
1133
/* Our support for uniforms is piggy-backed on the struct
 * gl_fragment_program, because that's where the values actually
 * get stored, rather than in some global gl_shader_program uniform
 * store.
 */
1134
1135
void
fs_visitor::setup_uniform_values(ir_variable *ir)
1136
{
1137
   int namelen = strlen(ir->name);
1138

1139
1140
1141
1142
1143
1144
   /* The data for our (non-builtin) uniforms is stored in a series of
    * gl_uniform_driver_storage structs for each subcomponent that
    * glGetUniformLocation() could name.  We know it's been set up in the same
    * order we'd walk the type, so walk the list of storage and find anything
    * with our name, or the prefix of a component that starts with our name.
    */
1145
   unsigned params_before = uniforms;
1146
1147
   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149
1150
1151
1152
1153

      if (strncmp(ir->name, storage->name, namelen) != 0 ||
          (storage->name[namelen] != 0 &&
           storage->name[namelen] != '.' &&
           storage->name[namelen] != '[')) {
         continue;
1154
1155
      }

1156
1157
1158
      unsigned slots = storage->type->component_slots();
      if (storage->array_elements)
         slots *= storage->array_elements;
1159

1160
      for (unsigned i = 0; i < slots; i++) {
1161
         stage_prog_data->param[uniforms++] = &storage->storage[i];
1162
1163
      }
   }
1164