Samuel Pitoiset · Samuel Pitoiset · Kenneth Graunke · Kenneth Graunke · Bas Nieuwenhuizen · Jonathan Gray
--- a/VERSION
+++ b/VERSION
-19.3.0-rc5
+19.3.0-rc6
--- a/debian/changelog
+++ b/debian/changelog
+mesa (19.3.0~rc6-1) UNRELEASED; urgency=medium
+
+  * New upstream release candidate.
+  * control, rules: Don't build mesa-vulkan-drivers on mips*.
+
+ -- Timo Aaltonen <tjaalton@debian.org>  Thu, 05 Dec 2019 09:21:11 +0200
+
 mesa (19.3.0~rc5-1) experimental; urgency=medium

  * New upstream release candidate.

--- a/debian/control
+++ b/debian/control
@@ -443,7 +443,7 @@ Description: Mesa VDPAU video acceleration drivers

 Package: mesa-vulkan-drivers
 Section: libs
-Architecture: amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32
+Architecture: amd64 arm64 armel armhf i386 powerpc ppc64 ppc64el s390x sparc64 x32
 Pre-Depends: ${misc:Pre-Depends}
 Depends:
 libvulkan1,

--- a/debian/rules
+++ b/debian/rules
@@ -103,7 +103,7 @@ else

  # radv needs LLVM and the Vulkan loader, so only build on the subset of
  # arches where we have LLVM enabled and where the Vulkan loader is built.
-  ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64))
+  ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 powerpc ppc64 ppc64el s390x sparc64))
 	VULKAN_DRIVERS += amd,
  endif


--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -392,7 +392,7 @@ void insert_NOPs_gfx8_9(Program* program)
   }
 }

-void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
+void handle_instruction_gfx10(Program *program, NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
                              std::vector<aco_ptr<Instruction>>& old_instructions,
                              std::vector<aco_ptr<Instruction>>& new_instructions)
 {
@@ -403,6 +403,9 @@ void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
       instr->format == Format::SCRATCH || instr->format == Format::DS) {
      /* Remember all SGPRs that are read by the VMEM instruction */
      mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
+      ctx.sgprs_read_by_VMEM.set(exec);
+      if (program->wave_size == 64)
+         ctx.sgprs_read_by_VMEM.set(exec_hi);
   } else if (instr->isSALU() || instr->format == Format::SMEM) {
      /* Check if SALU writes an SGPR that was previously read by the VALU */
      if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) {
@@ -535,7 +538,7 @@ void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
   }
 }

-void handle_block_gfx10(NOP_ctx_gfx10& ctx, Block& block)
+void handle_block_gfx10(Program *program, NOP_ctx_gfx10& ctx, Block& block)
 {
   if (block.instructions.empty())
      return;
@@ -544,7 +547,7 @@ void handle_block_gfx10(NOP_ctx_gfx10& ctx, Block& block)
   instructions.reserve(block.instructions.size());

   for (aco_ptr<Instruction>& instr : block.instructions) {
-      handle_instruction_gfx10(ctx, instr, block.instructions, instructions);
+      handle_instruction_gfx10(program, ctx, instr, block.instructions, instructions);
      instructions.emplace_back(std::move(instr));
   }

@@ -569,7 +572,7 @@ void mitigate_hazards_gfx10(Program *program)
            for (unsigned b : program->blocks[idx].linear_preds)
               loop_block_ctx.join(all_ctx[b]);

-            handle_block_gfx10(loop_block_ctx, program->blocks[idx]);
+            handle_block_gfx10(program, loop_block_ctx, program->blocks[idx]);

            /* We only need to continue if the loop header context changed */
            if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
@@ -584,7 +587,7 @@ void mitigate_hazards_gfx10(Program *program)
      for (unsigned b : block.linear_preds)
         ctx.join(all_ctx[b]);

-      handle_block_gfx10(ctx, block);
+      handle_block_gfx10(program, ctx, block);
   }
 }


--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -1976,8 +1976,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
   }
   case nir_op_i2i64: {
      Temp src = get_alu_src(ctx, instr->src[0]);
-      if (instr->src[0].src.ssa->bit_size == 32) {
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
+      if (src.regClass() == s1) {
+         Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
+      } else if (src.regClass() == v1) {
+         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
      } else {
         fprintf(stderr, "Unimplemented NIR instr bit size: ");
         nir_print_instr(&instr->instr, stderr);
@@ -6572,11 +6576,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      }
   }

-   if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
-       instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
-       instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
-      coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
-
   std::vector<Operand> args;
   if (has_offset)
      args.emplace_back(Operand(offset));
@@ -6592,7 +6591,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
   if (has_lod)
      args.emplace_back(lod);

-   Operand arg;
+   Temp arg;
   if (args.size() > 1) {
      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
      unsigned size = 0;
@@ -6604,12 +6603,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
      Temp tmp = bld.tmp(rc);
      vec->definitions[0] = Definition(tmp);
      ctx->block->instructions.emplace_back(std::move(vec));
-      arg = Operand(tmp);
+      arg = tmp;
   } else {
      assert(args[0].isTemp());
-      arg = Operand(as_vgpr(ctx, args[0].getTemp()));
+      arg = as_vgpr(ctx, args[0].getTemp());
   }

+   /* we don't need the bias, sample index, compare value or offset to be
+    * computed in WQM but if the p_create_vector copies the coordinates, then it
+    * needs to be in WQM */
+   if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
+       instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
+       instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
+      arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
+
   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
      //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()

@@ -6741,7 +6748,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
   }

   tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
-   tex->operands[0] = arg;
+   tex->operands[0] = Operand(arg);
   tex->operands[1] = Operand(resource);
   tex->operands[2] = Operand(sampler);
   tex->dim = dim;

--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -130,8 +130,7 @@ struct InstrPred {
               return false;
         }
      }
-      if (a->format == Format::PSEUDO_BRANCH)
-         return false;
+
      if (a->isVOP3()) {
         VOP3A_instruction* a3 = static_cast<VOP3A_instruction*>(a);
         VOP3A_instruction* b3 = static_cast<VOP3A_instruction*>(b);
@@ -147,7 +146,8 @@ struct InstrPred {
      if (a->isDPP()) {
         DPP_instruction* aDPP = static_cast<DPP_instruction*>(a);
         DPP_instruction* bDPP = static_cast<DPP_instruction*>(b);
-         return aDPP->dpp_ctrl == bDPP->dpp_ctrl &&
+         return aDPP->pass_flags == bDPP->pass_flags &&
+                aDPP->dpp_ctrl == bDPP->dpp_ctrl &&
                aDPP->bank_mask == bDPP->bank_mask &&
                aDPP->row_mask == bDPP->row_mask &&
                aDPP->bound_ctrl == bDPP->bound_ctrl &&
@@ -156,6 +156,7 @@ struct InstrPred {
                aDPP->neg[0] == bDPP->neg[0] &&
                aDPP->neg[1] == bDPP->neg[1];
      }
+
      switch (a->format) {
         case Format::VOPC: {
            /* Since the results depend on the exec mask, these shouldn't
@@ -191,7 +192,7 @@ struct InstrPred {
            /* this is fine since they are only used for vertex input fetches */
            MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
            MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
-            return aM->can_reorder == bM->can_reorder &&
+            return aM->can_reorder && bM->can_reorder &&
                   aM->barrier == bM->barrier &&
                   aM->dfmt == bM->dfmt &&
                   aM->nfmt == bM->nfmt &&
@@ -208,6 +209,10 @@ struct InstrPred {
         case Format::FLAT:
         case Format::GLOBAL:
         case Format::SCRATCH:
+         case Format::EXP:
+         case Format::SOPP:
+         case Format::PSEUDO_BRANCH:
+         case Format::PSEUDO_BARRIER:
            return false;
         case Format::DS: {
            /* we already handle potential issue with permute/swizzle above */
@@ -276,6 +281,10 @@ void process_block(vn_ctx& ctx, Block& block)
            op.setTemp(it->second);
      }

+      if (instr->opcode == aco_opcode::p_discard_if ||
+          instr->opcode == aco_opcode::p_demote_to_helper)
+         ctx.exec_id++;
+
      if (instr->definitions.empty()) {
         new_instructions.emplace_back(std::move(instr));
         continue;
@@ -288,10 +297,6 @@ void process_block(vn_ctx& ctx, Block& block)
         ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
      }

-      if (instr->opcode == aco_opcode::p_discard_if ||
-          instr->opcode == aco_opcode::p_demote_to_helper)
-         ctx.exec_id++;
-
      instr->pass_flags = ctx.exec_id;
      std::pair<expr_set::iterator, bool> res = ctx.expr_values.emplace(instr.get(), block.index);

@@ -303,6 +308,7 @@ void process_block(vn_ctx& ctx, Block& block)
         if (dominates(ctx, res.first->second, block.index)) {
            for (unsigned i = 0; i < instr->definitions.size(); i++) {
               assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass());
+               assert(instr->definitions[i].isTemp());
               ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp();
            }
         } else {

--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -759,11 +759,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,

      /* count variables to be moved and check war_hint */
      bool war_hint = false;
-      for (unsigned j = reg_lo; j <= reg_hi; j++) {
-         if (reg_file[j] != 0)
+      bool linear_vgpr = false;
+      for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
+         if (reg_file[j] != 0) {
            k++;
+            /* we cannot split live ranges of linear vgprs */
+            if (ctx.assignments[reg_file[j]].second & (1 << 6))
+               linear_vgpr = true;
+         }
         war_hint |= ctx.war_hint[j];
      }
+      if (linear_vgpr || (war_hint && !best_war_hint))
+         continue;

      /* count operands in wrong positions */
      for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
@@ -775,7 +782,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
            k += instr->operands[j].size();
      }
      bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
-      if (k > num_moves || (!aligned && k == num_moves) || (war_hint && !best_war_hint))
+      if (k > num_moves || (!aligned && k == num_moves))
         continue;

      best_pos = reg_lo;
@@ -961,7 +968,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_

   handle_live_in = [&](Temp val, Block *block) -> Temp {
      std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
-      if (preds.size() == 0 && block->index != 0) {
+      if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) {
         renames[block->index][val.id()] = val;
         return val;
      }

--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -3953,8 +3953,43 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 {
 	LLVMValueRef result, tmp;

-	if (ctx->chip_class >= GFX10) {
-		result = inclusive ? src : identity;
+	if (inclusive) {
+		result = src;
+	} else if (ctx->chip_class >= GFX10) {
+		/* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
+		LLVMValueRef active, tmp1, tmp2;
+		LLVMValueRef tid = ac_get_thread_id(ctx);
+
+		tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+
+		tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
+
+		if (maxprefix > 32) {
+			active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+					       LLVMConstInt(ctx->i32, 32, false), "");
+
+			tmp2 = LLVMBuildSelect(ctx->builder, active,
+					       ac_build_readlane(ctx, src,
+								 LLVMConstInt(ctx->i32, 31, false)),
+					       tmp2, "");
+
+			active = LLVMBuildOr(ctx->builder, active,
+					     LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+							   LLVMBuildAnd(ctx->builder, tid,
+									LLVMConstInt(ctx->i32, 0x1f, false), ""),
+							   LLVMConstInt(ctx->i32, 0x10, false), ""), "");
+			src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+		} else if (maxprefix > 16) {
+			active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+					       LLVMConstInt(ctx->i32, 16, false), "");
+
+			src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+		}
+
+		result = src;
+	} else if (ctx->chip_class >= GFX8) {
+		src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+		result = src;
 	} else {
 		if (!inclusive)
 			src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
@@ -3984,33 +4019,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 		return result;

 	if (ctx->chip_class >= GFX10) {
-		/* dpp_row_bcast{15,31} are not supported on gfx10. */
-		LLVMBuilderRef builder = ctx->builder;
 		LLVMValueRef tid = ac_get_thread_id(ctx);
-		LLVMValueRef cc;
-		/* TODO-GFX10: Can we get better code-gen by putting this into
-		 * a branch so that LLVM generates EXEC mask manipulations? */
-		if (inclusive)
-			tmp = result;
-		else
-			tmp = ac_build_alu_op(ctx, result, src, op);
-		tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
-		tmp = ac_build_alu_op(ctx, result, tmp, op);
-		cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
-		cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
-		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+		LLVMValueRef active;
+
+		tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
+
+		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+				       LLVMBuildAnd(ctx->builder, tid,
+						    LLVMConstInt(ctx->i32, 16, false), ""),
+				       ctx->i32_0, "");
+
+		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+		result = ac_build_alu_op(ctx, result, tmp, op);
+
 		if (maxprefix <= 32)
 			return result;

-		if (inclusive)
-			tmp = result;
-		else
-			tmp = ac_build_alu_op(ctx, result, src, op);
-		tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
-		tmp = ac_build_alu_op(ctx, result, tmp, op);
-		cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
+		tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+
+		active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid,
 				       LLVMConstInt(ctx->i32, 32, false), "");
-		result = LLVMBuildSelect(builder, cc, tmp, result, "");
+
+		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+		result = ac_build_alu_op(ctx, result, tmp, op);
 		return result;
 	}


--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1097,25 +1097,32 @@ void radv_GetPhysicalDeviceFeatures2(
 	return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 }

-void radv_GetPhysicalDeviceProperties(
-	VkPhysicalDevice                            physicalDevice,
-	VkPhysicalDeviceProperties*                 pProperties)
+static size_t
+radv_max_descriptor_set_size()
 {
-	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
-	VkSampleCountFlags sample_counts = 0xf;
-
 	/* make sure that the entire descriptor set is addressable with a signed
 	 * 32-bit int. So the sum of all limits scaled by descriptor size has to
 	 * be at most 2 GiB. the combined image & samples object count as one of
 	 * both. This limit is for the pipeline layout, not for the set layout, but
 	 * there is no set limit, so we just set a pipeline limit. I don't think
 	 * any app is going to hit this soon. */
-	size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
+	return ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS
+	                     - MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
 	          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
 	           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
 	           32 /* sampler, largest when combined with image */ +
 	           64 /* sampled image */ +
 	           64 /* storage image */);
+}
+
+void radv_GetPhysicalDeviceProperties(
+	VkPhysicalDevice                            physicalDevice,
+	VkPhysicalDeviceProperties*                 pProperties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+	VkSampleCountFlags sample_counts = 0xf;
+
+	size_t max_descriptor_set_size = radv_max_descriptor_set_size();

 	VkPhysicalDeviceLimits limits = {
 		.maxImageDimension1D                      = (1 << 14),
@@ -1394,13 +1401,7 @@ void radv_GetPhysicalDeviceProperties2(
 			properties->robustBufferAccessUpdateAfterBind = false;
 			properties->quadDivergentImplicitLod = false;

-			size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS -
-				MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
-			          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
-			           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
-			           32 /* sampler, largest when combined with image */ +
-			           64 /* sampled image */ +
-			           64 /* storage image */);
+			size_t max_descriptor_set_size = radv_max_descriptor_set_size();
 			properties->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size;
 			properties->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
 			properties->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
@@ -3855,8 +3856,7 @@ radv_finalize_timelines(struct radv_device *device,
 			pthread_mutex_lock(&wait_sems[i]->timeline.mutex);
 			struct radv_timeline_point *point =
 				radv_timeline_find_point_at_least_locked(device, &wait_sems[i]->timeline, wait_values[i]);
-			if (point)
-				--point->wait_count;
+			point->wait_count -= 2;
 			pthread_mutex_unlock(&wait_sems[i]->timeline.mutex);
 		}
 	}
@@ -3865,11 +3865,9 @@ radv_finalize_timelines(struct radv_device *device,
 			pthread_mutex_lock(&signal_sems[i]->timeline.mutex);
 			struct radv_timeline_point *point =
 				radv_timeline_find_point_at_least_locked(device, &signal_sems[i]->timeline, signal_values[i]);
-			if (point) {
 			signal_sems[i]->timeline.highest_submitted =
 				MAX2(signal_sems[i]->timeline.highest_submitted, point->value);
-				point->wait_count--;
-			}
+			point->wait_count -= 2;
 			radv_timeline_trigger_waiters_locked(&signal_sems[i]->timeline, processing_list);
 			pthread_mutex_unlock(&signal_sems[i]->timeline.mutex);
 		}
@@ -5458,8 +5456,6 @@ radv_timeline_wait_locked(struct radv_device *device,
 	if (!point)
 		return VK_SUCCESS;

-	point->wait_count++;
-
 	pthread_mutex_unlock(&timeline->mutex);

 	bool success = device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, abs_timeout);

--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1101,15 +1101,32 @@ radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
 	int ps_iter_samples = 1;
 	uint32_t mask = 0xffff;

-	if (vkms)
+	if (vkms) {
 		ms->num_samples = vkms->rasterizationSamples;
-	else
-		ms->num_samples = 1;

-	if (vkms)
-		ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
-	if (vkms && !vkms->sampleShadingEnable && pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
+		/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
+		 *
+		 * "Sample shading is enabled for a graphics pipeline:
+		 *
+		 * - If the interface of the fragment shader entry point of the
+		 *   graphics pipeline includes an input variable decorated
+		 *   with SampleId or SamplePosition. In this case
+		 *   minSampleShadingFactor takes the value 1.0.
+		 * - Else if the sampleShadingEnable member of the
+		 *   VkPipelineMultisampleStateCreateInfo structure specified
+		 *   when creating the graphics pipeline is set to VK_TRUE. In
+		 *   this case minSampleShadingFactor takes the value of
+		 *   VkPipelineMultisampleStateCreateInfo::minSampleShading.
+		 *
+		 * Otherwise, sample shading is considered disabled."
+		 */
+		if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
 			ps_iter_samples = ms->num_samples;
+		} else {
+			ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
+		}
+	} else {
+		ms->num_samples = 1;
 	}

 	const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =

--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -151,6 +151,15 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
 			((wrmask >> (i * 4)) & 0xf) << comp;
 }

+static void
+set_writes_memory(const nir_shader *nir, struct radv_shader_info *info)
+{
+	if (nir->info.stage == MESA_SHADER_FRAGMENT)
+		info->ps.writes_memory = true;
+	else if (nir->info.stage == MESA_SHADER_GEOMETRY)
+		info->gs.writes_memory = true;
+}
+
 static void
 gather_intrinsic_store_deref_info(const nir_shader *nir,
 				const nir_intrinsic_instr *instr,
@@ -308,10 +317,7 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_xor ||
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_exchange ||
 		    instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) {
-			if (nir->info.stage == MESA_SHADER_FRAGMENT)
-				info->ps.writes_memory = true;
-			else if (nir->info.stage == MESA_SHADER_GEOMETRY)
-				info->gs.writes_memory = true;
+			set_writes_memory(nir, info);
 		}
 		break;
 	}
@@ -326,17 +332,28 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
 	case nir_intrinsic_ssbo_atomic_xor:
 	case nir_intrinsic_ssbo_atomic_exchange:
 	case nir_intrinsic_ssbo_atomic_comp_swap:
-		if (nir->info.stage == MESA_SHADER_FRAGMENT)
-			info->ps.writes_memory = true;
-		else if (nir->info.stage == MESA_SHADER_GEOMETRY)
-			info->gs.writes_memory = true;
+		set_writes_memory(nir, info);
 		break;
 	case nir_intrinsic_load_deref:
 		gather_intrinsic_load_deref_info(nir, instr, info);
 		break;
 	case nir_intrinsic_store_deref:
 		gather_intrinsic_store_deref_info(nir, instr, info);
+		/* fallthrough */
+	case nir_intrinsic_deref_atomic_add:
+	case nir_intrinsic_deref_atomic_imin:
+	case nir_intrinsic_deref_atomic_umin:
+	case nir_intrinsic_deref_atomic_imax:
+	case nir_intrinsic_deref_atomic_umax:
+	case nir_intrinsic_deref_atomic_and:
+	case nir_intrinsic_deref_atomic_or:
+	case nir_intrinsic_deref_atomic_xor:
+	case nir_intrinsic_deref_atomic_exchange:
+	case nir_intrinsic_deref_atomic_comp_swap: {
+		if (nir_src_as_deref(instr->src[0])->mode & (nir_var_mem_global | nir_var_mem_ssbo))
+			set_writes_memory(nir, info);
 		break;
+	}
 	default:
 		break;
 	}

--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -1435,6 +1435,9 @@ builtin_variable_generator::add_varying(int slot, const glsl_type *type,
 void
 builtin_variable_generator::generate_varyings()
 {
+   struct gl_shader_compiler_options *options =
+      &state->ctx->Const.ShaderCompilerOptions[state->stage];
+
   /* gl_Position and gl_PointSize are not visible from fragment shaders. */
   if (state->stage != MESA_SHADER_FRAGMENT) {
      add_varying(VARYING_SLOT_POS, vec4_t, GLSL_PRECISION_HIGH, "gl_Position");
@@ -1526,6 +1529,9 @@ builtin_variable_generator::generate_varyings()
         var->data.sample = fields[i].sample;
         var->data.patch = fields[i].patch;
         var->init_interface_type(per_vertex_out_type);
+
+         var->data.invariant = fields[i].location == VARYING_SLOT_POS &&
+                               options->PositionAlwaysInvariant;
      }
   }
 }

--- a/src/compiler/nir/nir_lower_io_to_vector.c
+++ b/src/compiler/nir/nir_lower_io_to_vector.c
@@ -184,6 +184,9 @@ get_flat_type(const nir_shader *shader, nir_variable *old_vars[MAX_SLOTS][4],
   if (num_vars <= 1)
      return NULL;

+   if (slots == 1)
+      return glsl_vector_type(base, 4);
+   else
      return glsl_array_type(glsl_vector_type(base, 4), slots, 0);
 }

@@ -340,6 +343,9 @@ build_array_deref_of_new_var_flat(nir_shader *shader,
      deref = nir_build_deref_array(b, deref, index);
   }

+   if (!glsl_type_is_array(deref->type))
+      return deref;
+
   bool vs_in = shader->info.stage == MESA_SHADER_VERTEX &&
                new_var->data.mode == nir_var_shader_in;
   return nir_build_deref_array(

--- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
@@ -38,6 +38,7 @@ DRI_CONF_SECTION_END
 DRI_CONF_SECTION_MISCELLANEOUS
   DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false")
   DRI_CONF_GLSL_ZERO_INIT("false")
+   DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false")
   DRI_CONF_ALLOW_RGB10_CONFIGS("true")
   DRI_CONF_ALLOW_FP16_CONFIGS("false")
 DRI_CONF_SECTION_END
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -498,7 +498,6 @@ etna_resource_from_handle(struct pipe_screen *pscreen,
   struct etna_resource *rsc;
   struct etna_resource_level *level;
   struct pipe_resource *prsc;
-   struct pipe_resource *ptiled = NULL;

   DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, "
       "nr_samples=%u, usage=%u, bind=%x, flags=%x",
@@ -572,8 +571,6 @@ etna_resource_from_handle(struct pipe_screen *pscreen,

 fail:
   etna_resource_destroy(pscreen, prsc);
-   if (ptiled)
-      etna_resource_destroy(pscreen, ptiled);

   return NULL;
 }

--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -1660,6 +1660,7 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd, bool bo_reuse)

   STATIC_ASSERT(IRIS_MEMZONE_SHADER_START == 0ull);
   const uint64_t _4GB = 1ull << 32;
+   const uint64_t _2GB = 1ul << 31;

   /* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
   const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
@@ -1669,9 +1670,16 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd, bool bo_reuse)
   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE],
                      IRIS_MEMZONE_SURFACE_START,
                      _4GB_minus_1 - IRIS_MAX_BINDERS * IRIS_BINDER_SIZE);
+   /* TODO: Why does limiting to 2GB help some state items on gen12?
+    *  - CC Viewport Pointer
+    *  - Blend State Pointer
+    *  - Color Calc State Pointer
+    */
+   const uint64_t dynamic_pool_size =
+      (devinfo->gen >= 12 ? _2GB : _4GB_minus_1) - IRIS_BORDER_COLOR_POOL_SIZE;
   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC],
                      IRIS_MEMZONE_DYNAMIC_START + IRIS_BORDER_COLOR_POOL_SIZE,
-                      _4GB_minus_1 - IRIS_BORDER_COLOR_POOL_SIZE);
+                      dynamic_pool_size);

   /* Leave the last 4GB out of the high vma range, so that no state
    * base address + size can overflow 48 bits.

--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -933,6 +933,25 @@ panfrost_batch_submit(struct panfrost_batch *batch)
        if (ret)
                fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret);

+        /* We must reset the damage info of our render targets here even
+         * though a damage reset normally happens when the DRI layer swaps
+         * buffers. That's because there can be implicit flushes the GL
+         * app is not aware of, and those might impact the damage region: if
+         * part of the damaged portion is drawn during those implicit flushes,
+         * you have to reload those areas before next draws are pushed, and
+         * since the driver can't easily know what's been modified by the draws
+         * it flushed, the easiest solution is to reload everything.
+         */
+        for (unsigned i = 0; i < batch->key.nr_cbufs; i++) {
+                struct panfrost_resource *res;
+
+                if (!batch->key.cbufs[i])
+                        continue;
+
+                res = pan_resource(batch->key.cbufs[i]->texture);
+                panfrost_resource_reset_damage(res);
+        }
+
 out:
        panfrost_freeze_batch(batch);
        panfrost_free_batch(batch);

--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -48,7 +48,7 @@
 #include "pan_util.h"
 #include "pan_tiling.h"

-static void
+void
 panfrost_resource_reset_damage(struct panfrost_resource *pres)
 {
        /* We set the damage extent to the full resource size but keep the

--- a/src/gallium/drivers/panfrost/pan_resource.h
+++ b/src/gallium/drivers/panfrost/pan_resource.h
@@ -135,6 +135,9 @@ void
 panfrost_blit_wallpaper(struct panfrost_context *ctx,
                        struct pipe_box *box);

+void
+panfrost_resource_reset_damage(struct panfrost_resource *pres);
+
 void
 panfrost_resource_set_damage_region(struct pipe_screen *screen,
                                    struct pipe_resource *res,