Skip to content
Commits on Source (24)
19.3.0-rc5
19.3.0-rc6
mesa (19.3.0~rc6-1) UNRELEASED; urgency=medium
* New upstream release candidate.
* control, rules: Don't build mesa-vulkan-drivers on mips*.
-- Timo Aaltonen <tjaalton@debian.org> Thu, 05 Dec 2019 09:21:11 +0200
mesa (19.3.0~rc5-1) experimental; urgency=medium
* New upstream release candidate.
......
......@@ -443,7 +443,7 @@ Description: Mesa VDPAU video acceleration drivers
Package: mesa-vulkan-drivers
Section: libs
Architecture: amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64 x32
Architecture: amd64 arm64 armel armhf i386 powerpc ppc64 ppc64el s390x sparc64 x32
Pre-Depends: ${misc:Pre-Depends}
Depends:
libvulkan1,
......
......@@ -103,7 +103,7 @@ else
# radv needs LLVM and the Vulkan loader, so only build on the subset of
# arches where we have LLVM enabled and where the Vulkan loader is built.
ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x sparc64))
ifneq (,$(filter $(DEB_HOST_ARCH), amd64 arm64 armel armhf i386 powerpc ppc64 ppc64el s390x sparc64))
VULKAN_DRIVERS += amd,
endif
......
......@@ -392,7 +392,7 @@ void insert_NOPs_gfx8_9(Program* program)
}
}
void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
void handle_instruction_gfx10(Program *program, NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
std::vector<aco_ptr<Instruction>>& old_instructions,
std::vector<aco_ptr<Instruction>>& new_instructions)
{
......@@ -403,6 +403,9 @@ void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
instr->format == Format::SCRATCH || instr->format == Format::DS) {
/* Remember all SGPRs that are read by the VMEM instruction */
mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
ctx.sgprs_read_by_VMEM.set(exec);
if (program->wave_size == 64)
ctx.sgprs_read_by_VMEM.set(exec_hi);
} else if (instr->isSALU() || instr->format == Format::SMEM) {
/* Check if SALU writes an SGPR that was previously read by the VALU */
if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) {
......@@ -535,7 +538,7 @@ void handle_instruction_gfx10(NOP_ctx_gfx10 &ctx, aco_ptr<Instruction>& instr,
}
}
void handle_block_gfx10(NOP_ctx_gfx10& ctx, Block& block)
void handle_block_gfx10(Program *program, NOP_ctx_gfx10& ctx, Block& block)
{
if (block.instructions.empty())
return;
......@@ -544,7 +547,7 @@ void handle_block_gfx10(NOP_ctx_gfx10& ctx, Block& block)
instructions.reserve(block.instructions.size());
for (aco_ptr<Instruction>& instr : block.instructions) {
handle_instruction_gfx10(ctx, instr, block.instructions, instructions);
handle_instruction_gfx10(program, ctx, instr, block.instructions, instructions);
instructions.emplace_back(std::move(instr));
}
......@@ -569,7 +572,7 @@ void mitigate_hazards_gfx10(Program *program)
for (unsigned b : program->blocks[idx].linear_preds)
loop_block_ctx.join(all_ctx[b]);
handle_block_gfx10(loop_block_ctx, program->blocks[idx]);
handle_block_gfx10(program, loop_block_ctx, program->blocks[idx]);
/* We only need to continue if the loop header context changed */
if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
......@@ -584,7 +587,7 @@ void mitigate_hazards_gfx10(Program *program)
for (unsigned b : block.linear_preds)
ctx.join(all_ctx[b]);
handle_block_gfx10(ctx, block);
handle_block_gfx10(program, ctx, block);
}
}
......
......@@ -1976,8 +1976,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
}
case nir_op_i2i64: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 32) {
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
if (src.regClass() == s1) {
Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
} else if (src.regClass() == v1) {
Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
} else {
fprintf(stderr, "Unimplemented NIR instr bit size: ");
nir_print_instr(&instr->instr, stderr);
......@@ -6572,11 +6576,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
}
}
if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
std::vector<Operand> args;
if (has_offset)
args.emplace_back(Operand(offset));
......@@ -6592,7 +6591,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
if (has_lod)
args.emplace_back(lod);
Operand arg;
Temp arg;
if (args.size() > 1) {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
unsigned size = 0;
......@@ -6604,12 +6603,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
Temp tmp = bld.tmp(rc);
vec->definitions[0] = Definition(tmp);
ctx->block->instructions.emplace_back(std::move(vec));
arg = Operand(tmp);
arg = tmp;
} else {
assert(args[0].isTemp());
arg = Operand(as_vgpr(ctx, args[0].getTemp()));
arg = as_vgpr(ctx, args[0].getTemp());
}
/* we don't need the bias, sample index, compare value or offset to be
* computed in WQM but if the p_create_vector copies the coordinates, then it
* needs to be in WQM */
if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
//FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
......@@ -6741,7 +6748,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
}
tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
tex->operands[0] = arg;
tex->operands[0] = Operand(arg);
tex->operands[1] = Operand(resource);
tex->operands[2] = Operand(sampler);
tex->dim = dim;
......
......@@ -130,8 +130,7 @@ struct InstrPred {
return false;
}
}
if (a->format == Format::PSEUDO_BRANCH)
return false;
if (a->isVOP3()) {
VOP3A_instruction* a3 = static_cast<VOP3A_instruction*>(a);
VOP3A_instruction* b3 = static_cast<VOP3A_instruction*>(b);
......@@ -147,7 +146,8 @@ struct InstrPred {
if (a->isDPP()) {
DPP_instruction* aDPP = static_cast<DPP_instruction*>(a);
DPP_instruction* bDPP = static_cast<DPP_instruction*>(b);
return aDPP->dpp_ctrl == bDPP->dpp_ctrl &&
return aDPP->pass_flags == bDPP->pass_flags &&
aDPP->dpp_ctrl == bDPP->dpp_ctrl &&
aDPP->bank_mask == bDPP->bank_mask &&
aDPP->row_mask == bDPP->row_mask &&
aDPP->bound_ctrl == bDPP->bound_ctrl &&
......@@ -156,6 +156,7 @@ struct InstrPred {
aDPP->neg[0] == bDPP->neg[0] &&
aDPP->neg[1] == bDPP->neg[1];
}
switch (a->format) {
case Format::VOPC: {
/* Since the results depend on the exec mask, these shouldn't
......@@ -191,7 +192,7 @@ struct InstrPred {
/* this is fine since they are only used for vertex input fetches */
MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
return aM->can_reorder == bM->can_reorder &&
return aM->can_reorder && bM->can_reorder &&
aM->barrier == bM->barrier &&
aM->dfmt == bM->dfmt &&
aM->nfmt == bM->nfmt &&
......@@ -208,6 +209,10 @@ struct InstrPred {
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH:
case Format::EXP:
case Format::SOPP:
case Format::PSEUDO_BRANCH:
case Format::PSEUDO_BARRIER:
return false;
case Format::DS: {
/* we already handle potential issue with permute/swizzle above */
......@@ -276,6 +281,10 @@ void process_block(vn_ctx& ctx, Block& block)
op.setTemp(it->second);
}
if (instr->opcode == aco_opcode::p_discard_if ||
instr->opcode == aco_opcode::p_demote_to_helper)
ctx.exec_id++;
if (instr->definitions.empty()) {
new_instructions.emplace_back(std::move(instr));
continue;
......@@ -288,10 +297,6 @@ void process_block(vn_ctx& ctx, Block& block)
ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
}
if (instr->opcode == aco_opcode::p_discard_if ||
instr->opcode == aco_opcode::p_demote_to_helper)
ctx.exec_id++;
instr->pass_flags = ctx.exec_id;
std::pair<expr_set::iterator, bool> res = ctx.expr_values.emplace(instr.get(), block.index);
......@@ -303,6 +308,7 @@ void process_block(vn_ctx& ctx, Block& block)
if (dominates(ctx, res.first->second, block.index)) {
for (unsigned i = 0; i < instr->definitions.size(); i++) {
assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass());
assert(instr->definitions[i].isTemp());
ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp();
}
} else {
......
......@@ -759,11 +759,18 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
/* count variables to be moved and check war_hint */
bool war_hint = false;
for (unsigned j = reg_lo; j <= reg_hi; j++) {
if (reg_file[j] != 0)
bool linear_vgpr = false;
for (unsigned j = reg_lo; j <= reg_hi && !linear_vgpr; j++) {
if (reg_file[j] != 0) {
k++;
/* we cannot split live ranges of linear vgprs */
if (ctx.assignments[reg_file[j]].second & (1 << 6))
linear_vgpr = true;
}
war_hint |= ctx.war_hint[j];
}
if (linear_vgpr || (war_hint && !best_war_hint))
continue;
/* count operands in wrong positions */
for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) {
......@@ -775,7 +782,7 @@ PhysReg get_reg_create_vector(ra_ctx& ctx,
k += instr->operands[j].size();
}
bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0;
if (k > num_moves || (!aligned && k == num_moves) || (war_hint && !best_war_hint))
if (k > num_moves || (!aligned && k == num_moves))
continue;
best_pos = reg_lo;
......@@ -961,7 +968,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
handle_live_in = [&](Temp val, Block *block) -> Temp {
std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
if (preds.size() == 0 && block->index != 0) {
if (preds.size() == 0 || val.regClass() == val.regClass().as_linear()) {
renames[block->index][val.id()] = val;
return val;
}
......
......@@ -3953,8 +3953,43 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
{
LLVMValueRef result, tmp;
if (ctx->chip_class >= GFX10) {
result = inclusive ? src : identity;
if (inclusive) {
result = src;
} else if (ctx->chip_class >= GFX10) {
/* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
LLVMValueRef active, tmp1, tmp2;
LLVMValueRef tid = ac_get_thread_id(ctx);
tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
if (maxprefix > 32) {
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
LLVMConstInt(ctx->i32, 32, false), "");
tmp2 = LLVMBuildSelect(ctx->builder, active,
ac_build_readlane(ctx, src,
LLVMConstInt(ctx->i32, 31, false)),
tmp2, "");
active = LLVMBuildOr(ctx->builder, active,
LLVMBuildICmp(ctx->builder, LLVMIntEQ,
LLVMBuildAnd(ctx->builder, tid,
LLVMConstInt(ctx->i32, 0x1f, false), ""),
LLVMConstInt(ctx->i32, 0x10, false), ""), "");
src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
} else if (maxprefix > 16) {
active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
LLVMConstInt(ctx->i32, 16, false), "");
src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
}
result = src;
} else if (ctx->chip_class >= GFX8) {
src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
result = src;
} else {
if (!inclusive)
src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
......@@ -3984,33 +4019,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
return result;
if (ctx->chip_class >= GFX10) {
/* dpp_row_bcast{15,31} are not supported on gfx10. */
LLVMBuilderRef builder = ctx->builder;
LLVMValueRef tid = ac_get_thread_id(ctx);
LLVMValueRef cc;
/* TODO-GFX10: Can we get better code-gen by putting this into
* a branch so that LLVM generates EXEC mask manipulations? */
if (inclusive)
tmp = result;
else
tmp = ac_build_alu_op(ctx, result, src, op);
tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
tmp = ac_build_alu_op(ctx, result, tmp, op);
cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
result = LLVMBuildSelect(builder, cc, tmp, result, "");
LLVMValueRef active;
tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
LLVMBuildAnd(ctx->builder, tid,
LLVMConstInt(ctx->i32, 16, false), ""),
ctx->i32_0, "");
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
result = ac_build_alu_op(ctx, result, tmp, op);
if (maxprefix <= 32)
return result;
if (inclusive)
tmp = result;
else
tmp = ac_build_alu_op(ctx, result, src, op);
tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
tmp = ac_build_alu_op(ctx, result, tmp, op);
cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid,
LLVMConstInt(ctx->i32, 32, false), "");
result = LLVMBuildSelect(builder, cc, tmp, result, "");
tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
}
......
......@@ -1097,25 +1097,32 @@ void radv_GetPhysicalDeviceFeatures2(
return radv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
}
void radv_GetPhysicalDeviceProperties(
VkPhysicalDevice physicalDevice,
VkPhysicalDeviceProperties* pProperties)
static size_t
radv_max_descriptor_set_size()
{
RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
VkSampleCountFlags sample_counts = 0xf;
/* make sure that the entire descriptor set is addressable with a signed
* 32-bit int. So the sum of all limits scaled by descriptor size has to
* be at most 2 GiB. the combined image & samples object count as one of
* both. This limit is for the pipeline layout, not for the set layout, but
* there is no set limit, so we just set a pipeline limit. I don't think
* any app is going to hit this soon. */
size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
return ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS
- MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
(32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
32 /* storage buffer, 32 due to potential space wasted on alignment */ +
32 /* sampler, largest when combined with image */ +
64 /* sampled image */ +
64 /* storage image */);
}
void radv_GetPhysicalDeviceProperties(
VkPhysicalDevice physicalDevice,
VkPhysicalDeviceProperties* pProperties)
{
RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
VkSampleCountFlags sample_counts = 0xf;
size_t max_descriptor_set_size = radv_max_descriptor_set_size();
VkPhysicalDeviceLimits limits = {
.maxImageDimension1D = (1 << 14),
......@@ -1394,13 +1401,7 @@ void radv_GetPhysicalDeviceProperties2(
properties->robustBufferAccessUpdateAfterBind = false;
properties->quadDivergentImplicitLod = false;
size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS -
MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
(32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
32 /* storage buffer, 32 due to potential space wasted on alignment */ +
32 /* sampler, largest when combined with image */ +
64 /* sampled image */ +
64 /* storage image */);
size_t max_descriptor_set_size = radv_max_descriptor_set_size();
properties->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size;
properties->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
properties->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
......@@ -3855,8 +3856,7 @@ radv_finalize_timelines(struct radv_device *device,
pthread_mutex_lock(&wait_sems[i]->timeline.mutex);
struct radv_timeline_point *point =
radv_timeline_find_point_at_least_locked(device, &wait_sems[i]->timeline, wait_values[i]);
if (point)
--point->wait_count;
point->wait_count -= 2;
pthread_mutex_unlock(&wait_sems[i]->timeline.mutex);
}
}
......@@ -3865,11 +3865,9 @@ radv_finalize_timelines(struct radv_device *device,
pthread_mutex_lock(&signal_sems[i]->timeline.mutex);
struct radv_timeline_point *point =
radv_timeline_find_point_at_least_locked(device, &signal_sems[i]->timeline, signal_values[i]);
if (point) {
signal_sems[i]->timeline.highest_submitted =
MAX2(signal_sems[i]->timeline.highest_submitted, point->value);
point->wait_count--;
}
point->wait_count -= 2;
radv_timeline_trigger_waiters_locked(&signal_sems[i]->timeline, processing_list);
pthread_mutex_unlock(&signal_sems[i]->timeline.mutex);
}
......@@ -5458,8 +5456,6 @@ radv_timeline_wait_locked(struct radv_device *device,
if (!point)
return VK_SUCCESS;
point->wait_count++;
pthread_mutex_unlock(&timeline->mutex);
bool success = device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, abs_timeout);
......
......@@ -1101,15 +1101,32 @@ radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
int ps_iter_samples = 1;
uint32_t mask = 0xffff;
if (vkms)
if (vkms) {
ms->num_samples = vkms->rasterizationSamples;
else
ms->num_samples = 1;
if (vkms)
ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
if (vkms && !vkms->sampleShadingEnable && pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
*
* "Sample shading is enabled for a graphics pipeline:
*
* - If the interface of the fragment shader entry point of the
* graphics pipeline includes an input variable decorated
* with SampleId or SamplePosition. In this case
* minSampleShadingFactor takes the value 1.0.
* - Else if the sampleShadingEnable member of the
* VkPipelineMultisampleStateCreateInfo structure specified
* when creating the graphics pipeline is set to VK_TRUE. In
* this case minSampleShadingFactor takes the value of
* VkPipelineMultisampleStateCreateInfo::minSampleShading.
*
* Otherwise, sample shading is considered disabled."
*/
if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.force_persample) {
ps_iter_samples = ms->num_samples;
} else {
ps_iter_samples = radv_pipeline_get_ps_iter_samples(vkms);
}
} else {
ms->num_samples = 1;
}
const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
......
......@@ -151,6 +151,15 @@ set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
((wrmask >> (i * 4)) & 0xf) << comp;
}
static void
set_writes_memory(const nir_shader *nir, struct radv_shader_info *info)
{
if (nir->info.stage == MESA_SHADER_FRAGMENT)
info->ps.writes_memory = true;
else if (nir->info.stage == MESA_SHADER_GEOMETRY)
info->gs.writes_memory = true;
}
static void
gather_intrinsic_store_deref_info(const nir_shader *nir,
const nir_intrinsic_instr *instr,
......@@ -308,10 +317,7 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
instr->intrinsic == nir_intrinsic_image_deref_atomic_xor ||
instr->intrinsic == nir_intrinsic_image_deref_atomic_exchange ||
instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) {
if (nir->info.stage == MESA_SHADER_FRAGMENT)
info->ps.writes_memory = true;
else if (nir->info.stage == MESA_SHADER_GEOMETRY)
info->gs.writes_memory = true;
set_writes_memory(nir, info);
}
break;
}
......@@ -326,17 +332,28 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
case nir_intrinsic_ssbo_atomic_xor:
case nir_intrinsic_ssbo_atomic_exchange:
case nir_intrinsic_ssbo_atomic_comp_swap:
if (nir->info.stage == MESA_SHADER_FRAGMENT)
info->ps.writes_memory = true;
else if (nir->info.stage == MESA_SHADER_GEOMETRY)
info->gs.writes_memory = true;
set_writes_memory(nir, info);
break;
case nir_intrinsic_load_deref:
gather_intrinsic_load_deref_info(nir, instr, info);
break;
case nir_intrinsic_store_deref:
gather_intrinsic_store_deref_info(nir, instr, info);
/* fallthrough */
case nir_intrinsic_deref_atomic_add:
case nir_intrinsic_deref_atomic_imin:
case nir_intrinsic_deref_atomic_umin:
case nir_intrinsic_deref_atomic_imax:
case nir_intrinsic_deref_atomic_umax:
case nir_intrinsic_deref_atomic_and:
case nir_intrinsic_deref_atomic_or:
case nir_intrinsic_deref_atomic_xor:
case nir_intrinsic_deref_atomic_exchange:
case nir_intrinsic_deref_atomic_comp_swap: {
if (nir_src_as_deref(instr->src[0])->mode & (nir_var_mem_global | nir_var_mem_ssbo))
set_writes_memory(nir, info);
break;
}
default:
break;
}
......
......@@ -1435,6 +1435,9 @@ builtin_variable_generator::add_varying(int slot, const glsl_type *type,
void
builtin_variable_generator::generate_varyings()
{
struct gl_shader_compiler_options *options =
&state->ctx->Const.ShaderCompilerOptions[state->stage];
/* gl_Position and gl_PointSize are not visible from fragment shaders. */
if (state->stage != MESA_SHADER_FRAGMENT) {
add_varying(VARYING_SLOT_POS, vec4_t, GLSL_PRECISION_HIGH, "gl_Position");
......@@ -1526,6 +1529,9 @@ builtin_variable_generator::generate_varyings()
var->data.sample = fields[i].sample;
var->data.patch = fields[i].patch;
var->init_interface_type(per_vertex_out_type);
var->data.invariant = fields[i].location == VARYING_SLOT_POS &&
options->PositionAlwaysInvariant;
}
}
}
......
......@@ -184,6 +184,9 @@ get_flat_type(const nir_shader *shader, nir_variable *old_vars[MAX_SLOTS][4],
if (num_vars <= 1)
return NULL;
if (slots == 1)
return glsl_vector_type(base, 4);
else
return glsl_array_type(glsl_vector_type(base, 4), slots, 0);
}
......@@ -340,6 +343,9 @@ build_array_deref_of_new_var_flat(nir_shader *shader,
deref = nir_build_deref_array(b, deref, index);
}
if (!glsl_type_is_array(deref->type))
return deref;
bool vs_in = shader->info.stage == MESA_SHADER_VERTEX &&
new_var->data.mode == nir_var_shader_in;
return nir_build_deref_array(
......
......@@ -38,6 +38,7 @@ DRI_CONF_SECTION_END
DRI_CONF_SECTION_MISCELLANEOUS
DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER("false")
DRI_CONF_GLSL_ZERO_INIT("false")
DRI_CONF_VS_POSITION_ALWAYS_INVARIANT("false")
DRI_CONF_ALLOW_RGB10_CONFIGS("true")
DRI_CONF_ALLOW_FP16_CONFIGS("false")
DRI_CONF_SECTION_END
......@@ -498,7 +498,6 @@ etna_resource_from_handle(struct pipe_screen *pscreen,
struct etna_resource *rsc;
struct etna_resource_level *level;
struct pipe_resource *prsc;
struct pipe_resource *ptiled = NULL;
DBG("target=%d, format=%s, %ux%ux%u, array_size=%u, last_level=%u, "
"nr_samples=%u, usage=%u, bind=%x, flags=%x",
......@@ -572,8 +571,6 @@ etna_resource_from_handle(struct pipe_screen *pscreen,
fail:
etna_resource_destroy(pscreen, prsc);
if (ptiled)
etna_resource_destroy(pscreen, ptiled);
return NULL;
}
......
......@@ -1660,6 +1660,7 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd, bool bo_reuse)
STATIC_ASSERT(IRIS_MEMZONE_SHADER_START == 0ull);
const uint64_t _4GB = 1ull << 32;
const uint64_t _2GB = 1ul << 31;
/* The STATE_BASE_ADDRESS size field can only hold 1 page shy of 4GB */
const uint64_t _4GB_minus_1 = _4GB - PAGE_SIZE;
......@@ -1669,9 +1670,16 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd, bool bo_reuse)
util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE],
IRIS_MEMZONE_SURFACE_START,
_4GB_minus_1 - IRIS_MAX_BINDERS * IRIS_BINDER_SIZE);
/* TODO: Why does limiting to 2GB help some state items on gen12?
* - CC Viewport Pointer
* - Blend State Pointer
* - Color Calc State Pointer
*/
const uint64_t dynamic_pool_size =
(devinfo->gen >= 12 ? _2GB : _4GB_minus_1) - IRIS_BORDER_COLOR_POOL_SIZE;
util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC],
IRIS_MEMZONE_DYNAMIC_START + IRIS_BORDER_COLOR_POOL_SIZE,
_4GB_minus_1 - IRIS_BORDER_COLOR_POOL_SIZE);
dynamic_pool_size);
/* Leave the last 4GB out of the high vma range, so that no state
* base address + size can overflow 48 bits.
......
......@@ -933,6 +933,25 @@ panfrost_batch_submit(struct panfrost_batch *batch)
if (ret)
fprintf(stderr, "panfrost_batch_submit failed: %d\n", ret);
/* We must reset the damage info of our render targets here even
* though a damage reset normally happens when the DRI layer swaps
* buffers. That's because there can be implicit flushes the GL
* app is not aware of, and those might impact the damage region: if
* part of the damaged portion is drawn during those implicit flushes,
* you have to reload those areas before next draws are pushed, and
* since the driver can't easily know what's been modified by the draws
* it flushed, the easiest solution is to reload everything.
*/
for (unsigned i = 0; i < batch->key.nr_cbufs; i++) {
struct panfrost_resource *res;
if (!batch->key.cbufs[i])
continue;
res = pan_resource(batch->key.cbufs[i]->texture);
panfrost_resource_reset_damage(res);
}
out:
panfrost_freeze_batch(batch);
panfrost_free_batch(batch);
......
......@@ -48,7 +48,7 @@
#include "pan_util.h"
#include "pan_tiling.h"
static void
void
panfrost_resource_reset_damage(struct panfrost_resource *pres)
{
/* We set the damage extent to the full resource size but keep the
......
......@@ -135,6 +135,9 @@ void
panfrost_blit_wallpaper(struct panfrost_context *ctx,
struct pipe_box *box);
void
panfrost_resource_reset_damage(struct panfrost_resource *pres);
void
panfrost_resource_set_damage_region(struct pipe_screen *screen,
struct pipe_resource *res,
......