José Fonseca · Dylan Baker · Lina Versace · Lina Versace · Samuel Pitoiset · Dylan Baker
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -52,6 +52,7 @@ LOCAL_CFLAGS += \
 	-DHAVE___BUILTIN_EXPECT \
 	-DHAVE___BUILTIN_FFS \
 	-DHAVE___BUILTIN_FFSLL \
+	-DHAVE_DLFCN_H \
 	-DHAVE_FUNC_ATTRIBUTE_FLATTEN \
 	-DHAVE_FUNC_ATTRIBUTE_UNUSED \
 	-DHAVE_FUNC_ATTRIBUTE_FORMAT \

--- a/VERSION
+++ b/VERSION
-18.1.4
+18.1.5
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -30,3 +30,18 @@ cac7ab1192eefdd8d8b3f25053fb006b5c330eb8
 # stable branch
 #
 a2f5292c82ad07731d633b36a663e46adc181db9
+
+# This patch required manual backport, which was provided as
+# 3953467ee7851792c1d4b1c9435499545a7da9fc
+#
+4a67ce886a7b3def5f66c1aedf9e5436d157a03c
+
+# This patch required manual backport, which was provided as
+# 31677c5aa867e457cd06ae25150be2155e8da3c6
+#
+1f616a840eac02241c585d28e9dac8f19a297f39
+
+# Jason de-nominated this because it "a) shouldn't be needed and b) is horribly
+# broken"
+#
+11712b9ca17e4e1a819dcb7d020e19c6da77bc90
--- a/configure.ac
+++ b/configure.ac
@@ -880,6 +880,7 @@ AC_HEADER_MAJOR
 AC_CHECK_HEADER([xlocale.h], [DEFINES="$DEFINES -DHAVE_XLOCALE_H"])
 AC_CHECK_HEADER([sys/sysctl.h], [DEFINES="$DEFINES -DHAVE_SYS_SYSCTL_H"])
 AC_CHECK_HEADERS([endian.h])
+AC_CHECK_HEADER([dlfcn.h], [DEFINES="$DEFINES -DHAVE_DLFCN_H"])
 AC_CHECK_FUNC([strtof], [DEFINES="$DEFINES -DHAVE_STRTOF"])
 AC_CHECK_FUNC([mkostemp], [DEFINES="$DEFINES -DHAVE_MKOSTEMP"])
 AC_CHECK_FUNC([timespec_get], [DEFINES="$DEFINES -DHAVE_TIMESPEC_GET"])

--- a/debian/changelog
+++ b/debian/changelog
+mesa (18.1.5-1) unstable; urgency=medium
+
+  * New upstream release.
+
+ -- Timo Aaltonen <tjaalton@debian.org>  Mon, 30 Jul 2018 14:30:06 +0300
+
 mesa (18.1.4-1) unstable; urgency=medium

  [ Emilio Pozuelo Monfort ]

--- a/docs/relnotes/18.1.4.html
+++ b/docs/relnotes/18.1.4.html
@@ -31,8 +31,8 @@ Compatibility contexts may report a lower version depending on each driver.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD
-TBD
+SHA256: 8acd42e4ac4d1e96ed22344073b3d4fef03d10f225f4eaf3f88c001dfc10e2db  mesa-18.1.4.tar.gz
+SHA256: 3061488b5d85504092cf4343816cfb2d96f2ad9bc2edec31fc96933d184cf58b  mesa-18.1.4.tar.xz
 </pre>



--- a/docs/relnotes/18.1.5.html
+++ b/docs/relnotes/18.1.5.html
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.1.4 Release Notes / July 13 2018</h1>
+
+<p>
+Mesa 18.1.5 is a bug fix release which fixes bugs found since the 18.1.4 release.
+</p>
+<p>
+Mesa 18.1.5 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+TBD
+</pre>
+
+
+<h2>New features</h2>
+
+<p>None</p>
+
+<h2>Bug fixes</h2>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103274">Bug 103274</a> - BRW allocates too much heap memory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107275">Bug 107275</a> - NIR segfaults after spirv-opt</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107295">Bug 107295</a> - Access violation on glDrawArrays with count &gt;= 2048</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107312">Bug 107312</a> - Mesa-git RPM build fails after commit 8cacf38f527d42e41441ef8c25d95d4b2f4e8602</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107366">Bug 107366</a> - NIR verification crashes on piglit tests</li>
+
+</ul>
+
+<h2>Changes</h2>
+<p>Alex Smith (1):</p>
+<ul>
+  <li>anv: Pay attention to VK_ACCESS_MEMORY_(READ|WRITE)_BIT</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (7):</p>
+<ul>
+  <li>radv: Select correct entries for binning.</li>
+  <li>radv: Fix number of samples used for binning.</li>
+  <li>radv: Disable disabled color buffers in rbplus opts.</li>
+  <li>nir: Do not use continue block after removing it.</li>
+  <li>util/disk_cache: Fix disk_cache_get_function_timestamp with disabled cache.</li>
+  <li>nir: Fix end of function without return warning/error.</li>
+  <li>radv: Still enable inmemory &amp; API level caching if disk cache is not enabled.</li>
+</ul>
+
+<p>Chad Versace (2):</p>
+<ul>
+  <li>anv/android: Fix type error in call to vk_errorf()</li>
+  <li>anv/android: Fix Autotools build for VK_ANDROID_native_buffer</li>
+</ul>
+
+<p>Chih-Wei Huang (1):</p>
+<ul>
+  <li>Android: fix a missing nir_intrinsics.h error</li>
+</ul>
+
+<p>Danylo Piliaiev (1):</p>
+<ul>
+  <li>i965: Sweep NIR after linking phase to free held memory</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>r600: enable tess_input_info for TES</li>
+</ul>
+
+<p>Dylan Baker (5):</p>
+<ul>
+  <li>docs: Add sha256 sums for 18.1.4 tarballs</li>
+  <li>cherry-ignore: add 4a67ce886a7b3def5f66c1aedf9e5436d157a03c</li>
+  <li>cherry-ignore: Add 1f616a840eac02241c585d28e9dac8f19a297f39</li>
+  <li>cherry-ignore: add 11712b9ca17e4e1a819dcb7d020e19c6da77bc90</li>
+  <li>bump version to 18.1.5</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>vc4: Don't automatically reallocate a PERSISTENT-mapped buffer.</li>
+  <li>meson: Move xvmc test tools from unit tests to installed tools.</li>
+</ul>
+
+<p>Harish Krupo (1):</p>
+<ul>
+  <li>egl: Fix missing clamping in eglSetDamageRegionKHR</li>
+</ul>
+
+<p>Jan Vesely (3):</p>
+<ul>
+  <li>radeonsi: Refuse to accept code with unhandled relocations</li>
+  <li>clover: Report error when pipe driver fails to create compute state</li>
+  <li>clover: Catch errors from executing event action</li>
+</ul>
+
+<p>Jason Ekstrand (6):</p>
+<ul>
+  <li>anv: Stop setting 3DSTATE_PS_EXTRA::PixelShaderHasUAV</li>
+  <li>nir/serialize: Alloc constants off the variable</li>
+  <li>blorp: Handle the RGB workaround more like other workarounds</li>
+  <li>intel/blorp: Handle 3-component formats in clears</li>
+  <li>intel/compiler: Account for built-in uniforms in analyze_ubo_ranges</li>
+  <li>spirv: Fix a couple of image atomic load/store bugs</li>
+</ul>
+
+<p>José Fonseca (1):</p>
+<ul>
+  <li>gallium/tests: Don't ignore S3TC errors.</li>
+</ul>
+
+<p>Karol Herbst (1):</p>
+<ul>
+  <li>nir: fix printing of vec16 type</li>
+</ul>
+
+<p>Lepton Wu (1):</p>
+<ul>
+  <li>virgl: Fix flush in virgl_encoder_inline_write.</li>
+</ul>
+
+<p>Lucas Stach (1):</p>
+<ul>
+  <li>st/mesa: call resource_changed when binding a EGLImage to a texture</li>
+</ul>
+
+<p>Mauro Rossi (2):</p>
+<ul>
+  <li>radv: winsys/amdgpu: include missing pthread.h header</li>
+  <li>android: util/disk_cache: fix building errors in gallium drivers</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>gallium: Check pipe_screen::resource_changed before dereferencing it</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>draw: force draw pipeline if there's more than 65535 vertices</li>
+</ul>
+
+<p>Samuel Iglesias Gonsálvez (1):</p>
+<ul>
+  <li>anv: fix assert in anv_CmdBindDescriptorSets()</li>
+</ul>
+
+<p>Samuel Pitoiset (3):</p>
+<ul>
+  <li>radv: make sure to wait for CP DMA when needed</li>
+  <li>radv: emit a dummy ZPASS_DONE to prevent GPU hangs on GFX9</li>
+  <li>radv: fix a memleak for merged shaders on GFX9</li>
+</ul>
+
+</div>
+</body>
+</html>
--- a/meson.build
+++ b/meson.build
@@ -54,7 +54,7 @@ with_osmesa = get_option('osmesa')
 with_swr_arches = get_option('swr-arches').split(',')
 with_tools = get_option('tools').split(',')
 if with_tools.contains('all')
-  with_tools = ['freedreno', 'glsl', 'intel', 'nir', 'nouveau']
+  with_tools = ['freedreno', 'glsl', 'intel', 'nir', 'nouveau', 'xvmc']
 endif
 if get_option('texture-float')
  pre_args += '-DTEXTURE_FLOAT_ENABLED'
@@ -928,7 +928,7 @@ elif cc.has_header_symbol('sys/mkdev.h', 'major')
  pre_args += '-DMAJOR_IN_MKDEV'
 endif

-foreach h : ['xlocale.h', 'sys/sysctl.h', 'linux/futex.h', 'endian.h']
+foreach h : ['xlocale.h', 'sys/sysctl.h', 'linux/futex.h', 'endian.h', 'dlfcn.h']
  if cc.compiles('#include <@0@>'.format(h), name : '@0@'.format(h))
    pre_args += '-DHAVE_@0@'.format(h.to_upper().underscorify())
  endif

--- a/meson_options.txt
+++ b/meson_options.txt
@@ -284,5 +284,5 @@ option(
  'tools',
  type : 'string',
  value : '',
-  description : 'Comma delimited list of tools to build. choices : freedreno,glsl,intel,nir,nouveau or all'
+  description : 'Comma delimited list of tools to build. choices : freedreno,glsl,intel,nir,nouveau,xvmc or all'
 )
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -319,11 +319,21 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 	}

 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends;
+		unsigned eop_bug_offset;
 		void *fence_ptr;
+
 		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0,
 					     &cmd_buffer->gfx9_fence_offset,
 					     &fence_ptr);
 		cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo;
+
+		/* Allocate a buffer for the EOP bug on GFX9. */
+		radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 0,
+					     &eop_bug_offset, &fence_ptr);
+		cmd_buffer->gfx9_eop_bug_va =
+			radv_buffer_get_va(cmd_buffer->upload.upload_bo);
+		cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
 	}

 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
@@ -473,7 +483,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
 				       cmd_buffer->device->physical_device->rad_info.chip_class,
 				       ptr, va,
 				       radv_cmd_buffer_uses_mec(cmd_buffer),
-				       flags);
+				       flags, cmd_buffer->gfx9_eop_bug_va);
 	}

 	if (unlikely(cmd_buffer->device->trace_bo))
@@ -681,8 +691,11 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
 	unsigned sx_blend_opt_control = 0;

 	for (unsigned i = 0; i < subpass->color_count; ++i) {
-		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
+		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
+			sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
+			sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
 			continue;
+		}

 		int idx = subpass->color_attachments[i].attachment;
 		struct radv_color_buffer_info *cb = &framebuffer->attachments[idx].cb;
@@ -796,6 +809,10 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
 		}
 	}

+	for (unsigned i = subpass->color_count; i < 8; ++i) {
+		sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
+		sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
+	}
 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
 	radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
 	radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
@@ -2500,6 +2517,11 @@ VkResult radv_EndCommandBuffer(
 		si_emit_cache_flush(cmd_buffer);
 	}

+	/* Make sure CP DMA is idle at the end of IBs because the kernel
+	 * doesn't wait for it.
+	 */
+	si_cp_dma_wait_for_idle(cmd_buffer);
+
 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);

 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
@@ -4054,6 +4076,11 @@ void radv_CmdPipelineBarrier(
 					     0);
 	}

+	/* Make sure CP DMA is idle because the driver might have performed a
+	 * DMA operation for copying or filling buffers/images.
+	 */
+	si_cp_dma_wait_for_idle(cmd_buffer);
+
 	cmd_buffer->state.flush_bits |= dst_flush_bits;
 }

@@ -4070,6 +4097,11 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,

 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18);

+	/* Make sure CP DMA is idle because the driver might have performed a
+	 * DMA operation for copying or filling buffers/images.
+	 */
+	si_cp_dma_wait_for_idle(cmd_buffer);
+
 	/* TODO: this is overkill. Probably should figure something out from
 	 * the stage mask. */

@@ -4078,7 +4110,8 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
 				   cmd_buffer->device->physical_device->rad_info.chip_class,
 				   radv_cmd_buffer_uses_mec(cmd_buffer),
 				   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				   1, va, 2, value);
+				   1, va, 2, value,
+				   cmd_buffer->gfx9_eop_bug_va);

 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }

--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -2181,7 +2181,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 			                       RADV_CMD_FLAG_INV_ICACHE |
 			                       RADV_CMD_FLAG_INV_SMEM_L1 |
 			                       RADV_CMD_FLAG_INV_VMEM_L1 |
-			                       RADV_CMD_FLAG_INV_GLOBAL_L2);
+			                       RADV_CMD_FLAG_INV_GLOBAL_L2, 0);
 		} else if (i == 1) {
 			si_cs_emit_cache_flush(cs,
 			                       queue->device->physical_device->rad_info.chip_class,
@@ -2191,7 +2191,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
 			                       RADV_CMD_FLAG_INV_ICACHE |
 			                       RADV_CMD_FLAG_INV_SMEM_L1 |
 			                       RADV_CMD_FLAG_INV_VMEM_L1 |
-			                       RADV_CMD_FLAG_INV_GLOBAL_L2);
+			                       RADV_CMD_FLAG_INV_GLOBAL_L2, 0);
 		}

 		if (!queue->device->ws->cs_finalize(cs))

--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2154,7 +2154,7 @@ void radv_create_shaders(struct radv_pipeline *pipeline,

 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
 		free(codes[i]);
-		if (modules[i]) {
+		if (nir[i]) {
 			if (!pipeline->device->keep_shader_info)
 				ralloc_free(nir[i]);

@@ -2437,7 +2437,7 @@ radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCr
 	                       pipeline->device->physical_device->rad_info.max_se);
 	unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);

-	unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_mode_cntl_1);
+	unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
 	unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
 	unsigned effective_samples = total_samples;
 	unsigned color_bytes_per_pixel = 0;
@@ -2462,7 +2462,7 @@ radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCr
 	}

 	const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
-	while(color_entry->bpp <= color_bytes_per_pixel)
+	while(color_entry[1].bpp <= color_bytes_per_pixel)
 		++color_entry;

 	extent = color_entry->extent;
@@ -2476,7 +2476,7 @@ radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCr
 		unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;

 		const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
-		while(ds_entry->bpp <= ds_bytes_per_pixel)
+		while(ds_entry[1].bpp <= ds_bytes_per_pixel)
 			++ds_entry;

 		extent.width = MIN2(extent.width, ds_entry->extent.width);

--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -248,7 +248,6 @@ radv_is_cache_disabled(struct radv_device *device)
 	 * MESA_GLSL_CACHE_DISABLE=1, and when VK_AMD_shader_info is requested.
 	 */
 	return (device->instance->debug_flags & RADV_DEBUG_NO_CACHE) ||
-	       !device->physical_device->disk_cache ||
 	       device->keep_shader_info;
 }

@@ -271,7 +270,7 @@ radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
 		/* Don't cache when we want debug info, since this isn't
 		 * present in the cache.
 		 */
-		if (radv_is_cache_disabled(device)) {
+		if (radv_is_cache_disabled(device) || !device->physical_device->disk_cache) {
 			pthread_mutex_unlock(&cache->mutex);
 			return false;
 		}

--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -972,6 +972,9 @@ struct radv_cmd_state {
 	uint32_t last_num_instances;
 	uint32_t last_first_instance;
 	uint32_t last_vertex_offset;
+
+	/* Whether CP DMA is busy/idle. */
+	bool dma_is_busy;
 };

 struct radv_cmd_pool {
@@ -1034,6 +1037,7 @@ struct radv_cmd_buffer {
 	uint32_t gfx9_fence_offset;
 	struct radeon_winsys_bo *gfx9_fence_bo;
 	uint32_t gfx9_fence_idx;
+	uint64_t gfx9_eop_bug_va;

 	/**
 	 * Whether a query pool has been resetted and we have to flush caches.
@@ -1066,7 +1070,8 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
 				unsigned data_sel,
 				uint64_t va,
 				uint32_t old_fence,
-				uint32_t new_fence);
+				uint32_t new_fence,
+				uint64_t gfx9_eop_bug_va);

 void si_emit_wait_fence(struct radeon_winsys_cs *cs,
 			bool predicated,
@@ -1076,7 +1081,8 @@ void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 			    enum chip_class chip_class,
 			    uint32_t *fence_ptr, uint64_t va,
 			    bool is_mec,
-			    enum radv_cmd_flush_bits flush_bits);
+			    enum radv_cmd_flush_bits flush_bits,
+			    uint64_t gfx9_eop_bug_va);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
 void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va);
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
@@ -1086,6 +1092,8 @@ void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
                        unsigned size);
 void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
 			    uint64_t size, unsigned value);
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
+
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
 bool
 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,

--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -1169,7 +1169,8 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
 					   cmd_buffer->device->physical_device->rad_info.chip_class,
 					   radv_cmd_buffer_uses_mec(cmd_buffer),
 					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-					   1, avail_va, 0, 1);
+					   1, avail_va, 0, 1,
+					   cmd_buffer->gfx9_eop_bug_va);
 		break;
 	default:
 		unreachable("ending unhandled query type");
@@ -1292,13 +1293,15 @@ void radv_CmdWriteTimestamp(
 						   cmd_buffer->device->physical_device->rad_info.chip_class,
 						   mec,
 						   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-						   3, query_va, 0, 0);
+						   3, query_va, 0, 0,
+						   cmd_buffer->gfx9_eop_bug_va);
 			si_cs_emit_write_event_eop(cs,
 						   false,
 						   cmd_buffer->device->physical_device->rad_info.chip_class,
 						   mec,
 						   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-						   1, avail_va, 0, 1);
+						   1, avail_va, 0, 1,
+						   cmd_buffer->gfx9_eop_bug_va);
 			break;
 		}
 		query_va += pool->stride;

--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -852,7 +852,8 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
 				unsigned data_sel,
 				uint64_t va,
 				uint32_t old_fence,
-				uint32_t new_fence)
+				uint32_t new_fence,
+				uint64_t gfx9_eop_bug_va)
 {
 	unsigned op = EVENT_TYPE(event) |
 		EVENT_INDEX(5) |
@@ -860,6 +861,17 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
 	unsigned is_gfx8_mec = is_mec && chip_class < GFX9;

 	if (chip_class >= GFX9 || is_gfx8_mec) {
+		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+		 * counters) must immediately precede every timestamp event to
+		 * prevent a GPU hang on GFX9.
+		 */
+		if (chip_class == GFX9) {
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+			radeon_emit(cs, gfx9_eop_bug_va);
+			radeon_emit(cs, gfx9_eop_bug_va >> 32);
+		}
+
 		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, predicated));
 		radeon_emit(cs, op);
 		radeon_emit(cs, EOP_DATA_SEL(data_sel));
@@ -941,7 +953,8 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 		       uint32_t *flush_cnt,
 		       uint64_t flush_va,
                       bool is_mec,
-                       enum radv_cmd_flush_bits flush_bits)
+                       enum radv_cmd_flush_bits flush_bits,
+		       uint64_t gfx9_eop_bug_va)
 {
 	unsigned cp_coher_cntl = 0;
 	uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
@@ -971,7 +984,8 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 							   chip_class,
 							   is_mec,
 							   V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-							   0, 0, 0, 0, 0);
+							   0, 0, 0, 0, 0,
+							   gfx9_eop_bug_va);
 			}
 		}
 		if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
@@ -1057,7 +1071,8 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
 		uint32_t old_fence = (*flush_cnt)++;

 		si_cs_emit_write_event_eop(cs, false, chip_class, false, cb_db_event, tc_flags, 1,
-					   flush_va, old_fence, *flush_cnt);
+					   flush_va, old_fence, *flush_cnt,
+					   gfx9_eop_bug_va);
 		si_emit_wait_fence(cs, false, flush_va, *flush_cnt, 0xffffffff);
 	}

@@ -1149,7 +1164,8 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 	                       cmd_buffer->device->physical_device->rad_info.chip_class,
 			       ptr, va,
 	                       radv_cmd_buffer_uses_mec(cmd_buffer),
-	                       cmd_buffer->state.flush_bits);
+	                       cmd_buffer->state.flush_bits,
+			       cmd_buffer->gfx9_eop_bug_va);


 	if (unlikely(cmd_buffer->device->trace_bo))
@@ -1214,7 +1230,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
 	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	uint32_t header = 0, command = 0;

-	assert(size);
 	assert(size <= cp_dma_max_byte_count(cmd_buffer));

 	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
@@ -1273,11 +1288,16 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
 	 * indices. If we wanted to execute CP DMA in PFP, this packet
 	 * should precede it.
 	 */
-	if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+	if (flags & CP_DMA_SYNC) {
+		if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
 			radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
 			radeon_emit(cs, 0);
 		}

+		/* CP will see the sync flag and wait for all DMAs to complete. */
+		cmd_buffer->state.dma_is_busy = false;
+	}
+
 	if (unlikely(cmd_buffer->device->trace_bo))
 		radv_cmd_buffer_trace_emit(cmd_buffer);
 }
@@ -1339,6 +1359,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
 	uint64_t main_src_va, main_dest_va;
 	uint64_t skipped_size = 0, realign_size = 0;

+	/* Assume that we are not going to sync after the last DMA operation. */
+	cmd_buffer->state.dma_is_busy = true;

 	if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
 	    cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
@@ -1402,6 +1424,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,

 	assert(va % 4 == 0 && size % 4 == 0);

+	/* Assume that we are not going to sync after the last DMA operation. */
+	cmd_buffer->state.dma_is_busy = true;
+
 	while (size) {
 		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
 		unsigned dma_flags = CP_DMA_CLEAR;
@@ -1417,6 +1442,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
 	}
 }

+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
+{
+	if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
+		return;
+
+	if (!cmd_buffer->state.dma_is_busy)
+		return;
+
+	/* Issue a dummy DMA that copies zero bytes.
+	 *
+	 * The DMA engine will see that there's no work to do and skip this
+	 * DMA request, however, the CP will see the sync flag and still wait
+	 * for all DMAs to complete.
+	 */
+	si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
+
+	cmd_buffer->state.dma_is_busy = false;
+}
+
 /* For MSAA sample positions. */
 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
 	(((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |		   \

--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
@@ -33,6 +33,7 @@
 #include "addrlib/addrinterface.h"
 #include <amdgpu.h>
 #include "util/list.h"
+#include <pthread.h>

 struct radv_amdgpu_winsys {
 	struct radeon_winsys base;

--- a/src/compiler/Android.nir.gen.mk
+++ b/src/compiler/Android.nir.gen.mk
@@ -46,6 +46,7 @@ LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \
 # Modules using libmesa_nir must set LOCAL_GENERATED_SOURCES to this
 MESA_GEN_NIR_H := $(addprefix $(call local-generated-sources-dir)/, \
 	nir/nir_opcodes.h \
+	nir/nir_intrinsics.h \
 	nir/nir_builder_opcodes.h)

 nir_builder_opcodes_gen := $(LOCAL_PATH)/nir/nir_builder_opcodes_h.py

--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -24,6 +24,28 @@
 #include "nir.h"
 #include "nir_control_flow.h"

+/**
+ * Gets the single block that jumps back to the loop header. Already assumes
+ * there is exactly one such block.
+ */
+static nir_block*
+find_continue_block(nir_loop *loop)
+{
+   nir_block *header_block = nir_loop_first_block(loop);
+   nir_block *prev_block =
+      nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node));
+
+   assert(header_block->predecessors->entries == 2);
+
+   struct set_entry *pred_entry;
+   set_foreach(header_block->predecessors, pred_entry) {
+      if (pred_entry->key != prev_block)
+         return (nir_block*)pred_entry->key;
+   }
+
+   unreachable("Continue block not found!");
+}
+
 /**
 * This optimization detects if statements at the tops of loops where the
 * condition is a phi node of two constants and moves half of the if to above
@@ -95,12 +117,7 @@ opt_peel_loop_initial_if(nir_loop *loop)
   if (header_block->predecessors->entries != 2)
      return false;

-   nir_block *continue_block = NULL;
-   struct set_entry *pred_entry;
-   set_foreach(header_block->predecessors, pred_entry) {
-      if (pred_entry->key != prev_block)
-         continue_block = (void *)pred_entry->key;
-   }
+   nir_block *continue_block = find_continue_block(loop);

   nir_cf_node *if_node = nir_cf_node_next(&header_block->cf_node);
   if (!if_node || if_node->type != nir_cf_node_if)
@@ -191,6 +208,10 @@ opt_peel_loop_initial_if(nir_loop *loop)
   nir_cf_reinsert(&tmp, nir_before_cf_node(&loop->cf_node));

   nir_cf_reinsert(&header, nir_after_block_before_jump(continue_block));
+
+   /* Get continue block again as the previous reinsert might have removed the block. */
+   continue_block = find_continue_block(loop);
+
   nir_cf_extract(&tmp, nir_before_cf_list(continue_list),
                        nir_after_cf_list(continue_list));
   nir_cf_reinsert(&tmp, nir_after_block_before_jump(continue_block));

--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -87,6 +87,7 @@ print_register(nir_register *reg, print_state *state)

 static const char *sizes[] = { "error", "vec1", "vec2", "vec3", "vec4",
                               "error", "error", "error", "vec8",
+                               "error", "error", "error", "error",
                               "error", "error", "error", "vec16"};

 static void