Dylan Baker · Eric Engestrom · Faith Ekstrand · Kenneth Graunke · Andres Gomez · Rhys Perry
--- a/VERSION
+++ b/VERSION
-19.0.2
+19.0.3
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -70,7 +70,14 @@ def main():
                name, ext = os.path.splitext(name)
        finally:
            os.chdir(ret)
+
+    # Remove meson-created master .so and symlinks
    os.unlink(master)
+    name, ext = os.path.splitext(master)
+    while ext != '.so':
+        if os.path.lexists(name):
+            os.unlink(name)
+        name, ext = os.path.splitext(name)


 if __name__ == '__main__':

--- a/docs/relnotes/19.0.2.html
+++ b/docs/relnotes/19.0.2.html
@@ -31,7 +31,8 @@ Compatibility contexts may report a lower version depending on each driver.

 <h2>SHA256 checksums</h2>
 <pre>
-TBD
+SHA256: eb972fc11d4e1261d34ec0b91a701f158d4870c0428fb108353ae7eab64b1118  mesa-19.0.2.tar.gz
+SHA256: 1a2edc3ce56906a676c91e6851298db45903df1f5cb9827395a922c1452db802  mesa-19.0.2.tar.xz
 </pre>



--- a/docs/relnotes/19.0.3.html
+++ b/docs/relnotes/19.0.3.html
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 19.0.3 Release Notes / April 24, 2019</h1>
+
+<p>
+Mesa 19.0.3 is a bug fix release which fixes bugs found since the l9.0.2 release.
+</p>
+<p>
+Mesa 19.0.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+
+<p>N/A</p>
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108879">Bug 108879</a> - [CIK] [regression] All opencl apps hangs indefinitely in si_create_context</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110201">Bug 110201</a> - [ivb] mesa 19.0.0 breaks rendering in kitty</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110356">Bug 110356</a> - install_megadrivers.py creates new dangling symlink [bisected]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=110441">Bug 110441</a> - [llvmpipe] complex-loop-analysis-bug regression</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<p>Andres Gomez (1):</p>
+<ul>
+  <li>glsl/linker: location aliasing requires types to have the same width</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>ac: Move has_local_buffers disable to radeonsi.</li>
+</ul>
+
+<p>Chia-I Wu (1):</p>
+<ul>
+  <li>virgl: fix fence fd version check</li>
+</ul>
+
+<p>Danylo Piliaiev (1):</p>
+<ul>
+  <li>intel/compiler: Do not reswizzle dst if instruction writes to flag register</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>docs: Add sha256 sums for 19.0.2</li>
+  <li>Bump version for 19.0.3</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>nir: Fix deref offset calculation for structs.</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>meson: remove meson-created megadrivers symlinks</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>anv/pipeline: Fix MEDIA_VFE_STATE::PerThreadScratchSpace on gen7</li>
+  <li>anv: Add a #define for the max binding table size</li>
+</ul>
+
+<p>Juan A. Suarez Romero (1):</p>
+<ul>
+  <li>meson: Add dependency on genxml to anvil genfiles</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>glsl: Set location on structure-split sampler uniform variables</li>
+  <li>Revert "glsl: Set location on structure-split sampler uniform variables"</li>
+</ul>
+
+<p>Lionel Landwerlin (2):</p>
+<ul>
+  <li>anv: fix uninitialized pthread cond clock domain</li>
+  <li>intel/devinfo: fix missing num_thread_per_eu on ICL</li>
+</ul>
+
+<p>Lubomir Rintel (2):</p>
+<ul>
+  <li>gallivm: guess CPU features also on ARM</li>
+  <li>gallivm: disable NEON instructions if they are not supported</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: use CP DMA for the null const buffer clear on CIK</li>
+</ul>
+
+<p>Rhys Perry (1):</p>
+<ul>
+  <li>nir,ac/nir: fix cube_face_coord</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>gallivm: fix bogus assert in get_indirect_index</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>ac/nir: only use the new raw/struct image atomic intrinsics with LLVM 9+</li>
+  <li>radv: do not load vertex attributes that are not provided by the pipeline</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -367,9 +367,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
 	info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
 	info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
 	info->has_ctx_priority = info->drm_minor >= 22;
-	/* TODO: Enable this once the kernel handles it efficiently. */
-	info->has_local_buffers = info->drm_minor >= 20 &&
-				  !info->has_dedicated_vram;
+	info->has_local_buffers = info->drm_minor >= 20;
 	info->kernel_flushes_hdp_before_ib = true;
 	info->htile_cmask_support_1d_tiling = true;
 	info->si_TA_CS_BC_BASE_ADDR_allowed = true;

--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1019,10 +1019,17 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		LLVMValueRef in[3];
 		for (unsigned chan = 0; chan < 3; chan++)
 			in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
-		results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
+		results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
 						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
-		results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
+		results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
 						ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema",
+						     ctx->ac.f32, in, 3, AC_FUNC_ATTR_READNONE);
+		results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
+		results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
+		LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
+		results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, "");
+		results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, "");
 		result = ac_build_gather_values(&ctx->ac, results, 2);
 		break;
 	}
@@ -2532,7 +2539,10 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
 		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 								ctx->ac.i32_0, ""); /* vindex */
 		params[param_count++] = ctx->ac.i32_0; /* voffset */
-		if (HAVE_LLVM >= 0x800) {
+		if (HAVE_LLVM >= 0x900) {
+			/* XXX: The new raw/struct atomic intrinsics are buggy
+			 * with LLVM 8, see r358579.
+			 */
 			params[param_count++] = ctx->ac.i32_0; /* soffset */
 			params[param_count++] = ctx->ac.i32_0;  /* slc */


--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2027,10 +2027,32 @@ handle_vs_input_decl(struct radv_shader_context *ctx,

 		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);

+		if (ctx->options->key.vs.vertex_attribute_provided & (1u << attrib_index)) {
 			input = ac_build_buffer_load_format(&ctx->ac, t_list,
 							    buffer_index,
 							    ctx->ac.i32_0,
 							    num_channels, false, true);
+		} else {
+			/* Per the Vulkan spec, it's invalid to consume vertex
+			 * attributes that are not provided by the pipeline but
+			 * some (invalid) apps appear to do that. Fill the
+			 * input array with (eg. (0, 0, 0, 1)) to workaround
+			 * the problem and to avoid possible GPU hangs.
+			 */
+			LLVMValueRef chan[4];
+
+			/* The input_usage mask might be 0 if input variables
+			 * are not removed by the compiler.
+			 */
+			num_channels = CLAMP(num_channels, 1, 4);
+
+			for (unsigned i = 0; i < num_channels; i++) {
+				chan[i] = i == 3 ? ctx->ac.f32_1 : ctx->ac.f32_0;
+				chan[i] = ac_to_float(&ctx->ac, chan[i]);
+			}
+
+			input = ac_build_gather_values(&ctx->ac, chan, num_channels);
+		}

 		input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels);


--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1922,6 +1922,8 @@ radv_generate_graphics_pipeline_key(struct radv_pipeline *pipeline,
 			}
 			key.vertex_alpha_adjust |= adjust << (2 * location);
 		}
+
+		key.vertex_attribute_provided |= 1 << location;
 	}

 	if (pCreateInfo->pTessellationState)
@@ -1950,6 +1952,7 @@ radv_fill_shader_keys(struct radv_shader_variant_key *keys,
 {
 	keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
 	keys[MESA_SHADER_VERTEX].vs.alpha_adjust = key->vertex_alpha_adjust;
+	keys[MESA_SHADER_VERTEX].vs.vertex_attribute_provided = key->vertex_attribute_provided;
 	for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i)
 		keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];


--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -365,6 +365,7 @@ struct radv_pipeline_cache {
 struct radv_pipeline_key {
 	uint32_t instance_rate_inputs;
 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+	uint32_t vertex_attribute_provided;
 	uint64_t vertex_alpha_adjust;
 	unsigned tess_input_vertices;
 	uint32_t col_format;

--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -66,6 +66,9 @@ struct radv_vs_variant_key {
 	uint32_t instance_rate_inputs;
 	uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];

+	/* Mask of vertex attributes that are provided by the pipeline. */
+	uint32_t vertex_attribute_provided;
+
 	/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
 	 * so we may need to fix it up. */
 	uint64_t alpha_adjust;

--- a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
+++ b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
@@ -167,6 +167,14 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
   } else {
      var = nir_variable_create(state->shader, nir_var_uniform, type, name);
      var->data.binding = binding;
+
+      /* Don't set var->data.location.  The old structure location could be
+       * used to index into gl_uniform_storage, assuming the full structure
+       * was walked in order.  With the new split variables, this invariant
+       * no longer holds and there's no meaningful way to start from a base
+       * location and access a particular array element.  Just leave it 0.
+       */
+
      _mesa_hash_table_insert_pre_hashed(state->remap_table, hash, name, var);
   }


--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -424,28 +424,14 @@ compute_variable_location_slot(ir_variable *var, gl_shader_stage stage)

 struct explicit_location_info {
   ir_variable *var;
-   unsigned numerical_type;
+   bool base_type_is_integer;
+   unsigned base_type_bit_size;
   unsigned interpolation;
   bool centroid;
   bool sample;
   bool patch;
 };

-static inline unsigned
-get_numerical_type(const glsl_type *type)
-{
-   /* From the OpenGL 4.6 spec, section 4.4.1 Input Layout Qualifiers, Page 68,
-    * (Location aliasing):
-    *
-    *    "Further, when location aliasing, the aliases sharing the location
-    *     must have the same underlying numerical type  (floating-point or
-    *     integer)
-    */
-   if (type->is_float() || type->is_double())
-      return GLSL_TYPE_FLOAT;
-   return GLSL_TYPE_INT;
-}
-
 static bool
 check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                        ir_variable *var,
@@ -461,14 +447,23 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                        gl_shader_stage stage)
 {
   unsigned last_comp;
-   if (type->without_array()->is_record()) {
-      /* The component qualifier can't be used on structs so just treat
-       * all component slots as used.
+   unsigned base_type_bit_size;
+   const glsl_type *type_without_array = type->without_array();
+   const bool base_type_is_integer =
+      glsl_base_type_is_integer(type_without_array->base_type);
+   const bool is_struct = type_without_array->is_record();
+   if (is_struct) {
+      /* structs don't have a defined underlying base type so just treat all
+       * component slots as used and set the bit size to 0. If there is
+       * location aliasing, we'll fail anyway later.
       */
      last_comp = 4;
+      base_type_bit_size = 0;
   } else {
-      unsigned dmul = type->without_array()->is_64bit() ? 2 : 1;
-      last_comp = component + type->without_array()->vector_elements * dmul;
+      unsigned dmul = type_without_array->is_64bit() ? 2 : 1;
+      last_comp = component + type_without_array->vector_elements * dmul;
+      base_type_bit_size =
+         glsl_base_type_get_bit_size(type_without_array->base_type);
   }

   while (location < location_limit) {
@@ -478,8 +473,22 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
            &explicit_locations[location][comp];

         if (info->var) {
-            /* Component aliasing is not alloed */
-            if (comp >= component && comp < last_comp) {
+            if (info->var->type->without_array()->is_record() || is_struct) {
+               /* Structs cannot share location since they are incompatible
+                * with any other underlying numerical type.
+                */
+               linker_error(prog,
+                            "%s shader has multiple %sputs sharing the "
+                            "same location that don't have the same "
+                            "underlying numerical type. Struct variable '%s', "
+                            "location %u\n",
+                            _mesa_shader_stage_to_string(stage),
+                            var->data.mode == ir_var_shader_in ? "in" : "out",
+                            is_struct ? var->name : info->var->name,
+                            location);
+               return false;
+            } else if (comp >= component && comp < last_comp) {
+               /* Component aliasing is not allowed */
               linker_error(prog,
                            "%s shader has multiple %sputs explicitly "
                            "assigned to location %d and component %d\n",
@@ -488,27 +497,52 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                            location, comp);
               return false;
            } else {
-               /* For all other used components we need to have matching
-                * types, interpolation and auxiliary storage
+               /* From the OpenGL 4.60.5 spec, section 4.4.1 Input Layout
+                * Qualifiers, Page 67, (Location aliasing):
+                *
+                *   " Further, when location aliasing, the aliases sharing the
+                *     location must have the same underlying numerical type
+                *     and bit width (floating-point or integer, 32-bit versus
+                *     64-bit, etc.) and the same auxiliary storage and
+                *     interpolation qualification."
+                */
+
+               /* If the underlying numerical type isn't integer, implicitly
+                * it will be float or else we would have failed by now.
                */
-               if (info->numerical_type !=
-                   get_numerical_type(type->without_array())) {
+               if (info->base_type_is_integer != base_type_is_integer) {
                  linker_error(prog,
-                               "Varyings sharing the same location must "
-                               "have the same underlying numerical type. "
-                               "Location %u component %u\n",
-                               location, comp);
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "underlying numerical type. Location %u "
+                               "component %u.\n",
+                               _mesa_shader_stage_to_string(stage),
+                               var->data.mode == ir_var_shader_in ?
+                               "in" : "out", location, comp);
+                  return false;
+               }
+
+               if (info->base_type_bit_size != base_type_bit_size) {
+                  linker_error(prog,
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "underlying numerical bit size. Location %u "
+                               "component %u.\n",
+                               _mesa_shader_stage_to_string(stage),
+                               var->data.mode == ir_var_shader_in ?
+                               "in" : "out", location, comp);
                  return false;
               }

               if (info->interpolation != interpolation) {
                  linker_error(prog,
-                               "%s shader has multiple %sputs at explicit "
-                               "location %u with different interpolation "
-                               "settings\n",
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "interpolation qualification. Location %u "
+                               "component %u.\n",
                               _mesa_shader_stage_to_string(stage),
                               var->data.mode == ir_var_shader_in ?
-                               "in" : "out", location);
+                               "in" : "out", location, comp);
                  return false;
               }

@@ -516,17 +550,20 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                   info->sample != sample ||
                   info->patch != patch) {
                  linker_error(prog,
-                               "%s shader has multiple %sputs at explicit "
-                               "location %u with different aux storage\n",
+                               "%s shader has multiple %sputs sharing the "
+                               "same location that don't have the same "
+                               "auxiliary storage qualification. Location %u "
+                               "component %u.\n",
                               _mesa_shader_stage_to_string(stage),
                               var->data.mode == ir_var_shader_in ?
-                               "in" : "out", location);
+                               "in" : "out", location, comp);
                  return false;
               }
            }
         } else if (comp >= component && comp < last_comp) {
            info->var = var;
-            info->numerical_type = get_numerical_type(type->without_array());
+            info->base_type_is_integer = base_type_is_integer;
+            info->base_type_bit_size = base_type_bit_size;
            info->interpolation = interpolation;
            info->centroid = centroid;
            info->sample = sample;

--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -31,6 +31,7 @@
 #include "shader_enums.h"
 #include "blob.h"
 #include "c11/threads.h"
+#include "util/macros.h"

 #ifdef __cplusplus
 #include "main/config.h"
@@ -114,6 +115,42 @@ static inline bool glsl_base_type_is_integer(enum glsl_base_type type)
          type == GLSL_TYPE_IMAGE;
 }

+static inline unsigned int
+glsl_base_type_get_bit_size(const enum glsl_base_type base_type)
+{
+   switch (base_type) {
+   case GLSL_TYPE_BOOL:
+      return 1;
+
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+   case GLSL_TYPE_SUBROUTINE:
+      return 32;
+
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_UINT16:
+   case GLSL_TYPE_INT16:
+      return 16;
+
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+      return 8;
+
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_INT64:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SAMPLER:
+      return 64;
+
+   default:
+      unreachable("unknown base type");
+   }
+
+   return 0;
+}
+
 enum glsl_sampler_dim {
   GLSL_SAMPLER_DIM_1D = 0,
   GLSL_SAMPLER_DIM_2D,

--- a/src/compiler/nir/nir_deref.c
+++ b/src/compiler/nir/nir_deref.c
@@ -215,7 +215,7 @@ nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref,
         unsigned field_offset =
            struct_type_get_field_offset(parent->type, size_align,
                                         (*p)->strct.index);
-         nir_iadd(b, offset, nir_imm_int(b, field_offset));
+         offset = nir_iadd(b, offset, nir_imm_int(b, field_offset));
      } else {
         unreachable("Unsupported deref type");
      }

--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -404,12 +404,21 @@ dst.x = dst.y = 0.0;
 float absX = fabs(src0.x);
 float absY = fabs(src0.y);
 float absZ = fabs(src0.z);
-if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
-if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
-if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
-if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
-if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
-if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
+
+float ma = 0.0;
+if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
+if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
+if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
+
+if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
+if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
+if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
+if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
+if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
+if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
+
+dst.x = dst.x / ma + 0.5;
+dst.y = dst.y / ma + 0.5;
 """)

 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """

--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -97,37 +97,7 @@ unsigned glsl_atomic_size(const struct glsl_type *type);
 static inline unsigned
 glsl_get_bit_size(const struct glsl_type *type)
 {
-   switch (glsl_get_base_type(type)) {
-   case GLSL_TYPE_BOOL:
-      return 1;
-
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
-   case GLSL_TYPE_SUBROUTINE:
-      return 32;
-
-   case GLSL_TYPE_FLOAT16:
-   case GLSL_TYPE_UINT16:
-   case GLSL_TYPE_INT16:
-      return 16;
-
-   case GLSL_TYPE_UINT8:
-   case GLSL_TYPE_INT8:
-      return 8;
-
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_INT64:
-   case GLSL_TYPE_UINT64:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_SAMPLER:
-      return 64;
-
-   default:
-      unreachable("unknown base type");
-   }
-
-   return 0;
+   return glsl_base_type_get_bit_size(glsl_get_base_type(type));
 }

 bool glsl_type_is_16bit(const struct glsl_type *type);

--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -556,11 +556,11 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,

   llvm::SmallVector<std::string, 16> MAttrs;

-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-#if HAVE_LLVM >= 0x0400
-   /* llvm-3.7+ implements sys::getHostCPUFeatures for x86,
-    * which allows us to enable/disable code generation based
-    * on the results of cpuid.
+#if HAVE_LLVM >= 0x0400 && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_ARM))
+   /* llvm-3.3+ implements sys::getHostCPUFeatures for Arm
+    * and llvm-3.7+ for x86, which allows us to enable/disable
+    * code generation based on the results of cpuid on these
+    * architectures.
    */
   llvm::StringMap<bool> features;
   llvm::sys::getHostCPUFeatures(features);
@@ -570,7 +570,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
        ++f) {
      MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str());
   }
-#else
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * We need to unset attributes because sometimes LLVM mistakenly assumes
    * certain features are present given the processor name.
@@ -625,6 +625,12 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
   MAttrs.push_back("-avx512vl");
 #endif
 #endif
+#if defined(PIPE_ARCH_ARM)
+   if (!util_cpu_caps.has_neon) {
+      MAttrs.push_back("-neon");
+      MAttrs.push_back("-crypto");
+      MAttrs.push_back("-vfp2");
+   }
 #endif

 #if defined(PIPE_ARCH_PPC)

--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1108,7 +1108,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
    * larger than the declared size but smaller than the buffer size.
    */
   if (reg_file != TGSI_FILE_CONSTANT) {
-      assert(index_limit > 0);
+      assert(index_limit >= 0);
      max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
                                         uint_bld->type, index_limit);


--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -272,7 +272,7 @@ void vi_dcc_clear_level(struct si_context *sctx,
 	}

 	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			&clear_value, 4, SI_COHERENCY_CB_META);
+			&clear_value, 4, SI_COHERENCY_CB_META, false);
 }

 /* Set the same micro tile mode as the destination of the last MSAA resolve.
@@ -505,7 +505,7 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 				uint32_t clear_value = 0xCCCCCCCC;
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 						tex->cmask_offset, tex->surface.cmask_size,
-						&clear_value, 4, SI_COHERENCY_CB_META);
+						&clear_value, 4, SI_COHERENCY_CB_META, false);
 				fmask_decompress_needed = true;
 			}

@@ -533,7 +533,7 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 			uint32_t clear_value = 0;
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 					tex->cmask_offset, tex->surface.cmask_size,
-					&clear_value, 4, SI_COHERENCY_CB_META);
+					&clear_value, 4, SI_COHERENCY_CB_META, false);
 			eliminate_needed = true;
 		}


--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -177,7 +177,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx,

 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher)
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma)
 {
 	if (!size)
 		return;
@@ -241,7 +242,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		 * about buffer placements.
 		 */
 		if (clear_value_size > 4 ||
-		    (clear_value_size == 4 &&
+		    (!force_cpdma &&
+		     clear_value_size == 4 &&
 		     offset % 4 == 0 &&
 		     (size > 32*1024 || sctx->chip_class <= VI))) {
 			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
@@ -282,7 +284,7 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx,
 		coher = SI_COHERENCY_SHADER;

 	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
-			clear_value_size, coher);
+			clear_value_size, coher, false);
 }

 void si_copy_buffer(struct si_context *sctx,