Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe
--- a/debian/changelog
+++ b/debian/changelog
+libssw (1.1-6) unstable; urgency=medium
+
+  * Release to unstable.
+
+ -- Michael R. Crusoe <michael.crusoe@gmail.com>  Fri, 13 Dec 2019 13:24:05 +0100
+
+libssw (1.1-6~0expsimde2) experimental; urgency=medium
+
+  * Fix some simde c/c++ compat
+
+ -- Michael R. Crusoe <michael.crusoe@gmail.com>  Fri, 13 Dec 2019 12:11:05 +0100
+
+libssw (1.1-6~0expsimde1) experimental; urgency=medium
+
+  * Include patch from upstream SIMDE for SH4
+
+ -- Michael R. Crusoe <michael.crusoe@gmail.com>  Fri, 13 Dec 2019 08:12:10 +0100
+
+libssw (1.1-6~0expsimde0) experimental; urgency=medium
+
+  * Enable for all architectures using simde
+  * Add myself as an uploader
+
+ -- Michael R. Crusoe <michael.crusoe@gmail.com>  Wed, 11 Dec 2019 15:14:03 +0100
+
 libssw (1.1-4) unstable; urgency=medium

  * Source upload to enable testing migration.

--- a/debian/control
+++ b/debian/control
 Source: libssw
 Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
-Uploaders: Sascha Steinbiss <satta@debian.org>
+Uploaders: Sascha Steinbiss <satta@debian.org>,
+           Michael R. Crusoe <michael.crusoe@gmail.com>
 Section: science
 Priority: optional
 Build-Depends: debhelper (>= 12~),
@@ -16,7 +17,7 @@ Vcs-Git: https://salsa.debian.org/med-team/libssw.git
 Homepage: https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library

 Package: libssw0
-Architecture: any-amd64 x32 ppc64el
+Architecture: any
 Multi-Arch: same
 Section: libs
 Depends: ${shlibs:Depends},
@@ -35,11 +36,10 @@ Description: fast SIMD parallelized implementation of the Smith-Waterman algorit
 the sub-optimal alignment score and location heuristically.

 Package: libssw-dev
-Architecture: any-amd64 x32 ppc64el
+Architecture: any
 Multi-Arch: same
 Section: libdevel
-Depends: ${shlibs:Depends},
-         ${misc:Depends},
+Depends: ${misc:Depends},
         libssw0 (= ${binary:Version})
 Pre-Depends: ${misc:Pre-Depends}
 Provides: libssw-dev
@@ -50,7 +50,7 @@ Description: Development headers and static libraries for libssw
 algorithm at the instruction level.

 Package: libssw-java
-Architecture: any-amd64 x32 ppc64el
+Architecture: any
 Section: java
 Depends: ${java:Depends},
         ${shlibs:Depends},
@@ -63,7 +63,7 @@ Description: Java bindings for libssw
 instruction level.

 Package: ssw-align
-Architecture: any-amd64 x32 ppc64el
+Architecture: any
 Depends: ${shlibs:Depends},
         ${misc:Depends},
         libssw0 (= ${binary:Version})

--- a/debian/copyright
+++ b/debian/copyright
@@ -15,6 +15,10 @@ Files: debian/*
 Copyright: © 2016 Sascha Steinbiss <satta@debian.org>
 License: MIT

+Files: debian/include/simde/*
+Copyright: 2013-2019, Evan Nemerson <evan@nemerson.com>
+License: MIT
+
 License: MIT
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/debian/include/simde/check.h
+++ b/debian/include/simde/check.h
+/* Check (assertions)
+ * Portable Snippets - https://gitub.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#if !defined(SIMDE_CHECK_H)
+#define SIMDE_CHECK_H
+
+#if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG)
+#  define SIMDE_NDEBUG 1
+#endif
+
+#include <stdint.h>
+
+#if !defined(_WIN32)
+#  define SIMDE_SIZE_MODIFIER "z"
+#  define SIMDE_CHAR_MODIFIER "hh"
+#  define SIMDE_SHORT_MODIFIER "h"
+#else
+#  if defined(_M_X64) || defined(__amd64__)
+#    define SIMDE_SIZE_MODIFIER "I64"
+#  else
+#    define SIMDE_SIZE_MODIFIER ""
+#  endif
+#  define SIMDE_CHAR_MODIFIER ""
+#  define SIMDE_SHORT_MODIFIER ""
+#endif
+
+#if defined(_MSC_VER) &&  (_MSC_VER >= 1500)
+#  define SIMDE__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127))
+#  define SIMDE__POP_DISABLE_MSVC_C4127 __pragma(warning(pop))
+#else
+#  define SIMDE__PUSH_DISABLE_MSVC_C4127
+#  define SIMDE__POP_DISABLE_MSVC_C4127
+#endif
+
+#if !defined(simde_errorf)
+#  include <stdio.h>
+#  include <stdlib.h>
+#  define simde_errorf(format, ...) (fprintf(stderr, format, __VA_ARGS__), abort())
+#endif
+
+#define simde_error(msg) simde_errorf("%s", msg)
+
+#if defined(SIMDE_NDEBUG)
+#  if defined(SIMDE_CHECK_FAIL_DEFINED)
+#    define simde_assert(expr)
+#  else
+#    if defined(HEDLEY_ASSUME)
+#      define simde_assert(expr) HEDLEY_ASSUME(expr)
+#    elif HEDLEY_GCC_VERSION_CHECK(4,5,0)
+#      define simde_assert(expr) ((void) (!!(expr) ? 1 : (__builtin_unreachable(), 1)))
+#    elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+#      define simde_assert(expr) __assume(expr)
+#    else
+#      define simde_assert(expr)
+#    endif
+#  endif
+#  define simde_assert_true(expr) simde_assert(expr)
+#  define simde_assert_false(expr) simde_assert(!(expr))
+#  define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) simde_assert(((a) op (b)))
+#  define simde_assert_double_equal(a, b, precision)
+#  define simde_assert_string_equal(a, b)
+#  define simde_assert_string_not_equal(a, b)
+#  define simde_assert_memory_equal(size, a, b)
+#  define simde_assert_memory_not_equal(size, a, b)
+#else
+#  define simde_assert(expr) \
+    do { \
+      if (!HEDLEY_LIKELY(expr)) { \
+        simde_error("assertion failed: " #expr "\n"); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_true(expr) \
+    do { \
+      if (!HEDLEY_LIKELY(expr)) { \
+        simde_error("assertion failed: " #expr " is not true\n"); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_false(expr) \
+    do { \
+      if (!HEDLEY_LIKELY(!(expr))) { \
+        simde_error("assertion failed: " #expr " is not false\n"); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b)   \
+    do { \
+      T simde_tmp_a_ = (a); \
+      T simde_tmp_b_ = (b); \
+      if (!(simde_tmp_a_ op simde_tmp_b_)) { \
+        simde_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")\n", \
+                     #a, #op, #b, simde_tmp_a_, #op, simde_tmp_b_); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_double_equal(a, b, precision) \
+    do { \
+      const double simde_tmp_a_ = (a); \
+      const double simde_tmp_b_ = (b); \
+      const double simde_tmp_diff_ = ((simde_tmp_a_ - simde_tmp_b_) < 0) ? \
+        -(simde_tmp_a_ - simde_tmp_b_) : \
+        (simde_tmp_a_ - simde_tmp_b_); \
+      if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) { \
+        simde_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)\n", \
+                     #a, #b, simde_tmp_a_, simde_tmp_b_); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  include <string.h>
+#  define simde_assert_string_equal(a, b) \
+    do { \
+      const char* simde_tmp_a_ = a; \
+      const char* simde_tmp_b_ = b; \
+      if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) != 0)) { \
+        simde_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \
+                     #a, #b, simde_tmp_a_, simde_tmp_b_); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_string_not_equal(a, b) \
+    do { \
+      const char* simde_tmp_a_ = a; \
+      const char* simde_tmp_b_ = b; \
+      if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) == 0)) { \
+        simde_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \
+                     #a, #b, simde_tmp_a_, simde_tmp_b_); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_memory_equal(size, a, b) \
+    do { \
+      const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \
+      const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \
+      const size_t simde_tmp_size_ = (size); \
+      if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) != 0) { \
+        size_t simde_tmp_pos_; \
+        for (simde_tmp_pos_ = 0 ; simde_tmp_pos_ < simde_tmp_size_ ; simde_tmp_pos_++) { \
+          if (simde_tmp_a_[simde_tmp_pos_] != simde_tmp_b_[simde_tmp_pos_]) { \
+            simde_errorf("assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER "u\n", \
+                         #a, #b, simde_tmp_pos_); \
+            break; \
+          } \
+        } \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+
+#  define simde_assert_memory_not_equal(size, a, b) \
+    do { \
+      const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \
+      const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \
+      const size_t simde_tmp_size_ = (size); \
+      if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) == 0) { \
+        simde_errorf("assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER "u bytes)\n", \
+                     #a, #b, simde_tmp_size_); \
+      } \
+      SIMDE__PUSH_DISABLE_MSVC_C4127 \
+    } while (0) \
+    SIMDE__POP_DISABLE_MSVC_C4127
+#endif
+
+#define simde_assert_type(T, fmt, a, op, b) \
+  simde_assert_type_full("", "", T, fmt, a, op, b)
+
+#define simde_assert_char(a, op, b) \
+  simde_assert_type_full("'\\x", "'", char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b)
+#define simde_assert_uchar(a, op, b) \
+  simde_assert_type_full("'\\x", "'", unsigned char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b)
+#define simde_assert_short(a, op, b) \
+  simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b)
+#define simde_assert_ushort(a, op, b) \
+  simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b)
+#define simde_assert_int(a, op, b) \
+  simde_assert_type(int, "d", a, op, b)
+#define simde_assert_uint(a, op, b) \
+  simde_assert_type(unsigned int, "u", a, op, b)
+#define simde_assert_long(a, op, b) \
+  simde_assert_type(long int, "ld", a, op, b)
+#define simde_assert_ulong(a, op, b) \
+  simde_assert_type(unsigned long int, "lu", a, op, b)
+#define simde_assert_llong(a, op, b) \
+  simde_assert_type(long long int, "lld", a, op, b)
+#define simde_assert_ullong(a, op, b) \
+  simde_assert_type(unsigned long long int, "llu", a, op, b)
+
+#define simde_assert_size(a, op, b) \
+  simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b)
+
+#define simde_assert_float(a, op, b) \
+  simde_assert_type(float, "f", a, op, b)
+#define simde_assert_double(a, op, b) \
+  simde_assert_type(double, "g", a, op, b)
+#define simde_assert_ptr(a, op, b) \
+  simde_assert_type(const void*, "p", a, op, b)
+
+#define simde_assert_int8(a, op, b) \
+  simde_assert_type(int8_t, PRIi8, a, op, b)
+#define simde_assert_uint8(a, op, b) \
+  simde_assert_type(uint8_t, PRIu8, a, op, b)
+#define simde_assert_int16(a, op, b) \
+  simde_assert_type(int16_t, PRIi16, a, op, b)
+#define simde_assert_uint16(a, op, b) \
+  simde_assert_type(uint16_t, PRIu16, a, op, b)
+#define simde_assert_int32(a, op, b) \
+  simde_assert_type(int32_t, PRIi32, a, op, b)
+#define simde_assert_uint32(a, op, b) \
+  simde_assert_type(uint32_t, PRIu32, a, op, b)
+#define simde_assert_int64(a, op, b) \
+  simde_assert_type(int64_t, PRIi64, a, op, b)
+#define simde_assert_uint64(a, op, b) \
+  simde_assert_type(uint64_t, PRIu64, a, op, b)
+
+#define simde_assert_ptr_equal(a, b) \
+  simde_assert_ptr(a, ==, b)
+#define simde_assert_ptr_not_equal(a, b) \
+  simde_assert_ptr(a, !=, b)
+#define simde_assert_null(ptr) \
+  simde_assert_ptr(ptr, ==, NULL)
+#define simde_assert_not_null(ptr) \
+  simde_assert_ptr(ptr, !=, NULL)
+#define simde_assert_ptr_null(ptr) \
+  simde_assert_ptr(ptr, ==, NULL)
+#define simde_assert_ptr_not_null(ptr) \
+  simde_assert_ptr(ptr, !=, NULL)
+
+#endif /* !defined(SIMDE_CHECK_H) */
--- a/debian/include/simde/hedley.h
+++ b/debian/include/simde/hedley.h
--- a/debian/include/simde/simde-arch.h
+++ b/debian/include/simde/simde-arch.h
+/* Architecture detection
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   <https://creativecommons.org/publicdomain/zero/1.0/>
+ *
+ * Different compilers define different preprocessor macros for the
+ * same architecture.  This is an attempt to provide a single
+ * interface which is usable on any compiler.
+ *
+ * In general, a macro named SIMDE_ARCH_* is defined for each
+ * architecture the CPU supports.  When there are multiple possible
+ * versions, we try to define the macro to the target version.  For
+ * example, if you want to check for i586+, you could do something
+ * like:
+ *
+ *   #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5)
+ *   ...
+ *   #endif
+ *
+ * You could also just check that SIMDE_ARCH_X86 >= 5 without checking
+ * if it's defined first, but some compilers may emit a warning about
+ * an undefined macro being used (e.g., GCC with -Wundef).
+ *
+ * This was originally created for SIMDe
+ * <https://github.com/nemequ/simde> (hence the prefix), but this
+ * header has no dependencies and may be used anywhere.  It is
+ * originally based on information from
+ * <https://sourceforge.net/p/predef/wiki/Architectures/>, though it
+ * has been enhanced with additional information.
+ *
+ * If you improve this file, or find a bug, please file the issue at
+ * <https://github.com/nemequ/simde/issues>.  If you copy this into
+ * your project, even if you change the prefix, please keep the links
+ * to SIMDe intact so others know where to report issues, submit
+ * enhancements, and find the latest version. */
+
+#if !defined(SIMDE_ARCH_H)
+#define SIMDE_ARCH_H
+
+/* Alpha
+   <https://en.wikipedia.org/wiki/DEC_Alpha> */
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+#  if defined(__alpha_ev6__)
+#    define SIMDE_ARCH_ALPHA 6
+#  elif defined(__alpha_ev5__)
+#    define SIMDE_ARCH_ALPHA 5
+#  elif defined(__alpha_ev4__)
+#    define SIMDE_ARCH_ALPHA 4
+#  else
+#    define SIMDE_ARCH_ALPHA 1
+#  endif
+#endif
+
+/* Atmel AVR
+   <https://en.wikipedia.org/wiki/Atmel_AVR> */
+#if defined(__AVR_ARCH__)
+#  define SIMDE_ARCH_AVR __AVR_ARCH__
+#endif
+
+/* AMD64 / x86_64
+   <https://en.wikipedia.org/wiki/X86-64> */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64)
+#  define SIMDE_ARCH_AMD64 1
+#endif
+
+/* ARM
+   <https://en.wikipedia.org/wiki/ARM_architecture> */
+#if defined(__ARM_ARCH_8A__)
+#  define SIMDE_ARCH_ARM 82
+#elif defined(__ARM_ARCH_8R__)
+#  define SIMDE_ARCH_ARM 81
+#elif defined(__ARM_ARCH_8__)
+#  define SIMDE_ARCH_ARM 80
+#elif defined(__ARM_ARCH_7S__)
+#  define SIMDE_ARCH_ARM 74
+#elif defined(__ARM_ARCH_7M__)
+#  define SIMDE_ARCH_ARM 73
+#elif defined(__ARM_ARCH_7R__)
+#  define SIMDE_ARCH_ARM 72
+#elif defined(__ARM_ARCH_7A__)
+#  define SIMDE_ARCH_ARM 71
+#elif defined(__ARM_ARCH_7__)
+#  define SIMDE_ARCH_ARM 70
+#elif defined(__ARM_ARCH)
+#  define SIMDE_ARCH_ARM (__ARM_ARCH * 10)
+#elif defined(_M_ARM)
+#  define SIMDE_ARCH_ARM (_M_ARM * 10)
+#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || defined(_ARM) || defined(_M_ARM) || defined(_M_ARM)
+#  define SIMDE_ARCH_ARM 1
+#endif
+
+/* AArch64
+   <https://en.wikipedia.org/wiki/ARM_architecture> */
+#if defined(__aarch64__) || defined(_M_ARM64)
+#  define SIMDE_ARCH_AARCH64 10
+#endif
+
+/* Blackfin
+   <https://en.wikipedia.org/wiki/Blackfin> */
+#if defined(__bfin) || defined(__BFIN__) || defined(__bfin__)
+#  define SIMDE_ARCH_BLACKFIN 1
+#endif
+
+/* CRIS
+   <https://en.wikipedia.org/wiki/ETRAX_CRIS> */
+#if defined(__CRIS_arch_version)
+#  define SIMDE_ARCH_CRIS __CRIS_arch_version
+#elif defined(__cris__) || defined(__cris) || defined(__CRIS) || defined(__CRIS__)
+#  define SIMDE_ARCH_CRIS 1
+#endif
+
+/* Convex
+   <https://en.wikipedia.org/wiki/Convex_Computer> */
+#if defined(__convex_c38__)
+#  define SIMDE_ARCH_CONVEX 38
+#elif defined(__convex_c34__)
+#  define SIMDE_ARCH_CONVEX 34
+#elif defined(__convex_c32__)
+#  define SIMDE_ARCH_CONVEX 32
+#elif defined(__convex_c2__)
+#  define SIMDE_ARCH_CONVEX 2
+#elif defined(__convex__)
+#  define SIMDE_ARCH_CONVEX 1
+#endif
+
+/* Adapteva Epiphany
+   <https://en.wikipedia.org/wiki/Adapteva_Epiphany> */
+#if defined(__epiphany__)
+#  define SIMDE_ARCH_EPIPHANY 1
+#endif
+
+/* Fujitsu FR-V
+   <https://en.wikipedia.org/wiki/FR-V_(microprocessor)> */
+#if defined(__frv__)
+#  define SIMDE_ARCH_FRV 1
+#endif
+
+/* H8/300
+   <https://en.wikipedia.org/wiki/H8_Family> */
+#if defined(__H8300__)
+#  define SIMDE_ARCH_H8300
+#endif
+
+/* HP/PA / PA-RISC
+   <https://en.wikipedia.org/wiki/PA-RISC> */
+#if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || defined(_PA_RISC2_0)
+#  define SIMDE_ARCH_HPPA 20
+#elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1)
+#  define SIMDE_ARCH_HPPA 11
+#elif defined(_PA_RISC1_0)
+#  define SIMDE_ARCH_HPPA 10
+#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
+#  define SIMDE_ARCH_HPPA 1
+#endif
+
+/* x86
+   <https://en.wikipedia.org/wiki/X86> */
+#if defined(_M_IX86)
+#  define SIMDE_ARCH_X86 (_M_IX86 / 100)
+#elif defined(__I86__)
+#  define SIMDE_ARCH_X86 __I86__
+#elif defined(i686) || defined(__i686) || defined(__i686__)
+#  define SIMDE_ARCH_X86 6
+#elif defined(i586) || defined(__i586) || defined(__i586__)
+#  define SIMDE_ARCH_X86 5
+#elif defined(i486) || defined(__i486) || defined(__i486__)
+#  define SIMDE_ARCH_X86 4
+#elif defined(i386) || defined(__i386) || defined(__i386__)
+#  define SIMDE_ARCH_X86 3
+#elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__)
+#  define SIMDE_ARCH_X86 3
+#endif
+
+/* Itanium
+   <https://en.wikipedia.org/wiki/Itanium> */
+#if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || defined(__itanium__)
+#  define SIMDE_ARCH_IA64 1
+#endif
+
+/* Renesas M32R
+   <https://en.wikipedia.org/wiki/M32R> */
+#if defined(__m32r__) || defined(__M32R__)
+#  define SIMDE_ARCH_M32R
+#endif
+
+/* Motorola 68000
+   <https://en.wikipedia.org/wiki/Motorola_68000> */
+#if defined(__mc68060__) || defined(__MC68060__)
+#  define SIMDE_ARCH_M68K 68060
+#elif defined(__mc68040__) || defined(__MC68040__)
+#  define SIMDE_ARCH_M68K 68040
+#elif defined(__mc68030__) || defined(__MC68030__)
+#  define SIMDE_ARCH_M68K 68030
+#elif defined(__mc68020__) || defined(__MC68020__)
+#  define SIMDE_ARCH_M68K 68020
+#elif defined(__mc68010__) || defined(__MC68010__)
+#  define SIMDE_ARCH_M68K 68010
+#elif defined(__mc68000__) || defined(__MC68000__)
+#  define SIMDE_ARCH_M68K 68000
+#endif
+
+/* Xilinx MicroBlaze
+   <https://en.wikipedia.org/wiki/MicroBlaze> */
+#if defined(__MICROBLAZE__) || defined(__microblaze__)
+#  define SIMDE_ARCH_MICROBLAZE
+#endif
+
+/* MIPS
+   <https://en.wikipedia.org/wiki/MIPS_architecture> */
+#if defined(_MIPS_ISA_MIPS64R2)
+#  define SIMDE_ARCH_MIPS 642
+#elif defined(_MIPS_ISA_MIPS64)
+#  define SIMDE_ARCH_MIPS 640
+#elif defined(_MIPS_ISA_MIPS32R2)
+#  define SIMDE_ARCH_MIPS 322
+#elif defined(_MIPS_ISA_MIPS32)
+#  define SIMDE_ARCH_MIPS 320
+#elif defined(_MIPS_ISA_MIPS4)
+#  define SIMDE_ARCH_MIPS 4
+#elif defined(_MIPS_ISA_MIPS3)
+#  define SIMDE_ARCH_MIPS 3
+#elif defined(_MIPS_ISA_MIPS2)
+#  define SIMDE_ARCH_MIPS 2
+#elif defined(_MIPS_ISA_MIPS1)
+#  define SIMDE_ARCH_MIPS 1
+#elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__)
+#  define SIMDE_ARCH_MIPS 1
+#endif
+
+/* Matsushita MN10300
+   <https://en.wikipedia.org/wiki/MN103> */
+#if defined(__MN10300__) || defined(__mn10300__)
+#  define SIMDE_ARCH_MN10300 1
+#endif
+
+/* POWER
+   <https://en.wikipedia.org/wiki/IBM_POWER_Instruction_Set_Architecture> */
+#if defined(_M_PPC)
+#  define SIMDE_ARCH_POWER _M_PPC
+#elif defined(_ARCH_PWR8)
+#  define SIMDE_ARCH_POWER 800
+#elif defined(_ARCH_PWR7)
+#  define SIMDE_ARCH_POWER 700
+#elif defined(_ARCH_PWR6)
+#  define SIMDE_ARCH_POWER 600
+#elif defined(_ARCH_PWR5)
+#  define SIMDE_ARCH_POWER 500
+#elif defined(_ARCH_PWR4)
+#  define SIMDE_ARCH_POWER 400
+#elif defined(_ARCH_440) || defined(__ppc440__)
+#  define SIMDE_ARCH_POWER 440
+#elif defined(_ARCH_450) || defined(__ppc450__)
+#  define SIMDE_ARCH_POWER 450
+#elif defined(_ARCH_601) || defined(__ppc601__)
+#  define SIMDE_ARCH_POWER 601
+#elif defined(_ARCH_603) || defined(__ppc603__)
+#  define SIMDE_ARCH_POWER 603
+#elif defined(_ARCH_604) || defined(__ppc604__)
+#  define SIMDE_ARCH_POWER 604
+#elif defined(_ARCH_605) || defined(__ppc605__)
+#  define SIMDE_ARCH_POWER 605
+#elif defined(_ARCH_620) || defined(__ppc620__)
+#  define SIMDE_ARCH_POWER 620
+#elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) || defined(__ppc)
+#  define SIMDE_ARCH_POWER 1
+#endif
+
+/* SPARC
+   <https://en.wikipedia.org/wiki/SPARC> */
+#if defined(__sparc_v9__) || defined(__sparcv9)
+#  define SIMDE_ARCH_SPARC 9
+#elif defined(__sparc_v8__) || defined(__sparcv8)
+#  define SIMDE_ARCH_SPARC 8
+#elif defined(__sparc_v7__) || defined(__sparcv7)
+#  define SIMDE_ARCH_SPARC 7
+#elif defined(__sparc_v6__) || defined(__sparcv6)
+#  define SIMDE_ARCH_SPARC 6
+#elif defined(__sparc_v5__) || defined(__sparcv5)
+#  define SIMDE_ARCH_SPARC 5
+#elif defined(__sparc_v4__) || defined(__sparcv4)
+#  define SIMDE_ARCH_SPARC 4
+#elif defined(__sparc_v3__) || defined(__sparcv3)
+#  define SIMDE_ARCH_SPARC 3
+#elif defined(__sparc_v2__) || defined(__sparcv2)
+#  define SIMDE_ARCH_SPARC 2
+#elif defined(__sparc_v1__) || defined(__sparcv1)
+#  define SIMDE_ARCH_SPARC 1
+#elif defined(__sparc__) || defined(__sparc)
+#  define SIMDE_ARCH_SPARC 1
+#endif
+
+/* SuperH
+   <https://en.wikipedia.org/wiki/SuperH> */
+#if defined(__sh5__) || defined(__SH5__)
+#  define SIMDE_ARCH_SUPERH 5
+#elif defined(__sh4__) || defined(__SH4__)
+#  define SIMDE_ARCH_SUPERH 4
+#elif defined(__sh3__) || defined(__SH3__)
+#  define SIMDE_ARCH_SUPERH 3
+#elif defined(__sh2__) || defined(__SH2__)
+#  define SIMDE_ARCH_SUPERH 2
+#elif defined(__sh1__) || defined(__SH1__)
+#  define SIMDE_ARCH_SUPERH 1
+#elif defined(__sh__) || defined(__SH__)
+#  define SIMDE_ARCH_SUPERH 1
+#endif
+
+/* IBM System z
+   <https://en.wikipedia.org/wiki/IBM_System_z> */
+#if defined(__370__) || defined(__THW_370__) || defined(__s390__) || defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__)
+#  define SIMDE_ARCH_SYSTEMZ
+#endif
+
+/* TMS320 DSP
+   <https://en.wikipedia.org/wiki/Texas_Instruments_TMS320> */
+#if defined(_TMS320C6740) || defined(__TMS320C6740__)
+#  define SIMDE_ARCH_TMS320 6740
+#elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__)
+#  define SIMDE_ARCH_TMS320 6701
+#elif defined(_TMS320C6700) || defined(__TMS320C6700__)
+#  define SIMDE_ARCH_TMS320 6700
+#elif defined(_TMS320C6600) || defined(__TMS320C6600__)
+#  define SIMDE_ARCH_TMS320 6600
+#elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__)
+#  define SIMDE_ARCH_TMS320 6401
+#elif defined(_TMS320C6400) || defined(__TMS320C6400__)
+#  define SIMDE_ARCH_TMS320 6400
+#elif defined(_TMS320C6200) || defined(__TMS320C6200__)
+#  define SIMDE_ARCH_TMS320 6200
+#elif defined(_TMS320C55X) || defined(__TMS320C55X__)
+#  define SIMDE_ARCH_TMS320 550
+#elif defined(_TMS320C54X) || defined(__TMS320C54X__)
+#  define SIMDE_ARCH_TMS320 540
+#elif defined(_TMS320C28X) || defined(__TMS320C28X__)
+#  define SIMDE_ARCH_TMS320 280
+#endif
+
+/* Xtensa
+   <https://en.wikipedia.org/wiki/> */
+#if defined(__xtensa__) || defined(__XTENSA__)
+#  define SIMDE_ARCH_XTENSA 1
+#endif
+
+#endif /* !defined(SIMDE_ARCH_H) */
--- a/debian/include/simde/simde-common.h
+++ b/debian/include/simde/simde-common.h
+/* Copyright (c) 2017-2019 Evan Nemerson <evan@nemerson.com>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(SIMDE_COMMON_H)
+#define SIMDE_COMMON_H
+
+#include "hedley.h"
+#include "check.h"
+#include "simde-arch.h"
+
+#if \
+  HEDLEY_HAS_ATTRIBUTE(aligned) || \
+  HEDLEY_GCC_VERSION_CHECK(2,95,0) || \
+  HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \
+  HEDLEY_IBM_VERSION_CHECK(11,1,0) || \
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  HEDLEY_PGI_VERSION_CHECK(19,4,0) || \
+  HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \
+  HEDLEY_TI_VERSION_CHECK(8,1,0)
+#  define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
+#elif defined(_MSC_VER) && (!defined(_M_IX86) || defined(_M_AMD64))
+#  define SIMDE_ALIGN(alignment) __declspec(align(alignment))
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#  define SIMDE_ALIGN(alignment) _Alignas(alignment)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+#  define SIMDE_ALIGN(alignment) alignas(alignment)
+#else
+#  define SIMDE_ALIGN(alignment)
+#endif
+
+#define simde_assert_aligned(alignment, val) \
+  simde_assert_int(((uintptr_t) (val)) % (alignment), ==, 0)
+
+/* TODO: this should really do something like
+   HEDLEY_STATIC_CAST(T, (simde_assert_int(alignment, v), v))
+   but I need to think about how to handle it in all compilers...
+   may end up moving to Hedley, too. */
+#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned)
+#  define SIMDE_CAST_ALIGN(alignment, T, v) ((T) __builtin_assume_aligned(v, alignment))
+#elif HEDLEY_HAS_WARNING("-Wcast-align")
+#  define SIMDE_CAST_ALIGN(alignment, T, v) \
+    HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wcast-align\"") \
+    HEDLEY_STATIC_CAST(T, v) \
+    HEDLEY_DIAGNOSTIC_POP
+#else
+#  define SIMDE_CAST_ALIGN(alignment, T, v) HEDLEY_STATIC_CAST(T, v)
+#endif
+
+#if HEDLEY_GCC_HAS_ATTRIBUTE(vector_size,4,6,0)
+#  define SIMDE__ENABLE_GCC_VEC_EXT
+#endif
+
+#if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L)))
+#  define SIMDE_ENABLE_OPENMP
+#endif
+
+#if !defined(SIMDE_ENABLE_CILKPLUS) && defined(__cilk)
+#  define SIMDE_ENABLE_CILKPLUS
+#endif
+
+#if defined(SIMDE_ENABLE_OPENMP)
+#  define SIMDE__VECTORIZE _Pragma("omp simd")
+#  define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
+#  define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
+#  define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
+#elif defined(SIMDE_ENABLE_CILKPLUS)
+#  define SIMDE__VECTORIZE _Pragma("simd")
+#  define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
+#  define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
+#  define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
+#elif defined(__INTEL_COMPILER)
+#  define SIMDE__VECTORIZE _Pragma("simd")
+#  define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
+#  define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
+#  define SIMDE__VECTORIZE_ALIGNED(a)
+#elif defined(__clang__)
+#  define SIMDE__VECTORIZE _Pragma("clang loop vectorize(enable)")
+#  define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
+#  define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
+#  define SIMDE__VECTORIZE_ALIGNED(a)
+#elif HEDLEY_GCC_VERSION_CHECK(4,9,0)
+#  define SIMDE__VECTORIZE _Pragma("GCC ivdep")
+#  define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE
+#  define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
+#  define SIMDE__VECTORIZE_ALIGNED(a)
+#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0)
+#  define SIMDE__VECTORIZE _Pragma("_CRI ivdep")
+#  define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE
+#  define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
+#  define SIMDE__VECTORIZE_ALIGNED(a)
+#else
+#  define SIMDE__VECTORIZE
+#  define SIMDE__VECTORIZE_SAFELEN(l)
+#  define SIMDE__VECTORIZE_REDUCTION(r)
+#  define SIMDE__VECTORIZE_ALIGNED(a)
+#endif
+
+#if HEDLEY_GCC_HAS_ATTRIBUTE(unused,3,1,0)
+#  define SIMDE__UNUSED __attribute__((__unused__))
+#else
+#  define SIMDE__UNUSED
+#endif
+
+#if HEDLEY_GCC_HAS_ATTRIBUTE(artificial,4,3,0)
+#  define SIMDE__ARTIFICIAL __attribute__((__artificial__))
+#else
+#  define SIMDE__ARTIFICIAL
+#endif
+
+/* Intended for checking coverage, you should never use this in
+   production. */
+#if defined(SIMDE_NO_INLINE)
+#  define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE SIMDE__UNUSED static
+#else
+#  define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_INLINE SIMDE__ARTIFICIAL static
+#endif
+
+#if defined(_MSC_VER)
+#  define SIMDE__BEGIN_DECLS HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable:4996 4204)) HEDLEY_BEGIN_C_DECLS
+#  define SIMDE__END_DECLS HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS
+#else
+#  define SIMDE__BEGIN_DECLS HEDLEY_BEGIN_C_DECLS
+#  define SIMDE__END_DECLS HEDLEY_END_C_DECLS
+#endif
+
+#if defined(__SIZEOF_INT128__)
+#  define SIMDE__HAVE_INT128
+typedef __int128 simde_int128;
+typedef unsigned __int128 simde_uint128;
+#endif
+
+/* TODO: we should at least make an attempt to detect the correct
+   types for simde_float32/float64 instead of just assuming float and
+   double. */
+
+#if !defined(SIMDE_FLOAT32_TYPE)
+#  define SIMDE_FLOAT32_TYPE float
+#  define SIMDE_FLOAT32_C(value) value##f
+#else
+#  define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE) value)
+#endif
+typedef SIMDE_FLOAT32_TYPE simde_float32;
+HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4, "Unable to find 32-bit floating-point type.");
+
+#if !defined(SIMDE_FLOAT64_TYPE)
+#  define SIMDE_FLOAT64_TYPE double
+#  define SIMDE_FLOAT64_C(value) value
+#else
+#  define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE) value)
+#endif
+typedef SIMDE_FLOAT64_TYPE simde_float64;
+HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, "Unable to find 64-bit floating-point type.");
+
+/* Whether to assume that the compiler can auto-vectorize reasonably
+   well.  This will cause SIMDe to attempt to compose vector
+   operations using more simple vector operations instead of minimize
+   serial work.
+
+   As an example, consider the _mm_add_ss(a, b) function from SSE,
+   which returns { a0 + b0, a1, a2, a3 }.  This pattern is repeated
+   for other operations (sub, mul, etc.).
+
+   The naïve implementation would result in loading a0 and b0, adding
+   them into a temporary variable, then splicing that value into a new
+   vector with the remaining elements from a.
+
+   On platforms which support vectorization, it's generally faster to
+   simply perform the operation on the entire vector to avoid having
+   to move data between SIMD registers and non-SIMD registers.
+   Basically, instead of the temporary variable being (a0 + b0) it
+   would be a vector of (a + b), which is then combined with a to form
+   the result.
+
+   By default, SIMDe will prefer the pure-vector versions if we detect
+   a vector ISA extension, but this can be overridden by defining
+   SIMDE_NO_ASSUME_VECTORIZATION.  You can also define
+   SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the
+   vectorized version. */
+#if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && !defined(SIMDE_ASSUME_VECTORIZATION)
+#  if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || defined(__ALTIVEC__)
+#    define SIMDE_ASSUME_VECTORIZATION
+#  endif
+#endif
+
+/* GCC and clang have built-in functions to handle shuffling of
+   vectors, but the implementations are slightly different.  This
+   macro is just an abstraction over them.  Note that elem_size is in
+   bits but vec_size is in bytes. */
+#if HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
+#  define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle,4,7,0) && !defined(__INTEL_COMPILER)
+#  define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) __builtin_shuffle(a, b, (int##elem_size##_t __attribute__((__vector_size__(vec_size)))) { __VA_ARGS__ })
+#endif
+
+#if HEDLEY_GCC_HAS_BUILTIN(__builtin_convertvector,9,0,0)
+#  define SIMDE__CONVERT_VECTOR(to, from) ((to) = __builtin_convertvector((from), __typeof__(to)))
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wbad-function-cast")
+#  define SIMDE_CONVERT_FTOI(T,v) \
+    HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \
+    ((T) (v)) \
+    HEDLEY_DIAGNOSTIC_POP
+#else
+#  define SIMDE_CONVERT_FTOI(T,v) ((T) (v))
+#endif
+
+/* Some algorithms are iterative, and fewer iterations means less
+   accuracy.  Lower values here will result in faster, but less
+   accurate, calculations for some functions. */
+#if !defined(SIMDE_ACCURACY_ITERS)
+#  define SIMDE_ACCURACY_ITERS 2
+#endif
+
+#if defined(SIMDE__ASSUME_ALIGNED)
+#  undef SIMDE__ASSUME_ALIGNED
+#endif
+#if HEDLEY_INTEL_VERSION_CHECK(9,0,0)
+#  define SIMDE__ASSUME_ALIGNED(ptr, align) __assume_aligned(ptr, align)
+#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+#  define SIMDE__ASSUME_ALIGNED(ptr, align) __assume((((char*) ptr) - ((char*) 0)) % (align) == 0)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_assume_aligned,4,7,0)
+#  define SIMDE__ASSUME_ALIGNED(ptr, align) (ptr = (__typeof__(ptr)) __builtin_assume_aligned((ptr), align))
+#elif HEDLEY_CLANG_HAS_BUILTIN(__builtin_assume)
+#  define SIMDE__ASSUME_ALIGNED(ptr, align) __builtin_assume((((char*) ptr) - ((char*) 0)) % (align) == 0)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_unreachable,4,5,0)
+#  define SIMDE__ASSUME_ALIGNED(ptr, align) ((((char*) ptr) - ((char*) 0)) % (align) == 0) ? (1) : (__builtin_unreachable(), 0)
+#else
+#  define SIMDE__ASSUME_ALIGNED(ptr, align)
+#endif
+
+/* This is only to help us implement functions like _mm_undefined_ps. */
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
+#  undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+#endif
+#if HEDLEY_HAS_WARNING("-Wuninitialized")
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wuninitialized\"")
+#elif HEDLEY_GCC_VERSION_CHECK(4,2,0)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wuninitialized\"")
+#elif HEDLEY_PGI_VERSION_CHECK(19,10,0)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) && defined(__cplusplus)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,unassigned)")
+/* #elif \
+     HEDLEY_TI_VERSION_CHECK(16,9,9) || \
+     HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+     HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+     HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551") */
+#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)")
+#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+#  define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ __pragma(warning(disable:4700))
+#endif
+
+/* Sometimes we run into problems with specific versions of compilers
+   which make the native versions unusable for us.  Often this is due
+   to missing functions, sometimes buggy implementations, etc.  These
+   macros are how we check for specific bugs.  As they are fixed we'll
+   start only defining them for problematic compiler versions. */
+
+#if !defined(SIMDE_IGNORE_COMPILER_BUGS)
+#  if !HEDLEY_GCC_VERSION_CHECK(4,9,0)
+#    define SIMDE_BUG_GCC_REV_208793
+#  endif
+#  if !HEDLEY_GCC_VERSION_CHECK(5,0,0)
+#    define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */
+#  endif
+#  if !HEDLEY_GCC_VERSION_CHECK(4,6,0)
+#    define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */
+#  endif
+#  if !HEDLEY_GCC_VERSION_CHECK(10,0,0)
+#    define SIMDE_BUG_GCC_REV_274313
+#  endif
+#  if defined(HEDLEY_EMSCRIPTEN_VERSION)
+#    define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
+#    define SIMDE_BUG_EMSCRIPTEN_5242
+#  endif
+#endif
+
+#if !defined(__cplusplus)
+#define SIMDE_F64_ALL_SET   (((union { uint64_t u64; simde_float64 f64; }) { .u64 = ~UINT64_C(0x0) }).f64)
+#define SIMDE_F64_ALL_UNSET (((union { uint64_t u64; simde_float64 f64; }) { .u64 =  UINT64_C(0x0) }).f64)
+#define SIMDE_F32_ALL_SET   (((union { uint32_t u32; simde_float32 f32; }) { .u32 = ~UINT32_C(0x0) }).f32)
+#define SIMDE_F32_ALL_UNSET (((union { uint32_t u32; simde_float32 f32; }) { .u32 =  UINT32_C(0x0) }).f32)
+#else
+static const union { uint64_t u64; simde_float64 f64; } simde_f64_all_set   = { .u64 = ~UINT64_C(0) };
+static const union { uint64_t u64; simde_float64 f64; } simde_f64_all_unset = { .u64 =  UINT64_C(0) };
+static const union { uint64_t u32; simde_float64 f32; } simde_f32_all_set   = { .u32 = ~UINT32_C(0) };
+static const union { uint64_t u32; simde_float64 f32; } simde_f32_all_unset = { .u32 =  UINT32_C(0) };
+
+#  define SIMDE_F64_ALL_SET   (simde_f64_all_set.f64)
+#  define SIMDE_F64_ALL_UNSET (simde_f64_all_unset.f64)
+#  define SIMDE_F32_ALL_SET   (simde_f32_all_set.f32)
+#  define SIMDE_F32_ALL_UNSET (simde_f32_all_unset.f32)
+#endif
+
+#endif /* !defined(SIMDE_COMMON_H) */
--- a/debian/include/simde/x86/mmx.h
+++ b/debian/include/simde/x86/mmx.h
--- a/debian/include/simde/x86/sse.h
+++ b/debian/include/simde/x86/sse.h
--- a/debian/include/simde/x86/sse2.h
+++ b/debian/include/simde/x86/sse2.h
--- a/debian/patches/rename_tool.patch
+++ b/debian/patches/rename_tool.patch
@@ -2,8 +2,8 @@ Description: build command line aligner
 This patch ensures that 'ssw_test' is called 'ssw-align' in Debian
 and also links against libssw.
 Author: Sascha Steinbiss <satta@debian.org>
--- a/src/Makefile
-+++ b/src/Makefile
+--- libssw.orig/src/Makefile
+++ libssw/src/Makefile
 @@ -4,9 +4,10 @@
 #CXXFLAGS := $(CFLAGS)
 LOBJS = ssw.o
@@ -37,8 +37,8 @@ Author: Sascha Steinbiss <satta@debian.org>
 	$(CC) -o $@ $(filter-out %.h,$^) $(CPPFLAGS) $(CFLAGS) -lm -lz $(LDFLAGS)
 
 $(EXAMPLE_CPP): example.cpp $(LOBJS) $(LCPPOBJS)
--- a/src/main.c
-+++ b/src/main.c
+--- libssw.orig/src/main.c
+++ libssw/src/main.c
 @@ -297,7 +297,7 @@
 	}
 	if (optind + 2 > argc) {

--- a/debian/patches/series
+++ b/debian/patches/series
 build_all_libs.patch
 hardening.patch
 rename_tool.patch
+simde.patch
--- a/debian/patches/simde.patch
+++ b/debian/patches/simde.patch
+Author: Michael R. Crusoe <michael.crusoe@gmail.com>
+Description: use the simde header library for greater compatibility
+Forwarded: https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/pull/69
+--- libssw.orig/src/ssw.c
+++ libssw/src/ssw.c
+@@ -35,7 +35,7 @@
+  *
+  */
+ 
+-#include <emmintrin.h>
+#include "../debian/include/simde/x86/sse2.h"
+ #include <stdint.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+@@ -76,8 +76,8 @@
+ } cigar;
+ 
+ struct _profile{
+-	__m128i* profile_byte;	// 0: none
+-	__m128i* profile_word;	// 0: none
+	simde__m128i* profile_byte;	// 0: none
+	simde__m128i* profile_word;	// 0: none
+ 	const int8_t* read;
+ 	const int8_t* mat;
+ 	int32_t readLen;
+@@ -86,7 +86,7 @@
+ };
+ 
+ /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
+-static __m128i* qP_byte (const int8_t* read_num,
+static simde__m128i* qP_byte (const int8_t* read_num,
+ 				  const int8_t* mat,
+ 				  const int32_t readLen,
+ 				  const int32_t n,	/* the edge length of the squre matrix mat */
+@@ -96,7 +96,7 @@
+ 								     Each piece is 8 bit. Split the read into 16 segments.
+ 								     Calculat 16 segments in parallel.
+ 								   */
+-	__m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+	simde__m128i* vProfile = (simde__m128i*)malloc(n * segLen * sizeof(simde__m128i));
+ 	int8_t* t = (int8_t*)vProfile;
+ 	int32_t nt, i, j, segNum;
+ 
+@@ -126,7 +126,7 @@
+ 							 int32_t readLen,
+ 							 const uint8_t weight_gapO, /* will be used as - */
+ 							 const uint8_t weight_gapE, /* will be used as - */
+-							 const __m128i* vProfile,
+							 const simde__m128i* vProfile,
+ 							 uint8_t terminate,	/* the best alignment score: used to terminate
+ 												   the matrix calculation when locating the
+ 												   alignment beginning point. If this score
+@@ -134,11 +134,11 @@
+ 	 						 uint8_t bias,  /* Shift 0 point to a positive value. */
+ 							 int32_t maskLen) {
+ 
+-#define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
+-					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
+-					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
+-					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
+-					  (m) = _mm_extract_epi16((vm), 0)
+#define max16(m, vm) (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 8)); \
+					  (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 4)); \
+					  (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 2)); \
+					  (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 1)); \
+					  (m) = simde_mm_extract_epi16((vm), 0)
+ 
+ 	uint8_t max = 0;		                     /* the max alignment score */
+ 	int32_t end_read = readLen - 1;
+@@ -152,26 +152,26 @@
+ 	int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
+ 
+ 	/* Define 16 byte 0 vector. */
+-	__m128i vZero = _mm_set1_epi32(0);
+	simde__m128i vZero = simde_mm_set1_epi32(0);
+ 
+-	__m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
+-	__m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
+-	__m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
+-	__m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
+	simde__m128i* pvHStore = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+	simde__m128i* pvHLoad = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+	simde__m128i* pvE = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+	simde__m128i* pvHmax = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+ 
+ 	int32_t i, j;
+ 	/* 16 byte insertion begin vector */
+-	__m128i vGapO = _mm_set1_epi8(weight_gapO);
+	simde__m128i vGapO = simde_mm_set1_epi8(weight_gapO);
+ 
+ 	/* 16 byte insertion extension vector */
+-	__m128i vGapE = _mm_set1_epi8(weight_gapE);
+	simde__m128i vGapE = simde_mm_set1_epi8(weight_gapE);
+ 
+ 	/* 16 byte bias vector */
+-	__m128i vBias = _mm_set1_epi8(bias);
+	simde__m128i vBias = simde_mm_set1_epi8(bias);
+ 
+-	__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+-	__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+-	__m128i vTemp;
+	simde__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+	simde__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+	simde__m128i vTemp;
+ 	int32_t edge, begin = 0, end = refLen, step = 1;
+ 
+ 	/* outer loop to process the reference sequence */
+@@ -182,84 +182,84 @@
+ 	}
+ 	for (i = begin; LIKELY(i != end); i += step) {
+ 		int32_t cmp;
+-		__m128i e, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
+		simde__m128i e, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
+ 							   Any errors to vH values will be corrected in the Lazy_F loop.
+ 							 */
+ 
+-		__m128i vH = pvHStore[segLen - 1];
+-		vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
+-		const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+		simde__m128i vH = pvHStore[segLen - 1];
+		vH = simde_mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
+		const simde__m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+ 
+ 		/* Swap the 2 H buffers. */
+-		__m128i* pv = pvHLoad;
+		simde__m128i* pv = pvHLoad;
+ 		pvHLoad = pvHStore;
+ 		pvHStore = pv;
+ 
+ 		/* inner loop to process the query sequence */
+ 		for (j = 0; LIKELY(j < segLen); ++j) {
+-			vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
+-			vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
+			vH = simde_mm_adds_epu8(vH, simde_mm_load_si128(vP + j));
+			vH = simde_mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
+ 
+ 			/* Get max from vH, vE and vF. */
+-			e = _mm_load_si128(pvE + j);
+-			vH = _mm_max_epu8(vH, e);
+-			vH = _mm_max_epu8(vH, vF);
+-			vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+			e = simde_mm_load_si128(pvE + j);
+			vH = simde_mm_max_epu8(vH, e);
+			vH = simde_mm_max_epu8(vH, vF);
+			vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH);
+ 
+ 			/* Save vH values. */
+-			_mm_store_si128(pvHStore + j, vH);
+			simde_mm_store_si128(pvHStore + j, vH);
+ 
+ 			/* Update vE value. */
+-			vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
+-			e = _mm_subs_epu8(e, vGapE);
+-			e = _mm_max_epu8(e, vH);
+-			_mm_store_si128(pvE + j, e);
+			vH = simde_mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
+			e = simde_mm_subs_epu8(e, vGapE);
+			e = simde_mm_max_epu8(e, vH);
+			simde_mm_store_si128(pvE + j, e);
+ 
+ 			/* Update vF value. */
+-			vF = _mm_subs_epu8(vF, vGapE);
+-			vF = _mm_max_epu8(vF, vH);
+			vF = simde_mm_subs_epu8(vF, vGapE);
+			vF = simde_mm_max_epu8(vF, vH);
+ 
+ 			/* Load the next vH. */
+-			vH = _mm_load_si128(pvHLoad + j);
+			vH = simde_mm_load_si128(pvHLoad + j);
+ 		}
+ 
+ 		/* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
+         /* reset pointers to the start of the saved data */
+         j = 0;
+-        vH = _mm_load_si128 (pvHStore + j);
+        vH = simde_mm_load_si128 (pvHStore + j);
+ 
+         /*  the computed vF value is for the given column.  since */
+         /*  we are at the end, we need to shift the vF value over */
+         /*  to the next column. */
+-        vF = _mm_slli_si128 (vF, 1);
+-        vTemp = _mm_subs_epu8 (vH, vGapO);
+-		vTemp = _mm_subs_epu8 (vF, vTemp);
+-		vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
+-		cmp  = _mm_movemask_epi8 (vTemp);
+        vF = simde_mm_slli_si128 (vF, 1);
+        vTemp = simde_mm_subs_epu8 (vH, vGapO);
+		vTemp = simde_mm_subs_epu8 (vF, vTemp);
+		vTemp = simde_mm_cmpeq_epi8 (vTemp, vZero);
+		cmp  = simde_mm_movemask_epi8 (vTemp);
+ 
+         while (cmp != 0xffff)
+         {
+-            vH = _mm_max_epu8 (vH, vF);
+-			vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+-            _mm_store_si128 (pvHStore + j, vH);
+-            vF = _mm_subs_epu8 (vF, vGapE);
+            vH = simde_mm_max_epu8 (vH, vF);
+			vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH);
+            simde_mm_store_si128 (pvHStore + j, vH);
+            vF = simde_mm_subs_epu8 (vF, vGapE);
+             j++;
+             if (j >= segLen)
+             {
+                 j = 0;
+-                vF = _mm_slli_si128 (vF, 1);
+                vF = simde_mm_slli_si128 (vF, 1);
+             }
+-            vH = _mm_load_si128 (pvHStore + j);
+            vH = simde_mm_load_si128 (pvHStore + j);
+ 
+-            vTemp = _mm_subs_epu8 (vH, vGapO);
+-            vTemp = _mm_subs_epu8 (vF, vTemp);
+-            vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
+-            cmp  = _mm_movemask_epi8 (vTemp);
+            vTemp = simde_mm_subs_epu8 (vH, vGapO);
+            vTemp = simde_mm_subs_epu8 (vF, vTemp);
+            vTemp = simde_mm_cmpeq_epi8 (vTemp, vZero);
+            cmp  = simde_mm_movemask_epi8 (vTemp);
+         }
+ 
+-		vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
+-		vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
+-		cmp = _mm_movemask_epi8(vTemp);
+		vMaxScore = simde_mm_max_epu8(vMaxScore, vMaxColumn);
+		vTemp = simde_mm_cmpeq_epi8(vMaxMark, vMaxScore);
+		cmp = simde_mm_movemask_epi8(vTemp);
+ 		if (cmp != 0xffff) {
+ 			uint8_t temp;
+ 			vMaxMark = vMaxScore;
+@@ -327,13 +327,13 @@
+ 	return bests;
+ }
+ 
+-static __m128i* qP_word (const int8_t* read_num,
+static simde__m128i* qP_word (const int8_t* read_num,
+ 				  const int8_t* mat,
+ 				  const int32_t readLen,
+ 				  const int32_t n) {
+ 
+ 	int32_t segLen = (readLen + 7) / 8;
+-	__m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+	simde__m128i* vProfile = (simde__m128i*)malloc(n * segLen * sizeof(simde__m128i));
+ 	int16_t* t = (int16_t*)vProfile;
+ 	int32_t nt, i, j;
+ 	int32_t segNum;
+@@ -357,14 +357,14 @@
+ 							 int32_t readLen,
+ 							 const uint8_t weight_gapO, /* will be used as - */
+ 							 const uint8_t weight_gapE, /* will be used as - */
+-							 const __m128i* vProfile,
+							 const simde__m128i* vProfile,
+ 							 uint16_t terminate,
+ 							 int32_t maskLen) {
+ 
+-#define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
+-					(vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
+-					(vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
+-					(m) = _mm_extract_epi16((vm), 0)
+#define max8(m, vm) (vm) = simde_mm_max_epi16((vm), simde_mm_srli_si128((vm), 8)); \
+					(vm) = simde_mm_max_epi16((vm), simde_mm_srli_si128((vm), 4)); \
+					(vm) = simde_mm_max_epi16((vm), simde_mm_srli_si128((vm), 2)); \
+					(m) = simde_mm_extract_epi16((vm), 0)
+ 
+ 	uint16_t max = 0;		                     /* the max alignment score */
+ 	int32_t end_read = readLen - 1;
+@@ -378,23 +378,23 @@
+ 	int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
+ 
+ 	/* Define 16 byte 0 vector. */
+-	__m128i vZero = _mm_set1_epi32(0);
+	simde__m128i vZero = simde_mm_set1_epi32(0);
+ 
+-	__m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
+-	__m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
+-	__m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
+-	__m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
+	simde__m128i* pvHStore = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+	simde__m128i* pvHLoad = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+	simde__m128i* pvE = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+	simde__m128i* pvHmax = (simde__m128i*) calloc(segLen, sizeof(simde__m128i));
+ 
+ 	int32_t i, j, k;
+ 	/* 16 byte insertion begin vector */
+-	__m128i vGapO = _mm_set1_epi16(weight_gapO);
+	simde__m128i vGapO = simde_mm_set1_epi16(weight_gapO);
+ 
+ 	/* 16 byte insertion extension vector */
+-	__m128i vGapE = _mm_set1_epi16(weight_gapE);
+	simde__m128i vGapE = simde_mm_set1_epi16(weight_gapE);
+ 
+-	__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+-	__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+-	__m128i vTemp;
+	simde__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+	simde__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+	simde__m128i vTemp;
+ 	int32_t edge, begin = 0, end = refLen, step = 1;
+ 
+ 	/* outer loop to process the reference sequence */
+@@ -405,66 +405,66 @@
+ 	}
+ 	for (i = begin; LIKELY(i != end); i += step) {
+ 		int32_t cmp;
+-		__m128i e, vF = vZero; /* Initialize F value to 0.
+		simde__m128i e, vF = vZero; /* Initialize F value to 0.
+ 							   Any errors to vH values will be corrected in the Lazy_F loop.
+ 							 */
+-		__m128i vH = pvHStore[segLen - 1];
+-		vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
+		simde__m128i vH = pvHStore[segLen - 1];
+		vH = simde_mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
+ 
+ 		/* Swap the 2 H buffers. */
+-		__m128i* pv = pvHLoad;
+		simde__m128i* pv = pvHLoad;
+ 
+-		__m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
+		simde__m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
+ 
+-		const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+		const simde__m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+ 		pvHLoad = pvHStore;
+ 		pvHStore = pv;
+ 
+ 		/* inner loop to process the query sequence */
+ 		for (j = 0; LIKELY(j < segLen); j ++) {
+-			vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
+			vH = simde_mm_adds_epi16(vH, simde_mm_load_si128(vP + j));
+ 
+ 			/* Get max from vH, vE and vF. */
+-			e = _mm_load_si128(pvE + j);
+-			vH = _mm_max_epi16(vH, e);
+-			vH = _mm_max_epi16(vH, vF);
+-			vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
+			e = simde_mm_load_si128(pvE + j);
+			vH = simde_mm_max_epi16(vH, e);
+			vH = simde_mm_max_epi16(vH, vF);
+			vMaxColumn = simde_mm_max_epi16(vMaxColumn, vH);
+ 
+ 			/* Save vH values. */
+-			_mm_store_si128(pvHStore + j, vH);
+			simde_mm_store_si128(pvHStore + j, vH);
+ 
+ 			/* Update vE value. */
+-			vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
+-			e = _mm_subs_epu16(e, vGapE);
+-			e = _mm_max_epi16(e, vH);
+-			_mm_store_si128(pvE + j, e);
+			vH = simde_mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
+			e = simde_mm_subs_epu16(e, vGapE);
+			e = simde_mm_max_epi16(e, vH);
+			simde_mm_store_si128(pvE + j, e);
+ 
+ 			/* Update vF value. */
+-			vF = _mm_subs_epu16(vF, vGapE);
+-			vF = _mm_max_epi16(vF, vH);
+			vF = simde_mm_subs_epu16(vF, vGapE);
+			vF = simde_mm_max_epi16(vF, vH);
+ 
+ 			/* Load the next vH. */
+-			vH = _mm_load_si128(pvHLoad + j);
+			vH = simde_mm_load_si128(pvHLoad + j);
+ 		}
+ 
+ 		/* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
+ 		for (k = 0; LIKELY(k < 8); ++k) {
+-			vF = _mm_slli_si128 (vF, 2);
+			vF = simde_mm_slli_si128 (vF, 2);
+ 			for (j = 0; LIKELY(j < segLen); ++j) {
+-				vH = _mm_load_si128(pvHStore + j);
+-				vH = _mm_max_epi16(vH, vF);
+-				vMaxColumn = _mm_max_epi16(vMaxColumn, vH); //newly added line
+-				_mm_store_si128(pvHStore + j, vH);
+-				vH = _mm_subs_epu16(vH, vGapO);
+-				vF = _mm_subs_epu16(vF, vGapE);
+-				if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
+				vH = simde_mm_load_si128(pvHStore + j);
+				vH = simde_mm_max_epi16(vH, vF);
+				vMaxColumn = simde_mm_max_epi16(vMaxColumn, vH); //newly added line
+				simde_mm_store_si128(pvHStore + j, vH);
+				vH = simde_mm_subs_epu16(vH, vGapO);
+				vF = simde_mm_subs_epu16(vF, vGapE);
+				if (UNLIKELY(! simde_mm_movemask_epi8(simde_mm_cmpgt_epi16(vF, vH)))) goto end;
+ 			}
+ 		}
+ 
+ end:
+-		vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);
+-		vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
+-		cmp = _mm_movemask_epi8(vTemp);
+		vMaxScore = simde_mm_max_epi16(vMaxScore, vMaxColumn);
+		vTemp = simde_mm_cmpeq_epi16(vMaxMark, vMaxScore);
+		cmp = simde_mm_movemask_epi8(vTemp);
+ 		if (cmp != 0xffff) {
+ 			uint16_t temp;
+ 			vMaxMark = vMaxScore;
+@@ -801,7 +801,7 @@
+ 					const int32_t maskLen) {
+ 
+ 	alignment_end* bests = 0, *bests_reverse = 0;
+-	__m128i* vP = 0;
+	simde__m128i* vP = 0;
+ 	int32_t word = 0, band_width = 0, readLen = prof->readLen;
+ 	int8_t* read_reverse = 0;
+ 	cigar* path;
+--- libssw.orig/src/ssw.h
+++ libssw/src/ssw.h
+@@ -14,7 +14,6 @@
+ #include <stdio.h>
+ #include <stdint.h>
+ #include <string.h>
+-#include <emmintrin.h>
+ 
+ #ifdef __cplusplus
+ extern "C" {
+--- libssw.orig/src/main.c
+++ libssw/src/main.c
+@@ -6,7 +6,6 @@
+ 
+ #include <stdlib.h>
+ #include <stdint.h>
+-#include <emmintrin.h>
+ #include <zlib.h>
+ #include <stdio.h>
+ #include <time.h>
--- a/debian/rules
+++ b/debian/rules
@@ -2,6 +2,9 @@

 # DH_VERBOSE := 1
 export DEB_BUILD_MAINT_OPTIONS = hardening=+bindnow
+export DEB_CFLAGS_MAINT_APPEND += -DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
+export DEB_CXXFLAGS_MAINT_APPEND += -DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
+
 ifeq ($(DEB_BUILD_ARCH), ppc64el)
 export DEB_CFLAGS_MAINT_APPEND = -DNO_WARN_X86_INTRINSICS
 export DEB_CXXFLAGS_MAINT_APPEND = -DNO_WARN_X86_INTRINSICS