Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
+2018-12-26  Martin C. Frith  <Martin C. Frith>
+
+	* src/tantan.cc:
+	Make tantan repeat-finding faster
+	[44199bff9cef] [tip]
+
+	* doc/Makefile, doc/last-parallel.txt, doc/last-split.txt, doc/last-
+	tutorial.txt:
+	Update the documents a bit
+	[6727ae8b7a12]
+
 2018-12-10  Martin C. Frith  <Martin C. Frith>

 	* doc/last-dotplot.txt, scripts/last-dotplot:
 	Add last-dotplot --maxseqs option
-	[ed0fb9b1eb40] [tip]
+	[ed0fb9b1eb40]

 	* src/split/last-split.cc:
 	Make last-split -n keep E-values

--- a/debian/changelog
+++ b/debian/changelog
+last-align (963-1) unstable; urgency=medium
+
+  * New upstream version
+  * debhelper 12
+  * Standards-Version: 4.3.0
+  * debian/tests/control: Replace needs-recommends by explicitly
+    adding recommended package python-pil to test depends
+
+ -- Andreas Tille <tille@debian.org>  Fri, 11 Jan 2019 22:49:18 +0100
+
 last-align (961-1) unstable; urgency=medium

  [ Jelmer Vernooĳ ]

--- a/debian/compat
+++ b/debian/compat
-11
+12
--- a/debian/control
+++ b/debian/control
@@ -4,11 +4,11 @@ Uploaders: Charles Plessy <plessy@debian.org>,
           Andreas Tille <tille@debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 11~),
+Build-Depends: debhelper (>= 12~),
               help2man,
               python-pil,
               zlib1g-dev
-Standards-Version: 4.2.1
+Standards-Version: 4.3.0
 Vcs-Browser: https://salsa.debian.org/med-team/last-align
 Vcs-Git: https://salsa.debian.org/med-team/last-align.git
 Homepage: http://last.cbrc.jp/

--- a/debian/tests/control
+++ b/debian/tests/control
 Tests: run-unit-test
-Depends: @
-Restrictions: allow-stderr, needs-recommends
+Depends: @, python-pil
+Restrictions: allow-stderr
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -15,6 +15,7 @@ ${DOCS}: last-doc.css Makefile

 # Ugh!  Is there a better way?
 RST_CSS = `locate html4css1.css | tail -n1`
+#RST_CSS = html4css1.css

 RSTFLAGS = --initial-header-level=2 --no-compact-lists	\
 --no-compact-field-lists --option-limit=0 --no-doc-info

--- a/doc/last-parallel.html
+++ b/doc/last-parallel.html
@@ -364,11 +364,11 @@ parallel-fasta &quot;lastal mydb&quot; &lt; queries.fa &gt; myalns.maf
 </pre>
 <p>Instead of this:</p>
 <pre class="literal-block">
-lastal -Q1 -D100 db q.fastq | last-split &gt; out.maf
+lastal -Q1 db q.fastq | last-split &gt; out.maf
 </pre>
 <p>try this:</p>
 <pre class="literal-block">
-parallel-fastq &quot;lastal -Q1 -D100 db | last-split&quot; &lt; q.fastq &gt; out.maf
+parallel-fastq &quot;lastal -Q1 db | last-split&quot; &lt; q.fastq &gt; out.maf
 </pre>
 <p>Instead of this:</p>
 <pre class="literal-block">

--- a/doc/last-parallel.txt
+++ b/doc/last-parallel.txt
@@ -53,11 +53,11 @@ try this::

 Instead of this::

-  lastal -Q1 -D100 db q.fastq | last-split > out.maf
+  lastal -Q1 db q.fastq | last-split > out.maf

 try this::

-  parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+  parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf

 Instead of this::


--- a/doc/last-split.html
+++ b/doc/last-split.html
@@ -334,7 +334,7 @@ format), and the genome is in &quot;genome.fasta&quot; (in fasta format).  We
 can do the alignment like this:</p>
 <pre class="literal-block">
 lastdb -uNEAR -R01 db genome.fasta
-lastal -Q1 -D100 db q.fastq | last-split &gt; out.maf
+lastal -Q1 db q.fastq | last-split &gt; out.maf
 </pre>
 </div>
 <div class="section" id="spliced-alignment-of-rna-reads-to-a-genome">
@@ -397,7 +397,7 @@ matches.</p>
 <h2>Going faster by parallelization</h2>
 <p>For example, split alignment of DNA reads to a genome:</p>
 <pre class="literal-block">
-parallel-fastq &quot;lastal -Q1 -D100 db | last-split&quot; &lt; q.fastq &gt; out.maf
+parallel-fastq &quot;lastal -Q1 db | last-split&quot; &lt; q.fastq &gt; out.maf
 </pre>
 <p>This requires GNU parallel to be installed
 (<a class="reference external" href="http://www.gnu.org/software/parallel/">http://www.gnu.org/software/parallel/</a>).</p>
@@ -558,7 +558,7 @@ middle of the query, we can do &quot;spliced&quot; alignment without considering
 splice signals or favouring cis-splices:</p>
 <pre class="literal-block">
 lastdb -uNEAR -R01 db genome.fasta
-lastal -Q1 -D100 db q.fastq | last-split -c0 &gt; out.maf
+lastal -Q1 db q.fastq | last-split -c0 &gt; out.maf
 </pre>
 </div>
 </div>

--- a/doc/last-split.txt
+++ b/doc/last-split.txt
@@ -21,7 +21,7 @@ format), and the genome is in "genome.fasta" (in fasta format).  We
 can do the alignment like this::

  lastdb -uNEAR -R01 db genome.fasta
-  lastal -Q1 -D100 db q.fastq | last-split > out.maf
+  lastal -Q1 db q.fastq | last-split > out.maf

 Spliced alignment of RNA reads to a genome
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -78,7 +78,7 @@ Going faster by parallelization

 For example, split alignment of DNA reads to a genome::

-  parallel-fastq "lastal -Q1 -D100 db | last-split" < q.fastq > out.maf
+  parallel-fastq "lastal -Q1 db | last-split" < q.fastq > out.maf

 This requires GNU parallel to be installed
 (http://www.gnu.org/software/parallel/).
@@ -173,7 +173,7 @@ middle of the query, we can do "spliced" alignment without considering
 splice signals or favouring cis-splices::

  lastdb -uNEAR -R01 db genome.fasta
-  lastal -Q1 -D100 db q.fastq | last-split -c0 > out.maf
+  lastal -Q1 db q.fastq | last-split -c0 > out.maf

 Options
 -------

--- a/doc/last-tutorial.html
+++ b/doc/last-tutorial.html
@@ -379,20 +379,28 @@ lastdb -p -cR01 protdb proteins.fa
 lastal -F15 protdb dnas.fa
 </pre>
 </div>
-<div class="section" id="example-4-find-short-protein-alignments">
-<h2>Example 4: Find short protein alignments</h2>
+<div class="section" id="example-4-find-high-similarity-and-short-protein-alignments">
+<h2>Example 4: Find high-similarity, and short, protein alignments</h2>
 <p>LAST uses a <a class="reference external" href="last-matrices.html">scoring scheme</a> to find
-similarities.  Some scoring schemes are good for long-and-weak
-similarities, others for short-and-strong similarities.  If we seek
-very short similarities, weak ones are hopeless (statistically
-insignificant), so we had better focus on strong ones.  The PAM30
-scoring scheme may work well:</p>
+similarities.  Some scoring schemes are tuned for weak similarities,
+others for strong similarities.  The PAM30 scoring scheme finds strong
+protein similarities:</p>
 <pre class="literal-block">
 lastdb -p -cR01 invdb invertebrate.fa
 lastal -pPAM30 invdb vertebrate.fa
 </pre>
-<p>(How short is &quot;very short&quot;?  It depends on the amount of sequence data
-we are searching, but perhaps roughly less than 40 amino acids.)</p>
+<p>This has two advantages:</p>
+<ul>
+<li><p class="first">It omits weak alignments, or alignment parts (occasionally a strong
+similarity is flanked by a weak similarity).</p>
+</li>
+<li><p class="first">It can find short similarities.  If we seek very short similarities,
+weak ones are hopeless (statistically insignificant), so we had
+better focus on strong ones.  (How short is &quot;very short&quot;?  It
+depends on the amount of sequence data we are searching, but perhaps
+roughly less than 40 amino acids.)</p>
+</li>
+</ul>
 </div>
 <div class="section" id="example-5-align-human-dna-sequences-to-the-human-genome">
 <h2>Example 5: Align human DNA sequences to the human genome</h2>
@@ -446,13 +454,14 @@ for smaller genomes).</p>
 <p>DNA sequences are not always perfectly accurate, and they are
 sometimes provided in fastq format, which indicates the reliability of
 each base.  LAST can use this information to improve alignment
-accuracy.  (It assumes the reliabilities reflect substitution errors,
-not insertion/deletion errors: if that is not true, it may be better
-to use fasta format.)  Option -Q1 indicates fastq-sanger format:</p>
+accuracy.  Option -Q1 indicates fastq-sanger format:</p>
 <pre class="literal-block">
 lastdb -uNEAR -R01 humandb human/chr*.fa
-lastal -Q1 -D100 humandb queries.fastq | last-split &gt; myalns.maf
+lastal -Q1 humandb queries.fastq | last-split &gt; myalns.maf
 </pre>
+<p><strong>Assumption:</strong> LAST assumes the reliabilities reflect substitution
+errors, not insertion/deletion errors.  If that is not true, you can
+tell it to ignore the reliability data with -Q0.</p>
 </div>
 <div class="section" id="fastq-format-confusion">
 <h2>Fastq format confusion</h2>
@@ -490,7 +499,19 @@ the two reads in a pair to match (e.g.) different chromosomes.</p>
 </div>
 <div class="section" id="example-9-compare-the-human-and-chimp-genomes">
 <h2>Example 9: Compare the human and chimp genomes</h2>
-<p>See <a class="reference external" href="https://github.com/mcfrith/last-genome-alignments">here</a>.</p>
+<p>See <a class="reference external" href="https://github.com/mcfrith/last-genome-alignments">here</a>.  But
+that recipe is <em>extremely</em> slow-and-accurate.  You can <a class="reference external" href="last-tuning.html">tune</a> it to compare huge, high-similarity genomes with
+moderate run time and memory use:</p>
+<ul>
+<li><p class="first">Omit the sensitivity-boosting lastal -m option.</p>
+</li>
+<li><p class="first">Add -W99 (or so) to the lastdb options.</p>
+</li>
+<li><p class="first">If the &quot;reference&quot; genome (the one given to lastdb) is &gt; 4 GB, it's
+probably more efficient to use lastdb8 and lastal8, instead of
+lastdb and lastal.</p>
+</li>
+</ul>
 </div>
 <div class="section" id="example-10-ambiguity-of-alignment-columns">
 <h2>Example 10: Ambiguity of alignment columns</h2>

--- a/doc/last-tutorial.txt
+++ b/doc/last-tutorial.txt
@@ -70,21 +70,27 @@ score penalty of 15 for frameshifts::
  lastdb -p -cR01 protdb proteins.fa
  lastal -F15 protdb dnas.fa

-Example 4: Find short protein alignments
----------------------------------------
+Example 4: Find high-similarity, and short, protein alignments
+--------------------------------------------------------------

 LAST uses a `scoring scheme <last-matrices.html>`_ to find
-similarities.  Some scoring schemes are good for long-and-weak
-similarities, others for short-and-strong similarities.  If we seek
-very short similarities, weak ones are hopeless (statistically
-insignificant), so we had better focus on strong ones.  The PAM30
-scoring scheme may work well::
+similarities.  Some scoring schemes are tuned for weak similarities,
+others for strong similarities.  The PAM30 scoring scheme finds strong
+protein similarities::

  lastdb -p -cR01 invdb invertebrate.fa
  lastal -pPAM30 invdb vertebrate.fa

-(How short is "very short"?  It depends on the amount of sequence data
-we are searching, but perhaps roughly less than 40 amino acids.)
+This has two advantages:
+
+* It omits weak alignments, or alignment parts (occasionally a strong
+  similarity is flanked by a weak similarity).
+
+* It can find short similarities.  If we seek very short similarities,
+  weak ones are hopeless (statistically insignificant), so we had
+  better focus on strong ones.  (How short is "very short"?  It
+  depends on the amount of sequence data we are searching, but perhaps
+  roughly less than 40 amino acids.)

 Example 5: Align human DNA sequences to the human genome
 --------------------------------------------------------
@@ -138,12 +144,14 @@ Example 7: Align human fastq sequences to the human genome
 DNA sequences are not always perfectly accurate, and they are
 sometimes provided in fastq format, which indicates the reliability of
 each base.  LAST can use this information to improve alignment
-accuracy.  (It assumes the reliabilities reflect substitution errors,
-not insertion/deletion errors: if that is not true, it may be better
-to use fasta format.)  Option -Q1 indicates fastq-sanger format::
+accuracy.  Option -Q1 indicates fastq-sanger format::

  lastdb -uNEAR -R01 humandb human/chr*.fa
-  lastal -Q1 -D100 humandb queries.fastq | last-split > myalns.maf
+  lastal -Q1 humandb queries.fastq | last-split > myalns.maf
+
+**Assumption:** LAST assumes the reliabilities reflect substitution
+errors, not insertion/deletion errors.  If that is not true, you can
+tell it to ignore the reliability data with -Q0.

 Fastq format confusion
 ----------------------
@@ -183,7 +191,18 @@ See `here <https://github.com/mcfrith/last-genome-alignments>`_.
 Example 9: Compare the human and chimp genomes
 ----------------------------------------------

-See `here <https://github.com/mcfrith/last-genome-alignments>`_.
+See `here <https://github.com/mcfrith/last-genome-alignments>`_.  But
+that recipe is *extremely* slow-and-accurate.  You can `tune
+<last-tuning.html>`_ it to compare huge, high-similarity genomes with
+moderate run time and memory use:
+
+* Omit the sensitivity-boosting lastal -m option.
+
+* Add -W99 (or so) to the lastdb options.
+
+* If the "reference" genome (the one given to lastdb) is > 4 GB, it's
+  probably more efficient to use lastdb8 and lastal8, instead of
+  lastdb and lastal.

 Example 10: Ambiguity of alignment columns
 ------------------------------------------

--- a/src/tantan.cc
+++ b/src/tantan.cc
@@ -20,9 +20,9 @@ void multiplyAll(std::vector<double> &v, double factor) {
 }

 double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) {
-  if (probMult < 1 || probMult > 1)
+  if (probMult < 1 || probMult > 1) {
    return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset));
-  else
+  }
  return 1.0 / maxRepeatOffset;
 }

@@ -64,6 +64,7 @@ struct Tantan {
  double b2fLast;  // background state to last foreground state

  double backgroundProb;
+  std::vector<double> b2fProbs;  // background state to each foreground state
  std::vector<double> foregroundProbs;
  std::vector<double> insertionProbs;

@@ -99,7 +100,7 @@ struct Tantan {
    //f2g = firstGapProb;
    //g2f = 1 - otherGapProb;
    oneGapProb = firstGapProb * (1 - otherGapProb);
-    endGapProb = firstGapProb * 1;
+    endGapProb = firstGapProb * (maxRepeatOffset > 1);
    f2f0 = 1 - repeatEndProb;
    f2f1 = 1 - repeatEndProb - firstGapProb;
    f2f2 = 1 - repeatEndProb - firstGapProb * 2;
@@ -110,9 +111,16 @@ struct Tantan {
    b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset);
    b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset);

+    b2fProbs.resize(maxRepeatOffset);
    foregroundProbs.resize(maxRepeatOffset);
    insertionProbs.resize(maxRepeatOffset - 1);

+    double p = b2fFirst;
+    for (int i = 0; i < maxRepeatOffset; ++i) {
+      b2fProbs[i] = p;
+      p *= b2fDecay;
+    }
+
    scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize);
  }

@@ -125,8 +133,7 @@ struct Tantan {
  double forwardTotal() {
    double fromForeground = std::accumulate(foregroundProbs.begin(),
                                            foregroundProbs.end(), 0.0);
-    fromForeground *= f2b;
-    double total = backgroundProb * b2b + fromForeground;
+    double total = backgroundProb * b2b + fromForeground * f2b;
    assert(total > 0);
    return total;
  }
@@ -148,9 +155,6 @@ struct Tantan {
    double f = *foregroundPtr;
    double fromForeground = f;

-    if (insertionProbs.empty()) {
-      *foregroundPtr = fromBackground + f * f2f0;
-    } else {
    double *insertionPtr = &insertionProbs.back();
    double i = *insertionPtr;
    *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
@@ -174,10 +178,8 @@ struct Tantan {
    fromForeground += f;
    *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
    *insertionPtr = f;
-    }

-    fromForeground *= f2b;
-    backgroundProb = backgroundProb * b2b + fromForeground;
+    backgroundProb = backgroundProb * b2b + fromForeground * f2b;
  }

  void calcBackwardTransitionProbsWithGaps() {
@@ -186,9 +188,6 @@ struct Tantan {
    double f = *foregroundPtr;
    double toForeground = f;

-    if (insertionProbs.empty()) {
-      *foregroundPtr = toBackground + f2f0 * f;
-    } else {
    double *insertionPtr = &insertionProbs.front();
    double i = *insertionPtr;
    *foregroundPtr = toBackground + f2f1 * f + i;
@@ -213,30 +212,24 @@ struct Tantan {
    toForeground += f;
    *foregroundPtr = toBackground + f2f1 * f + d;
    *insertionPtr = endGapProb * f;
-    }

-    toForeground *= b2fLast;
-    backgroundProb = b2b * backgroundProb + toForeground;
+    backgroundProb = b2b * backgroundProb + b2fLast * toForeground;
  }

  void calcForwardTransitionProbs() {
    if (endGapProb > 0) return calcForwardTransitionProbsWithGaps();

-    double fromBackground = backgroundProb * b2fLast;
+    double b = backgroundProb;
    double fromForeground = 0;
-    double *foregroundPtr = END(foregroundProbs);
    double *foregroundBeg = BEG(foregroundProbs);

-    while (foregroundPtr > foregroundBeg) {
-      --foregroundPtr;
-      double f = *foregroundPtr;
+    for (int i = 0; i < maxRepeatOffset; ++i) {
+      double f = foregroundBeg[i];
      fromForeground += f;
-      *foregroundPtr = fromBackground + f * f2f0;
-      fromBackground *= b2fGrowth;
+      foregroundBeg[i] = b * b2fProbs[i] + f * f2f0;
    }

-    fromForeground *= f2b;
-    backgroundProb = backgroundProb * b2b + fromForeground;
+    backgroundProb = b * b2b + fromForeground * f2b;
  }

  void calcBackwardTransitionProbs() {
@@ -244,18 +237,14 @@ struct Tantan {

    double toBackground = f2b * backgroundProb;
    double toForeground = 0;
-    double *foregroundPtr = BEG(foregroundProbs);
-    double *foregroundEnd = END(foregroundProbs);
+    double *foregroundBeg = BEG(foregroundProbs);

-    while (foregroundPtr < foregroundEnd) {
-      toForeground *= b2fGrowth;
-      double f = *foregroundPtr;
-      toForeground += f;
-      *foregroundPtr = toBackground + f2f0 * f;
-      ++foregroundPtr;
+    for (int i = 0; i < maxRepeatOffset; ++i) {
+      double f = foregroundBeg[i];
+      toForeground += b2fProbs[i] * f;
+      foregroundBeg[i] = toBackground + f2f0 * f;
    }

-    toForeground *= b2fLast;
    backgroundProb = b2b * backgroundProb + toForeground;
  }

@@ -281,12 +270,17 @@ struct Tantan {
    }
  }

-  void calcEmissionProbs() {
-    const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+  bool isNearSeqBeg() {
+    return seqPtr - seqBeg < maxRepeatOffset;
+  }

-    bool isNearSeqBeg = (seqPtr - seqBeg < maxRepeatOffset);
-    const uchar *seqStop = isNearSeqBeg ? seqBeg : seqPtr - maxRepeatOffset;
+  const uchar *seqFurthestBack() {
+    return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset;
+  }

+  void calcEmissionProbs() {
+    const double *lrRow = likelihoodRatioMatrix[*seqPtr];
+    const uchar *seqStop = seqFurthestBack();
    double *foregroundPtr = BEG(foregroundProbs);
    const uchar *offsetPtr = seqPtr;


--- a/src/version.hh
+++ b/src/version.hh
-"961"
+"963"