Skip to content
Commits on Source (12)
/*******************************************************************************************
*
* Using overlap pile for each read compute estimated coverage of the underlying genome
* generating a .covr track containing a histogram of the coverage of every unmasked
* trace-point tile.
*
* Author: Gene Myers
* Date : January 2018
*
*******************************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "DB.h"
#include "align.h"
#ifdef HIDE_FILES
#define PATHSEP "/."
#else
#define PATHSEP "/"
#endif
#undef COVER_DEBUG
static char *Usage = "[-v] [-H<int>] [-m<track>]+ <source:db> <overlaps:las> ...";
#define MAX_COVER 1000
static int VERBOSE;
static int HGAP_MIN; // Under this length do not process for HGAP
typedef struct
{ char *name;
int64 *idx;
int *data;
} Mask;
static int NUM_MASK;
static Mask *MASKS;
static int64 Cov_Hist[MAX_COVER+1]; // [0..MAX_COVER] counts of trace-interval coverages
static char *Cov_Name = "Coverage Histogram";
static char *Hgap_Name = "Hgap threshold";
static int TRACE_SPACING; // Trace spacing (from .las file)
static int TBYTES; // Bytes per trace segment (from .las file)
static DAZZ_DB _DB, *DB = &_DB; // Data base
static int DB_FIRST; // First read of DB to process
static int DB_LAST; // Last read of DB to process (+1)
static int DB_PART; // 0 if all, otherwise block #
static DAZZ_READ *Reads; // Data base reads array
static FILE *CV_AFILE; // .covr.anno
static char *CV_ANAME; // ".covr.anno"
// Statistics
static int64 nreads, totlen;
// For each pile, calculate QV scores of the aread at tick spacing TRACE_SPACING
static void HISTOGRAM_COVER(int aread, Overlap *ovls, int novl)
{ static int nmax = 0;
static int *local;
static int *cover;
static int *maskd;
int alen, atick;
int bread, cssr;
int t2, t1;
int i, j, a, e;
alen = DB->reads[aread].rlen;
atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING;
if (alen < HGAP_MIN)
return;
#if defined(COVER_DEBUG)
printf("AREAD %d",aread);
if (novl == 0)
printf(" EMPTY");
printf("\n");
#endif
// COVERAGE
// Allocate or expand data structures for cover calculation as needed
if (nmax == 0)
{ nmax = (DB->maxlen + (TRACE_SPACING-1))/TRACE_SPACING;
local = (int *) Malloc(nmax*sizeof(int),"Allocating bread cover");
cover = (int *) Malloc(nmax*sizeof(int),"Allocating aread cover");
maskd = (int *) Malloc(nmax*sizeof(int),"Allocating aread cover");
if (local == NULL || cover == NULL || maskd == NULL)
exit (1);
for (i = nmax-1; i >= 0; i--)
local[i] = cover[i] = maskd[i] = 0;
}
// For every segment, fill histogram of match diffs for every one of the
// atick intervals, building separate histograms, hist & cist, for forward
// and reverse B-hits
t2 = TRACE_SPACING/2;
t1 = t2-1;
for (i = 0; i < novl; i = j)
{ bread = ovls[i].bread;
for (j = bread+1; j < DB_LAST; j++)
if ((Reads[j].flags & DB_CSS) == 0)
break;
cssr = j;
for (j = i; j < novl; j++)
if (ovls[j].bread < cssr)
{ e = (ovls[j].path.aepos + t2) / TRACE_SPACING;
for (a = (ovls[j].path.abpos + t1) / TRACE_SPACING; a < e; a++)
local[a] = 1;
}
else
break;
for (j = i; j < novl; j++)
if (ovls[j].bread < cssr)
{ e = (ovls[j].path.aepos + t2) / TRACE_SPACING;
for (a = (ovls[j].path.abpos + t1) / TRACE_SPACING; a < e; a++)
if (local[a] > 0)
{ local[a] = 0;
cover[a] += 1;
}
}
else
break;
}
for (i = 0; i < NUM_MASK; i++)
{ Mask *m = (Mask *) (MASKS+i);
int k, u, e;
#ifdef COVER_DEBUG
printf(" %s:",m->name);
#endif
for (k = m->idx[aread]; k < m->idx[aread+1]; k += 2)
{ e = m->data[k+1];
#ifdef COVER_DEBUG
printf(" [%d..%d]",m->data[k],e);
#endif
e = (e + (TRACE_SPACING-1))/TRACE_SPACING;
for (u = m->data[k]/TRACE_SPACING; u < e; u++)
maskd[u] = 1;
}
#ifdef COVER_DEBUG
printf("\n");
#endif
}
#ifdef COVER_DEBUG
printf("Mask: ");
for (a = 0; a < atick; a++)
printf("%d",maskd[a]);
printf("\n");
#endif
for (a = 0; a < atick; a++)
{ if (maskd[a] == 0)
{ e = cover[a];
if (e >= MAX_COVER)
Cov_Hist[MAX_COVER] += 1;
else
Cov_Hist[e] += 1;
}
cover[a] = 0;
}
for (i = 0; i < NUM_MASK; i++)
{ Mask *m = (Mask *) (MASKS+i);
int k, u, e;
for (k = m->idx[aread]; k < m->idx[aread+1]; k += 2)
{ e = m->data[k+1];
e = (e + (TRACE_SPACING-1))/TRACE_SPACING;
for (u = m->data[k]/TRACE_SPACING; u < e; u++)
maskd[u] = 0;
}
}
if (VERBOSE)
{ nreads += 1;
totlen += alen;
}
}
// Read in each successive pile and call ACTION on it. Read in the traces only if
// "trace" is nonzero
static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace)
{ static Overlap *ovls = NULL;
static int omax = 500;
static uint16 *paths = NULL;
static int pmax = 100000;
int64 i, j, novl;
int n, a;
int pcur;
int max;
if (ovls == NULL)
{ ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer");
if (ovls == NULL)
exit (1);
}
if (trace && paths == NULL)
{ paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer");
if (paths == NULL)
exit (1);
}
rewind(input);
fread(&novl,sizeof(int64),1,input);
fread(&TRACE_SPACING,sizeof(int),1,input);
if (TRACE_SPACING <= TRACE_XOVR)
TBYTES = sizeof(uint8);
else
TBYTES = sizeof(uint16);
if (Read_Overlap(input,ovls) != 0)
ovls[0].aread = INT32_MAX;
else if (trace)
{ if (ovls[0].path.tlen > pmax)
{ pmax = 1.2*(ovls[0].path.tlen)+10000;
paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
if (paths == NULL) exit (1);
}
fread(paths,TBYTES,ovls[0].path.tlen,input);
if (TBYTES == 1)
{ ovls[0].path.trace = paths;
Decompress_TraceTo16(ovls);
}
}
else
fseek(input,TBYTES*ovls[0].path.tlen,SEEK_CUR);
if (ovls[0].aread < DB_FIRST)
{ fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
Prog_Name,DB_PART);
exit (1);
}
pcur = 0;
n = max = 0;
for (j = DB_FIRST; j < DB_LAST; j++)
{ ovls[0] = ovls[n];
a = ovls[0].aread;
if (a != j)
n = 0;
else
{ if (trace)
memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
n = 1;
pcur = ovls[0].path.tlen;
while (1)
{ if (Read_Overlap(input,ovls+n) != 0)
{ ovls[n].aread = INT32_MAX;
break;
}
if (trace)
{ if (pcur + ovls[n].path.tlen > pmax)
{ pmax = 1.2*(pcur+ovls[n].path.tlen)+10000;
paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
if (paths == NULL) exit (1);
}
fread(paths+pcur,TBYTES,ovls[n].path.tlen,input);
if (TBYTES == 1)
{ ovls[n].path.trace = paths+pcur;
Decompress_TraceTo16(ovls+n);
}
}
else
fseek(input,TBYTES*ovls[n].path.tlen,SEEK_CUR);
if (ovls[n].aread != a)
break;
pcur += ovls[n].path.tlen;
n += 1;
if (n >= omax)
{ omax = 1.2*n + 100;
ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer");
if (ovls == NULL) exit (1);
}
}
if (n >= max)
max = n;
pcur = 0;
for (i = 0; i < n; i++)
{ ovls[i].path.trace = paths+pcur;
pcur += ovls[i].path.tlen;
}
}
ACTION(j,ovls,n);
}
if (ovls[n].aread < INT32_MAX)
{ fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
Prog_Name,DB_PART);
exit (1);
}
return (max);
}
int main(int argc, char *argv[])
{ char *root, *dpwd;
int64 novl, hgap64;
int c;
DAZZ_EXTRA ex_covr, ex_hgap;
// Process arguments
{ int i, j, k;
int flags[128];
char *eptr;
int mmax;
ARG_INIT("DAScover")
HGAP_MIN = 0;
NUM_MASK = 0;
mmax = 10;
MASKS = (Mask *) Malloc(mmax*sizeof(Mask),"Allocating mask array");
if (MASKS == NULL)
exit (1);
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
ARG_FLAGS("v")
break;
case 'm':
if (NUM_MASK >= mmax)
{ mmax = 1.2*NUM_MASK + 10;
MASKS = (Mask *) Realloc(MASKS,mmax*sizeof(Mask),"Reallocating mask array");
if (MASKS == NULL)
exit (1);
}
MASKS[NUM_MASK++].name = argv[i]+2;
break;
case 'H':
ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)")
break;
}
else
argv[j++] = argv[i];
argc = j;
VERBOSE = flags['v'];
if (argc < 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -H: HGAP minimum length threshold.\n");
fprintf(stderr," -m: repeat masks, stats not collected over these intervals\n");
exit (1);
}
}
// Open trimmed DB and any mask tracks
{ DAZZ_TRACK *track;
int64 *anno;
int status, kind;
int i, j, k;
status = Open_DB(argv[1],DB);
if (status < 0)
exit (1);
if (status == 1)
{ fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
exit (1);
}
if (DB->part)
{ fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
exit (1);
}
Trim_DB(DB);
Reads = DB->reads;
k = 0;
for (i = 0; i < NUM_MASK; i++)
{ status = Check_Track(DB,MASKS[i].name,&kind);
switch (status)
{ case 0:
fprintf(stderr,"%s: [WARNING] %s track is for the *un*trimmed DB?\n",
Prog_Name,MASKS[i].name);
continue;
case -1:
fprintf(stderr,"%s: [WARNING] %s track size not correct for trimmed DB.\n",
Prog_Name,MASKS[i].name);
continue;
case -2:
fprintf(stderr,"%s: [WARNING] -m%s option given but no track found.\n",
Prog_Name,MASKS[i].name);
continue;
default:
if (kind != MASK_TRACK)
{ fprintf(stderr,"%s: [WARNING] %s track is not a mask track.\n",
Prog_Name,MASKS[i].name);
continue;
}
break;
}
track = Load_Track(DB,MASKS[i].name);
anno = (int64 *) (track->anno);
for (j = 0; j <= DB->nreads; j++)
anno[j] /= sizeof(int);
MASKS[k].name = MASKS[i].name;
MASKS[k].idx = anno;
MASKS[k].data = (int *) (track->data);
k += 1;
}
NUM_MASK = k;
}
// Determine if overlap block is being processed and if so get first and last read
// from .db file
ex_covr.vtype = DB_INT;
ex_covr.nelem = MAX_COVER+1;
ex_covr.accum = DB_SUM;
ex_covr.name = Cov_Name;
ex_covr.value = Cov_Hist;
ex_hgap.vtype = DB_INT;
ex_hgap.nelem = 1;
ex_hgap.accum = DB_EXACT;
ex_hgap.name = Hgap_Name;
hgap64 = HGAP_MIN;
ex_hgap.value = &hgap64;
dpwd = PathTo(argv[1]);
root = Root(argv[1],".db");
for (c = 2; c < argc; c++)
{ Block_Looper *parse;
FILE *input;
parse = Parse_Block_Arg(argv[c]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
{ FILE *dbfile;
char buffer[2*MAX_NAME+100];
char *p, *eptr;
int i, part, nfiles, nblocks, cutoff, all, oindx;
int64 size;
p = rindex(Block_Arg_Root(parse),'.');
if (p != NULL)
{ part = strtol(p+1,&eptr,10);
if (*eptr == '\0' && eptr != p+1)
{ dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r");
if (dbfile == NULL)
exit (1);
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
SYSTEM_READ_ERROR
for (i = 0; i < nfiles; i++)
if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
SYSTEM_READ_ERROR
for (i = 1; i <= part; i++)
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
}
}
}
// Set up cover extra's track
if (DB_PART > 0)
CV_ANAME = Strdup(Catenate(dpwd,PATHSEP,root,
Numbered_Suffix(".",DB_PART,".covr.anno")),"Allocating cover anno name");
else
CV_ANAME = Strdup(Catenate(dpwd,PATHSEP,root,".covr.anno"),"Allocating cover anno name");
CV_AFILE = Fopen(CV_ANAME,"w");
if (CV_ANAME == NULL || CV_AFILE == NULL)
exit (1);
{ int size, length;
length = 0;
size = 1;
fwrite(&length,sizeof(int),1,CV_AFILE);
fwrite(&size,sizeof(int),1,CV_AFILE);
}
// Get trace point spacing information
fread(&novl,sizeof(int64),1,input);
fread(&TRACE_SPACING,sizeof(int),1,input);
// Initialize statistics gathering
if (VERBOSE)
{ nreads = 0;
totlen = 0;
printf("\nDAScover %s %s\n",argv[1],argv[c]);
}
{ int i;
for (i = 0; i <= MAX_COVER; i++)
Cov_Hist[i] = 0;
}
// Process each read pile
make_a_pass(input,HISTOGRAM_COVER,0);
// If verbose output statistics summary to stdout
if (VERBOSE)
{ int i, cover;
int64 ssum, stotal;
printf("\nInput: ");
Print_Number(nreads,7,stdout);
printf("reads, ");
Print_Number(totlen,12,stdout);
printf(" bases");
if (HGAP_MIN > 0)
{ printf(" (another ");
Print_Number((DB_LAST-DB_FIRST) - nreads,0,stdout);
printf(" were < H-length)");
}
printf("\n");
stotal = 0;
for (i = 0; i <= MAX_COVER; i++)
stotal += Cov_Hist[i];
printf("\nCoverage Histogram\n\n");
ssum = Cov_Hist[MAX_COVER];
if (ssum > 0)
printf(" %4d: %9lld %5.1f%%\n\n",
MAX_COVER,Cov_Hist[MAX_COVER],(100.*ssum)/stotal);
stotal -= ssum;
ssum = 0;
for (i = MAX_COVER-1; i >= 0; i--)
if (Cov_Hist[i] > 0)
{ ssum += Cov_Hist[i];
printf(" %4d: %9lld %5.1f%%\n",
i,Cov_Hist[i],(100.*ssum)/stotal);
}
i = 0;
while (Cov_Hist[i+1] < Cov_Hist[i])
i += 1;
for (cover = i++; i < MAX_COVER; i++)
if (Cov_Hist[cover] < Cov_Hist[i])
cover = i;
printf("\n Coverage is estimated at %d\n\n",cover);
}
// Output coverage histogram
Write_Extra(CV_AFILE,&ex_covr);
Write_Extra(CV_AFILE,&ex_hgap);
fclose(CV_AFILE);
free(CV_ANAME);
fclose(input);
}
Free_Block_Arg(parse);
}
free(dpwd);
free(root);
Close_DB(DB);
free(Prog_Name);
exit (0);
}
......@@ -75,8 +75,6 @@ static char *Usage = "[-v] [-x<int>] <source:db> <target:db>";
#define LOWQ 0 // Gap is spanned by many LAs and patchable
#define SPAN 1 // Gap has many paired LAs and patchable
#define SPLIT 2 // Gap is a chimer or an unpatchable gap
#define ADPRE 3 // Gap is an adatper break and prefix should be removed
#define ADSUF 4 // Gap is an adatper break and suffix should be removed
// Global Variables (must exist across the processing of each pile)
......@@ -205,9 +203,6 @@ int main(int argc, char *argv[])
char *pwd2, *root2;
int VERBOSE;
int CUTOFF;
int GOOD_QV;
int BAD_QV;
int COVERAGE;
int HGAP_MIN;
int nfiles; // contents of source .db file
......@@ -253,6 +248,9 @@ int main(int argc, char *argv[])
if (argc != 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -x: minimum length for edited reads\n");
exit (1);
}
}
......@@ -374,64 +372,35 @@ int main(int argc, char *argv[])
track = Load_Track(DB,"trim");
if (track != NULL)
{ FILE *afile;
int size, tracklen, extra;
char *aname;
int extra, tracklen, size;
DAZZ_EXTRA ex_hgap;
TRIM_IDX = (int64 *) track->anno;
TRIM = (int *) track->data;
for (i = 0; i <= nreads; i++)
TRIM_IDX[i] /= sizeof(int);
// if newer .trim tracks with -g, -b, -c, -H meta data, grab it
// Get HGAP minimum from .trim extras
aname = Strdup(Catenate(DB->path,".","trim",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
fread(&tracklen,sizeof(int),1,afile);
fread(&size,sizeof(int),1,afile);
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
fseeko(afile,-extra,SEEK_END);
if (extra == 4*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
fread(&COVERAGE,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
}
else if (extra == 3*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
fread(&COVERAGE,sizeof(int),1,afile);
HGAP_MIN = 0;
}
else if (extra == 2*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
COVERAGE = -1;
HGAP_MIN = 0;
}
else
{ fprintf(stderr,"%s: trim annotation is out of date, rerun DAStrim\n",Prog_Name);
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
fclose(afile);
{ int a, t, x;
int tb, te;
x = 0;
for (a = 0; a < DB->nreads; a++)
{ tb = TRIM_IDX[a];
te = TRIM_IDX[a+1];
if (tb+2 < te)
{ if (TRIM[tb+2] == ADPRE)
tb += 3;
if (TRIM[te-3] == ADSUF)
te -= 3;
}
TRIM_IDX[a] = x;
for (t = tb; t < te; t++)
TRIM[x++] = TRIM[t];
}
TRIM_IDX[DB->nreads] = x;
}
HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0];
}
else
{ fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
......
......@@ -106,6 +106,8 @@ int main(int argc, char *argv[])
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -p: Pretty print (vs easy to parse).\n");
exit (1);
}
}
......
......@@ -13,7 +13,6 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "DB.h"
#include "align.h"
......@@ -42,8 +41,6 @@ static int VERBOSE;
#define LOWQ 0 // Gap is spanned by many LAs and patchable
#define SPAN 1 // Gap has many paired LAs and patchable
#define SPLIT 2 // Gap is a chimer or an unpatchable gap
#define ADPRE 3 // Gap is an adatper break and prefix should be removed
#define ADSUF 4 // Gap is an adatper break and suffix should be removed
#define NOPAT 3 // Gap could not be patched (internal only)
#define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side.
......@@ -635,9 +632,7 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
}
int main(int argc, char *argv[])
{ FILE *input;
char *root, *dpwd;
char *las, *lpwd;
{ char *root, *dpwd;
int64 novl;
DAZZ_TRACK *track;
int c;
......@@ -665,6 +660,8 @@ int main(int argc, char *argv[])
if (argc < 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
exit (1);
}
}
......@@ -699,57 +696,52 @@ int main(int argc, char *argv[])
track = Load_Track(DB,"trim");
if (track != NULL)
{ FILE *afile;
int size, tracklen, extra;
char *aname;
int extra, tracklen, size;
DAZZ_EXTRA ex_hgap, ex_cest, ex_good, ex_bad;
TRIM_IDX = (int64 *) track->anno;
TRIM = (int *) track->data;
for (i = 0; i <= DB->nreads; i++)
TRIM_IDX[i] /= sizeof(int);
// if newer .trim tracks with -g, -b, -c, -H meta data, grab it
// Get HGAP minimum, and good and bad qv thresholds from .trim extras
aname = Strdup(Catenate(DB->path,".","trim",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
fread(&tracklen,sizeof(int),1,afile);
fread(&size,sizeof(int),1,afile);
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
fseeko(afile,-extra,SEEK_END);
if (extra == 4*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
else if (extra == 2*sizeof(int) || extra == 3*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
HGAP_MIN = 0;
ex_cest.nelem = 0;
if (Read_Extra(afile,aname,&ex_cest) != 0)
{ fprintf(stderr,"%s: Coverage estimate extra missing from .trim track?\n",Prog_Name);
exit (1);
}
else
{ fprintf(stderr,"%s: trim annotation is out of date, rerun DAStrim\n",Prog_Name);
ex_good.nelem = 0;
if (Read_Extra(afile,aname,&ex_good) != 0)
{ fprintf(stderr,"%s: Good QV threshold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
ex_bad.nelem = 0;
if (Read_Extra(afile,aname,&ex_bad) != 0)
{ fprintf(stderr,"%s: Bad QV threshold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
fclose(afile);
{ int a, t, x;
int tb, te;
x = 0;
for (a = 0; a < DB->nreads; a++)
{ tb = TRIM_IDX[a];
te = TRIM_IDX[a+1];
if (tb+2 < te)
{ if (TRIM[tb+2] == ADPRE)
tb += 3;
if (TRIM[te-3] == ADSUF)
te -= 3;
}
TRIM_IDX[a] = x;
for (t = tb; t < te; t++)
TRIM[x++] = TRIM[t];
}
TRIM_IDX[DB->nreads] = x;
}
HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0];
GOOD_QV = (int) ((int64 *) (ex_good.value))[0];
BAD_QV = (int) ((int64 *) (ex_bad.value))[0];
}
else
{ fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
......@@ -757,25 +749,21 @@ int main(int argc, char *argv[])
}
}
// Initialize statistics gathering
if (VERBOSE)
{ npatch = 0;
fpatch = 0;
printf("\nDASpatch -g%d -b%d %s",GOOD_QV,BAD_QV,argv[1]);
for (c = 2; c < argc; c++)
printf(" %s",argv[c]);
printf("\n");
}
// For each .las block/file
dpwd = PathTo(argv[1]);
root = Root(argv[1],".db");
for (c = 2; c < argc; c++)
{ las = Root(argv[c],".las");
{ Block_Looper *parse;
FILE *input;
parse = Parse_Block_Arg(argv[c]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
// Determine if a .las block is being processed and if so get first and last read
// from .db file
......@@ -786,11 +774,7 @@ int main(int argc, char *argv[])
int i, part, nfiles, nblocks, cutoff, all, oindx;
int64 size;
DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
p = rindex(las,'.');
p = rindex(Block_Arg_Root(parse),'.');
if (p != NULL)
{ part = strtol(p+1,&eptr,10);
if (*eptr == '\0' && eptr != p+1)
......@@ -813,12 +797,11 @@ int main(int argc, char *argv[])
SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
*p = '\0';
}
}
}
// Set up QV trimming track
// Set up patch track
{ int len, size;
......@@ -843,40 +826,39 @@ int main(int argc, char *argv[])
fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE);
}
// Open overlap file
lpwd = PathTo(argv[c]);
if (DB_PART)
input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
else
input = Fopen(Catenate(lpwd,"/",las,".las"),"r");
if (input == NULL)
exit (1);
free(lpwd);
free(las);
// Get trace point spacing information
fread(&novl,sizeof(int64),1,input);
fread(&TRACE_SPACING,sizeof(int),1,input);
ANCHOR_THRESH = ANCHOR_MATCH * TRACE_SPACING;
make_a_pass(input,PATCH_GAPS,1);
// Clean up
// Initialize statistics gathering
fclose(PR_AFILE);
fclose(PR_DFILE);
if (VERBOSE)
{ npatch = 0;
fpatch = 0;
printf("\nDASpatch -g%d -b%d %s %s\n",GOOD_QV,BAD_QV,argv[1],argv[c]);
}
// Process each read pile
make_a_pass(input,PATCH_GAPS,1);
// If verbose output statistics summary to stdout
if (VERBOSE)
{ if (fpatch == 0)
printf("%s: All %d patches were successful\n",Prog_Name,npatch);
printf(" All %d patches were successful\n",npatch);
else
printf("%s: %d out of %d total patches failed\n",Prog_Name,fpatch,npatch);
printf(" %d out of %d total patches failed\n",fpatch,npatch);
}
fclose(PR_AFILE);
fclose(PR_DFILE);
fclose(input);
}
Free_Block_Arg(parse);
}
free(dpwd);
......
......@@ -23,7 +23,7 @@
#undef QV_DEBUG
static char *Usage = "[-v] [-H<int>] -c<int> <source:db> <overlaps:las> ...";
static char *Usage = "[-v] [-c<int>] <source:db> <overlaps:las> ...";
#define MAXQV 50 // Max QV score is 50
#define MAXQV1 51
......@@ -32,6 +32,7 @@ static char *Usage = "[-v] [-H<int>] -c<int> <source:db> <overlaps:las> ...";
#define PARTIAL .20 // Partial terminal segments covering this percentage are scored
static int VERBOSE;
static int COVERAGE; // Estimated coverage of genome
static int QV_DEEP; // # of best diffs to average for QV score
static int HGAP_MIN; // Under this length do not process for HGAP
......@@ -52,6 +53,7 @@ static int64 QV_INDEX; // Current index into .qual.data file
static int64 nreads, totlen;
static int64 qgram[MAXQV1], sgram[MAXQV1];
// For each pile, calculate QV scores of the aread at tick spacing TRACE_SPACING
static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
......@@ -202,7 +204,6 @@ static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
}
#endif
if (VERBOSE)
for (v = 0; v <= MAXQV; v++)
sgram[v] += tick[v] + cick[v];
......@@ -259,14 +260,12 @@ static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
#endif
}
// Accumulate qv histogram (if VERBOSE) and append qv's to .qual file
// Accumulate qv histogram and append qv's to .qual file
if (VERBOSE)
{ for (i = 0; i < atick; i++)
for (i = 0; i < atick; i++)
qgram[qvec[i]] += 1;
nreads += 1;
totlen += alen;
}
fwrite(qvec,sizeof(uint8),atick,QV_DFILE);
QV_INDEX += atick;
......@@ -392,11 +391,17 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
}
int main(int argc, char *argv[])
{ FILE *input;
char *root, *dpwd;
char *las, *lpwd;
{ char *root, *dpwd;
int64 novl;
int c, COVERAGE;
int c;
DAZZ_EXTRA ex_hgap, ex_covr;
DAZZ_EXTRA ex_cest, ex_qvs, ex_dif;
char *cest_name = "Coverage Estimate";
char *qvs_name = "Histogram of QVs";
char *dif_name = "Histogram of Tile Differences";
int64 cover64;
// Process arguments
......@@ -407,7 +412,6 @@ int main(int argc, char *argv[])
ARG_INIT("DASqv")
COVERAGE = -1;
HGAP_MIN = 0;
j = 1;
for (i = 1; i < argc; i++)
......@@ -419,9 +423,6 @@ int main(int argc, char *argv[])
case 'c':
ARG_POSITIVE(COVERAGE,"Voting depth")
break;
case 'H':
ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)")
break;
}
else
argv[j++] = argv[i];
......@@ -431,25 +432,11 @@ int main(int argc, char *argv[])
if (argc < 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -c: Use this as the average coverage (not DAScover estimate).\n");
exit (1);
}
if (COVERAGE < 0)
{ fprintf(stderr,"%s: Must supply -c parameter\n",Prog_Name);
exit (1);
}
else
{ if (COVERAGE >= 40)
QV_DEEP = COVERAGE/8;
else if (COVERAGE >= 20)
QV_DEEP = 5;
else if (COVERAGE >= 4)
QV_DEEP = COVERAGE/4;
else
{ fprintf(stderr,"%s: Average coverage is too low (< 4X), cannot infer qv's\n",Prog_Name);
exit (1);
}
}
}
// Open trimmed DB
......@@ -470,30 +457,103 @@ int main(int argc, char *argv[])
Trim_DB(DB);
}
// Initialize statistics gathering
// Get .covr track information
if (VERBOSE)
{ FILE *afile;
char *aname;
int extra, cmax;
int64 *cgram;
aname = Strdup(Catenate(DB->path,".","covr",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
{ fprintf(stderr,"%s: Must have a 'covr' track, run DAScover\n",Prog_Name);
exit (1);
}
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - sizeof(int)*2;
fseeko(afile,-extra,SEEK_END);
ex_covr.nelem = 0;
if (Read_Extra(afile,aname,&ex_covr) != 0)
{ fprintf(stderr,"%s: Histogram extra missing from .covr track?\n",Prog_Name);
exit (1);
}
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .covr track?\n",Prog_Name);
exit (1);
}
fclose(afile);
HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0];
cgram = (int64 *) (ex_covr.value);
cmax = ex_covr.nelem - 1;
if (COVERAGE < 0)
{ int i;
nreads = 0;
totlen = 0;
for (i = 0; i <= MAXQV; i++)
qgram[i] = sgram[i] = 0;
i = 0;
while (cgram[i+1] < cgram[i])
i += 1;
for (COVERAGE = i++; i < cmax; i++)
if (cgram[COVERAGE] < cgram[i])
COVERAGE = i;
}
printf("\nDASqv -c%d %s",COVERAGE,argv[1]);
for (i = 2; i < argc; i++)
printf(" %s",argv[i]);
printf("\n");
if (COVERAGE >= 40)
QV_DEEP = COVERAGE/8;
else if (COVERAGE >= 20)
QV_DEEP = 5;
else if (COVERAGE >= 4)
QV_DEEP = COVERAGE/4;
else
{ fprintf(stderr,"%s: Average coverage is too low (< 4X), cannot infer qv's\n",Prog_Name);
exit (1);
}
}
// Determine if overlap block is being processed and if so get first and last read
// from .db file
// Setup extras
ex_cest.vtype = DB_INT; // Estimated coverage (same for every .las)
ex_cest.nelem = 1;
ex_cest.accum = DB_EXACT;
ex_cest.name = cest_name;
cover64 = COVERAGE;
ex_cest.value = &cover64;
ex_qvs.vtype = DB_INT; // Histogram of MAXQV1 trace-point diff counts
ex_qvs.nelem = MAXQV1;
ex_qvs.accum = DB_SUM;
ex_qvs.name = qvs_name;
ex_qvs.value = &qgram;
ex_dif.vtype = DB_INT; // Histogram of MAXQV1 intrinisic qv counts
ex_dif.nelem = MAXQV1;
ex_dif.accum = DB_SUM;
ex_dif.name = dif_name;
ex_dif.value = &sgram;
// For each .las file do
dpwd = PathTo(argv[1]);
root = Root(argv[1],".db");
for (c = 2; c < argc; c++)
{ las = Root(argv[c],".las");
{ Block_Looper *parse;
FILE *input;
parse = Parse_Block_Arg(argv[c]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
// Determine if overlap block is being processed and if so get first and last read
// from .db file
{ FILE *dbfile;
char buffer[2*MAX_NAME+100];
......@@ -501,11 +561,7 @@ int main(int argc, char *argv[])
int i, part, nfiles, nblocks, cutoff, all, oindx;
int64 size;
DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
p = rindex(las,'.');
p = rindex(Block_Arg_Root(parse),'.');
if (p != NULL)
{ part = strtol(p+1,&eptr,10);
if (*eptr == '\0' && eptr != p+1)
......@@ -528,7 +584,6 @@ int main(int argc, char *argv[])
SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
*p = '\0';
}
}
}
......@@ -558,34 +613,43 @@ int main(int argc, char *argv[])
fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE);
}
// Open overlap file
lpwd = PathTo(argv[c]);
if (DB_PART > 0)
input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
else
input = Fopen(Catenate(lpwd,"/",las,".las"),"r");
if (input == NULL)
exit (1);
free(lpwd);
free(las);
// Get trace point spacing information
fread(&novl,sizeof(int64),1,input);
fread(&TRACE_SPACING,sizeof(int),1,input);
// Initialize statistics gathering
{ int i;
nreads = 0;
totlen = 0;
for (i = 0; i <= MAXQV; i++)
qgram[i] = sgram[i] = 0;
}
if (VERBOSE)
{ printf("\n\nDASqv");
if (HGAP_MIN > 0)
printf(" -H%d",HGAP_MIN);
printf(" -c%d %s %s\n\n",COVERAGE,argv[1],argv[c]);
fflush(stdout);
}
// Process each read pile
make_a_pass(input,CALCULATE_QVS,1);
fwrite(&COVERAGE,sizeof(int),1,QV_AFILE);
fwrite(&HGAP_MIN,sizeof(int),1,QV_AFILE);
// Write out extras and close .qual track
Write_Extra(QV_AFILE,&ex_hgap);
Write_Extra(QV_AFILE,&ex_cest);
Write_Extra(QV_AFILE,&ex_qvs);
Write_Extra(QV_AFILE,&ex_dif);
fclose(QV_AFILE);
fclose(QV_DFILE);
}
fclose(input);
// If verbose output statistics summary to stdout
......@@ -639,6 +703,10 @@ int main(int argc, char *argv[])
printf("\n Recommend \'DAStrim -g%d -b%d'\n\n",gval,bval);
}
}
Free_Block_Arg(parse);
}
// Clean up
......
......@@ -1011,6 +1011,9 @@ int main(int argc, char *argv[])
if (argc != 5)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -l: minimum length alignment length.\n");
exit (1);
}
}
......
......@@ -14,7 +14,6 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "DB.h"
#include "align.h"
......@@ -40,7 +39,7 @@
// Command format and global parameter variables
static char *Usage = " [-v] -g<int> -b<int> <source:db> <overlaps:las> ...";
static char *Usage = " [-v] [-g<int>] [-b<int>] <source:db> <overlaps:las> ...";
static int COVERAGE; // estimated coverage
static int BAD_QV; // qv >= and you are "bad"
......@@ -53,13 +52,8 @@ static int VERBOSE;
#define LOWQ 0 // Gap is spanned by many LAs and patchable
#define SPAN 1 // Gap has many paired LAs and patchable
#define SPLIT 2 // Gap is a chimer or an unpatchable gap
#define ADPRE 3 // Gap is due to adaptemer, trim prefix interval to left
#define ADSUF 4 // Gap is due to adaptemer, trim suffix interval to right
#define ADAPT 3 // Gap is due to adaptemer (internal only)
static int AdPre = ADPRE;
static int AdSuf = ADSUF;
// Good patch constants
#define MIN_BLOCK 500 // Minimum length of a good patch
......@@ -1970,10 +1964,8 @@ static void GAPS(int aread, Overlap *ovls, int novl)
return;
}
if (VERBOSE)
{ nreads += 1;
nreads += 1;
totlen += alen;
}
// Partition into HQ-blocks
......@@ -1992,10 +1984,8 @@ static void GAPS(int aread, Overlap *ovls, int novl)
// No blocks? ==> nothing to do
if (nblk <= 0)
{ if (VERBOSE)
{ nelim += 1;
nelimbp += alen;
}
#ifdef ANNOTATE
fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE);
fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE);
......@@ -2084,8 +2074,7 @@ static void GAPS(int aread, Overlap *ovls, int novl)
// Accummulate statistics
if (VERBOSE)
{ if (block[0].beg > 0)
if (block[0].beg > 0)
{ n5trm += 1;
n5trmbp += block[0].beg;
}
......@@ -2117,7 +2106,6 @@ static void GAPS(int aread, Overlap *ovls, int novl)
nchimbp += block[i].beg - block[i-1].end;
}
}
}
#ifdef ANNOTATE
fwrite(&(block[abeg].beg),sizeof(int),1,KP_DFILE);
......@@ -2134,12 +2122,6 @@ static void GAPS(int aread, Overlap *ovls, int novl)
// Output .trim track for this read
if (abeg > 0)
{ fwrite(&(block[0].beg),sizeof(int),1,TR_DFILE);
fwrite(&(block[abeg-1].end),sizeof(int),1,TR_DFILE);
fwrite(&AdPre,sizeof(int),1,TR_DFILE);
TR_INDEX += 3*sizeof(int);
}
fwrite(&(block[abeg].beg),sizeof(int),1,TR_DFILE);
fwrite(&(block[abeg].end),sizeof(int),1,TR_DFILE);
TR_INDEX += 2*sizeof(int);
......@@ -2149,12 +2131,6 @@ static void GAPS(int aread, Overlap *ovls, int novl)
fwrite(&(block[i].end),sizeof(int),1,TR_DFILE);
TR_INDEX += 3*sizeof(int);
}
if (aend < nblk)
{ fwrite(&AdSuf,sizeof(int),1,TR_DFILE);
fwrite(&(block[aend].beg),sizeof(int),1,TR_DFILE);
fwrite(&(block[nblk-1].end),sizeof(int),1,TR_DFILE);
TR_INDEX += 3*sizeof(int);
}
fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE);
}
}
......@@ -2280,13 +2256,19 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
}
int main(int argc, char *argv[])
{ FILE *input;
char *root, *dpwd;
char *las, *lpwd;
{ char *root, *dpwd;
int64 novl;
DAZZ_TRACK *track;
int c;
DAZZ_EXTRA ex_hgap, ex_cest;
DAZZ_EXTRA ex_good, ex_bad, ex_trim;
char *good_name = "Good QV threshold";
char *bad_name = "Bad QV threshold";
char *trim_name = "Trimming statistics";
int64 good64, bad64, tstats[18];
// Process arguments
{ int i, j, k;
......@@ -2320,20 +2302,10 @@ int main(int argc, char *argv[])
if (argc < 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
if (GOOD_QV < 0)
{ fprintf(stderr,"%s: Must supply -g parameter\n",Prog_Name);
exit (1);
}
if (BAD_QV < 0)
{ fprintf(stderr,"%s: Must supply -b parameter\n",Prog_Name);
exit (1);
}
if (GOOD_QV > BAD_QV)
{ fprintf(stderr,"%s: Good QV threshold (%d) > Bad QV threshold (%d) ?\n",
Prog_Name,GOOD_QV,BAD_QV);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -g: Use as good qv threshold (and not auto-estimate).\n");
fprintf(stderr," -b: Use as bad qv threshold (and not auto-estimate).\n");
exit (1);
}
}
......@@ -2354,80 +2326,137 @@ int main(int argc, char *argv[])
exit (1);
}
Trim_DB(DB);
}
// Get .qual track and extras
track = Load_Track(DB,"qual");
if (track != NULL)
{ FILE *afile;
int size, tracklen, extra;
char *aname;
int extra, tracklen, size;
DAZZ_EXTRA ex_qvs, ex_dif;
QV_IDX = (int64 *) track->anno;
QV = (uint8 *) track->data;
// if newer .qual tracks with -c meta data, grab it
aname = Strdup(Catenate(DB->path,".","qual",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
afile = fopen(Catenate(DB->path,".","qual",".anno"),"r");
fread(&tracklen,sizeof(int),1,afile);
fread(&size,sizeof(int),1,afile);
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
fseeko(afile,-extra,SEEK_END);
if (extra == 2*sizeof(int))
{ fread(&COVERAGE,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .qual track?\n",Prog_Name);
exit (1);
}
else if (extra == sizeof(int))
{ fread(&COVERAGE,sizeof(int),1,afile);
HGAP_MIN = 0;
ex_cest.nelem = 0;
if (Read_Extra(afile,aname,&ex_cest) != 0)
{ fprintf(stderr,"%s: Coverage estimate extra missing from .qual track?\n",Prog_Name);
exit (1);
}
else
{ COVERAGE = -1;
HGAP_MIN = 0;
ex_qvs.nelem = 0;
if (Read_Extra(afile,aname,&ex_qvs) != 0)
{ fprintf(stderr,"%s: QV histogram extra missing from .qual track?\n",Prog_Name);
exit (1);
}
ex_dif.nelem = 0;
if (Read_Extra(afile,aname,&ex_dif) != 0)
{ fprintf(stderr,"%s: Differences histogram extra missing from .qual track?\n",Prog_Name);
exit (1);
}
fclose(afile);
COVERAGE = (int) ((int64 *) (ex_cest.value))[0];
HGAP_MIN = (int) ((int64 *) (ex_hgap.value))[0];
// Compute -g and -b parameters
{ int64 qsum, qtotal;
int64 *qgram;
int i, maxqv;
int gv, bv;
qgram = (int64 *) (ex_qvs.value);
maxqv = ex_qvs.nelem - 1;
qtotal = 0;
for (i = 0; i < maxqv; i++)
qtotal += qgram[i];
bv = gv = -1;
qsum = 0;
for (i = maxqv-1; i >= 0; i--)
if (qgram[i] > 0)
{ qsum += qgram[i];
if ((100.*qsum)/qtotal > 7. && bv < 0)
bv = i+1;
if ((100.*qsum)/qtotal > 20. && gv < 0)
gv = i+1;
}
if (GOOD_QV < 0)
GOOD_QV = gv;
if (BAD_QV < 0)
BAD_QV = bv;
}
}
else
{ fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
exit (1);
}
if (GOOD_QV > BAD_QV)
{ fprintf(stderr,"%s: Good QV threshold (%d) > Bad QV threshold (%d) ?\n",
Prog_Name,GOOD_QV,BAD_QV);
exit (1);
}
// Initialize statistics gathering
// Setup extras
if (VERBOSE)
{ nreads = 0;
totlen = 0;
nelim = 0;
n5trm = 0;
n3trm = 0;
natrm = 0;
nelimbp = 0;
n5trmbp = 0;
n3trmbp = 0;
natrmbp = 0;
ex_good.vtype = DB_INT; // Good QV threshold
ex_good.nelem = 1;
ex_good.accum = DB_EXACT;
ex_good.name = good_name;
good64 = GOOD_QV;
ex_good.value = &good64;
ngaps = 0;
nlowq = 0;
nspan = 0;
nchim = 0;
ngapsbp = 0;
nlowqbp = 0;
nspanbp = 0;
nchimbp = 0;
ex_bad.vtype = DB_INT; // Bad QV threshold
ex_bad.nelem = 1;
ex_bad.accum = DB_EXACT;
ex_bad.name = bad_name;
bad64 = BAD_QV;
ex_bad.value = &bad64;
printf("\nDAStrim -g%d -b%d %s", GOOD_QV,BAD_QV,argv[1]);
for (c = 2; c < argc; c++)
printf(" %s",argv[c]);
printf("\n");
}
ex_trim.vtype = DB_INT; // Trim statistics
ex_trim.nelem = 16;
ex_trim.accum = DB_SUM;
ex_trim.name = trim_name;
ex_trim.value = &tstats;
// Determine if overlap block is being processed and if so get first and last read
// from .db file
// For each .las file do
dpwd = PathTo(argv[1]);
root = Root(argv[1],".db");
for (c = 2; c < argc; c++)
{ las = Root(argv[c],".las");
{ Block_Looper *parse;
FILE *input;
parse = Parse_Block_Arg(argv[c]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
// Determine if overlap block is being processed and if so get first and last read
// from .db file
{ FILE *dbfile;
char buffer[2*MAX_NAME+100];
......@@ -2435,11 +2464,7 @@ int main(int argc, char *argv[])
int i, part, nfiles, nblocks, cutoff, all, oindx;
int64 size;
DB_PART = 0;
DB_FIRST = 0;
DB_LAST = DB->nreads;
p = rindex(las,'.');
p = rindex(Block_Arg_Root(parse),'.');
if (p != NULL)
{ part = strtol(p+1,&eptr,10);
if (*eptr == '\0' && eptr != p+1)
......@@ -2462,7 +2487,6 @@ int main(int argc, char *argv[])
SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
*p = '\0';
}
}
}
......@@ -2506,32 +2530,67 @@ int main(int argc, char *argv[])
SETUP(KP_AFILE,KP_DFILE,KP_INDEX,".keep.anno",".keep.data",0)
#endif
// Open overlap file
lpwd = PathTo(argv[c]);
if (DB_PART)
input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
else
input = Fopen(Catenate(lpwd,"/",las,".las"),"r");
if (input == NULL)
exit (1);
free(lpwd);
free(las);
// Get trace point spacing information
fread(&novl,sizeof(int64),1,input);
fread(&TRACE_SPACING,sizeof(int),1,input);
make_a_pass(input,GAPS,1);
// Initialize statistics gathering
nreads = 0;
totlen = 0;
nelim = 0;
n5trm = 0;
n3trm = 0;
natrm = 0;
nelimbp = 0;
n5trmbp = 0;
n3trmbp = 0;
natrmbp = 0;
ngaps = 0;
nlowq = 0;
nspan = 0;
nchim = 0;
ngapsbp = 0;
nlowqbp = 0;
nspanbp = 0;
nchimbp = 0;
// Clean up
if (VERBOSE)
{ printf("\nDAStrim");
if (HGAP_MIN > 0)
printf(" -H%d",HGAP_MIN);
printf(" -c%d -g%d -b%d %s %s\n",COVERAGE,GOOD_QV,BAD_QV,argv[1],argv[c]);
}
fwrite(&GOOD_QV,sizeof(int),1,TR_AFILE);
fwrite(&BAD_QV,sizeof(int),1,TR_AFILE);
fwrite(&COVERAGE,sizeof(int),1,TR_AFILE);
fwrite(&HGAP_MIN,sizeof(int),1,TR_AFILE);
// Process each read pile
make_a_pass(input,GAPS,1);
// Write out extras and close .trim track
tstats[ 0] = nelim;
tstats[ 1] = n5trm;
tstats[ 2] = n3trm;
tstats[ 3] = natrm;
tstats[ 4] = nelimbp;
tstats[ 5] = n5trmbp;
tstats[ 6] = n3trmbp;
tstats[ 7] = natrmbp;
tstats[ 8] = ngaps;
tstats[ 9] = nlowq;
tstats[10] = nspan;
tstats[11] = nchim;
tstats[12] = ngapsbp;
tstats[13] = nlowqbp;
tstats[14] = nspanbp;
tstats[15] = nchimbp;
Write_Extra(TR_AFILE,&ex_hgap);
Write_Extra(TR_AFILE,&ex_cest);
Write_Extra(TR_AFILE,&ex_good);
Write_Extra(TR_AFILE,&ex_bad);
Write_Extra(TR_AFILE,&ex_trim);
fclose(TR_AFILE);
fclose(TR_DFILE);
......@@ -2555,7 +2614,8 @@ int main(int argc, char *argv[])
fclose(KP_AFILE);
fclose(KP_DFILE);
#endif
}
fclose(input);
// If verbose output statistics summary to stdout
......@@ -2636,6 +2696,10 @@ int main(int argc, char *argv[])
Print_Number(nlowqbp+nspanbp,12,stdout);
printf(" (%5.1f%%) bases\n",(100.*(nlowqbp+nspanbp))/totlen);
}
}
Free_Block_Arg(parse);
}
free(dpwd);
free(root);
......
......@@ -17,6 +17,7 @@
#include <ctype.h>
#include <unistd.h>
#include <dirent.h>
#include <limits.h>
#include "DB.h"
......@@ -599,8 +600,8 @@ error:
// for the retained reads.
void Trim_DB(DAZZ_DB *db)
{ int i, j, r;
int allflag, cutoff;
{ int i, j, r, f;
int allflag, cutoff, css;
int64 totlen;
int maxlen, nreads;
DAZZ_TRACK *record;
......@@ -678,12 +679,20 @@ void Trim_DB(DAZZ_DB *db)
totlen = maxlen = 0;
for (j = i = 0; i < nreads; i++)
{ r = reads[i].rlen;
if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff)
{ f = reads[i].flags;
if ((f & DB_CSS) == 0)
css = 0;
r = reads[i].rlen;
if ((f & DB_BEST) >= allflag && r >= cutoff)
{ totlen += r;
if (r > maxlen)
maxlen = r;
reads[j++] = reads[i];
reads[j] = reads[i];
if (css)
reads[j++].flags |= DB_CSS;
else
reads[j++].flags &= ~DB_CSS;
css = 1;
}
}
......@@ -1989,3 +1998,175 @@ void Print_Read(char *s, int width)
printf("\n");
}
}
/*******************************************************************************************
*
* COMMAND LINE BLOCK PARSER
* Take a command line argument and interpret the '@' block number ranges.
* Parse_Block_Arg produces an Block_Looper iterator object that can then
* be invoked multiple times to iterate through all the files implied by
* the @ pattern/range.
*
********************************************************************************************/
typedef struct
{ int first, last, next;
char *root, *pwd, *ppnt;
char *slice;
} _Block_Looper;
// Advance the iterator e_parse to the next file, open it, and return the file pointer
// to it. Return NULL if at the end of the list of files.
FILE *Next_Block_Arg(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
char *disp;
FILE *input;
parse->next += 1;
if (parse->next > parse->last)
return (NULL);
if (parse->next < 0)
disp = parse->root;
else
disp = Numbered_Suffix(parse->root,parse->next,parse->ppnt);
if ((input = fopen(Catenate(parse->pwd,"/",disp,".las"),"r")) == NULL)
{ if (parse->last != INT_MAX)
{ fprintf(stderr,"%s: %s.las is not present\n",Prog_Name,disp);
exit (1);
}
return (NULL);
}
return (input);
}
// Reset the iterator e_parse to the first file
void Reset_Block_Arg(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
parse->next = parse->first - 1;
}
// Return a pointer to the path for the current file
char *Block_Arg_Path(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
return (parse->pwd);
}
// Return a pointer to the root name for the current file
char *Block_Arg_Root(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
if (parse->next < 0)
return (parse->root);
else
return (Numbered_Suffix(parse->root,parse->next,parse->ppnt));
}
// Free the iterator
void Free_Block_Arg(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
free(parse->root);
free(parse->pwd);
free(parse->slice);
free(parse);
}
char *Next_Block_Slice(Block_Looper *e_parse, int slice)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
if (parse->slice == NULL)
{ int size = strlen(parse->pwd) + strlen(Block_Arg_Root(parse)) + 30;
parse->slice = (char *) Malloc(size,"Block argument slice");
if (parse->slice == NULL)
exit (1);
}
if (parse->first < 0)
sprintf(parse->slice,"%s/%s",parse->pwd,parse->root);
else
sprintf(parse->slice,"%s/%s%c%d-%d%s",parse->pwd,parse->root,BLOCK_SYMBOL,parse->next+1,
parse->next+slice,parse->ppnt);
parse->next += slice;
return (parse->slice);
}
// Parse the command line argument and return an iterator to move through the
// file names, setting it up to report the first file.
Block_Looper *Parse_Block_Arg(char *arg)
{ _Block_Looper *parse;
char *pwd, *root;
char *ppnt, *cpnt;
int first, last;
parse = (_Block_Looper *) Malloc(sizeof(_Block_Looper),"Allocating parse node");
pwd = PathTo(arg);
root = Root(arg,".las");
if (parse == NULL || pwd == NULL || root == NULL)
exit (1);
ppnt = index(root,BLOCK_SYMBOL);
if (ppnt == NULL)
first = last = -1;
else
{ if (index(ppnt+1,BLOCK_SYMBOL) != NULL)
{ fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n",
Prog_Name,BLOCK_SYMBOL,root);
exit (1);
}
*ppnt++ = '\0';
first = strtol(ppnt,&cpnt,10);
if (cpnt == ppnt)
{ first = 1;
last = INT_MAX;
}
else
{ if (first < 0)
{ fprintf(stderr,
"%s: Integer following %c-sigan is less than 0 in source name '%s'\n",
Prog_Name,BLOCK_SYMBOL,root);
exit (1);
}
if (*cpnt == '-')
{ ppnt = cpnt+1;
last = strtol(ppnt,&cpnt,10);
if (cpnt == ppnt)
{ fprintf(stderr,"%s: Second integer must follow - in source name '%s'\n",
Prog_Name,root);
exit (1);
}
if (last < first)
{ fprintf(stderr,
"%s: 2nd integer is less than 1st integer in source name '%s'\n",
Prog_Name,root);
exit (1);
}
ppnt = cpnt;
}
else
{ last = INT_MAX;
ppnt = cpnt;
}
}
}
parse->pwd = pwd;
parse->root = root;
parse->ppnt = ppnt;
parse->first = first;
parse->last = last;
parse->next = first-1;
parse->slice = NULL;
return ((Block_Looper *) parse);
}
......@@ -59,6 +59,8 @@ typedef signed long long int64;
typedef float float32;
typedef double float64;
#define LAST_READ_SYMBOL '$'
#define BLOCK_SYMBOL '@'
/*******************************************************************************************
*
......@@ -567,4 +569,24 @@ int Read_All_Sequences(DAZZ_DB *db, int ascii);
int List_DB_Files(char *path, void actor(char *path, char *extension));
// Take a command line argument and interpret the '@' block number ranges.
// Parse_Block_Arg produces a Block_Looper iterator object that can then
// be invoked multiple times to iterate through all the files implied by
// the @ pattern/range. Next_Block_Slice returns a string encoing the next
// slice files represented by an @-notation, and advances the iterator by
// that many files.
typedef void Block_Looper;
Block_Looper *Parse_Block_Arg(char *arg);
FILE *Next_Block_Arg(Block_Looper *e_parse);
char *Next_Block_Slice(Block_Looper *e_parse,int slice);
void Reset_Block_Arg(Block_Looper *e_parse); // Reset iterator to first file
char *Block_Arg_Path(Block_Looper *e_parse); // Path of current file
char *Block_Arg_Root(Block_Looper *e_parse); // Root name of current file
void Free_Block_Arg(Block_Looper *e_parse); // Free the iterator
#endif // _DAZZ_DB
......@@ -2,10 +2,16 @@ DEST_DIR = ~/bin
CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
ALL = DASqv DAStrim DASpatch DASedit DASmap DASrealign REPqv REPtrim
ALL = DAScover DASqv DAStrim DASpatch DASedit DASmap DASrealign REPcover REPqv REPtrim
all: $(ALL)
DAScover: DAScover.c align.c align.h DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DAScover DAScover.c align.c DB.c QV.c -lm
REPcover: REPcover.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o REPcover REPcover.c DB.c QV.c -lm
DASqv: DASqv.c align.c align.h DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DASqv DASqv.c align.c DB.c QV.c -lm
......
......@@ -7,7 +7,7 @@
For typeset documentation, examples of use, and design philosophy please go to
my [blog](https://dazzlerblog.wordpress.com/command-guides/dascrubber-command-guide).
This is still an incomplete release.
This is still a preliminary release.
The current set of commands provide a pipeline that one can use to scrub reads and if desired
to scrub the alignment piles (with DASrealign). Ultimately DASpatch/DASedit and DASrealign will
be replaced with more powerful programs that correct reads and not only scrub alignment piles,
......@@ -19,27 +19,69 @@ The goal of scrubbing is to produce a set of edited reads that are guaranteed to
and not chimers), and (b) have no very low quality stretches (i\.e\. the error rate
never exceeds some reasonable maximum, 20% or so in the case of Pacbio data). The
secondary goal of scrubbing is to do so with the minimum removal of data and splitting
of reads. The \"DAS\" suite consists of a pipeline of several programs that in sequence accomplish
the task of scrubbing.
of reads.
Note carefully that the current scrubbing pipeline requires that one has employed
repeat-masking in the daligner run as per the DAMASKER module described in this
[post](https://dazzlerblog.wordpress.com/2016/04/01/detecting-and-soft-masking-repeats).
The current \"DAS\" suite consists of a pipeline of several programs that in sequence accomplish
the task of scrubbing: DAScover &rarr; DASqv &rarr; DAStrim &rarr; DASpatch &rarr; DASedit.
For the commands, the \<source\> argument must always refer to the entire DB, and only
the \<overlaps\> arguments can involve a block number. If \<overlaps\> involves a block
number, e\.g\. Ecoli.2.las, then the .las file is expected to contain all the overlaps
where the A-read is in block 2 of the underlying database. The HPC.daligner scripts in the
DALIGNER module produce such .las files as their final result. Parameters are propoagated
down the pipeline to subsequent phases via the annotation tracks so one need not specify
the same parameter over and over again, i.e, the parameters/flags -H, -c, -g, and -b.
All programs add suffixes (e.g. .db, .las) as needed.
For the commands that take multiple .las block files as arguments, e.g. DAScover, DASqv, ...,
one can place a @-sign in the name, which is then interpreted as the sequence of files
obtained by replacing the @-sign by 1, 2, 3, ... in sequence until a number is reached for
which no file matches. One can also place a @-sign followed by an integer, say, i, in which
case the sequence starts at i. Lastly, one can also place @i-j where i and j are integers, in
which case the sequence is from i to j, inclusive.
```
1. DASqv [-v] [-H<int>] -c<int> <subject:db> <overlaps:las> ...
1. DAScover [-v] [-H<int>] [-m<track>]+ <subject:db> <overlaps:las> ...
```
This command takes as input a database \<source\> and a sequence of sorted local alignments
blocks, \<overlaps\>, produced by an overlap/daligner run for said database. It is recommended
that one employ repeat-masking in the overlap run as per the new DAMASKER module described in
this post. Note carefully that \<source\> must always refer to the entire DB, only \<overlaps\>
can involve a block number. If \<overlaps\> involves a block number, e\.g\. Ecoli.2.las, then
the .las file is expected to contain all the overlaps where the A-read is in block 2 of the
underlying database. The HPC.daligner scripts in the DALIGNER module produce such .las files
as their final result.
blocks, \<overlaps\>, produced by an overlap/daligner run for said database.
Note carefully that \<source\> must always refer to the entire DB, only the \<overlaps\> can
involve block numbers.
Using the local
alignment-pile for each A-read, DAScover produces a histogram of the depth of coverage of
each trace point tile that is not within one of the intervals of the optionally specified tracks.
It places this histogram in a .covr track for the bock and these block tracks are merged
later with Catrack. If the -v option is set, the histogram for each block is displayed and an
estimate of the coverage of the underlying target genome is output.
If the overlap file contains a block number
then the track files also contain a block number, e\.g\. \"DAScovr DB OVL.2\" will result in
the track files DB.2.covr.[anno,data]. Furthermore, if DAScovr is run on .las blocks, then
once it has been run on all the blocks of the DB, the block tracks must be concatenated
into a single track for the entire database with Catrack in preparation for the next phase
of scrubbing by DAStrim.
```
2. DASqv [-v] [-c<int>] <subject:db> <overlaps:las> ...
```
This command takes as input a database \<source\> and a sequence of sorted local alignments
blocks, \<overlaps\>, produced by an overlap/daligner run for said database.
A .covr track obtained by running DASqv and Catrack must be present for the entire data base.
Note carefully that \<source\> must always refer to the entire DB, only the \<overlaps\> can
involve a block number.
Using the local alignment-pile for each A-read, DASqv produces a QV value for each complete
segment of TRACE_SPACING bases (e\.g\. 100bp, the -s parameter to daligner). The quality value
of the average percentile of the best 25-50% alignment matches covering it depending on the
coverage estimate -c. One must supply the -c parameter to the expected coverage of the genome
in question. All quality values over 50 are clipped to 50.
of ecah trace tile is the average of the best 25-50% of the estimated coverage alignment matches,
where the estimated coverage is computed from the histogram of the .covr track.
If one supplies the -c parameter, than this value explicitly overrides the estimated coverage
produced by default. All quality values over 50 are clipped to 50.
The -v option prints out a histogram of the segment align matches, and the quality values
produced. This histogram is useful in assessing, for a given data set, what constitutes the
......@@ -52,21 +94,22 @@ reads are used in the overlap piles for H-reads to help assess and scrub the H-r
are themselves not scrubbed.
The quality values are written to a .qual track, that can be viewed by calling DBdump with
the -i option set (\"i\" for \"intrinsic QV\"). If the overlap file contains a block number
then the track files also contain a block number, e\.g\. \"DASqv -c50 DB OVL.2\" will result in
the track files DB.2.qual.[anno,data]. Furthermore, if DASqv is run on .las blocks, then
once it has been run on all the blocks of the DB, the block tracks must be concatenated
into a single track for the entire database with Catrack in preparation for the next phase
of scrubbing by DAStrim.
the -i option set (\"i\" for \"intrinsic QV\").
Like DAScovr and all other
scrubber modules, block tracks are produced in response to block .las files and these must be
concatenated with Catrack into a single .qual track for the entire DB in preparation for the
next phase of scrubbing by DAStrim.
```
2. DAStrim [-v] -g<int> -b<int> <subject:db> <overlaps:las> ...
3. DAStrim [-v] [-g<int>] [-b<int>] <subject:db> <overlaps:las> ...
```
This command takes as input a database \<source\> and a sequence of sorted local alignments
blocks, \<overlaps\>, produced by an overlap/daligner run for said database. A .qual track
A DB-wide .qual track produced by DASqv and Catrack are required as input to this command.
This command further takes as input a database \<source\> and a sequence of sorted local alignments
blocks, \<overlaps\>, produced by an overlap/daligner run for said database.
A .qual track
obtained by running DASqv must be present for the entire data base. Note carefully that
\<source\> must always refer to the entire DB, only \<overlaps\> can involve a block number.
\<source\> must always refer to the entire DB, only \<overlaps\> can involve block numbers.
Using the local alignment-pile for each A-read and the QV\'s for all the reads in the pile,
DAStrim (1) finds and breaks all chimeric reads, (2) finds all missed adaptamers and retains
......@@ -76,11 +119,9 @@ decisions conservatively so that what remains is very highly likely to be free o
adaptamers, and undetected low-quality sequence segments. Some of these artifacts may still
get through, but at very low odds, less than 1 in 10,000 in our experience. The decision
process is guided by the parameters -g and -b which indicate the thresholds for considering
intrinsic QV values good, bad, or unknown. In our experience, -g should be set to the value
for which 80% or more of the QV\'s produced by DASqv are better than this value, and -b should
be set to the value for which 5-7% or more of the QV\'s are worse than this value. Consult
the histogram produced by DASqv to set these parameter. We further recommend doing this
for each project on a case-by-case basis.
intrinsic QV values good, bad, or unknown. By default these parameters are automatically set
to be the 80'th and 93'rd percentiles of the qv-histograms hidden in the .qual track. They
may however be explicitly set at the command line to over-rule this default choice.
The -v option prints out a report of how many chimer and adaptamer breaks were detected, how
much sequence was trimmed, how many low-quality segments were spanned by alignments, and how
......@@ -89,20 +130,21 @@ low-quality region, and so on.
The retained high-quality intervals for each read are written to a .trim track, in
left-to-right order with an indicator of whether the gap between two such intervals is spanned
by local alignments or by span-consistent pairs of local alignments. Like DASqv and all other
by local alignments or by span-consistent pairs of local alignments.
Like DAScovr and all other
scrubber modules, block tracks are produced in response to block .las files and these must be
concatenated with Catrack into a single .trim track for the entire DB in preparation for the
next phase of scrubbing by DASpatch.
```
3. DASpatch [-v] <subject:db> <overlaps:las> ...
4. DASpatch [-v] <subject:db> <overlaps:las> ...
```
This command takes as input a database \<source\> and a sequence of sorted local alignments
blocks, \<overlaps\>, produced by an overlap/daligner run for said database. A .qual track
and a .trim track obtained by running DASqv and DAStrim must be present for the entire data
base. Note carefully that \<source\> must always refer to the entire DB, only \<overlaps\> can
involve a block number.
involve block numbers.
Using the local alignment-pile for each A-read, the QV\'s for all the reads in the pile, and
the hiqh-quality segments annotated by DAStrim, DASpatch selects a high-quality B-read segment
......@@ -113,16 +155,17 @@ span/patch can fail. This is very rare but does occur and is the number of such
reported by DASpatch when the -v option is set.
The B-read segments for each patch (or a special \"failure patch\") are written to a .patch
track, in left-to-right order. Like DASqv and all other scrubber modules, block tracks are
track, in left-to-right order. Like DAScovr and all other scrubber modules, block tracks are
produced in response to block .las files and these must be concatenated with Catrack into a
single .patch track for the entire DB in preparation for the next phase of scrubbing by DASedit.
```
4. DASedit [-v] [-x<int>] <source:db> <target:db>
5. DASedit [-v] [-x<int>] <source:db> <target:db>
```
This command takes as input a database \<source\> for which a .trim, and .patch tracks have
produced by DAStrim and DASpatch in sequence (and perforce DASqv initially). Using the
produced by DAStrim and DASpatch in sequence (and perforce DAScover and DASqv before them).
Using the
information in the two tracks, DASedit produces a new database \<target\> whose reads are the
patched, high-quality sub-reads. That is, every low quality segment is patched with the relevant
sequence of another read, and some reads give rise to two or more reads if deemed chimers, or
......@@ -130,7 +173,7 @@ no reads if the entire read was deemed junk. This command can take considerable
access pattern to read sequences (for the patching) is not sequential or localized, implying
poor cache performance.
The new database does not have a .qvs or .arr component, that is it is a a sequence or
The new database does not have a .qvs or .arr component, that is, it is a a sequence or
S-database (see the original Dazzler DB post). Very importantly, the new database has exactly
the same block divisions as the original. That is, all patched subreads in a block of the new
database have been derived from reads of the same block in the original database, and only
......@@ -140,7 +183,7 @@ high-quality (i\.e\. not patched). The program DASmap below can be used to outp
information in either an easy-to-read or an easy-to-parse format.
```
5. DASmap [-p] <path:db> [ <reads:FILE> |<reads:range> ... ]
6. DASmap [-p] <path:db> [ <reads:FILE> |<reads:range> ... ]
```
This command takes as input a database of patched reads \<source\> produced by DASedit, and for
......@@ -173,7 +216,7 @@ some n):
```
```
6. DASrealign [-v] [-l<int:(800)>] <block1> <block2> <source:las> <target:las>
7. DASrealign [-v] [-l<int:(800)>] <block1> <block2> <source:las> <target:las>
```
This command takes as input two blocks a patched database \<source\> created by DASedit, and the
......@@ -193,7 +236,18 @@ is roughly 40X faster than the original daligner run and the collection of overl
output can be feed directly into a string graph construction process.
```
7. REPqv <subject:db> ...
8. REPcover <subject:db> ...
```
This command takes as input a sequence of databases or blocks \<source\> and for each outputs
a histogram of the coverage of the unmasked portions of the reads in the source along with
a recommendation of the -c value with which to run DASqv. The .covr track produced by
DAScover must be present for all sources referred to. The command is a quick way get the
-v output of DAScover at any time after producing the coverage histograms and to get the
histogram and coverage estimate for the entire data base (as opposed to a block of the database.
```
9. REPqv <subject:db> ...
```
This command takes as input a sequence of databases or blocks \<source\> and for each outputs
......@@ -201,10 +255,10 @@ a histogram of the intrinsic quality values of the reads in the source along wit
recommendation of the -g and -b values with which to run DAStrim. The .qual track produced
by DASqv must be present for all sources referred to. The command is a quick way to get
the -v output of DASqv at any time after producing the intrinsic quality values and to get
the histogram for the entire data base (as opposed to a block of the database).
the histograms for the entire data base (as opposed to a block of the database).
```
8. REPtrim <subject:db> ...
10. REPtrim <subject:db> ...
```
This command takes as input a sequence of databases or blocks \<source\> and for each outputs
......
/*******************************************************************************************
*
* Read the .covr track of each db or db block on the command line and output a histogram
* of the coverage of the unmasked portions and an estimate of the coverage of the
* underlying genome
*
* Author: Gene Myers
* Date : January 2017
*
*******************************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "DB.h"
// Command format and global parameter variables
static char *Usage = "<source:db> ...";
int main(int argc, char *argv[])
{ int c;
// Process arguments
Prog_Name = Strdup("REPcover","");
if (argc < 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
// Open trimmed DB and the qual-track
for (c = 1; c < argc; c++)
{ DAZZ_DB _DB, *DB = &_DB;
DAZZ_EXTRA ex_hgap, ex_covr;
// Load DB
{ int status;
status = Open_DB(argv[c],DB);
if (status < 0)
exit (1);
if (status == 1)
{ fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
exit (1);
}
Trim_DB(DB);
}
// Get .covr track extras
{ FILE *afile;
char *aname;
int extra;
afile = NULL;
if (DB->part)
{ aname = Strdup(Catenate(DB->path,Numbered_Suffix(".",DB->part,"."),"covr",".anno"),
"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
{ fprintf(stderr,"%s: Must have a 'covr.%d' track, run DAScover\n",Prog_Name,DB->part);
exit (1);
}
}
else
{ aname = Strdup(Catenate(DB->path,".","covr",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
{ fprintf(stderr,"%s: Must have a 'covr' track, run DAScover\n",Prog_Name);
exit (1);
}
}
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - sizeof(int)*2;
fseeko(afile,-extra,SEEK_END);
ex_covr.nelem = 0;
if (Read_Extra(afile,aname,&ex_covr) != 0)
{ fprintf(stderr,"%s: Histogram extra missing from .covr track?\n",Prog_Name);
exit (1);
}
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .covr track?\n",Prog_Name);
exit (1);
}
fclose(afile);
}
// Generate display
{ char *root;
int i, cmax, hgap_min, cover;
int64 nreads, totlen;
int64 *cgram;
int64 ssum, stotal;
root = Root(argv[c],".db");
nreads = DB->nreads;
totlen = DB->totlen;
hgap_min = (int) ((int64 *) (ex_hgap.value))[0];
cgram = (int64 *) (ex_covr.value);
cmax = ex_covr.nelem - 1;
printf("\nDAScover");
if (hgap_min > 0)
printf(" -H%d",hgap_min);
printf(" %s\n\n",root);
if (hgap_min > 0)
{ for (i = 0; i < DB->nreads; i++)
if (DB->reads[i].rlen < hgap_min)
{ nreads -= 1;
totlen -= DB->reads[i].rlen;
}
}
// Display histogram
printf("\nInput: ");
Print_Number(nreads,7,stdout);
printf("reads, ");
Print_Number(totlen,12,stdout);
printf(" bases");
if (hgap_min > 0)
{ printf(" (another ");
Print_Number(DB->nreads-nreads,0,stdout);
printf(" were < H-length)");
}
printf("\n");
stotal = 0;
for (i = 0; i <= cmax; i++)
stotal += cgram[i];
printf("\nCoverage Histogram\n\n");
ssum = cgram[cmax];
if (ssum > 0)
printf(" %4d: %9lld %5.1f%%\n\n",
cmax,cgram[cmax],(100.*ssum)/stotal);
stotal -= ssum;
ssum = 0;
for (i = cmax-1; i >= 0; i--)
if (cgram[i] > 0)
{ ssum += cgram[i];
printf(" %4d: %9lld %5.1f%%\n",
i,cgram[i],(100.*ssum)/stotal);
}
i = 0;
while (cgram[i+1] < cgram[i])
i += 1;
for (cover = i++; i < cmax; i++)
if (cgram[cover] < cgram[i])
cover = i;
printf("\n Coverage is estimated at %d\n\n",cover);
free(root);
Close_DB(DB);
}
}
free(Prog_Name);
exit (0);
}
......@@ -20,16 +20,6 @@
static char *Usage = "<source:db> ...";
// Global Variables
#define MAXQV 50 // Max QV score is 50
#define MAXQV1 51
static DAZZ_DB _DB, *DB = &_DB; // Data base
static int64 *QV_IDX; // qual track index
static uint8 *QV; // qual track values
int main(int argc, char *argv[])
{ int c;
......@@ -45,13 +35,13 @@ int main(int argc, char *argv[])
// Open trimmed DB and the qual-track
for (c = 1; c < argc; c++)
{ DAZZ_DB _DB, *DB = &_DB;
DAZZ_EXTRA ex_hgap, ex_cest, ex_qvs, ex_dif;
// Load DB
{ int status;
char *root;
int i, bval, gval, cover, hgap_min;
DAZZ_TRACK *track;
int64 nreads, totlen;
int64 qgram[MAXQV1];
int64 qsum, qtotal;
status = Open_DB(argv[c],DB);
if (status < 0)
......@@ -61,76 +51,103 @@ int main(int argc, char *argv[])
exit (1);
}
Trim_DB(DB);
}
track = Load_Track(DB,"qual");
if (track != NULL)
{ FILE *afile;
int size, tracklen, extra;
QV_IDX = (int64 *) track->anno;
QV = (uint8 *) track->data;
// Get .qual track extras
// if newer .qual tracks with -c meta data, grab it
{ FILE *afile;
char *aname;
int extra, tracklen, size;
afile = NULL;
if (DB->part)
{ afile = fopen(Catenate(DB->path,
Numbered_Suffix(".",DB->part,"."),"qual",".anno"),"r");
{ aname = Strdup(Catenate(DB->path,Numbered_Suffix(".",DB->part,"."),"qual",".anno"),
"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
afile = fopen(Catenate(DB->path,".","qual",".anno"),"r");
{ fprintf(stderr,"%s: Must have a 'qual.%d' track, run DASqv\n",Prog_Name,DB->part);
exit (1);
}
}
else
afile = fopen(Catenate(DB->path,".","qual",".anno"),"r");
{ aname = Strdup(Catenate(DB->path,".","qual",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
{ fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
exit (1);
}
}
fread(&tracklen,sizeof(int),1,afile);
fread(&size,sizeof(int),1,afile);
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
fseeko(afile,-extra,SEEK_END);
if (extra == 2*sizeof(int))
{ fread(&cover,sizeof(int),1,afile);
fread(&hgap_min,sizeof(int),1,afile);
}
else if (extra == sizeof(int))
{ fread(&cover,sizeof(int),1,afile);
hgap_min = 0;
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .qual track?\n",Prog_Name);
exit (1);
}
else
{ cover = -1;
hgap_min = 0;
ex_cest.nelem = 0;
if (Read_Extra(afile,aname,&ex_cest) != 0)
{ fprintf(stderr,"%s: Coverage estimate extra missing from .qual track?\n",Prog_Name);
exit (1);
}
fclose(afile);
ex_qvs.nelem = 0;
if (Read_Extra(afile,aname,&ex_qvs) != 0)
{ fprintf(stderr,"%s: QV histogram extra missing from .qual track?\n",Prog_Name);
exit (1);
}
else
{ fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
ex_dif.nelem = 0;
if (Read_Extra(afile,aname,&ex_dif) != 0)
{ fprintf(stderr,"%s: Differences histogram extra missing from .qual track?\n",Prog_Name);
exit (1);
}
fclose(afile);
}
// Generate display
{ char *root;
int64 nreads, totlen;
int hgap_min, cover;
int64 *qgram, *sgram;
int maxqv;
// Get relevant variables
root = Root(argv[c],".db");
nreads = DB->nreads;
totlen = DB->totlen;
for (i = 0; i <= MAXQV; i++)
qgram[i] = 0;
for (i = 0; i < QV_IDX[nreads]; i++)
qgram[QV[i]] += 1;
hgap_min = (int) ((int64 *) (ex_hgap.value))[0];
cover = (int) ((int64 *) (ex_cest.value))[0];
qgram = (int64 *) (ex_qvs.value);
maxqv = ex_qvs.nelem - 1;
sgram = (int64 *) (ex_dif.value);
printf("\nDASqv");
if (cover >= 0)
printf(" -c%d",cover);
else
printf(" -c??");
if (hgap_min > 0)
printf(" -H%d",hgap_min);
printf(" %s\n\n",root);
printf(" -c%d %s\n\n",cover,root);
if (hgap_min > 0)
{ for (i = 0; i < DB->nreads; i++)
{ int i;
for (i = 0; i < DB->nreads; i++)
if (DB->reads[i].rlen < hgap_min)
{ nreads -= 1;
totlen -= DB->reads[i].rlen;
}
}
printf(" Input: ");
// Display histograms
printf("\n Input: ");
Print_Number(nreads,7,stdout);
printf("reads, ");
Print_Number(totlen,12,stdout);
......@@ -142,8 +159,17 @@ int main(int argc, char *argv[])
}
printf("\n");
if (cover >= 0)
{ int qv_deep;
{ int64 ssum, qsum;
int64 stotal, qtotal;
int qv_deep;
int gval, bval;
int i;
stotal = qtotal = 0;
for (i = 0; i <= maxqv; i++)
{ stotal += sgram[i];
qtotal += qgram[i];
}
if (cover >= 40)
qv_deep = cover/8;
......@@ -152,36 +178,45 @@ int main(int argc, char *argv[])
else
qv_deep = cover/4;
printf("\n Histogram of q-values (average %d best)\n",2*qv_deep);
}
else
printf("\n Histogram of q-values\n");
printf("\n Histogram of q-values (average %d best)\n",qv_deep);
printf("\n Input QV\n");
qsum = qgram[maxqv];
ssum = sgram[maxqv];
printf("\n %2d: %9lld %5.1f%% %9lld %5.1f%%\n\n",
maxqv,sgram[maxqv],(100.*ssum)/stotal,qgram[maxqv],(100.*qsum)/qtotal);
qtotal = 0;
for (i = 0; i <= MAXQV; i++)
qtotal += qgram[i];
qtotal -= qsum;
stotal -= ssum;
ssum = qsum = 0;
for (i = maxqv-1; i >= 0; i--)
if (qgram[i] > 0)
{ ssum += sgram[i];
qsum += qgram[i];
printf(" %2d: %9lld %5.1f%% %9lld %5.1f%%\n",
i,sgram[i],(100.*ssum)/stotal,
qgram[i],(100.*qsum)/qtotal);
}
qsum = qgram[MAXQV];
printf("\n %2d: %9lld %5.1f%%\n\n",MAXQV,qgram[MAXQV],(100.*qsum)/qtotal);
// Estimate -g and -b parameters
bval = gval = -1;
qtotal -= qsum;
qsum = 0;
for (i = MAXQV-1; i >= 0; i--)
for (i = maxqv-1; i >= 0; i--)
if (qgram[i] > 0)
{ qsum += qgram[i];
printf(" %2d: %9lld %5.1f%%\n",i,qgram[i],(100.*qsum)/qtotal);
if ((100.*qsum)/qtotal > 7. && bval < 0)
bval = i;
bval = i+1;
if ((100.*qsum)/qtotal > 20. && gval < 0)
gval = i;
gval = i+1;
}
printf("\n Recommend \'DAStrim -g%d -b%d'\n\n",gval,bval);
}
free(root);
Close_DB(DB);
}
}
free(Prog_Name);
exit (0);
......
......@@ -21,22 +21,6 @@
static char *Usage = "<source:db> ...";
// Gap states
#define LOWQ 0 // Gap is spanned by many LAs and patchable
#define SPAN 1 // Gap has many paired LAs and patchable
#define SPLIT 2 // Gap is a chimer or an unpatchable gap
#define ADPRE 3 // Gap is due to adaptemer, trim prefix interval to left
#define ADSUF 4 // Gap is due to adaptemer, trim suffix interval to right
// Global Variables (must exist across the processing of each pile)
static DAZZ_DB _DB, *DB = &_DB; // Data base
static int64 *TRIM_IDX; // trim track index
static int *TRIM; // trim track values
int main(int argc, char *argv[])
{ int c;
......@@ -52,22 +36,13 @@ int main(int argc, char *argv[])
// Open trimmed DB and .qual and .trim tracks
for (c = 1; c < argc; c++)
{ DAZZ_DB _DB, *DB = &_DB;
DAZZ_EXTRA ex_hgap, ex_cest, ex_good, ex_bad, ex_trim;
// Load DB
{ int status;
char *root;
int i, a, tb, te;
int alen;
DAZZ_TRACK *track;
int64 nreads, totlen;
int64 nelim, nelimbp;
int64 n5trm, n5trmbp;
int64 n3trm, n3trmbp;
int64 natrm, natrmbp;
int64 ngaps, ngapsbp;
int64 nlowq, nlowqbp;
int64 nspan, nspanbp;
int64 nchim, nchimbp;
int rlog, blog;
int BAD_QV, GOOD_QV, HGAP_MIN;
status = Open_DB(argv[c],DB);
if (status < 0)
......@@ -77,136 +52,135 @@ int main(int argc, char *argv[])
exit (1);
}
Trim_DB(DB);
}
track = Load_Track(DB,"trim");
if (track != NULL)
{ FILE *afile;
int size, tracklen, extra;
// Get .trim track extras
TRIM_IDX = (int64 *) track->anno;
TRIM = (int *) track->data;
for (i = 0; i <= DB->nreads; i++)
TRIM_IDX[i] /= sizeof(int);
{ FILE *afile;
char *aname;
int extra, tracklen, size;
afile = NULL;
if (DB->part)
{ afile = fopen(Catenate(DB->path,
Numbered_Suffix(".",DB->part,"."),"trim",".anno"),"r");
{ aname = Strdup(Catenate(DB->path,Numbered_Suffix(".",DB->part,"."),"trim",".anno"),
"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
{ fprintf(stderr,"%s: Must have a 'trim.%d' track, run DAStrim\n",Prog_Name,DB->part);
exit (1);
}
}
else
afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
{ aname = Strdup(Catenate(DB->path,".","trim",".anno"),"Allocating anno file");
if (aname == NULL)
exit (1);
afile = fopen(aname,"r");
if (afile == NULL)
{ fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
exit (1);
}
}
fread(&tracklen,sizeof(int),1,afile);
fread(&size,sizeof(int),1,afile);
fseeko(afile,0,SEEK_END);
extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
fseeko(afile,-extra,SEEK_END);
if (extra == 4*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
ex_hgap.nelem = 0;
if (Read_Extra(afile,aname,&ex_hgap) != 0)
{ fprintf(stderr,"%s: Hgap threshold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
else if (extra == 3*sizeof(int) || extra == 2*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
HGAP_MIN = 0;
ex_cest.nelem = 0;
if (Read_Extra(afile,aname,&ex_cest) != 0)
{ fprintf(stderr,"%s: Coverage estimate extra missing from .trim track?\n",Prog_Name);
exit (1);
}
else
{ GOOD_QV = -1;
BAD_QV = -1;
HGAP_MIN = 0;
ex_good.nelem = 0;
if (Read_Extra(afile,aname,&ex_good) != 0)
{ fprintf(stderr,"%s: Good QV threshold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
fclose(afile);
ex_bad.nelem = 0;
if (Read_Extra(afile,aname,&ex_bad) != 0)
{ fprintf(stderr,"%s: Bad QV threshdold extra missing from .trim track?\n",Prog_Name);
exit (1);
}
else
{ fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
ex_trim.nelem = 0;
if (Read_Extra(afile,aname,&ex_trim) != 0)
{ fprintf(stderr,"%s: Trimming statistics extra missing from .trim track?\n",Prog_Name);
exit (1);
}
fclose(afile);
}
// Generate Display
{ char *root;
int64 nreads, totlen;
int64 nelim, nelimbp;
int64 n5trm, n5trmbp;
int64 n3trm, n3trmbp;
int64 natrm, natrmbp;
int64 ngaps, ngapsbp;
int64 nlowq, nlowqbp;
int64 nspan, nspanbp;
int64 nchim, nchimbp;
int rlog, blog;
int cover, hgap_min;
int bad_qv, good_qv;
int64 *tstats;
// Get relevant variables
root = Root(argv[c],".db");
nreads = DB->nreads;
totlen = DB->totlen;
nelim = 0;
n5trm = 0;
n3trm = 0;
natrm = 0;
nelimbp = 0;
n5trmbp = 0;
n3trmbp = 0;
natrmbp = 0;
ngaps = 0;
nlowq = 0;
nspan = 0;
nchim = 0;
ngapsbp = 0;
nlowqbp = 0;
nspanbp = 0;
nchimbp = 0;
for (a = 0; a < DB->nreads; a++)
{ tb = TRIM_IDX[a];
te = TRIM_IDX[a+1];
alen = DB->reads[a].rlen;
if (alen < HGAP_MIN)
hgap_min = (int) ((int64 *) (ex_hgap.value))[0];
cover = (int) ((int64 *) (ex_cest.value))[0];
good_qv = (int) ((int64 *) (ex_good.value))[0];
bad_qv = (int) ((int64 *) (ex_bad.value))[0];
tstats = (int64 *) (ex_trim.value);
nelim = tstats[0];
n5trm = tstats[1];
n3trm = tstats[2];
natrm = tstats[3];
nelimbp = tstats[4];
n5trmbp = tstats[5];
n3trmbp = tstats[6];
natrmbp = tstats[7];
ngaps = tstats[8];
nlowq = tstats[9];
nspan = tstats[10];
nchim = tstats[11];
ngapsbp = tstats[12];
nlowqbp = tstats[13];
nspanbp = tstats[14];
nchimbp = tstats[15];
printf("\nDAStrim");
if (hgap_min > 0)
printf(" [-H%d]",hgap_min);
printf(" -c%d -g%d -b%d %s\n\n",cover,good_qv,bad_qv,root);
// Compensate for HGAP
if (hgap_min > 0)
{ int i;
for (i = 0; i < DB->nreads; i++)
if (DB->reads[i].rlen < hgap_min)
{ nreads -= 1;
totlen -= alen;
}
else if (tb >= te)
{ nelim += 1;
nelimbp += alen;
}
else
{ if (TRIM[tb] > 0)
{ n5trm += 1;
n5trmbp += TRIM[tb];
}
if (TRIM[te-1] < alen)
{ n3trm += 1;
n3trmbp += alen - TRIM[te-1];
}
while (tb + 3 < te)
{ ngaps += 1;
ngapsbp += TRIM[tb+3] - TRIM[tb+1];
if (TRIM[tb+2] == LOWQ)
{ nlowq += 1;
nlowqbp += TRIM[tb+3] - TRIM[tb+1];
}
else if (TRIM[tb+2] == SPAN)
{ nspan += 1;
nspanbp += TRIM[tb+3] - TRIM[tb+1];
}
else if (TRIM[tb+2] == ADPRE)
{ natrm += 1;
natrmbp += TRIM[tb+3] - TRIM[tb];
}
else if (TRIM[tb+2] == ADSUF)
{ natrm += 1;
natrmbp += TRIM[tb+4] - TRIM[tb+1];
}
else
{ nchim += 1;
nchimbp += TRIM[tb+3] - TRIM[tb+1];
}
tb += 3;
}
totlen -= DB->reads[i].rlen;
}
}
printf("\nStatistics for DAStrim");
if (GOOD_QV >= 0)
printf(" -g%d",GOOD_QV);
else
printf(" -g??");
if (BAD_QV >= 0)
printf(" -b%d",BAD_QV);
else
printf(" -b??");
if (HGAP_MIN > 0)
printf(" [-H%d]",HGAP_MIN);
printf(" %s\n\n",root);
// Compute maximum field widths of statistics
{ int64 mult;
......@@ -233,12 +207,14 @@ int main(int argc, char *argv[])
blog += (blog-1)/3;
}
// Display the statistices
printf(" Input: ");
Print_Number((int64) nreads,rlog,stdout);
printf(" (100.0%%) reads ");
Print_Number(totlen,blog,stdout);
printf(" (100.0%%) bases");
if (HGAP_MIN > 0)
if (hgap_min > 0)
{ printf(" (another ");
Print_Number((int64) (DB->nreads-nreads),0,stdout);
printf(" were < H-length)");
......@@ -263,13 +239,11 @@ int main(int argc, char *argv[])
Print_Number(n3trmbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*n3trmbp)/totlen);
if (natrm > 0)
{ printf(" Adapter: ");
printf(" Adapter: ");
Print_Number(natrm,rlog,stdout);
printf(" (%5.1f%%) reads ",(100.*natrm)/nreads);
Print_Number(natrmbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*natrmbp)/totlen);
}
printf("\n");
......@@ -314,6 +288,7 @@ int main(int argc, char *argv[])
free(root);
Close_DB(DB);
}
}
free(Prog_Name);
exit (0);
......
......@@ -34,7 +34,8 @@
#undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap
#undef DEBUG_ALIGN // Show division points of Compute_Trace
#undef DEBUG_SCRIPT // Show trace additions for Compute_Trace
#undef DEBUG_TRACE // Show trace additions for Compute_Trace
#undef DEBUG_SCRIPT // Show script additions for Compute_Trace
#undef DEBUG_AWAVE // Show F/R waves of Compute_Trace
#undef SHOW_TRACE // Show full trace for Print_Alignment
......@@ -3060,13 +3061,24 @@ int Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
return (0);
}
void Compress_TraceTo8(Overlap *ovl)
int Compress_TraceTo8(Overlap *ovl, int check)
{ uint16 *t16 = (uint16 *) ovl->path.trace;
uint8 *t8 = (uint8 *) ovl->path.trace;
int j;
int j, x;
if (check)
for (j = 0; j < ovl->path.tlen; j++)
{ x = t16[j];
if (x > 255)
{ fprintf(stderr,"%s: Compression of trace to bytes fails, value too big\n",Prog_Name);
EXIT(1);
}
t8[j] = (uint8) x;
}
else
for (j = 0; j < ovl->path.tlen; j++)
t8[j] = (uint8) (t16[j]);
return (0);
}
void Decompress_TraceTo16(Overlap *ovl)
......@@ -3909,45 +3921,20 @@ static int depth = 0;
typedef struct
{ int *Stop; // Ongoing stack of alignment indels
uint16 *Trace; // Base of Trace Vector
char *Aabs, *Babs; // Absolute base of A and B sequences
int **PVF, **PHF; // List of waves for iterative np algorithms
int mida, midb; // mid point division for mid-point algorithms
int *VF, *VB; // Forward/Reverse waves for nd algorithms
// (defunct: were used for O(nd) algorithms)
} Trace_Waves;
static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
static int split_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int *px, int *py)
{ int x, y;
int D;
#ifdef DEBUG_ALIGN
printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
#endif
if (M <= 0)
{ x = (wave->Aabs-A)-1;
for (y = 1; y <= N; y++)
{ *wave->Stop++ = x;
#ifdef DEBUG_SCRIPT
printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1);
#endif
}
return (N);
}
if (N <= 0)
{ y = (B-wave->Babs)+1;
for (x = 1; x <= M; x++)
{ *wave->Stop++ = y;
#ifdef DEBUG_SCRIPT
printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1);
#endif
}
return (M);
}
{ int *VF = wave->VF;
int *VF = wave->VF;
int *VB = wave->VB;
int flow; // fhgh == D !
int blow, bhgh;
......@@ -3961,8 +3948,10 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
{ while (y < M && B[y] == A[y])
y += 1;
if (y >= M && N == M)
{ *px = *py = M;
return (0);
}
}
flow = 0;
VF[0] = y;
......@@ -4021,7 +4010,9 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
else
y = r+1;
x = k+y;
goto OVERLAP2;
*px = x;
*py = y;
return (D);
}
}
......@@ -4076,7 +4067,9 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
else
y = r;
x = k+y;
goto OVERLAP2;
*px = x;
*py = y;
return (D);
}
}
......@@ -4099,36 +4092,215 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
}
}
OVERLAP2:
static int trace_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int tspace)
{ int x, y;
int D, s;
#ifdef DEBUG_ALIGN
printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
fflush(stdout);
#endif
if (M <= 0)
{ y = (((A-wave->Aabs)/tspace) << 1);
wave->Trace[y] += N;
wave->Trace[y+1] += N;
#ifdef DEBUG_TRACE
printf("%*s Adding1 (%d,%d) to tp %d(%d,%d)\n",depth,"",N,N,y>>1,
wave->Trace[y+1],wave->Trace[y]);
fflush(stdout);
#endif
return (N);
}
if (N <= 0)
{ x = A - wave->Aabs;
y = x/tspace;
x = (y+1)*tspace - x;
y <<= 1;
for (s = M; s > 0; s -= x, x = tspace)
{ if (x > s)
x = s;
wave->Trace[y] += x;
#ifdef DEBUG_TRACE
printf("%*s Adding2 (0,%d) to tp %d(%d,%d)\n",depth,"",x,y>>1,
wave->Trace[y+1],wave->Trace[y]);
fflush(stdout);
#endif
y += 2;
}
return (M);
}
D = split_nd(A,M,B,N,wave,&x,&y);
if (D > 1)
{
#ifdef DEBUG_ALIGN
printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D);
fflush(stdout);
depth += 2;
#endif
s = A-wave->Aabs;
if ((s/tspace+1)*tspace - s >= x)
{ s = ((s/tspace)<<1);
wave->Trace[s] += (D+1)/2;
wave->Trace[s+1] += y;
#ifdef DEBUG_TRACE
printf("%*s Adding3 (%d,%d) to tp %d(%d,%d)\n",depth,"",y,(D+1)/2,s>>1,
wave->Trace[s+1],wave->Trace[s]);
fflush(stdout);
#endif
}
else
trace_nd(A,x,B,y,wave,tspace);
s = (A+x)-wave->Aabs;
if ((s/tspace+1)*tspace - s >= M-x)
{ s = ((s/tspace)<<1);
wave->Trace[s] += D/2;
wave->Trace[s+1] += N-y;
#ifdef DEBUG_TRACE
printf("%*s Adding4 (%d,%d)) to tp %d(%d,%d)\n",depth,"",N-y,D/2,s>>1,
wave->Trace[s+1],wave->Trace[s]);
fflush(stdout);
#endif
}
else
trace_nd(A+x,M-x,B+y,N-y,wave,tspace);
#ifdef DEBUG_ALIGN
depth -= 2;
#endif
}
else
{ int u, v;
if (D == 0 || M < N)
s = x;
else
s = x-1;
if (s > 0)
{ u = A - wave->Aabs;
v = u/tspace;
u = (v+1)*tspace - u;
for (v <<= 1; s > 0; s -= u, u = tspace)
{ if (u > s)
u = s;
wave->Trace[v+1] += u;
#ifdef DEBUG_TRACE
printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1,
wave->Trace[v+1],wave->Trace[v]);
fflush(stdout);
#endif
v += 2;
}
}
if (D == 0)
return (D);
if (M < N)
y = ((((A+x)-wave->Aabs)/tspace)<<1);
else
y = ((((A+(x-1))-wave->Aabs)/tspace)<<1);
wave->Trace[y] += 1;
if (M <= N)
wave->Trace[y+1] += 1;
#ifdef DEBUG_TRACE
printf("%*s Adding5 (%d,1)) to tp %d(%d,%d)\n",depth,"",N>=M,y>>1,
wave->Trace[y+1],wave->Trace[y]);
fflush(stdout);
#endif
s = M-x;
if (s > 0)
{ u = (A+x) - wave->Aabs;
v = u/tspace;
u = (v+1)*tspace - u;
for (v <<= 1; s > 0; s -= u, u = tspace)
{ if (u > s)
u = s;
wave->Trace[v+1] += u;
#ifdef DEBUG_TRACE
printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1,
wave->Trace[v+1],wave->Trace[v]);
fflush(stdout);
#endif
v += 2;
}
}
}
return (D);
}
static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
{ int x, y;
int D;
#ifdef DEBUG_ALIGN
printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
#endif
if (M <= 0)
{ x = (wave->Aabs-A)-1;
for (y = 1; y <= N; y++)
{ *wave->Stop++ = x;
#ifdef DEBUG_SCRIPT
printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1);
#endif
}
return (N);
}
if (N <= 0)
{ y = (B-wave->Babs)+1;
for (x = 1; x <= M; x++)
{ *wave->Stop++ = y;
#ifdef DEBUG_SCRIPT
printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1);
#endif
}
return (M);
}
D = split_nd(A,M,B,N,wave,&x,&y);
if (D > 1)
{
#ifdef DEBUG_ALIGN
printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D);
fflush(stdout);
depth += 2;
#endif
dandc_nd(A,x,B,y,wave);
dandc_nd(A+x,M-x,B+y,N-y,wave);
#ifdef DEBUG_ALIGN
depth -= 2;
#endif
}
else if (D == 1)
{ if (M > N)
{ *wave->Stop++ = (B-wave->Babs)+y+1;
#ifdef DEBUG_SCRIPT
printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1);
#endif
}
else if (M < N)
{ *wave->Stop++ = (wave->Aabs-A)-x-1;
#ifdef DEBUG_SCRIPT
printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1);
#endif
}
#ifdef DEBUG_SCRIPT
else
printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y);
......@@ -4138,47 +4310,138 @@ OVERLAP2:
return (D);
}
static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework)
int Compute_Alignment(Alignment *align, Work_Data *ework, int task, int tspace)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
int L, D;
int asub, bsub;
char *aseq, *bseq;
Path *path;
int *trace;
uint16 *strace;
path = align->path;
asub = path->aepos-path->abpos;
bsub = path->bepos-path->bbpos;
aseq = align->aseq+path->abpos;
bseq = align->bseq+path->bbpos;
if (asub < bsub)
L = bsub;
if (task != DIFF_ONLY)
{ if (task == DIFF_TRACE || task == PLUS_TRACE)
L = 2*(((path->aepos + (tspace-1))/tspace - path->abpos/tspace) + 1)*sizeof(uint16);
else if (asub < bsub)
L = bsub*sizeof(int);
else
L = asub;
L *= sizeof(int);
L = asub*sizeof(int);
if (L > work->tramax)
if (enlarge_trace(work,L))
EXIT(1);
}
trace = wave.Stop = ((int *) work->trace);
trace = ((int *) work->trace);
strace = ((uint16 *) work->trace);
D = 2*(path->diffs + 4)*sizeof(int);
if (asub > bsub)
D = (4*asub+6)*sizeof(int);
else
D = (4*bsub+6)*sizeof(int);
if (D > work->vecmax)
if (enlarge_vector(work,D))
EXIT(1);
D = (path->diffs+3)/2;
wave.VF = ((int *) work->vector) + (D+1);
wave.VB = wave.VF + (2*D+1);
if (asub > bsub)
{ wave.VF = ((int *) work->vector) + (asub+1);
wave.VB = wave.VF + (2*asub+3);
}
else
{ wave.VF = ((int *) work->vector) + (bsub+1);
wave.VB = wave.VF + (2*bsub+3);
}
wave.Aabs = align->aseq;
wave.Babs = align->bseq;
path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos,
align->bseq+path->bbpos,path->bepos-path->bbpos,&wave);
path->trace = trace;
if (task == DIFF_ONLY)
{ wave.mida = -1;
if (asub <= 0)
path->diffs = bsub;
else if (bsub <= 0)
path->diffs = asub;
else
path->diffs = split_nd(aseq,asub,bseq,bsub,&wave,&wave.mida,&wave.midb);
path->trace = NULL;
path->tlen = -1;
return (0);
}
else if (task < DIFF_ONLY && wave.mida >= 0)
{ int x = wave.mida;
int y = wave.midb;
if (task == PLUS_ALIGN)
{ wave.Stop = trace;
dandc_nd(aseq,x,bseq,y,&wave);
dandc_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave);
path->tlen = wave.Stop - trace;
}
else
{ int i, n;
wave.Trace = strace - 2*(path->abpos/tspace);
n = L/sizeof(uint16);
for (i = 0; i < n; i++)
strace[i] = 0;
trace_nd(aseq,x,bseq,y,&wave,tspace);
trace_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave,tspace);
if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary
{ strace[n-3] += strace[n-1];
strace[n-4] += strace[n-2];
}
path->tlen = n-2;
#ifdef DEBUG_SCRIPT
printf(" Trace:\n");
for (i = 0; i < path->tlen; i += 2)
printf(" %3d %3d\n",strace[i],strace[i+1]);
fflush(stdout);
#endif
}
}
else
{ if (task == DIFF_ALIGN)
{ wave.Stop = trace;
path->diffs = dandc_nd(aseq,asub,bseq,bsub,&wave);
path->tlen = wave.Stop - trace;
}
else
{ int i, n;
wave.Trace = strace - 2*(path->abpos/tspace);
n = L/sizeof(uint16);
for (i = 0; i < n; i++)
strace[i] = 0;
path->diffs = trace_nd(aseq,asub,bseq,bsub,&wave,tspace);
if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary
{ strace[n-3] += strace[n-1];
strace[n-4] += strace[n-2];
}
path->tlen = n-2;
#ifdef DEBUG_SCRIPT
printf(" Trace:\n");
for (i = 0; i < path->tlen; i += 2)
printf(" %3d %3d\n",strace[i],strace[i+1]);
fflush(stdout);
#endif
}
}
path->trace = trace;
return (0);
}
......@@ -4825,79 +5088,6 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode
static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect";
int Compute_Trace_ALL(Alignment *align, Work_Data *ework)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
Path *path;
char *aseq, *bseq;
int alen, blen;
int M, N, D;
int dmax;
alen = align->alen;
blen = align->blen;
path = align->path;
aseq = align->aseq;
bseq = align->bseq;
M = path->aepos-path->abpos;
N = path->bepos-path->bbpos;
{ int64 s;
int d;
int **PVF, **PHF;
if (M < N)
s = N;
else
s = M;
s *= sizeof(int);
if (s > work->tramax)
if (enlarge_trace(work,s))
EXIT(1);
dmax = path->diffs - abs(M-N);
s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *));
if (s > 256000000)
return (Compute_Trace_ND_ALL(align,ework));
if (s > work->vecmax)
if (enlarge_vector(work,s))
EXIT(1);
wave.PVF = PVF = ((int **) (work->vector)) + 2;
wave.PHF = PHF = PVF + (dmax+3);
s = M+N+3;
PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1);
for (d = -1; d <= dmax; d++)
PVF[d] = PVF[d-1] + s;
PHF[-2] = PVF[dmax] + s;
for (d = -1; d <= dmax; d++)
PHF[d] = PHF[d-1] + s;
}
wave.Stop = ((int *) work->trace);
wave.Aabs = aseq;
wave.Babs = bseq;
if (path->aepos > alen || path->bepos > blen)
{ EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
EXIT(1);
}
D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST,dmax);
if (D < 0)
EXIT(1);
path->diffs = D;
path->trace = work->trace;
path->tlen = wave.Stop - ((int *) path->trace);
return (0);
}
int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
......
......@@ -235,30 +235,25 @@ void Complement_Seq(char *a, int n);
int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !!
int diag, int anti, int lbord, int hbord, int prefix);
/* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment.
If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points
and diff levels computed by Local_Alignment. In either case 'path.trace' is set
/* Given a legitimate Alignment object and associated trace point vector in 'align->path.trace',
Compute_Trace_X, computes an exact trace for the alignment and resets 'align->path.trace'
to point at an integer array within the storage of the Work_Data packet encoding an
exact optimal trace from the start to end points. If the trace is needed beyond the
next call to a routine that sets it, then it should be copied to an array allocated
and managed by the caller.
Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the
best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the
edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the
trace between successive pass through points. It is much, much faster than Compute_Trace_ALL
but at the tradeoff of not necessarily being optimal as pass-through points are not all
perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points
of alignments between two adjacent pairs of pass through points. It is generally twice as
slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines
return 1 if an error occurred and 0 otherwise.
Compute_Trace_PTS computes a trace by computing the trace between successive trace points.
It is much, much faster than Compute_Alignment below but at the tradeoff of not necessarily
being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace
by computing the trace between the mid-points of alignments between two adjacent pairs of trace
points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal
alignments. Both these routines return 1 if an error occurred and 0 otherwise.
*/
#define LOWERMOST -1 // Possible modes for "mode" parameter below)
#define GREEDIEST 0
#define UPPERMOST 1
int Compute_Trace_ALL(Alignment *align, Work_Data *work);
int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode);
int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode);
......@@ -271,6 +266,25 @@ void Complement_Seq(char *a, int n);
int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !!
/* Compute Alignment determines the best alignment between the substrings specified by align.
If the task is DIFF_ONLY, then only the difference of this alignment is computed and placed
in the "diffs" field of align's path. If the task is PLUS_TRACE or DIFF_TRACE, then
'path.trace' is set to point at an integer array within the storage of the Work_Data packet
encoding a trace point sequence for an optimal alignment, whereas if the task is PLUS_ALIGN
or DIFF_ALIGN, then it points to an optimal trace of an optimatl alignment. The PLUS
tasks can only be called if the immmediately proceeding call was a DIFF_ONLY on the same
alignment record and sequences, in which case a little efficiency is gained by avoiding
the repetition of the top level search for an optimal mid-point.
*/
#define PLUS_ALIGN 0
#define PLUS_TRACE 1
#define DIFF_ONLY 2
#define DIFF_ALIGN 3
#define DIFF_TRACE 4
int Compute_Alignment(Alignment *align, Work_Data *work, int task, int trace_spacing);
/* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the
two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls
the display width of numbers, it must be not less than the width of any number to be
......@@ -339,7 +353,9 @@ typedef struct {
margin by 'indent' spaces.
Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and
Decompress_TraceTo16 does the reverse conversion.
Decompress_TraceTo16 does the reverse conversion. If check is set in a call to Compress
then it checks whether the values fit in 8-bits, and if not returns a non-zero result
in interactive mode, or exits with an error message in batch mode.
Check_Trace_Points checks that the number of trace points is correct and that the sum
of the b-read displacements equals the b-read alignment interval, assuming the trace
......@@ -353,7 +369,7 @@ typedef struct {
int Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent);
void Compress_TraceTo8(Overlap *ovl);
int Compress_TraceTo8(Overlap *ovl, int check);
void Decompress_TraceTo16(Overlap *ovl);
int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname);
......
dascrubber (1.1-1) unstable; urgency=medium
* Team upload.
* New upstream version
* Add watch file
* debhelper 11
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.2.1
* Remove trailing whitespace in debian/copyright
* hardening=+all
-- Andreas Tille <tille@debian.org> Sun, 28 Oct 2018 08:56:42 +0100
dascrubber (0~20180108-1) unstable; urgency=medium
* New upstream snapshot (git 3491b14)
......
Source: dascrubber
Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Afif Elghraoui <afif@debian.org>
Build-Depends: debhelper (>=10)
Standards-Version: 4.1.3
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~)
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/dascrubber
Vcs-Git: https://salsa.debian.org/med-team/dascrubber.git
Homepage: https://dazzlerblog.wordpress.com/
Vcs-Git: https://anonscm.debian.org/git/debian-med/dascrubber.git
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/dascrubber.git
Package: dascrubber
Architecture: any
Depends:
${shlibs:Depends},
${misc:Depends},
Suggests:
daligner,
dazzdb,
Depends: ${shlibs:Depends},
${misc:Depends}
Suggests: daligner,
dazzdb
Description: alignment-based scrubbing pipeline for DNA sequencing reads
The Dazzler Scrubbing Suite produces a set of edited reads that are guaranteed
to
......