Skip to content
Commits on Source (9)
......@@ -17,6 +17,7 @@
#include <ctype.h>
#include <unistd.h>
#include <dirent.h>
#include <limits.h>
#include "DB.h"
......@@ -599,8 +600,8 @@ error:
// for the retained reads.
void Trim_DB(DAZZ_DB *db)
{ int i, j, r;
int allflag, cutoff;
{ int i, j, r, f;
int allflag, cutoff, css;
int64 totlen;
int maxlen, nreads;
DAZZ_TRACK *record;
......@@ -676,14 +677,23 @@ void Trim_DB(DAZZ_DB *db)
record->anno = Realloc(record->anno,record->size*(j+1),NULL);
}
css = 0;
totlen = maxlen = 0;
for (j = i = 0; i < nreads; i++)
{ r = reads[i].rlen;
if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff)
{ f = reads[i].flags;
if ((f & DB_CSS) == 0)
css = 0;
r = reads[i].rlen;
if ((f & DB_BEST) >= allflag && r >= cutoff)
{ totlen += r;
if (r > maxlen)
maxlen = r;
reads[j++] = reads[i];
reads[j] = reads[i];
if (css)
reads[j++].flags |= DB_CSS;
else
reads[j++].flags &= ~DB_CSS;
css = 1;
}
}
......@@ -1989,3 +1999,175 @@ void Print_Read(char *s, int width)
printf("\n");
}
}
/*******************************************************************************************
*
* COMMAND LINE BLOCK PARSER
* Take a command line argument and interpret the '@' block number ranges.
* Parse_Block_Arg produces an Block_Looper iterator object that can then
* be invoked multiple times to iterate through all the files implied by
* the @ pattern/range.
*
********************************************************************************************/
typedef struct
{ int first, last, next;
char *root, *pwd, *ppnt;
char *slice;
} _Block_Looper;
// Advance the iterator e_parse to the next file, open it, and return the file pointer
// to it. Return NULL if at the end of the list of files.
FILE *Next_Block_Arg(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
char *disp;
FILE *input;
parse->next += 1;
if (parse->next > parse->last)
return (NULL);
if (parse->next < 0)
disp = parse->root;
else
disp = Numbered_Suffix(parse->root,parse->next,parse->ppnt);
if ((input = fopen(Catenate(parse->pwd,"/",disp,".las"),"r")) == NULL)
{ if (parse->last != INT_MAX)
{ fprintf(stderr,"%s: %s.las is not present\n",Prog_Name,disp);
exit (1);
}
return (NULL);
}
return (input);
}
// Reset the iterator e_parse to the first file
void Reset_Block_Arg(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
parse->next = parse->first - 1;
}
// Return a pointer to the path for the current file
char *Block_Arg_Path(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
return (parse->pwd);
}
// Return a pointer to the root name for the current file
char *Block_Arg_Root(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
if (parse->next < 0)
return (parse->root);
else
return (Numbered_Suffix(parse->root,parse->next,parse->ppnt));
}
// Free the iterator
void Free_Block_Arg(Block_Looper *e_parse)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
free(parse->root);
free(parse->pwd);
free(parse->slice);
free(parse);
}
char *Next_Block_Slice(Block_Looper *e_parse, int slice)
{ _Block_Looper *parse = (_Block_Looper *) e_parse;
if (parse->slice == NULL)
{ int size = strlen(parse->pwd) + strlen(Block_Arg_Root(parse)) + 30;
parse->slice = (char *) Malloc(size,"Block argument slice");
if (parse->slice == NULL)
exit (1);
}
if (parse->first < 0)
sprintf(parse->slice,"%s/%s",parse->pwd,parse->root);
else
sprintf(parse->slice,"%s/%s%c%d-%d%s",parse->pwd,parse->root,BLOCK_SYMBOL,parse->next+1,
parse->next+slice,parse->ppnt);
parse->next += slice;
return (parse->slice);
}
// Parse the command line argument and return an iterator to move through the
// file names, setting it up to report the first file.
Block_Looper *Parse_Block_Arg(char *arg)
{ _Block_Looper *parse;
char *pwd, *root;
char *ppnt, *cpnt;
int first, last;
parse = (_Block_Looper *) Malloc(sizeof(_Block_Looper),"Allocating parse node");
pwd = PathTo(arg);
root = Root(arg,".las");
if (parse == NULL || pwd == NULL || root == NULL)
exit (1);
ppnt = index(root,BLOCK_SYMBOL);
if (ppnt == NULL)
first = last = -1;
else
{ if (index(ppnt+1,BLOCK_SYMBOL) != NULL)
{ fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n",
Prog_Name,BLOCK_SYMBOL,root);
exit (1);
}
*ppnt++ = '\0';
first = strtol(ppnt,&cpnt,10);
if (cpnt == ppnt)
{ first = 1;
last = INT_MAX;
}
else
{ if (first < 0)
{ fprintf(stderr,
"%s: Integer following %c-sigan is less than 0 in source name '%s'\n",
Prog_Name,BLOCK_SYMBOL,root);
exit (1);
}
if (*cpnt == '-')
{ ppnt = cpnt+1;
last = strtol(ppnt,&cpnt,10);
if (cpnt == ppnt)
{ fprintf(stderr,"%s: Second integer must follow - in source name '%s'\n",
Prog_Name,root);
exit (1);
}
if (last < first)
{ fprintf(stderr,
"%s: 2nd integer is less than 1st integer in source name '%s'\n",
Prog_Name,root);
exit (1);
}
ppnt = cpnt;
}
else
{ last = INT_MAX;
ppnt = cpnt;
}
}
}
parse->pwd = pwd;
parse->root = root;
parse->ppnt = ppnt;
parse->first = first;
parse->last = last;
parse->next = first-1;
parse->slice = NULL;
return ((Block_Looper *) parse);
}
......@@ -59,6 +59,8 @@ typedef signed long long int64;
typedef float float32;
typedef double float64;
#define LAST_READ_SYMBOL '$'
#define BLOCK_SYMBOL '@'
/*******************************************************************************************
*
......@@ -215,12 +217,11 @@ int Count_Args(char *arg);
SYSTEM_READ_ERROR \
}
#define FTELLO(file) \
( { int x = ftello(file); \
if (x < 0) \
#define FTELLO(val,file) \
{ val = ftello(file); \
if (val < 0) \
SYSTEM_READ_ERROR \
; x; \
} )
}
/*******************************************************************************************
*
......@@ -567,4 +568,24 @@ int Read_All_Sequences(DAZZ_DB *db, int ascii);
int List_DB_Files(char *path, void actor(char *path, char *extension));
// Take a command line argument and interpret the '@' block number ranges.
// Parse_Block_Arg produces a Block_Looper iterator object that can then
// be invoked multiple times to iterate through all the files implied by
// the @ pattern/range. Next_Block_Slice returns a string encoing the next
// slice files represented by an @-notation, and advances the iterator by
// that many files.
typedef void Block_Looper;
Block_Looper *Parse_Block_Arg(char *arg);
FILE *Next_Block_Arg(Block_Looper *e_parse);
char *Next_Block_Slice(Block_Looper *e_parse,int slice);
void Reset_Block_Arg(Block_Looper *e_parse); // Reset iterator to first file
char *Block_Arg_Path(Block_Looper *e_parse); // Path of current file
char *Block_Arg_Root(Block_Looper *e_parse); // Root name of current file
void Free_Block_Arg(Block_Looper *e_parse); // Free the iterator
#endif // _DAZZ_DB
This diff is collapsed.
......@@ -19,7 +19,7 @@
#include "DB.h"
#include "align.h"
static char *Usage = "[-v] <source:las> > <target>.las";
static char *Usage = "[-v] <source:las> ... > <target>.las";
#define MEMORY 1000 // How many megabytes for output buffer
......@@ -28,7 +28,7 @@ int main(int argc, char *argv[])
FILE *input;
int64 novl, bsize, ovlsize, ptrsize;
int tspace, tbytes;
char *pwd, *root, *root2;
int c;
int VERBOSE;
......@@ -51,6 +51,10 @@ int main(int argc, char *argv[])
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," <source>'s may contain a template that is %c-sign optionally\n",
BLOCK_SYMBOL);
fprintf(stderr," followed by an integer or integer range\n");
exit (1);
}
}
......@@ -64,57 +68,49 @@ int main(int argc, char *argv[])
exit (1);
iblock += ptrsize;
pwd = PathTo(argv[1]);
root = Root(argv[1],".las");
novl = 0;
tspace = -1;
for (c = 1; c < argc; c++)
{ Block_Looper *parse;
FILE *input;
root2 = index(root,'#');
if (root2 == NULL)
{ fprintf(stderr,"%s: No #-sign in source name '%s'\n",Prog_Name,root);
exit (1);
}
if (index(root2+1,'#') != NULL)
{ fprintf(stderr,"%s: Two or more occurences of #-sign in source name '%s'\n",Prog_Name,root);
exit (1);
}
*root2++ = '\0';
parse = Parse_Block_Arg(argv[c]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ int64 povl;
int i, mspace;
novl = 0;
tspace = 0;
mspace = 0;
tbytes = 0;
for (i = 0; 1; i++)
{ char *name = Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las");
if ((input = fopen(name,"r")) == NULL) break;
int mspace;
if (fread(&povl,sizeof(int64),1,input) != 1)
SYSTEM_READ_ERROR
novl += povl;
if (fread(&mspace,sizeof(int),1,input) != 1)
SYSTEM_READ_ERROR
if (i == 0)
{ tspace = mspace;
if (tspace <= TRACE_XOVR && tspace != 0)
tbytes = sizeof(uint8);
else
tbytes = sizeof(uint16);
}
if (tspace < 0)
tspace = mspace;
else if (tspace != mspace)
{ fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace);
{ fprintf(stderr,"%s: trace-point spacing conflict between %s and earlier files",
Prog_Name,Block_Arg_Root(parse));
fprintf(stderr," (%d vs %d)\n",tspace,mspace);
exit (1);
}
fclose(input);
}
Free_Block_Arg(parse);
}
if (tspace <= TRACE_XOVR && tspace != 0)
tbytes = sizeof(uint8);
else
tbytes = sizeof(uint16);
if (fwrite(&novl,sizeof(int64),1,stdout) != 1)
SYSTEM_READ_ERROR
if (fwrite(&tspace,sizeof(int),1,stdout) != 1)
SYSTEM_READ_ERROR
}
{ int i, j;
{ Block_Looper *parse;
int c, j;
Overlap *w;
int64 tsize, povl;
int mspace;
......@@ -124,17 +120,20 @@ int main(int argc, char *argv[])
optr = oblock;
otop = oblock + bsize;
for (i = 0; 1; i++)
{ char *name = Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las");
if ((input = fopen(name,"r")) == NULL) break;
for (c = 1; c < argc; c++)
{ parse = Parse_Block_Arg(argv[c]);
if (fread(&povl,sizeof(int64),1,input) != 1)
while ((input = Next_Block_Arg(parse)) != NULL)
{ if (fread(&povl,sizeof(int64),1,input) != 1)
SYSTEM_READ_ERROR
if (fread(&mspace,sizeof(int),1,input) != 1)
SYSTEM_READ_ERROR
if (VERBOSE)
fprintf(stderr," Concatenating %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl);
{ fprintf(stderr,
" Concatenating %s: %lld la\'s\n",Block_Arg_Root(parse),povl);
fflush(stderr);
}
iptr = iblock;
itop = iblock + fread(iblock,1,bsize,input);
......@@ -179,6 +178,9 @@ int main(int argc, char *argv[])
fclose(input);
}
Free_Block_Arg(parse);
}
if (optr > oblock)
{ if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock))
SYSTEM_READ_ERROR
......@@ -186,10 +188,10 @@ int main(int argc, char *argv[])
}
if (VERBOSE)
fprintf(stderr," Totalling %lld la\'s\n",novl);
{ fprintf(stderr," Totalling %lld la\'s\n",novl);
fflush(stderr);
}
free(pwd);
free(root);
free(oblock);
free(iblock-ptrsize);
......
......@@ -18,7 +18,7 @@
#include "DB.h"
#include "align.h"
static char *Usage = "[-vS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...";
static char *Usage = "[-vaS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...";
#define MEMORY 1000 // How many megabytes for output buffer
......@@ -26,6 +26,7 @@ int main(int argc, char *argv[])
{ DAZZ_DB _db1, *db1 = &_db1;
DAZZ_DB _db2, *db2 = &_db2;
int VERBOSE;
int MAP_ORDER;
int SORTED;
int ISTWO;
int status;
......@@ -42,7 +43,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
ARG_FLAGS("vS")
ARG_FLAGS("vaS")
break;
}
else
......@@ -50,18 +51,24 @@ int main(int argc, char *argv[])
argc = j;
VERBOSE = flags['v'];
MAP_ORDER = flags['a'];
SORTED = flags['S'];
if (argc <= 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output error messages.\n");
fprintf(stderr," -S: Check that .las is in sorted order.\n");
fprintf(stderr," -a: If -S, then check sorted by A-read, A-position pairs\n");
fprintf(stderr," off => check sorted by A,B-read pairs (LA-piles)\n");
exit (1);
}
}
// Open trimmed DB
{ int status;
char *pwd, *root;
{ Block_Looper *parse;
int status;
FILE *input;
ISTWO = 0;
......@@ -73,14 +80,12 @@ int main(int argc, char *argv[])
exit (1);
}
pwd = PathTo(argv[2]);
root = Root(argv[2],".las");
if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL)
{ ISTWO = 1;
if (argc <= 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
db2 = db1;
else
{ parse = Parse_Block_Arg(argv[2]);
if ((input = Next_Block_Arg(parse)) == NULL)
{ ISTWO = 1;
status = Open_DB(argv[2],db2);
if (status < 0)
exit (1);
......@@ -91,13 +96,12 @@ int main(int argc, char *argv[])
Trim_DB(db2);
}
else
{ fclose(input);
db2 = db1;
{ db2 = db1;
fclose(input);
}
Free_Block_Arg(parse);
}
Trim_DB(db1);
free(root);
free(pwd);
}
{ char *iblock;
......@@ -122,8 +126,9 @@ int main(int argc, char *argv[])
status = 0;
for (i = 2+ISTWO; i < argc; i++)
{ char *pwd, *root;
{ Block_Looper *parse;
FILE *input;
char *disp;
char *iptr, *itop;
Overlap last, prev;
int64 novl;
......@@ -132,10 +137,10 @@ int main(int argc, char *argv[])
// Establish IO and (novl,tspace) header
pwd = PathTo(argv[i]);
root = Root(argv[i],".las");
if ((input = Fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL)
goto error;
parse = Parse_Block_Arg(argv[i]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ disp = Block_Arg_Root(parse);
if (fread(&novl,sizeof(int64),1,input) != 1)
SYSTEM_READ_ERROR
......@@ -143,12 +148,12 @@ int main(int argc, char *argv[])
SYSTEM_READ_ERROR
if (novl < 0)
{ if (VERBOSE)
fprintf(stderr," %s: Number of alignments < 0\n",root);
fprintf(stderr," %s: Number of alignments < 0\n",disp);
goto error;
}
if (tspace < 0)
{ if (VERBOSE)
fprintf(stderr," %s: Trace spacing < 0\n",root);
fprintf(stderr," %s: Trace spacing < 0\n",disp);
goto error;
}
......@@ -185,7 +190,7 @@ int main(int argc, char *argv[])
itop += fread(itop,1,bsize-remains,input);
if (iptr + ovlsize > itop)
{ if (VERBOSE)
fprintf(stderr," %s: Too few alignment records\n",root);
fprintf(stderr," %s: Too few alignment records\n",disp);
goto error;
}
}
......@@ -203,7 +208,7 @@ int main(int argc, char *argv[])
itop += fread(itop,1,bsize-remains,input);
if (iptr + tsize > itop)
{ if (VERBOSE)
fprintf(stderr," %s: Too few alignment records\n",root);
fprintf(stderr," %s: Too few alignment records\n",disp);
goto error;
}
}
......@@ -214,12 +219,12 @@ int main(int argc, char *argv[])
if (ovl.aread < 0 || ovl.bread < 0)
{ if (VERBOSE)
fprintf(stderr," %s: Read indices < 0\n",root);
fprintf(stderr," %s: Read indices < 0\n",disp);
goto error;
}
if (ovl.aread >= nreads1 || ovl.bread >= nreads2)
{ if (VERBOSE)
fprintf(stderr," %s: Read indices out of range\n",root);
fprintf(stderr," %s: Read indices out of range\n",disp);
goto error;
}
......@@ -227,38 +232,38 @@ int main(int argc, char *argv[])
ovl.path.bbpos >= ovl.path.bepos || ovl.path.bepos > reads2[ovl.bread].rlen ||
ovl.path.abpos < 0 || ovl.path.bbpos < 0 )
{ if (VERBOSE)
fprintf(stderr," %s: Non-sense alignment intervals\n",root);
fprintf(stderr," %s: Non-sense alignment intervals\n",disp);
goto error;
}
if (ovl.path.diffs < 0 || ovl.path.diffs > reads1[ovl.aread].rlen ||
ovl.path.diffs > reads2[ovl.bread].rlen)
{ if (VERBOSE)
fprintf(stderr," %s: Non-sense number of differences\n",root);
fprintf(stderr," %s: Non-sense number of differences\n",disp);
goto error;
}
if (Check_Trace_Points(&ovl,tspace,VERBOSE,root))
if (Check_Trace_Points(&ovl,tspace,VERBOSE,disp))
goto error;
if (j == 0)
has_chains = ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0);
if (has_chains)
{ if ((ovl.flags & (START_FLAG | NEXT_FLAG)) == 0)
{ if (CHAIN_START(ovl.flags) && CHAIN_NEXT(ovl.flags))
{ if (VERBOSE)
fprintf(stderr," %s: LA has both start & next flag set\n",root);
fprintf(stderr," %s: LA has both start & next flag set\n",disp);
goto error;
}
if (BEST_CHAIN(ovl.flags) && CHAIN_NEXT(ovl.flags))
{ if (VERBOSE)
fprintf(stderr," %s: LA has both best & next flag set\n",root);
fprintf(stderr," %s: LA has both best & next flag set\n",disp);
goto error;
}
}
else
{ if ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0)
{ if (VERBOSE)
fprintf(stderr," %s: LAs should not have chain flags\n",root);
fprintf(stderr," %s: LAs should not have chain flags\n",disp);
goto error;
}
}
......@@ -267,9 +272,26 @@ int main(int argc, char *argv[])
equal = 0;
if (SORTED)
{ if (CHAIN_NEXT(ovl.flags) || !has_chains)
{ if (CHAIN_NEXT(ovl.flags))
{ if (ovl.aread == last.aread && ovl.bread != last.bread &&
COMP(ovl.flags) != COMP(last.flags) &&
ovl.path.abpos >= last.path.abpos &&
ovl.path.bbpos >= last.path.bbpos)
goto dupcheck;
if (VERBOSE)
fprintf(stderr," %s: Chain is not valid (%d vs %d)\n",
disp,ovl.aread+1,ovl.bread+1);
goto error;
}
else if (!has_chains)
{ if (ovl.aread > last.aread) goto inorder;
if (ovl.aread == last.aread)
{ if (MAP_ORDER)
{ if (ovl.path.abpos > prev.path.abpos) goto inorder;
if (ovl.path.abpos == prev.path.abpos)
goto dupcheck;
}
else
{ if (ovl.bread > last.bread) goto inorder;
if (ovl.bread == last.bread)
{ if (COMP(ovl.flags) > COMP(last.flags)) goto inorder;
......@@ -282,26 +304,37 @@ int main(int argc, char *argv[])
}
}
}
if (VERBOSE)
{ if (CHAIN_NEXT(ovl.flags))
fprintf(stderr," %s: Chain is not valid (%d vs %d)\n",
root,ovl.aread+1,ovl.bread+1);
else
fprintf(stderr," %s: Reads are not sorted (%d vs %d)\n",
root,ovl.aread+1,ovl.bread+1);
}
if (VERBOSE)
fprintf(stderr," %s: LAs are not sorted (%d vs %d)\n",
disp,ovl.aread+1,ovl.bread+1);
goto error;
}
else
else // First element of a chain
{ if (ovl.aread > prev.aread) goto inorder;
if (ovl.aread == prev.aread)
{ if (MAP_ORDER)
{ if (ovl.path.abpos > prev.path.abpos) goto inorder;
if (ovl.path.abpos == prev.path.abpos)
goto dupcheck;
}
else
{ if (ovl.bread > prev.bread) goto inorder;
if (ovl.bread == prev.bread)
{ if (COMP(ovl.flags) > COMP(prev.flags)) goto inorder;
if (COMP(ovl.flags) == COMP(prev.flags))
{ if (ovl.path.abpos > prev.path.abpos) goto inorder;
if (ovl.path.abpos == prev.path.abpos)
{ equal = 1;
goto dupcheck;
}
}
}
}
}
if (VERBOSE)
fprintf(stderr," %s: Chains are not sorted (%d vs %d)\n",
root,ovl.aread+1,ovl.bread+1);
disp,ovl.aread+1,ovl.bread+1);
goto error;
}
}
......@@ -315,8 +348,8 @@ int main(int argc, char *argv[])
ovl.path.bbpos == last.path.bbpos &&
ovl.path.bepos == last.path.bepos)
{ if (VERBOSE)
fprintf(stderr," %s: Duplicate overlap (%d vs %d)\n",
root,ovl.aread+1,ovl.bread+1);
fprintf(stderr," %s: Duplicate LAs (%d vs %d)\n",
disp,ovl.aread+1,ovl.bread+1);
goto error;
}
}
......@@ -330,24 +363,30 @@ int main(int argc, char *argv[])
if (iptr < itop)
{ if (VERBOSE)
fprintf(stderr," %s: Too many alignment records\n",root);
fprintf(stderr," %s: Too many alignment records\n",disp);
goto error;
}
if (VERBOSE)
{ fprintf(stderr," %s: ",root);
Print_Number(novl,0,stderr);
fprintf(stderr," all OK\n");
{ printf(" %s: ",disp);
Print_Number(novl,0,stdout);
printf(" all OK\n");
fflush(stdout);
}
goto cleanup;
error:
status = 1;
if (VERBOSE)
{ printf(" %s: Not OK, see stderr\n",disp);
fflush(stdout);
}
cleanup:
if (input != NULL)
fclose(input);
free(pwd);
free(root);
}
Free_Block_Arg(parse);
}
free(iblock-ptrsize);
......
......@@ -24,8 +24,6 @@
static char *Usage =
"[-cdtlo] <src1:db|dam> [<src2:db|dam>] <align:las> [<reads:FILE> | <reads:range> ...]";
#define LAST_READ_SYMBOL '$'
static int ORDER(const void *l, const void *r)
{ int x = *((int *) l);
int y = *((int *) r);
......
......@@ -27,11 +27,11 @@ static char *Usage = "[-v] <source:las> ...";
#define MEMORY 1000 // How many megabytes for output buffer
int main(int argc, char *argv[])
{ char *iblock;
{ Block_Looper *parse;
char *iblock;
FILE *input, *output;
int64 novl, bsize, ovlsize, ptrsize;
int tspace, tbytes;
char *pwd, *root;
int64 tmax, ttot;
int64 omax, smax;
int64 odeg, sdeg;
......@@ -73,13 +73,10 @@ int main(int argc, char *argv[])
iblock += ptrsize;
for (i = 1; i < argc; i++)
{ pwd = PathTo(argv[i]);
root = Root(argv[i],".las");
input = Fopen(Catenate(pwd,"/",root,".las"),"r");
if (input == NULL)
exit (1);
{ parse = Parse_Block_Arg(argv[i]);
if (fread(&novl,sizeof(int64),1,input) != 1)
while ((input = Next_Block_Arg(parse)) != NULL)
{ if (fread(&novl,sizeof(int64),1,input) != 1)
SYSTEM_READ_ERROR
if (fread(&tspace,sizeof(int),1,input) != 1)
SYSTEM_READ_ERROR
......@@ -88,17 +85,14 @@ int main(int argc, char *argv[])
else
tbytes = sizeof(uint16);
output = Fopen(Catenate(pwd,"/.",root,".las.idx"),"w");
output = Fopen(Catenate(Block_Arg_Path(parse),"/.",Block_Arg_Root(parse),".las.idx"),"w");
if (output == NULL)
exit (1);
free(pwd);
free(root);
if (VERBOSE)
{ printf(" Indexing %s: ",root);
{ printf(" Indexing %s: ",Block_Arg_Root(parse));
Print_Number(novl,0,stdout);
printf(" records ... ");
fprintf(stdout," records ... ");
fflush(stdout);
}
......@@ -195,6 +189,9 @@ int main(int argc, char *argv[])
fclose(output);
}
Free_Block_Arg(parse);
}
free(iblock-ptrsize);
exit (0);
......
......@@ -10,19 +10,23 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <ctype.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include "DB.h"
#include "align.h"
static char *Usage = "[-va] <merge:las> <parts:las> ...";
#undef DEBUG
static char *Usage = "[-va] [-P<dir(/tmp)>] <merge:las> <parts:las> ...";
#define MEMORY 4000 // in Mb
#undef DEBUG
#define MAX_FILES 250
// Heap sort of records according to (aread,bread,COMP(flags),abpos) order
......@@ -180,7 +184,7 @@ int main(int argc, char *argv[])
{ IO_block *in;
int64 bsize, osize, psize;
char *block, *oblock;
int i, fway;
int i, c, fway, clen, nfile[argc];
Overlap **heap;
int hsize;
Overlap *ovls;
......@@ -191,18 +195,34 @@ int main(int argc, char *argv[])
int VERBOSE;
int MAP_SORT;
char *TEMP_PATH;
// Process command line
{ int j, k;
int flags[128];
DIR *dirp;
ARG_INIT("LAmerge")
TEMP_PATH = "/tmp";
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
{ ARG_FLAGS("va") }
switch (argv[i][1])
{ default:
ARG_FLAGS("va")
break;
case 'P':
TEMP_PATH = argv[i]+2;
if ((dirp = opendir(TEMP_PATH)) == NULL)
{ fprintf(stderr,"%s: -P option: cannot open directory %s\n",Prog_Name,TEMP_PATH);
exit (1);
}
closedir(dirp);
break;
}
else
argv[j++] = argv[i];
argc = j;
......@@ -212,17 +232,129 @@ int main(int argc, char *argv[])
if (argc < 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n");
fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n");
fprintf(stderr," -P: Do any intermediate merging in directory -P.\n");
exit (1);
}
}
fway = argc-2;
if (fway > 252)
{ fprintf(stderr,"Exceeded maximum # of inputs and outputs (252) of merge\n");
// Determine the number of files and check they are all mergeable
clen = 2*strlen(TEMP_PATH) + 50;
fway = 0;
totl = 0;
tspace = -1;
for (c = 2; c < argc; c++)
{ Block_Looper *parse;
FILE *input;
parse = Parse_Block_Arg(argv[c]);
clen += strlen(Block_Arg_Path(parse)) + strlen(Block_Arg_Root(parse)) + 30;
nfile[c] = 0;
while ((input = Next_Block_Arg(parse)) != NULL)
{ int64 povl;
int mspace;
if (fread(&povl,sizeof(int64),1,input) != 1)
SYSTEM_READ_ERROR
totl += povl;
if (fread(&mspace,sizeof(int),1,input) != 1)
SYSTEM_READ_ERROR
if (tspace < 0)
tspace = mspace;
else if (tspace != mspace)
{ fprintf(stderr,"%s: trace-point spacing conflict between %s and earlier files",
Prog_Name,Block_Arg_Root(parse));
fprintf(stderr," (%d vs %d)\n",tspace,mspace);
exit (1);
}
fclose(input);
nfile[c] += 1;
}
Free_Block_Arg(parse);
fway += nfile[c];
}
// Open all the input files and initialize their buffers
if (VERBOSE)
{ printf(" Merging %d files totalling ",fway);
Print_Number(totl,0,stdout);
printf(" records\n");
fflush(stdout);
}
// Must recursively merge, emit sub-merges, then merge their results
if (fway > MAX_FILES)
{ Block_Looper *parse;
int mul, dim, fsum, cut;
char command[clen], *com;
int pid;
mul = 1;
for (c = 0; mul < fway; c++)
mul *= MAX_FILES;
dim = pow(1.*fway,1./c)+1;
fsum = 0;
c = 2;
parse = Parse_Block_Arg(argv[c]);
pid = getpid();
for (i = 1; i <= dim; i++)
{ com = command;
com += sprintf(com,"LAmerge");
if (MAP_SORT)
com += sprintf(com," -a");
if (mul > 2)
com += sprintf(com," -P%s",TEMP_PATH);
com += sprintf(com," %s/LM%d.P%d",TEMP_PATH,pid,i);
cut = (fway * i) / dim;
while (fsum + nfile[c] <= cut)
{ com += sprintf(com," %s",Next_Block_Slice(parse,nfile[c]));
fsum += nfile[c];
c += 1;
if (c >= argc)
break;
Free_Block_Arg(parse);
parse = Parse_Block_Arg(argv[c]);
}
if (c < argc && fsum < cut)
{ int n = cut-fsum;
com += sprintf(com," %s",Next_Block_Slice(parse,n));
nfile[c] -= n;
fsum += n;
}
system(command);
}
Free_Block_Arg(parse);
com = command;
com += sprintf(com,"LAmerge");
if (MAP_SORT)
com += sprintf(com," -a");
com += sprintf(com," %s %s/LM%d.P%c",argv[1],TEMP_PATH,pid,BLOCK_SYMBOL);
system(command);
sprintf(command,"rm %s/LM%d.P*.las",TEMP_PATH,pid);
system(command);
exit (0);
}
// Base level merge: Open all the input files and initialize their buffers
psize = sizeof(void *);
osize = sizeof(Overlap) - psize;
......@@ -233,47 +365,37 @@ int main(int argc, char *argv[])
exit (1);
block += psize;
totl = 0;
tbytes = 0;
tspace = 0;
for (i = 0; i < fway; i++)
fway = 0;
for (c = 2; c < argc; c++)
{ Block_Looper *parse;
FILE *input;
parse = Parse_Block_Arg(argv[c]);
while ((input = Next_Block_Arg(parse)) != NULL)
{ int64 novl;
int mspace;
FILE *input;
char *pwd, *root;
char *iblock;
pwd = PathTo(argv[i+2]);
root = Root(argv[i+2],".las");
input = Fopen(Catenate(pwd,"/",root,".las"),"r");
if (input == NULL)
exit (1);
free(pwd);
free(root);
if (fread(&novl,sizeof(int64),1,input) != 1)
SYSTEM_READ_ERROR
totl += novl;
if (fread(&mspace,sizeof(int),1,input) != 1)
SYSTEM_READ_ERROR
if (i == 0)
{ tspace = mspace;
in[fway].stream = input;
in[fway].block = iblock = block+fway*bsize;
in[fway].ptr = iblock;
in[fway].top = iblock + fread(in[fway].block,1,bsize,input);
in[fway].count = 0;
fway += 1;
}
Free_Block_Arg(parse);
}
if (tspace <= TRACE_XOVR && tspace != 0)
tbytes = sizeof(uint8);
else
tbytes = sizeof(uint16);
}
else if (tspace != mspace)
{ fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace);
exit (1);
}
in[i].stream = input;
in[i].block = iblock = block+i*bsize;
in[i].ptr = iblock;
in[i].top = iblock + fread(in[i].block,1,bsize,input);
in[i].count = 0;
}
// Open the output file buffer and write (novl,tspace) header
......@@ -297,12 +419,6 @@ int main(int argc, char *argv[])
otop = oblock + bsize;
}
if (VERBOSE)
{ printf("Merging %d files totalling ",fway);
Print_Number(totl,0,stdout);
printf(" records\n");
}
// Initialize the heap
heap = (Overlap **) Malloc(sizeof(Overlap *)*(fway+1),"Allocating heap");
......
......@@ -26,8 +26,6 @@ static char *Usage[] =
" <src1:db|dam> [ <src2:db|dam> ] <align:las> [ <reads:FILE> | <reads:range> ... ]"
};
#define LAST_READ_SYMBOL '$'
static int ORDER(const void *l, const void *r)
{ int x = *((int *) l);
int y = *((int *) r);
......@@ -96,6 +94,17 @@ int main(int argc, char *argv[])
if (argc <= 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
fprintf(stderr,"\n");
fprintf(stderr," -c: Show a cartoon of the LA between reads.\n");
fprintf(stderr," -a: Show the alignment of each LA.\n");
fprintf(stderr," -r: Show the alignment of each LA with -w bp's of A in each row.\n");
fprintf(stderr," -o: Show only proper overlaps.\n");
fprintf(stderr," -F: Switch the roles of A- and B-reads.\n");
fprintf(stderr,"\n");
fprintf(stderr," -U: Show alignments in upper case.\n");
fprintf(stderr," -i: Indent alignments and cartoons by -i.\n");
fprintf(stderr," -w: Width of each row of alignment in symbols (-a) or bps (-r).\n");
fprintf(stderr," -b: # of border bp.s to show on each side of LA.\n");
exit (1);
}
}
......
......@@ -125,6 +125,10 @@ int main(int argc, char *argv[])
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n");
fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n");
exit (1);
}
}
......@@ -142,21 +146,18 @@ int main(int argc, char *argv[])
{ int64 *perm;
FILE *input, *foutput;
int64 novl, sov;
Block_Looper *parse;
parse = Parse_Block_Arg(argv[i]);
while ((input = Next_Block_Arg(parse)) != NULL)
{
// Read in the entire file and output header
{ int64 size;
struct stat info;
char *pwd, *root, *name;
pwd = PathTo(argv[i]);
root = Root(argv[i],".las");
name = Catenate(pwd,"/",root,".las");
input = Fopen(name,"r");
if (input == NULL)
exit (1);
stat(name,&info);
stat(Catenate(Block_Arg_Path(parse),"/",Block_Arg_Root(parse),".las"),&info);
size = info.st_size;
if (fread(&novl,sizeof(int64),1,input) != 1)
......@@ -170,7 +171,7 @@ int main(int argc, char *argv[])
tbytes = sizeof(uint16);
if (VERBOSE)
{ printf(" %s: ",root);
{ printf(" %s: ",Block_Arg_Root(parse));
Print_Number(novl,0,stdout);
printf(" records ");
Print_Number(size-novl*ovlsize,0,stdout);
......@@ -178,7 +179,7 @@ int main(int argc, char *argv[])
fflush(stdout);
}
foutput = Fopen(Catenate(pwd,"/",root,".S.las"),"w");
foutput = Fopen(Catenate(Block_Arg_Path(parse),"/",Block_Arg_Root(parse),".S.las"),"w");
if (foutput == NULL)
exit (1);
......@@ -187,9 +188,6 @@ int main(int argc, char *argv[])
if (fwrite(&tspace,sizeof(int),1,foutput) != 1)
SYSTEM_READ_ERROR
free(pwd);
free(root);
if (size > isize)
{ if (iblock == NULL)
iblock = Malloc(size+ptrsize,"Allocating LAsort input block");
......@@ -277,11 +275,14 @@ int main(int argc, char *argv[])
SYSTEM_READ_ERROR
}
}
}
free(perm);
fclose(foutput);
Free_Block_Arg(parse);
}
if (iblock != NULL)
free(iblock - ptrsize);
free(fblock);
......
......@@ -19,7 +19,7 @@
#include "DB.h"
#include "align.h"
static char *Usage = "-v <align:las> (<parts:int> | <path:db|dam>) < <source>.las";
static char *Usage = "-v <target:las> (<parts:int> | <path:db|dam>) < <source>.las";
#define MEMORY 1000 // How many megabytes for output buffer
......@@ -52,6 +52,10 @@ int main(int argc, char *argv[])
if (argc != 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
fprintf(stderr,"\n");
fprintf(stderr," <target> is a template that must have a single %c-sign in it\n",
BLOCK_SYMBOL);
fprintf(stderr," This symbol is replaced by numbers 1 to n = the number of parts\n");
exit (1);
}
}
......@@ -116,13 +120,14 @@ int main(int argc, char *argv[])
pwd = PathTo(argv[1]);
root = Root(argv[1],".las");
root2 = index(root,'#');
root2 = index(root,BLOCK_SYMBOL);
if (root2 == NULL)
{ fprintf(stderr,"%s: No #-sign in source name '%s'\n",Prog_Name,root);
{ fprintf(stderr,"%s: No %c-sign in source name '%s'\n",Prog_Name,BLOCK_SYMBOL,root);
exit (1);
}
if (index(root2+1,'#') != NULL)
{ fprintf(stderr,"%s: Two or more occurences of #-sign in source name '%s'\n",Prog_Name,root);
if (index(root2+1,BLOCK_SYMBOL) != NULL)
{ fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n",
Prog_Name,BLOCK_SYMBOL,root);
exit (1);
}
*root2++ = '\0';
......@@ -137,7 +142,9 @@ int main(int argc, char *argv[])
tbytes = sizeof(uint16);
if (VERBOSE)
fprintf(stderr," Distributing %lld la\'s\n",novl);
{ printf(" Distributing %lld la\'s\n",novl);
fflush(stdout);
}
{ int i;
Overlap *w;
......@@ -227,7 +234,9 @@ int main(int argc, char *argv[])
fwrite(&povl,sizeof(int64),1,output);
if (VERBOSE)
fprintf(stderr," Split off %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl);
{ printf(" Split off %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl);
fflush(stdout);
}
fclose(output);
}
......
......@@ -4,7 +4,7 @@
## _First: April 10, 2016_
For typeset documentation, examples of use, and design philosophy please go to
my [blog](https://dazzlerblog.wordpress.com/command-guides/damasker-commands).
my [blog](https://dazzlerblog.wordpress.com/command-guides/daligner-command-reference-guide).
The commands below permit one to find all significant local alignments between reads
encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II
......@@ -18,8 +18,19 @@ in the data set. The alignment records are parsimonious in that they do not rec
alignment but simply a set of trace points, typically every 100bp or so, that allow the
efficient reconstruction of alignments on demand.
All programs add suffixes (e.g. .db, .las) as needed.
For the commands that take multiple .las files as arguments, i.e. LAsort, LAmerge, LAindex, LAcat,
and LAcheck, one can place a @-sign in the name, which is then interpreted as the sequence of files
obtained by replacing the @-sign by 1, 2, 3, ... in sequence until a number is reached for
which no file matches. One can also place a @-sign followed by an integer, say, i, in which
case the sequence starts at i. Lastly, one can also place @i-j where i and j are integers, in
which case the sequence is from i to j, inclusive.
The formal UNIX command line
descriptions and options for the DALIGNER module commands are as follows:
```
1. daligner [-vbAI]
1. daligner [-vabAI]
[-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>] [-P<dir(/tmp)>]
[-e<double(.70)] [-l<int(1000)] [-s<int(100)>] [-H<int>] [-T<int(4)>]
[-m<track>]+ <subject:db|dam> <target:db|dam> ...
......@@ -78,6 +89,8 @@ between different portions of the same read will also be found and reported. In
the command "daligner -A X Y" produces a single file X.Y..las and "daligner X Y" produces
2 files X.Y..las and Y.X.las (unless X=Y in which case only a single file, X.X.las, is
produced). The overlap records in one of these files are sorted as described for LAsort.
The -a option to daligner is passed directly through to LAsort which is actually called
as a sub-process to produce the sorted file.
In order to produce the aforementioned .las file, several temporary .las files, two for
each thread, are produce in the sub-directory /tmp by default. You can overide this
location by specifying the directory you would like this activity to take place in with
......@@ -114,12 +127,15 @@ LAsort can detects that it has been passed such a file and if so treats the chai
a unit and sorts them on the basis of the first LA in the chain.
```
3. LAmerge [-va] <merge:las> <parts:las> ...
3. LAmerge [-va] [-P<dir(/tmp)>] <merge:las> <parts:las> ...
```
Merge the .las files \<parts\> into a singled sorted file \<merge\>, where it is assumed
that the input \<parts\> files are sorted. Due to operating system limits, the number of
\<parts\> files must be &le; 252. With the -v option set the program reports the # of
that the input \<parts\> files are sorted. There are no limits to how many files can be
merged, but if there are more than 252, a typical UNIX OS limit on the number of simultaneously
open files, then the program recursively spawns sub-processes and creates temporary files
in the directory specified by the -P option, /tmp by default.
With the -v option set the program reports the number of
records read and written. The -a option indicates the sort is as describe for LAsort
above.
......@@ -248,7 +264,7 @@ first character of every line is a "1-code" character that tells you what inform
to expect on the line. The rest of the line contains information where each item is
separated by a single blank space. The trace point line gives the number of trace
point intervals in the LA and is immediately followed by that many lines containing
a pair of integers giving the # of differences and b-displacement in each successive
a pair of integers giving the number of differences and b-displacement in each successive
trace point interval.
```
......@@ -290,12 +306,11 @@ they need at any momment int time, as opposed to having to sequentially scan
through the .las file.
```
7. LAcat [-v] <source:las> > <target>.las
7. LAcat [-v] <source:las> ... > <target>.las
```
Given template name \<source\> that contains a single #-sign somewhere within it,
find all files that match it when the # is replace by i for i in 1,2,3,... and
a .las extension is added if not present. Then concatenate these files in order
The sequence of \<source\> files (that can contain @-sign block ranges) are
concatenated in order
into a single .las file and pipe the result to the standard output. The -v
option reports the files concatenated and the number of la's within them to
standard error (as the standard output receives the concatenated file).
......@@ -308,7 +323,7 @@ If the second argument is an integer n, then divide the alignment file \<source\
in through the standard input, as evenly as possible into n alignment files with the
names specified by template \<target\>, subject to the restriction that all alignment
records for a given a-read are in the same file. The name of the n files is the
string \<target\> where the single #-sign that occurs somewhere in it is replaced
string \<target\> where the single @-sign that occurs somewhere in it is replaced
by i for i in [1,n] and a .las extension is added if necessary.
If the second argument refers to a database \<path\>.db that has been partitioned, then
......@@ -317,26 +332,27 @@ in \<path\>.i.db are in the i'th file generated from the template \<target\>. T
option reports the files produced and the number of la's within them to standard error.
```
9. LAcheck [-vS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...
9. LAcheck [-vaS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...
```
LAcheck checks each .las file for structural integrity, where the a- and b-sequences
come from src1 or from src1 and scr2, respectively. That is, it makes sure each file
makes sense as a plausible .las file, e.g. values are not out of bound, the number of
records is correct, the number of trace points for a record is correct, and so on. If
the -S option is set then it further checks that the alignments are in sorted order.
the -S option is set then it further checks that the alignments are in sorted order,
by default pile order, but if -a is also set, then map order.
If the -v option is set then a line is output for each .las file saying either the
file is OK or reporting the first error. If the -v option is not set then the program
runs silently. The exit status is 0 if every file is deemed good, and 1 if at least
one of the files looks corrupted.
With the introduction of damapper, LAcheck checks to see if a file has chain
information, and if it does, then it checks the validity of chains and assumes that
the chains were sorted with the -a option to LAsort and LAmerge.
information, and if it does, then it checks the validity of chains and checks the
sorting order of chains as a unit according to the -a option.
```
10. HPC.daligner [-vbad] [-t<int>] [-w<int(6)>] [-l<int(1000)] [-s<int(100)] [-P<dir(/tmp)>]
[-M<int>] [-B<int(4)>] [-D<int( 250)>] [-T<int(4)>] [-f<name>]
10. HPC.daligner [-vbad] [-t<int>] [-w<int(6)>] [-l<int(1000)] [-s<int(100)] [-M<int>]
[-P<dir(/tmp)>] [-B<int(4)>] [-T<int(4)>] [-f<name>]
( [-k<int(14)>] [-h<int(35)>] [-e<double(.70)] [-H<int>]
[-k<int(20)>] [-h<int(50)>] [-e<double(.85)] <ref:db|dam> )
[-m<track>]+ <reads:db|dam> [<first:int>[-<last:int>]]
......@@ -364,18 +380,15 @@ these parameters are as for daligner. The -v and -a flags are passed to all call
LAsort and LAmerge. All other options are described later. For a database divided into
N sub-blocks, the calls to daligner will produce in total N<sup>2</sup> .las files,
on per block pair.
These are then merged in ceil(log<sub>D</sub> N) phases where
the number of files decreases geometrically in -D until there is 1 file per row of
the N x N block matrix. So at the end one has N sorted .las files that when
These are then merged so that there is 1 file per row of
the N x N block matrix. So at the end one has N sorted .las files, one per block of
A-reads, that when
concatenated would give a single large sorted overlap file.
The -B option (default 4) gives the desired number of block comparisons per call to
daligner. Some must contain B-1 comparisons, and the first B-2 block comparisons
even less, but the HPCdaligner "planner" does the best it can to give an average load
of dal block comparisons per command. The -D option (default 250) gives the maximum
number of files that will be merged in a single LAmerge command. The planner performs
D-way merges at all of the ceil(log<sub>D</sub> N) levels save the last, so as to minimize the
number of intermediate files.
of -B block comparisons per command.
If the integers \<first\> and \<last\> are missing then the script produced is for every
block in the database. If \<first\> is present then HPCdaligner produces an incremental
......@@ -419,14 +432,13 @@ DB" would produce the files:
JOBS.01.OVL
JOBS.02.CHECK.OPT
JOBS.03.MERGE
JOBS.04.CHECK.OPT
JOBS.05.RM.OPT
JOBS.04.RM.OPT
```
The number of command blocks varies as it depends on the number of merging rounds
required in the external sort of the .las files. The files with the suffix .OPT are
optional and need not be executed albeit we highly recommend that one run all the
CHECK blocks.
There are always 4 command blocks. The files with the suffix .OPT are
optional and need not be executed albeit we highly recommend that one run the
CHECK block. One should *not* run the RM block if one wants to later use
DASrealign after scrubbing.
A new -d option requests scripts that organize files into a collection of
sub-directories so as not to overwhelm the underlying OS for large genomes. Recall
......
......@@ -34,7 +34,8 @@
#undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap
#undef DEBUG_ALIGN // Show division points of Compute_Trace
#undef DEBUG_SCRIPT // Show trace additions for Compute_Trace
#undef DEBUG_TRACE // Show trace additions for Compute_Trace
#undef DEBUG_SCRIPT // Show script additions for Compute_Trace
#undef DEBUG_AWAVE // Show F/R waves of Compute_Trace
#undef SHOW_TRACE // Show full trace for Print_Alignment
......@@ -3060,13 +3061,24 @@ int Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
return (0);
}
void Compress_TraceTo8(Overlap *ovl)
int Compress_TraceTo8(Overlap *ovl, int check)
{ uint16 *t16 = (uint16 *) ovl->path.trace;
uint8 *t8 = (uint8 *) ovl->path.trace;
int j;
int j, x;
if (check)
for (j = 0; j < ovl->path.tlen; j++)
{ x = t16[j];
if (x > 255)
{ fprintf(stderr,"%s: Compression of trace to bytes fails, value too big\n",Prog_Name);
EXIT(1);
}
t8[j] = (uint8) x;
}
else
for (j = 0; j < ovl->path.tlen; j++)
t8[j] = (uint8) (t16[j]);
return (0);
}
void Decompress_TraceTo16(Overlap *ovl)
......@@ -3909,45 +3921,20 @@ static int depth = 0;
typedef struct
{ int *Stop; // Ongoing stack of alignment indels
uint16 *Trace; // Base of Trace Vector
char *Aabs, *Babs; // Absolute base of A and B sequences
int **PVF, **PHF; // List of waves for iterative np algorithms
int mida, midb; // mid point division for mid-point algorithms
int *VF, *VB; // Forward/Reverse waves for nd algorithms
// (defunct: were used for O(nd) algorithms)
} Trace_Waves;
static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
static int split_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int *px, int *py)
{ int x, y;
int D;
#ifdef DEBUG_ALIGN
printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
#endif
if (M <= 0)
{ x = (wave->Aabs-A)-1;
for (y = 1; y <= N; y++)
{ *wave->Stop++ = x;
#ifdef DEBUG_SCRIPT
printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1);
#endif
}
return (N);
}
if (N <= 0)
{ y = (B-wave->Babs)+1;
for (x = 1; x <= M; x++)
{ *wave->Stop++ = y;
#ifdef DEBUG_SCRIPT
printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1);
#endif
}
return (M);
}
{ int *VF = wave->VF;
int *VF = wave->VF;
int *VB = wave->VB;
int flow; // fhgh == D !
int blow, bhgh;
......@@ -3961,8 +3948,10 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
{ while (y < M && B[y] == A[y])
y += 1;
if (y >= M && N == M)
{ *px = *py = M;
return (0);
}
}
flow = 0;
VF[0] = y;
......@@ -4021,7 +4010,9 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
else
y = r+1;
x = k+y;
goto OVERLAP2;
*px = x;
*py = y;
return (D);
}
}
......@@ -4076,7 +4067,9 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
else
y = r;
x = k+y;
goto OVERLAP2;
*px = x;
*py = y;
return (D);
}
}
......@@ -4099,36 +4092,215 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
}
}
OVERLAP2:
static int trace_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int tspace)
{ int x, y;
int D, s;
#ifdef DEBUG_ALIGN
printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
fflush(stdout);
#endif
if (M <= 0)
{ y = (((A-wave->Aabs)/tspace) << 1);
wave->Trace[y] += N;
wave->Trace[y+1] += N;
#ifdef DEBUG_TRACE
printf("%*s Adding1 (%d,%d) to tp %d(%d,%d)\n",depth,"",N,N,y>>1,
wave->Trace[y+1],wave->Trace[y]);
fflush(stdout);
#endif
return (N);
}
if (N <= 0)
{ x = A - wave->Aabs;
y = x/tspace;
x = (y+1)*tspace - x;
y <<= 1;
for (s = M; s > 0; s -= x, x = tspace)
{ if (x > s)
x = s;
wave->Trace[y] += x;
#ifdef DEBUG_TRACE
printf("%*s Adding2 (0,%d) to tp %d(%d,%d)\n",depth,"",x,y>>1,
wave->Trace[y+1],wave->Trace[y]);
fflush(stdout);
#endif
y += 2;
}
return (M);
}
D = split_nd(A,M,B,N,wave,&x,&y);
if (D > 1)
{
#ifdef DEBUG_ALIGN
printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D);
fflush(stdout);
depth += 2;
#endif
s = A-wave->Aabs;
if ((s/tspace+1)*tspace - s >= x)
{ s = ((s/tspace)<<1);
wave->Trace[s] += (D+1)/2;
wave->Trace[s+1] += y;
#ifdef DEBUG_TRACE
printf("%*s Adding3 (%d,%d) to tp %d(%d,%d)\n",depth,"",y,(D+1)/2,s>>1,
wave->Trace[s+1],wave->Trace[s]);
fflush(stdout);
#endif
}
else
trace_nd(A,x,B,y,wave,tspace);
s = (A+x)-wave->Aabs;
if ((s/tspace+1)*tspace - s >= M-x)
{ s = ((s/tspace)<<1);
wave->Trace[s] += D/2;
wave->Trace[s+1] += N-y;
#ifdef DEBUG_TRACE
printf("%*s Adding4 (%d,%d)) to tp %d(%d,%d)\n",depth,"",N-y,D/2,s>>1,
wave->Trace[s+1],wave->Trace[s]);
fflush(stdout);
#endif
}
else
trace_nd(A+x,M-x,B+y,N-y,wave,tspace);
#ifdef DEBUG_ALIGN
depth -= 2;
#endif
}
else
{ int u, v;
if (D == 0 || M < N)
s = x;
else
s = x-1;
if (s > 0)
{ u = A - wave->Aabs;
v = u/tspace;
u = (v+1)*tspace - u;
for (v <<= 1; s > 0; s -= u, u = tspace)
{ if (u > s)
u = s;
wave->Trace[v+1] += u;
#ifdef DEBUG_TRACE
printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1,
wave->Trace[v+1],wave->Trace[v]);
fflush(stdout);
#endif
v += 2;
}
}
if (D == 0)
return (D);
if (M < N)
y = ((((A+x)-wave->Aabs)/tspace)<<1);
else
y = ((((A+(x-1))-wave->Aabs)/tspace)<<1);
wave->Trace[y] += 1;
if (M <= N)
wave->Trace[y+1] += 1;
#ifdef DEBUG_TRACE
printf("%*s Adding5 (%d,1)) to tp %d(%d,%d)\n",depth,"",N>=M,y>>1,
wave->Trace[y+1],wave->Trace[y]);
fflush(stdout);
#endif
s = M-x;
if (s > 0)
{ u = (A+x) - wave->Aabs;
v = u/tspace;
u = (v+1)*tspace - u;
for (v <<= 1; s > 0; s -= u, u = tspace)
{ if (u > s)
u = s;
wave->Trace[v+1] += u;
#ifdef DEBUG_TRACE
printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1,
wave->Trace[v+1],wave->Trace[v]);
fflush(stdout);
#endif
v += 2;
}
}
}
return (D);
}
static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
{ int x, y;
int D;
#ifdef DEBUG_ALIGN
printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
#endif
if (M <= 0)
{ x = (wave->Aabs-A)-1;
for (y = 1; y <= N; y++)
{ *wave->Stop++ = x;
#ifdef DEBUG_SCRIPT
printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1);
#endif
}
return (N);
}
if (N <= 0)
{ y = (B-wave->Babs)+1;
for (x = 1; x <= M; x++)
{ *wave->Stop++ = y;
#ifdef DEBUG_SCRIPT
printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1);
#endif
}
return (M);
}
D = split_nd(A,M,B,N,wave,&x,&y);
if (D > 1)
{
#ifdef DEBUG_ALIGN
printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D);
fflush(stdout);
depth += 2;
#endif
dandc_nd(A,x,B,y,wave);
dandc_nd(A+x,M-x,B+y,N-y,wave);
#ifdef DEBUG_ALIGN
depth -= 2;
#endif
}
else if (D == 1)
{ if (M > N)
{ *wave->Stop++ = (B-wave->Babs)+y+1;
#ifdef DEBUG_SCRIPT
printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1);
#endif
}
else if (M < N)
{ *wave->Stop++ = (wave->Aabs-A)-x-1;
#ifdef DEBUG_SCRIPT
printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1);
#endif
}
#ifdef DEBUG_SCRIPT
else
printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y);
......@@ -4138,47 +4310,138 @@ OVERLAP2:
return (D);
}
static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework)
int Compute_Alignment(Alignment *align, Work_Data *ework, int task, int tspace)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
int L, D;
int asub, bsub;
char *aseq, *bseq;
Path *path;
int *trace;
uint16 *strace;
path = align->path;
asub = path->aepos-path->abpos;
bsub = path->bepos-path->bbpos;
aseq = align->aseq+path->abpos;
bseq = align->bseq+path->bbpos;
if (asub < bsub)
L = bsub;
if (task != DIFF_ONLY)
{ if (task == DIFF_TRACE || task == PLUS_TRACE)
L = 2*(((path->aepos + (tspace-1))/tspace - path->abpos/tspace) + 1)*sizeof(uint16);
else if (asub < bsub)
L = bsub*sizeof(int);
else
L = asub;
L *= sizeof(int);
L = asub*sizeof(int);
if (L > work->tramax)
if (enlarge_trace(work,L))
EXIT(1);
}
trace = wave.Stop = ((int *) work->trace);
trace = ((int *) work->trace);
strace = ((uint16 *) work->trace);
D = 2*(path->diffs + 4)*sizeof(int);
if (asub > bsub)
D = (4*asub+6)*sizeof(int);
else
D = (4*bsub+6)*sizeof(int);
if (D > work->vecmax)
if (enlarge_vector(work,D))
EXIT(1);
D = (path->diffs+3)/2;
wave.VF = ((int *) work->vector) + (D+1);
wave.VB = wave.VF + (2*D+1);
if (asub > bsub)
{ wave.VF = ((int *) work->vector) + (asub+1);
wave.VB = wave.VF + (2*asub+3);
}
else
{ wave.VF = ((int *) work->vector) + (bsub+1);
wave.VB = wave.VF + (2*bsub+3);
}
wave.Aabs = align->aseq;
wave.Babs = align->bseq;
path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos,
align->bseq+path->bbpos,path->bepos-path->bbpos,&wave);
path->trace = trace;
if (task == DIFF_ONLY)
{ wave.mida = -1;
if (asub <= 0)
path->diffs = bsub;
else if (bsub <= 0)
path->diffs = asub;
else
path->diffs = split_nd(aseq,asub,bseq,bsub,&wave,&wave.mida,&wave.midb);
path->trace = NULL;
path->tlen = -1;
return (0);
}
else if (task < DIFF_ONLY && wave.mida >= 0)
{ int x = wave.mida;
int y = wave.midb;
if (task == PLUS_ALIGN)
{ wave.Stop = trace;
dandc_nd(aseq,x,bseq,y,&wave);
dandc_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave);
path->tlen = wave.Stop - trace;
}
else
{ int i, n;
wave.Trace = strace - 2*(path->abpos/tspace);
n = L/sizeof(uint16);
for (i = 0; i < n; i++)
strace[i] = 0;
trace_nd(aseq,x,bseq,y,&wave,tspace);
trace_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave,tspace);
if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary
{ strace[n-3] += strace[n-1];
strace[n-4] += strace[n-2];
}
path->tlen = n-2;
#ifdef DEBUG_SCRIPT
printf(" Trace:\n");
for (i = 0; i < path->tlen; i += 2)
printf(" %3d %3d\n",strace[i],strace[i+1]);
fflush(stdout);
#endif
}
}
else
{ if (task == DIFF_ALIGN)
{ wave.Stop = trace;
path->diffs = dandc_nd(aseq,asub,bseq,bsub,&wave);
path->tlen = wave.Stop - trace;
}
else
{ int i, n;
wave.Trace = strace - 2*(path->abpos/tspace);
n = L/sizeof(uint16);
for (i = 0; i < n; i++)
strace[i] = 0;
path->diffs = trace_nd(aseq,asub,bseq,bsub,&wave,tspace);
if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary
{ strace[n-3] += strace[n-1];
strace[n-4] += strace[n-2];
}
path->tlen = n-2;
#ifdef DEBUG_SCRIPT
printf(" Trace:\n");
for (i = 0; i < path->tlen; i += 2)
printf(" %3d %3d\n",strace[i],strace[i+1]);
fflush(stdout);
#endif
}
}
path->trace = trace;
return (0);
}
......@@ -4825,79 +5088,6 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode
static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect";
int Compute_Trace_ALL(Alignment *align, Work_Data *ework)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
Path *path;
char *aseq, *bseq;
int alen, blen;
int M, N, D;
int dmax;
alen = align->alen;
blen = align->blen;
path = align->path;
aseq = align->aseq;
bseq = align->bseq;
M = path->aepos-path->abpos;
N = path->bepos-path->bbpos;
{ int64 s;
int d;
int **PVF, **PHF;
if (M < N)
s = N;
else
s = M;
s *= sizeof(int);
if (s > work->tramax)
if (enlarge_trace(work,s))
EXIT(1);
dmax = path->diffs - abs(M-N);
s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *));
if (s > 256000000)
return (Compute_Trace_ND_ALL(align,ework));
if (s > work->vecmax)
if (enlarge_vector(work,s))
EXIT(1);
wave.PVF = PVF = ((int **) (work->vector)) + 2;
wave.PHF = PHF = PVF + (dmax+3);
s = M+N+3;
PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1);
for (d = -1; d <= dmax; d++)
PVF[d] = PVF[d-1] + s;
PHF[-2] = PVF[dmax] + s;
for (d = -1; d <= dmax; d++)
PHF[d] = PHF[d-1] + s;
}
wave.Stop = ((int *) work->trace);
wave.Aabs = aseq;
wave.Babs = bseq;
if (path->aepos > alen || path->bepos > blen)
{ EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
EXIT(1);
}
D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST,dmax);
if (D < 0)
EXIT(1);
path->diffs = D;
path->trace = work->trace;
path->tlen = wave.Stop - ((int *) path->trace);
return (0);
}
int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
......
......@@ -235,30 +235,25 @@ void Complement_Seq(char *a, int n);
int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !!
int diag, int anti, int lbord, int hbord, int prefix);
/* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment.
If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points
and diff levels computed by Local_Alignment. In either case 'path.trace' is set
/* Given a legitimate Alignment object and associated trace point vector in 'align->path.trace',
Compute_Trace_X, computes an exact trace for the alignment and resets 'align->path.trace'
to point at an integer array within the storage of the Work_Data packet encoding an
exact optimal trace from the start to end points. If the trace is needed beyond the
next call to a routine that sets it, then it should be copied to an array allocated
and managed by the caller.
Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the
best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the
edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the
trace between successive pass through points. It is much, much faster than Compute_Trace_ALL
but at the tradeoff of not necessarily being optimal as pass-through points are not all
perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points
of alignments between two adjacent pairs of pass through points. It is generally twice as
slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines
return 1 if an error occurred and 0 otherwise.
Compute_Trace_PTS computes a trace by computing the trace between successive trace points.
It is much, much faster than Compute_Alignment below but at the tradeoff of not necessarily
being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace
by computing the trace between the mid-points of alignments between two adjacent pairs of trace
points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal
alignments. Both these routines return 1 if an error occurred and 0 otherwise.
*/
#define LOWERMOST -1 // Possible modes for "mode" parameter below)
#define GREEDIEST 0
#define UPPERMOST 1
int Compute_Trace_ALL(Alignment *align, Work_Data *work);
int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode);
int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode);
......@@ -271,6 +266,25 @@ void Complement_Seq(char *a, int n);
int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !!
/* Compute Alignment determines the best alignment between the substrings specified by align.
If the task is DIFF_ONLY, then only the difference of this alignment is computed and placed
in the "diffs" field of align's path. If the task is PLUS_TRACE or DIFF_TRACE, then
'path.trace' is set to point at an integer array within the storage of the Work_Data packet
encoding a trace point sequence for an optimal alignment, whereas if the task is PLUS_ALIGN
or DIFF_ALIGN, then it points to an optimal trace of an optimatl alignment. The PLUS
tasks can only be called if the immmediately proceeding call was a DIFF_ONLY on the same
alignment record and sequences, in which case a little efficiency is gained by avoiding
the repetition of the top level search for an optimal mid-point.
*/
#define PLUS_ALIGN 0
#define PLUS_TRACE 1
#define DIFF_ONLY 2
#define DIFF_ALIGN 3
#define DIFF_TRACE 4
int Compute_Alignment(Alignment *align, Work_Data *work, int task, int trace_spacing);
/* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the
two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls
the display width of numbers, it must be not less than the width of any number to be
......@@ -339,7 +353,9 @@ typedef struct {
margin by 'indent' spaces.
Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and
Decompress_TraceTo16 does the reverse conversion.
Decompress_TraceTo16 does the reverse conversion. If check is set in a call to Compress
then it checks whether the values fit in 8-bits, and if not returns a non-zero result
in interactive mode, or exits with an error message in batch mode.
Check_Trace_Points checks that the number of trace points is correct and that the sum
of the b-read displacements equals the b-read alignment interval, assuming the trace
......@@ -353,7 +369,7 @@ typedef struct {
int Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent);
void Compress_TraceTo8(Overlap *ovl);
int Compress_TraceTo8(Overlap *ovl, int check);
void Decompress_TraceTo16(Overlap *ovl);
int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname);
......
......@@ -50,18 +50,19 @@
#include "filter.h"
static char *Usage[] =
{ "[-vbAI] [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>] [-P<dir(/tmp)>]",
{ "[-vabAI] [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>] [-P<dir(/tmp)>]",
" [-e<double(.70)] [-l<int(1000)>] [-s<int(100)>] [-H<int>] [-T<int(4)>]",
" [-m<track>]+ <subject:db|dam> <target:db|dam> ...",
};
int VERBOSE; // Globally visible to filter.c
char *SORT_PATH;
int BIASED;
int MINOVER;
int HGAP_MIN;
int SYMMETRIC;
int IDENTITY;
char *SORT_PATH;
uint64 MEM_LIMIT;
uint64 MEM_PHYSICAL;
......@@ -261,14 +262,14 @@ static DAZZ_TRACK *merge_tracks(DAZZ_DB *block, int mtop, int64 nsize)
ntrack = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating merged track");
if (ntrack == NULL)
exit (1);
Clean_Exit(1);
ntrack->name = Strdup("merge","Allocating merged track");
ntrack->anno = anno = (int64 *) Malloc(sizeof(int64)*(block->nreads+1),"Allocating merged track");
ntrack->data = data = (int *) Malloc(sizeof(int)*nsize,"Allocating merged track");
ntrack->size = sizeof(int);
ntrack->next = NULL;
if (anno == NULL || data == NULL || ntrack->name == NULL)
exit (1);
Clean_Exit(1);
{ DAZZ_TRACK *track;
int i;
......@@ -344,7 +345,7 @@ static int read_DB(DAZZ_DB *block, char *name, char **mask, int *mstat, int mtop
isdam = Open_DB(name,block);
if (isdam < 0)
exit (1);
Clean_Exit(1);
for (i = 0; i < mtop; i++)
{ status = Check_Track(block,mask[i],&kind);
......@@ -400,7 +401,7 @@ static int read_DB(DAZZ_DB *block, char *name, char **mask, int *mstat, int mtop
if (block->reads[i].rlen < kmer)
{ fprintf(stderr,"%s: Block %s contains reads < %dbp long ! Run DBsplit.\n",
Prog_Name,name,kmer);
exit (1);
Clean_Exit(1);
}
}
......@@ -440,7 +441,7 @@ static DAZZ_DB *complement_DB(DAZZ_DB *block, int inplace)
else
{ seq = (char *) Malloc(block->reads[nreads].boff+1,"Allocating dazzler sequence block");
if (seq == NULL)
exit (1);
Clean_Exit(1);
*seq++ = 4;
memmove(seq,block->bases,block->reads[nreads].boff);
*cblock = *block;
......@@ -486,11 +487,11 @@ static DAZZ_DB *complement_DB(DAZZ_DB *block, int inplace)
trg = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),
"Allocating dazzler interval track header");
if (data == NULL || trg == NULL || anno == NULL)
exit (1);
Clean_Exit(1);
trg->name = Strdup(src->name,"Copying track name");
if (trg->name == NULL)
exit (1);
Clean_Exit(1);
trg->size = 4;
trg->anno = (void *) anno;
......@@ -519,22 +520,35 @@ static DAZZ_DB *complement_DB(DAZZ_DB *block, int inplace)
return (cblock);
}
static char *CommandBuffer(char *aname, char *bname)
static char *CommandBuffer(char *aname, char *bname, char *spath)
{ static char *cat = NULL;
static int max = -1;
int len;
len = 2*(strlen(aname) + strlen(bname)) + 200;
len = 2*(strlen(aname) + strlen(bname) + strlen(spath)) + 200;
if (len > max)
{ max = ((int) (1.2*len)) + 100;
if ((cat = (char *) realloc(cat,max+1)) == NULL)
{ fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name);
exit (1);
Clean_Exit(1);
}
}
return (cat);
}
void Clean_Exit(int val)
{ char *command;
command = CommandBuffer("","",SORT_PATH);
sprintf(command,"rm -r %s",SORT_PATH);
if (system(command) != 0)
{ fprintf(stderr,"%s: Command Failed:\n%*s %s\n",
Prog_Name,(int) strlen(Prog_Name),"",command);
exit (1);
}
exit (val);
}
int main(int argc, char *argv[])
{ DAZZ_DB _ablock, _bblock;
DAZZ_DB *ablock = &_ablock, *bblock = &_bblock;
......@@ -554,6 +568,7 @@ int main(int argc, char *argv[])
double AVE_ERROR;
int SPACING;
int NTHREADS;
int MAP_ORDER;
{ int i, j, k;
int flags[128];
......@@ -592,7 +607,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
ARG_FLAGS("vbAI")
ARG_FLAGS("vabAI")
break;
case 'k':
ARG_POSITIVE(KMER_LEN,"K-mer length")
......@@ -664,11 +679,35 @@ int main(int argc, char *argv[])
BIASED = flags['b']; // Globally declared in filter.h
SYMMETRIC = 1-flags['A'];
IDENTITY = flags['I'];
MAP_ORDER = flags['a'];
if (argc <= 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
fprintf(stderr,"\n");
fprintf(stderr," -k: k-mer size (must be <= 32).\n");
fprintf(stderr," -w: Look for k-mers in averlapping bands of size 2^-w.\n");
fprintf(stderr," -h: A seed hit if the k-mers in band cover >= -h bps in the");
fprintf(stderr," targest read.\n");
fprintf(stderr," -t: Ignore k-mers that occur >= -t times in a block.\n");
fprintf(stderr," -M: Use only -M GB of memory by ignoring most frequent k-mers.\n");
fprintf(stderr,"\n");
fprintf(stderr," -e: Look for alignments with -e percent similarity.\n");
fprintf(stderr," -l: Look for alignments of length >= -l.\n");
fprintf(stderr," -s: The trace point spacing for encoding alignments.\n");
fprintf(stderr," -H: HGAP option: align only target reads of length >= -H.\n");
fprintf(stderr,"\n");
fprintf(stderr," -T: Use -T threads.\n");
fprintf(stderr," -P: Do block level sorts and merges in directory -P.\n");
fprintf(stderr," -m: Soft mask the blocks with the specified mask.\n");
fprintf(stderr," -b: For AT/GC biased data, compensate k-mer counts (deprecated).\n");
fprintf(stderr,"\n");
fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n");
fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n");
fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n");
fprintf(stderr," -A: Compare subjet to target, but not vice versa.\n");
fprintf(stderr," -I: Compare reads to themselves\n");
exit (1);
}
......@@ -682,7 +721,22 @@ int main(int argc, char *argv[])
exit (1);
}
/* Read in the reads in A */
// Create directory in SORT_PATH for file operations
{ char *newpath;
newpath = (char *) Malloc(strlen(SORT_PATH)+30,"Allocating sort path");
if (newpath == NULL)
exit (1);
sprintf(newpath,"%s/daligner.%d",SORT_PATH,getpid());
if (mkdir(newpath,S_IRWXU) != 0)
{ fprintf(stderr,"%s: Could not create directory %s\n",Prog_Name,newpath);
exit (1);
}
SORT_PATH = newpath;
}
// Read in the reads in A
afile = argv[1];
isdam = read_DB(ablock,afile,MASK,MSTAT,MTOP,KMER_LEN);
......@@ -693,7 +747,7 @@ int main(int argc, char *argv[])
asettings = New_Align_Spec( AVE_ERROR, SPACING, ablock->freq, 1);
/* Compare against reads in B in both orientations */
// Compare against reads in B in both orientations
{ int i, j;
char *command;
......@@ -754,34 +808,37 @@ int main(int argc, char *argv[])
Close_DB(bblock);
command = CommandBuffer(aroot,broot);
command = CommandBuffer(aroot,broot,SORT_PATH);
#define SYSTEM_CHECK(command) \
if (VERBOSE) \
printf("%s\n",command); \
if (system(command) != 0) \
{ fprintf(stderr,"\n%s: Command Failed:\n%*s %s\n", \
Prog_Name,(int) strlen(Prog_Name),"",command); \
Clean_Exit(1); \
}
sprintf(command,"LAsort %s %s %s/%s.%s.N%c %s/%s.%s.C%c",VERBOSE?"-v":"",
MAP_ORDER?"-a":"",SORT_PATH,aroot,broot,BLOCK_SYMBOL,
SORT_PATH,aroot,broot,BLOCK_SYMBOL);
SYSTEM_CHECK(command)
sprintf(command,"LAmerge %s %s %s.%s.las %s/%s.%s.N%c.S %s/%s.%s.C%c.S",VERBOSE?"-v":"",
MAP_ORDER?"-a":"",aroot,broot,SORT_PATH,aroot,broot,BLOCK_SYMBOL,
SORT_PATH,aroot,broot,BLOCK_SYMBOL);
SYSTEM_CHECK(command)
sprintf(command,"LAsort %s/%s.%s.[CN]*.las",SORT_PATH,aroot,broot);
if (VERBOSE)
printf("\n%s\n",command);
system(command);
sprintf(command,"LAmerge %s.%s.las %s/%s.%s.[CN]*.S.las",aroot,broot,SORT_PATH,aroot,broot);
if (VERBOSE)
printf("%s\n",command);
system(command);
sprintf(command,"rm %s/%s.%s.[CN]*.las",SORT_PATH,aroot,broot);
if (VERBOSE)
printf("%s\n",command);
system(command);
if (aroot != broot && SYMMETRIC)
{ sprintf(command,"LAsort %s/%s.%s.[CN]*.las",SORT_PATH,broot,aroot);
if (VERBOSE)
printf("%s\n",command);
system(command);
sprintf(command,"LAmerge %s.%s.las %s/%s.%s.[CN]*.S.las",broot,aroot,
SORT_PATH,broot,aroot);
if (VERBOSE)
printf("%s\n",command);
system(command);
sprintf(command,"rm %s/%s.%s.[CN]*.las",SORT_PATH,broot,aroot);
if (VERBOSE)
printf("%s\n",command);
system(command);
{ sprintf(command,"LAsort %s %s %s/%s.%s.N%c %s/%s.%s.C%c",VERBOSE?"-v":"",
MAP_ORDER?"-a":"",SORT_PATH,broot,aroot,BLOCK_SYMBOL,
SORT_PATH,broot,aroot,BLOCK_SYMBOL);
SYSTEM_CHECK(command)
sprintf(command,"LAmerge %s %s %s.%s.las %s/%s.%s.N%c.S %s/%s.%s.C%c.S",VERBOSE?"-v":"",
MAP_ORDER?"-a":"",broot,aroot,SORT_PATH,broot,aroot,BLOCK_SYMBOL,
SORT_PATH,broot,aroot,BLOCK_SYMBOL);
SYSTEM_CHECK(command)
}
if (aroot != broot)
......@@ -789,5 +846,5 @@ int main(int argc, char *argv[])
}
}
exit (0);
Clean_Exit(0);
}
daligner (1.0+20180108-2) UNRELEASED; urgency=medium
daligner (1.0+git20180524.fd21879-1) unstable; urgency=medium
* Team upload.
[ Jelmer Vernooij ]
* Use secure copyright file specification URI.
-- Jelmer Vernooij <jelmer@debian.org> Mon, 08 Oct 2018 21:39:17 +0100
[ Andreas Tille ]
* New upstream commit
* debhelper 11
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.2.1
* Remove trailing whitespace in debian/copyright
* hardening=+all
-- Andreas Tille <tille@debian.org> Sun, 28 Oct 2018 08:34:03 +0100
daligner (1.0+20180108-1) unstable; urgency=medium
......
Source: daligner
Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Afif Elghraoui <afif@debian.org>
Build-Depends:
debhelper (>= 10),
Standards-Version: 4.1.3
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~)
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/daligner
Vcs-Git: https://salsa.debian.org/med-team/daligner.git
Homepage: https://dazzlerblog.wordpress.com
Vcs-Git: https://anonscm.debian.org/git/debian-med/daligner.git
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/daligner.git
Package: daligner
Architecture: any
Depends:
${shlibs:Depends},
${misc:Depends},
Suggests:
dazzdb,
dascrubber,
Depends: ${shlibs:Depends},
${misc:Depends}
Suggests: dazzdb,
dascrubber
Description: local alignment discovery between long nucleotide sequencing reads
These tools permit one to find all significant local alignments between
reads encoded in a Dazzler database. The assumption is that the reads are
......
......@@ -3,6 +3,8 @@
#DH_VERBOSE = 1
include /usr/share/dpkg/default.mk
export DEB_BUILD_MAINT_OPTIONS=hardening=+all
%:
dh $@
......