Skip to content
Commits on Source (4)
......@@ -28,3 +28,56 @@ script: ./travis.sh
notifications:
email: false
sudo: false
# Copied from https://github.com/nim-lang/Nim/wiki/TravisCI
#language: c
env:
# Build and test against the master and devel branches of Nim
#- BRANCH=master
- BRANCH=devel
#- BRANCH=v0.17.2
compiler:
# Build and test using both gcc and clang
- gcc
#- clang
#matrix:
# allow_failures:
# # Ignore failures when building against the devel Nim branch
# - env: BRANCH=devel
# fast_finish: true
install:
- |
if [ ! -x nim-$BRANCH/bin/nim ]; then
git clone -b $BRANCH --depth 1 git://github.com/nim-lang/nim nim-$BRANCH/
cd nim-$BRANCH
git clone --depth 1 git://github.com/nim-lang/csources csources/
cd csources
sh build.sh
cd ..
rm -rf csources
bin/nim c koch
./koch boot -d:release
else
cd nim-$BRANCH
git fetch origin
if ! git merge FETCH_HEAD | grep "Already up-to-date"; then
bin/nim c koch
./koch boot -d:release
fi
fi
cd ..
before_script:
- export PATH="$(pwd)/nim-$BRANCH/bin${PATH:+:$PATH}"
#script:
# # Replace uppercase strings!
# - nim c --cc:$CC --verbosity:0 -r MYFILE.nim
# # Optional: build docs.
# - nim doc --docSeeSrcUrl:https://github.com/AUTHOR/MYPROJECT/blob/master --project MYFILE.nim
cache:
directories:
#- nim-master
- nim-devel
branches:
except:
- gh-pages
......@@ -62,6 +62,13 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// debugging
#include <time.h>
//#include <locale.h>
#include <stdbool.h>
// end debugging
#define MAX_OVERLAPS 50000
......@@ -552,6 +559,13 @@ int main(int argc, char *argv[])
// For each record do
// debugging
// setlocale(LC_NUMERIC, ""); // for commas in numbers with %'d, but this can cause other problems
fprintf(stderr, "\nabout to go into loop with novl = %lld %s\n", novl, argv[3]);
time_t timeLast;
time(&timeLast);
// end debugging
blast = -1;
match = 0;
seen = 0;
......@@ -559,7 +573,25 @@ int main(int argc, char *argv[])
for (j = 0; j < novl; j++)
// Read it in
{ Read_Overlap(input,ovl);
{
// debugging
time_t timeCurrent;
time(&timeCurrent);
double diff_t = difftime(timeCurrent, timeLast);
if ( ( diff_t > 60.0 ) || ( j < 10) || ( j % 1000000 == 0 ) ) {
time_t mytime = time(NULL);
fprintf(stderr, "before Read_Overlap record j = %d out of %lld at %s %s", j, novl, argv[3], ctime(&mytime));
fflush(stderr);
timeLast = timeCurrent;
}
// end debugging
Read_Overlap(input,ovl);
if (ovl->path.tlen > tmax)
{ tmax = ((int) 1.2*ovl->path.tlen) + 100;
trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector");
......@@ -831,6 +863,12 @@ int main(int argc, char *argv[])
}
}
// debugging
time_t mytime = time(NULL);
fprintf(stderr, "\ncompleted loop record j = %d out of %lld at %s %s\n", j, novl, argv[3], ctime(&mytime));
// end debugging
if (FALCON && hit_count != -1)
{
print_hits(hit_count, dbx2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
......
......@@ -48,6 +48,24 @@ static char *Usage = "[-va] <merge:las> <parts:las> ...";
else \
bigger = 0;
static void Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t rc = fwrite(ptr, size, nmemb, stream);
if (rc != nmemb) {
EPRINTF(EPLACE," Error writing %zu elements (of size %zu)\n", nmemb, size);
exit(1);
}
}
static void Fclose(FILE *stream) {
// An error in fclose() could be caused by an earlier failure in fwrite().
// 'man fclose' for details.
int rc = fclose(stream);
if (rc != 0) {
EPRINTF(EPLACE," Error closing stream.\n");
exit(1);
}
}
static void reheap(int s, Overlap **heap, int hsize)
{ int c, l, r;
int bigger;
......@@ -288,8 +306,8 @@ int main(int argc, char *argv[])
free(pwd);
free(root);
fwrite(&totl,sizeof(int64),1,output);
fwrite(&tspace,sizeof(int),1,output);
Fwrite(&totl,sizeof(int64),1,output);
Fwrite(&tspace,sizeof(int),1,output);
oblock = block+fway*bsize;
optr = oblock;
......@@ -351,7 +369,7 @@ int main(int argc, char *argv[])
if (src->ptr + span > src->top)
ovl_reload(src,bsize);
if (optr + span > otop)
{ fwrite(oblock,1,optr-oblock,output);
{ Fwrite(oblock,1,optr-oblock,output);
optr = oblock;
}
......@@ -375,8 +393,8 @@ int main(int argc, char *argv[])
// Flush output buffer and wind up
if (optr > oblock)
fwrite(oblock,1,optr-oblock,output);
fclose(output);
Fwrite(oblock,1,optr-oblock,output);
Fclose(output);
for (i = 0; i < fway; i++)
fclose(in[i].stream);
......
......@@ -25,6 +25,24 @@ static char *Usage = "[-va] <align:las> ...";
static char *IBLOCK;
static void Fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t rc = fwrite(ptr, size, nmemb, stream);
if (rc != nmemb) {
EPRINTF(EPLACE," Error writing %zu elements (of size %zu)\n", nmemb, size);
exit(1);
}
}
static void Fclose(FILE *stream) {
// An error in fclose() could be caused by an earlier failure in fwrite().
// 'man fclose' for details.
int rc = fclose(stream);
if (rc != 0) {
EPRINTF(EPLACE," Error closing stream.\n");
exit(1);
}
}
static int SORT_OVL(const void *x, const void *y)
{ int64 l = *((int64 *) x);
int64 r = *((int64 *) y);
......@@ -182,8 +200,8 @@ int main(int argc, char *argv[])
if (foutput == NULL)
exit (1);
fwrite(&novl,sizeof(int64),1,foutput);
fwrite(&tspace,sizeof(int),1,foutput);
Fwrite(&novl,sizeof(int64),1,foutput);
Fwrite(&tspace,sizeof(int),1,foutput);
free(pwd);
free(root);
......@@ -258,7 +276,7 @@ int main(int argc, char *argv[])
{ tsize = w->path.tlen*tbytes;
span = ovlsize + tsize;
if (fptr + span > ftop)
{ fwrite(fblock,1,fptr-fblock,foutput);
{ Fwrite(fblock,1,fptr-fblock,foutput);
fptr = fblock;
}
memcpy(fptr,((char *) w)+ptrsize,ovlsize);
......@@ -270,11 +288,11 @@ int main(int argc, char *argv[])
while (wo < iend && CHAIN_NEXT(w->flags));
}
if (fptr > fblock)
fwrite(fblock,1,fptr-fblock,foutput);
Fwrite(fblock,1,fptr-fblock,foutput);
}
free(perm);
fclose(foutput);
Fclose(foutput);
}
if (iblock != NULL)
......
......@@ -519,3 +519,17 @@ G: 62,654 records
1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts)
...
/*****************************************************************************\
PacBio Disclaimer
THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE
PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES
OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A
PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF
THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR
APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY
OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO
NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES.
\******************************************************************************/
......@@ -2,11 +2,11 @@
type module >& /dev/null || source /mnt/software/Modules/current/init/bash
set -vex
module load gcc/4.9.2
module load gcc/6.4.0
module load git/2.8.3
module load ccache
NEXUS_BASEURL=http://ossnexus.pacificbiosciences.com/repository
NEXUS_URL=$NEXUS_BASEURL/unsupported/gcc-4.9.2
NEXUS_URL=$NEXUS_BASEURL/unsupported/gcc-6.4.0
rm -rf prebuilt build
mkdir -p prebuilt/DAZZ_DB build/bin
......@@ -23,6 +23,8 @@ make -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile clean
PATH=.:${PATH} make -C DALIGNER -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile
make -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile clean
if [[ $bamboo_planRepository_branchName == "develop" ]]; then
cd build
tar zcf DALIGNER-SNAPSHOT.tgz bin
curl -v -n --upload-file DALIGNER-SNAPSHOT.tgz $NEXUS_URL/DALIGNER-SNAPSHOT.tgz
fi
......@@ -684,11 +684,11 @@ int main(int argc, char *argv[])
if (i == 2)
{ for (j = 0; j < MTOP; j++)
{ if (MSTAT[j] == -2)
printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]);
printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[j]);
else if (MSTAT[j] == -1)
printf("%s: Warning: %s track not sync'd with relevant db.\n",Prog_Name,MASK[i]);
printf("%s: Warning: %s track not sync'd with relevant db.\n",Prog_Name,MASK[j]);
else if (MSTAT[j] == -3)
printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]);
printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[j]);
}
if (VERBOSE)
......
......@@ -314,6 +314,14 @@ void Upper_Read(char *s)
*s = '\0';
}
void Letter_Arrow(char *s)
{ static char letter[4] = { '1', '2', '3', '4' };
for ( ; *s != 4; s++)
*s = letter[(int) *s];
*s = '\0';
}
// Convert read in ascii representation to [0-3] representation (end with 4)
void Number_Read(char *s)
......@@ -341,6 +349,31 @@ void Number_Read(char *s)
*s = 4;
}
void Number_Arrow(char *s)
{ static char arrow[128] =
{ 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 0, 1, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 2,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3,
};
for ( ; *s != '\0'; s++)
*s = arrow[(int) *s];
*s = 4;
}
/*******************************************************************************************
*
......@@ -426,7 +459,7 @@ int Open_DB(char* path, HITS_DB *db)
if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1)
if (part == 0)
{ cutoff = 0;
all = 1;
all = DB_ALL;
}
else
{ EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n",
......@@ -466,7 +499,7 @@ int Open_DB(char* path, HITS_DB *db)
db->tracks = NULL;
db->part = part;
db->cutoff = cutoff;
db->all = all;
db->allarr |= all;
db->ufirst = ufirst;
db->tfirst = tfirst;
......@@ -478,7 +511,7 @@ int Open_DB(char* path, HITS_DB *db)
db->reads += 1;
if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
free(db->reads);
free(db->reads-1);
goto error2;
}
}
......@@ -495,7 +528,7 @@ int Open_DB(char* path, HITS_DB *db)
fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR);
if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
free(reads);
free(reads-1);
goto error2;
}
......@@ -557,10 +590,10 @@ void Trim_DB(HITS_DB *db)
if (db->trimmed) return;
if (db->cutoff <= 0 && db->all) return;
if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return;
cutoff = db->cutoff;
if (db->all)
if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
......@@ -660,10 +693,10 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
if (!db->trimmed) return (0);
if (db->cutoff <= 0 && db->all) return (0);
if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0);
cutoff = db->cutoff;
if (db->all)
if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
......@@ -1435,6 +1468,57 @@ int Load_Read(HITS_DB *db, int i, char *read, int ascii)
return (0);
}
// Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1,
// and as a numeric string otherwise.
//
HITS_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow"
FILE *Arrow_File = NULL; // Becomes invalid after closing
int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
{ FILE *arrow;
int64 off;
int len, clen;
HITS_READ *r = db->reads;
if (i >= db->nreads)
{ EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name);
EXIT(1);
}
if (Arrow_DB != db)
{ if (Arrow_File != NULL)
fclose(Arrow_File);
arrow = Fopen(Catenate(db->path,"","",".arw"),"r");
if (arrow == NULL)
EXIT(1);
Arrow_File = arrow;
Arrow_DB = db;
}
else
arrow = Arrow_File;
off = r[i].boff;
len = r[i].rlen;
if (ftello(arrow) != off)
fseeko(arrow,off,SEEK_SET);
clen = COMPRESSED_LEN(len);
if (clen > 0)
{ if (fread(read,clen,1,arrow) != 1)
{ EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name);
EXIT(1);
}
}
Uncompress_Read(len,read);
if (ascii == 1)
{ Letter_Arrow(read);
read[-1] = '\0';
}
else
read[-1] = 4;
return (0);
}
char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
{ FILE *bases = (FILE *) db->bases;
int64 off;
......
......@@ -164,6 +164,9 @@ void Lower_Read(char *s); // Convert read from numbers to lowercase letters
void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT)
void Number_Read(char *s); // Convert read from letters to numbers
void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234)
void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers
/*******************************************************************************************
*
......@@ -175,6 +178,9 @@ void Number_Read(char *s); // Convert read from letters to numbers
#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
#define DB_ARROW 0x2 // DB is an arrow DB
#define DB_ALL 0x1 // all wells are in the trimmed DB
// Fields have different interpretations if a .db versus a .dam
typedef struct
......@@ -185,6 +191,7 @@ typedef struct
// uncompressed bases in memory block
int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB),
// Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
// 4 compressed shorts containing snr info if an arrow DB.
int flags; // QV of read + flags above (DB only)
} HITS_READ;
......@@ -226,7 +233,7 @@ typedef struct
{ int ureads; // Total number of reads in untrimmed DB
int treads; // Total number of reads in trimmed DB
int cutoff; // Minimum read length in block (-1 if not yet set)
int all; // Consider multiple reads from a given well
int allarr; // DB_ALL | DB_ARROW
float freq[4]; // frequency of A, C, G, T, respectively
// Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed)
......@@ -368,6 +375,11 @@ char *New_Read_Buffer(HITS_DB *db);
int Load_Read(HITS_DB *db, int i, char *read, int ascii);
// Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence,
// and there is only a choice between numeric (0) or ascii (1);
int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
// string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string
......
......@@ -21,7 +21,7 @@
#undef LSF // define if want a directly executable LSF script
static char *Usage[] =
{ "[-vbd] [-t<int>] [-w<int(6)>] [-l<int(1000)>] [-s<int(100)>]",
{ "[-vbd] [-t<int>] [-w<int(6)>] [-l<int(1000)>] [-s<int(100)>] [-P<dir(/tmp)>]",
" [-M<int>] [-B<int(4)>] [-D<int( 250)>] [T<int(4)>] [-f<name>]",
" [-k<int(14)>] [-h<int(35)>] [-e<double(.70)>] [-m<track>]+",
" -g<int> -c<int> <reads:db|dam> [<block:int>[-<range:int>]"
......@@ -29,8 +29,6 @@ static char *Usage[] =
#define LSF_ALIGN \
"bsub -q medium -n 4 -o DAL.REP%d.out -e DAL.REP%d.err -R span[hosts=1] -J align#%d"
#define LSF_SORT \
"bsub -q short -n 12 -o SORT.REP%d.out -e SORT.REP%d.err -R span[hosts=1] -J sort#%d"
#define LSF_MERGE \
"bsub -q short -n 12 -o MERGE%d.REP%d.out -e MERGE%d.REP%d.err -R span[hosts=1] -J merge#%d"
#define LSF_CHECK \
......@@ -61,6 +59,7 @@ int main(int argc, char *argv[])
int MMAX, MTOP;
char **MASK;
char *ONAME;
char *PDIR;
{ int i, j, k; // Process options
int flags[128];
......@@ -78,6 +77,7 @@ int main(int argc, char *argv[])
LINT = 1000;
SINT = 100;
MINT = -1;
PDIR = NULL;
MTOP = 0;
MMAX = 10;
......@@ -120,6 +120,10 @@ int main(int argc, char *argv[])
break;
case 'k':
ARG_POSITIVE(KINT,"K-mer length")
if (KINT > 32)
{ fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name);
exit (1);
}
break;
case 'l':
ARG_POSITIVE(LINT,"Minimum ovlerap length")
......@@ -156,6 +160,9 @@ int main(int argc, char *argv[])
case 'M':
ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)")
break;
case 'P':
PDIR = argv[i]+2;
break;
case 'T':
ARG_POSITIVE(NTHREADS,"Number of threads")
break;
......@@ -305,11 +312,11 @@ int main(int argc, char *argv[])
exit (1);
}
DON = (DON && useblock);
DON = (DON && (SPAN > 1));
}
{ int level, njobs;
int i, j, k, p;
int i, j, k;
if (DON)
{ if (ONAME != NULL)
......@@ -320,6 +327,9 @@ int main(int argc, char *argv[])
fprintf(out,"# Create work subdirectories\n");
for (i = fblock; i <= lblock; i++)
fprintf(out,"mkdir temp%d\n",i);
if (ONAME != NULL)
fclose(out);
}
// Produce all necessary daligner jobs ...
......@@ -380,6 +390,8 @@ int main(int argc, char *argv[])
fprintf(out," -s%d",SINT);
if (MINT >= 0)
fprintf(out," -M%d",MINT);
if (PDIR != NULL)
fprintf(out," -P%s",PDIR);
if (NTHREADS != 4)
fprintf(out," -T%d",NTHREADS);
for (k = 0; k < MTOP; k++)
......@@ -407,22 +419,19 @@ int main(int argc, char *argv[])
else
fprintf(out," %s",root);
if (DON)
for (k = low; k < hgh; k++)
if (SPAN == 1) // ==> [low,hgh) = [i,i+1)
if (useblock)
fprintf(out," && mv %s.%d.%s.%d.las %s.R1.%d.las",root,i,root,i,root,i);
else
fprintf(out," && mv %s.%s.las %s.R1.%d.las",root,root,root,i);
else if (DON)
{ fprintf(out," && mv");
for (p = 0; p < NTHREADS; p++)
{ fprintf(out," %s.%d.%s.%d.C%d.las",root,i,root,k,p);
fprintf(out," %s.%d.%s.%d.N%d.las",root,i,root,k,p);
}
for (k = low; k < hgh; k++)
fprintf(out," %s.%d.%s.%d.las",root,i,root,k);
fprintf(out," temp%d",i);
for (k = low; k < hgh; k++)
if (k != i)
{ fprintf(out," && mv");
for (p = 0; p < NTHREADS; p++)
{ fprintf(out," %s.%d.%s.%d.C%d.las",root,k,root,i,p);
fprintf(out," %s.%d.%s.%d.N%d.las",root,k,root,i,p);
}
fprintf(out," temp%d",k);
}
fprintf(out," && mv %s.%d.%s.%d.las temp%d",root,k,root,i,k);
}
#ifdef LSF
......@@ -433,90 +442,11 @@ int main(int argc, char *argv[])
}
}
// ... and then all the initial sort & merge jobs for each block pair
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.02.SORT",ONAME);
out = fopen(name,"w");
}
fprintf(out,"# Initial sort jobs (%d)\n",((lblock-fblock)+1)*SPAN);
#ifdef LSF
jobid = 1;
#endif
for (i = fblock; i <= lblock; i++)
{ int base;
base = fblock + ((i-fblock)/SPAN)*SPAN;
if (base + SPAN > lblock+1)
base = (lblock+1) - SPAN;
for (j = base; j < base+SPAN; j++)
{
#ifdef LSF
fprintf(out,LSF_SORT,SPAN,SPAN,jobid++);
fprintf(out," \"");
#endif
fprintf(out,"LAsort");
if (VON)
fprintf(out," -v");
for (k = 0; k < NTHREADS; k++)
if (useblock)
if (DON)
{ fprintf(out," temp%d/%s.%d.%s.%d.C%d",i,root,i,root,j,k);
fprintf(out," temp%d/%s.%d.%s.%d.N%d",i,root,i,root,j,k);
}
else
{ fprintf(out," %s.%d.%s.%d.C%d",root,i,root,j,k);
fprintf(out," %s.%d.%s.%d.N%d",root,i,root,j,k);
}
else
{ fprintf(out," %s.%s.C%d",root,root,k);
fprintf(out," %s.%s.N%d",root,root,k);
}
fprintf(out," && LAmerge");
if (VON)
fprintf(out," -v");
if (useblock)
if (DON)
if (SPAN == 1)
fprintf(out," temp%d/%s.R%d.%d",i,root,SPAN,j);
else
fprintf(out," temp%d/L1.%d.%d",i,i,(j-base)+1);
else
if (SPAN == 1)
fprintf(out," %s.R%d.%d",root,SPAN,j);
else
fprintf(out," L1.%d.%d",i,(j-base)+1);
else
fprintf(out," %s.R%d",root,SPAN);
for (k = 0; k < NTHREADS; k++)
if (useblock)
if (DON)
{ fprintf(out," temp%d/%s.%d.%s.%d.C%d.S",i,root,i,root,j,k);
fprintf(out," temp%d/%s.%d.%s.%d.N%d.S",i,root,i,root,j,k);
}
else
{ fprintf(out," %s.%d.%s.%d.C%d.S",root,i,root,j,k);
fprintf(out," %s.%d.%s.%d.N%d.S",root,i,root,j,k);
}
else
{ fprintf(out," %s.%s.C%d.S",root,root,k);
fprintf(out," %s.%s.N%d.S",root,root,k);
}
#ifdef LSF
fprintf(out,"\"");
#endif
fprintf(out,"\n");
}
}
// Check .las files (optional)
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.03.CHECK.OPT",ONAME);
sprintf(name,"%s.02.CHECK.OPT",ONAME);
out = fopen(name,"w");
}
......@@ -547,19 +477,12 @@ int main(int argc, char *argv[])
else
fprintf(out," %s",root);
while (j <= k)
{ if (useblock)
if (DON)
if (SPAN == 1)
fprintf(out," temp%d/%s.R%d.%d",i,root,SPAN,j);
else
fprintf(out," temp%d/L1.%d.%d",i,i,(j-base)+1);
else
if (SPAN == 1)
fprintf(out," %s.R%d.%d",root,SPAN,j);
else
fprintf(out," L1.%d.%d",i,(j-base)+1);
{ if (SPAN == 1)
fprintf(out," %s.R1.%d",root,i);
else if (DON)
fprintf(out," temp%d/%s.%d.%s.%d",i,root,i,root,j);
else
fprintf(out," %s.R%d",root,SPAN);
fprintf(out," %s.%d.%s.%d",root,i,root,j);
j += 1;
}
#ifdef LSF
......@@ -569,64 +492,13 @@ int main(int argc, char *argv[])
}
}
// Clean up
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.04.RM",ONAME);
out = fopen(name,"w");
}
fprintf(out,"# Remove initial .las files\n");
for (i = fblock; i <= lblock; i++)
{ int base, span;
if (DON)
fprintf(out,"cd temp%d\n",i);
span = SPAN;
base = fblock + ((i-fblock)/SPAN)*SPAN;
if (base + SPAN > lblock+1)
base = (lblock+1) - SPAN;
else if (i + SPAN > lblock)
span = (lblock-base)+1;
for (j = base; j < base+span; j++)
{ fprintf(out,"rm");
for (k = 0; k < NTHREADS; k++)
if (useblock)
{ fprintf(out," %s.%d.%s.%d.C%d.las",root,i,root,j,k);
fprintf(out," %s.%d.%s.%d.N%d.las",root,i,root,j,k);
}
else
{ fprintf(out," %s.%s.C%d.las",root,root,k);
fprintf(out," %s.%s.N%d.las",root,root,k);
}
fprintf(out,"\n");
if (j < base+SPAN)
{ fprintf(out,"rm");
for (k = 0; k < NTHREADS; k++)
if (useblock)
{ fprintf(out," %s.%d.%s.%d.C%d.S.las",root,i,root,j,k);
fprintf(out," %s.%d.%s.%d.N%d.S.las",root,i,root,j,k);
}
else
{ fprintf(out," %s.%s.C%d.S.las",root,root,k);
fprintf(out," %s.%s.N%d.S.las",root,root,k);
}
fprintf(out,"\n");
}
}
if (DON)
fprintf(out,"cd ..\n");
}
if (ONAME != NULL)
fclose(out);
stage = 5;
stage = 3;
// Higher level merges (if lblock > 1)
// Higher level merges (if SPAN > 1)
if (lblock > 1)
if (SPAN > 1)
{ int pow;
// Determine numbe of merge levels
......@@ -663,7 +535,14 @@ int main(int argc, char *argv[])
// New block merges
for (j = fblock; j <= lblock; j++)
{ low = 1;
{ int base;
base = fblock + ((j-fblock)/SPAN)*SPAN;
if (base + SPAN > lblock+1)
base = (lblock+1) - SPAN;
base -= 1;
low = 1;
for (p = 1; p <= cits; p++)
{ hgh = (cnt*p)/cits;
#ifdef LSF
......@@ -684,6 +563,12 @@ int main(int argc, char *argv[])
else
fprintf(out," L%d.%d.%d",i+1,j,p);
for (k = low; k <= hgh; k++)
if (i == 1)
if (DON)
fprintf(out," temp%d/%s.%d.%s.%d",j,root,j,root,base+k);
else
fprintf(out," %s.%d.%s.%d",root,j,root,base+k);
else
if (DON)
fprintf(out," temp%d/L%d.%d.%d",j,i,j,k);
else
......@@ -754,11 +639,47 @@ int main(int argc, char *argv[])
fprintf(out,"# Remove level %d .las files)\n",i);
for (j = fblock; j <= lblock; j++)
{ low = 1;
{ int base;
base = fblock + ((j-fblock)/SPAN)*SPAN;
if (base + SPAN > lblock+1)
{ int n, u;
int mnt, mits;
n = (lblock+1) - SPAN;
u = j - (base-n);
if (i == 1 && u < base)
{ mnt = (lblock+1)-base;
mits = (mnt-1)/DUNIT+1;
low = 1;
base -= 1;
for (p = 1; p <= mits; p++)
{ hgh = (mnt*p)/mits;
fprintf(out,"rm");
for (k = low; k <= hgh; k++)
if (DON)
fprintf(out," temp%d/%s.%d.%s.%d.las",u,root,u,root,base+k);
else
fprintf(out," %s.%d.%s.%d.las",root,u,root,base+k);
fprintf(out,"\n");
low = hgh+1;
}
}
base = n;
}
base -= 1;
low = 1;
for (p = 1; p <= cits; p++)
{ hgh = (cnt*p)/cits;
fprintf(out,"rm");
for (k = low; k <= hgh; k++)
if (i == 1)
if (DON)
fprintf(out," temp%d/%s.%d.%s.%d.las",j,root,j,root,base+k);
else
fprintf(out," %s.%d.%s.%d.las",root,j,root,base+k);
else
if (DON)
fprintf(out," temp%d/L%d.%d.%d.las",j,i,j,k);
else
......@@ -816,13 +737,10 @@ int main(int argc, char *argv[])
if (j > lblock)
j = lblock+1;
for (k = h; k < j; k++)
if (useblock)
if (DON)
fprintf(out," temp%d/%s.R%d.%d",k,root,SPAN,k);
else
fprintf(out," %s.R%d.%d",root,SPAN,k);
else
fprintf(out," %s.R%d",root,SPAN);
#ifdef LSF
fprintf(out,"\"");
#endif
......@@ -854,10 +772,7 @@ int main(int argc, char *argv[])
if (j > lblock)
j = lblock+1;
for (k = h; k < j; k++)
if (useblock)
fprintf(out," %s.R%d.%d.las",root,SPAN,k);
else
fprintf(out," %s.R%d.las",root,SPAN);
fprintf(out,"\n");
}
......@@ -868,7 +783,16 @@ int main(int argc, char *argv[])
printf("# Once all the .rep masks have been computed for every block\n");
printf("# you should call 'Catrack' to merge them, and then you should\n");
printf("# remove the block tracks\n");
printf("# remove the individual block tracks, e.g.:\n");
if (usepath)
{ printf("# Catrack -v %s/%s rep%d\n",pwd,root,SPAN);
printf("# rm %s/.%s.*.rep%d.*\n",pwd,root,SPAN);
}
else
{ printf("# Catrack -v %s rep%d\n",root,SPAN);
printf("# rm .%s.*.rep%d.*\n",root,SPAN);
}
free(root);
free(pwd);
......
......@@ -21,15 +21,12 @@
#undef LSF // define if want a directly executable LSF script
static char *Usage[] =
{ "[-vd] [-k<int(12)>] [-w<int(4)>] [-h<int(35)>] [-T<int(4)>]",
{ "[-v] [-k<int(12)>] [-w<int(4)>] [-h<int(35)>] [-T<int(4)>] [-P<dir(/tmp)>]",
" [-e<double(.70)] [-l<int(500)>] [-s<int(100)] [-f<name>]",
" <reads:db|dam> [<first:int>[-<last:int>]"
};
#define LSF_TAND "bsub -q medium -n 4 -o TANDEM.out -e TANDEM.err -R span[hosts=1] -J tandem#%d"
#define LSF_SORT "bsub -q short -n 12 -o SORT.TAN.out -e SORT.TAN.err -R span[hosts=1] -J sort#%d"
#define LSF_MERGE \
"bsub -q short -n 12 -o MERGE%d.REP%d.out -e MERGE%d.REP%d.err -R span[hosts=1] -J merge#%d"
#define LSF_CHECK \
"bsub -q short -n 12 -o CHECK%d.DAL.out -e CHECK%d.DAL.err -R span[hosts=1] -J check#%d"
#define LSF_MASK \
......@@ -50,11 +47,12 @@ int main(int argc, char *argv[])
#define BUNIT 4
int VON, DON;
int VON;
int WINT, HINT, KINT, SINT, LINT;
int NTHREADS;
double EREL;
char *ONAME;
char *PDIR;
{ int i, j, k; // Process options
int flags[128];
......@@ -69,6 +67,7 @@ int main(int argc, char *argv[])
LINT = 500;
SINT = 100;
ONAME = NULL;
PDIR = NULL;
out = stdout;
NTHREADS = 4;
......@@ -78,7 +77,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
ARG_FLAGS("vd");
ARG_FLAGS("v");
break;
case 'e':
ARG_REAL(EREL)
......@@ -95,6 +94,10 @@ int main(int argc, char *argv[])
break;
case 'k':
ARG_POSITIVE(KINT,"K-mer length")
if (KINT > 32)
{ fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name);
exit (1);
}
break;
case 'l':
ARG_POSITIVE(LINT,"Minimum ovlerap length")
......@@ -105,6 +108,9 @@ int main(int argc, char *argv[])
case 'w':
ARG_POSITIVE(WINT,"Log of bin width")
break;
case 'P':
PDIR = argv[i]+2;
break;
case 'T':
ARG_POSITIVE(NTHREADS,"Number of threads")
break;
......@@ -114,7 +120,6 @@ int main(int argc, char *argv[])
argc = j;
VON = flags['v'];
DON = flags['d'];
if (argc < 2 || argc > 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
......@@ -233,25 +238,10 @@ int main(int argc, char *argv[])
exit (1);
}
}
DON = (DON && useblock);
}
{ int njobs;
int i, j, k, p;
// Create all work subdirectories if -d is set
if (DON)
{ if (ONAME != NULL)
{ sprintf(name,"%s.00.MKDIR",ONAME);
out = fopen(name,"w");
}
fprintf(out,"# Create work subdirectories\n");
for (i = fblock; i <= lblock; i++)
fprintf(out,"mkdir temp%d\n",i);
}
int i, j, k;
// Produce all necessary datandem jobs ...
......@@ -288,6 +278,8 @@ int main(int argc, char *argv[])
fprintf(out," -l%d",LINT);
if (SINT != 100)
fprintf(out," -s%d",SINT);
if (PDIR != NULL)
fprintf(out," -P%s",PDIR);
if (NTHREADS != 4)
fprintf(out," -T%d",NTHREADS);
j = i+BUNIT;
......@@ -305,68 +297,6 @@ int main(int argc, char *argv[])
else
fprintf(out," %s",root);
if (DON)
for (k = i; k < j; k++)
{ fprintf(out," && mv");
for (p = 0; p < NTHREADS; p++)
fprintf(out," %s.%d.T%d.las",root,k,p);
fprintf(out," temp%d",k);
}
#ifdef LSF
fprintf(out,"\"");
#endif
fprintf(out,"\n");
}
// ... and then all the sort & merge jobs for each block
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.02.SORT",ONAME);
out = fopen(name,"w");
}
fprintf(out,"# Sort & merge jobs (%d)\n", lblock - fblock + 1);
#ifdef LSF
jobid = 1;
#endif
for (i = fblock; i <= lblock; i++)
{
#ifdef LSF
fprintf(out,LSF_SORT,jobid++);
fprintf(out," \"");
#endif
fprintf(out,"LAsort");
if (VON)
fprintf(out," -v");
for (k = 0; k < NTHREADS; k++)
if (useblock)
if (DON)
fprintf(out," temp%d/%s.%d.T%d",i,root,i,k);
else
fprintf(out," %s.%d.T%d",root,i,k);
else
fprintf(out," %s.T%d",root,k);
fprintf(out," && LAmerge");
if (VON)
fprintf(out," -v");
if (useblock)
if (DON)
fprintf(out," temp%d/%s.T.%d",i,root,i);
else
fprintf(out," %s.T.%d",root,i);
else
fprintf(out," %s.T",root);
for (k = 0; k < NTHREADS; k++)
if (useblock)
if (DON)
fprintf(out," temp%d/%s.%d.T%d.S",i,root,i,k);
else
fprintf(out," %s.%d.T%d.S",root,i,k);
else
fprintf(out," %s.T%d.S",root,k);
#ifdef LSF
fprintf(out,"\"");
#endif
......@@ -377,7 +307,7 @@ int main(int argc, char *argv[])
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.03.CHECK.OPT",ONAME);
sprintf(name,"%s.02.CHECK.OPT",ONAME);
out = fopen(name,"w");
}
......@@ -402,12 +332,9 @@ int main(int argc, char *argv[])
fprintf(out," %s",root);
while (i <= k)
{ if (useblock)
if (DON)
fprintf(out," temp%d/%s.T.%d",i,root,i);
else
fprintf(out," %s.T.%d",root,i);
fprintf(out," TAN.%s.%d",root,i);
else
fprintf(out," %s.T",root);
fprintf(out," TAN.%s",root);
i += 1;
}
#ifdef LSF
......@@ -416,35 +343,11 @@ int main(int argc, char *argv[])
fprintf(out,"\n");
}
// Clean up (optional)
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.04.RM",ONAME);
out = fopen(name,"w");
}
fprintf(out,"# Cleanup all intermediate .las files\n");
for (i = fblock; i <= lblock; i++)
{ if (DON)
fprintf(out,"cd temp%d\n",i);
fprintf(out,"rm");
for (k = 0; k < NTHREADS; k++)
if (useblock)
fprintf(out," %s.%d.T%d.S.las %s.%d.T%d.las",root,i,k,root,i,k);
else
fprintf(out," %s.T%d.S.las %s.T%d.las",root,k,root,k);
fprintf(out,"\n");
if (DON)
fprintf(out,"cd ..\n");
}
// Finish with MASKtan
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.05.MASK",ONAME);
sprintf(name,"%s.03.MASK",ONAME);
out = fopen(name,"w");
}
......@@ -473,12 +376,9 @@ int main(int argc, char *argv[])
j = lblock+1;
for (k = i; k < j; k++)
if (useblock)
if (DON)
fprintf(out," temp%d/%s.T.%d",k,root,k);
fprintf(out," TAN.%s.%d",root,k);
else
fprintf(out," %s.T.%d",root,k);
else
fprintf(out," %s.T",root);
fprintf(out," TAN.%s",root);
#ifdef LSF
fprintf(out,"\"");
#endif
......@@ -489,19 +389,12 @@ int main(int argc, char *argv[])
if (ONAME != NULL)
{ fclose(out);
sprintf(name,"%s.06.RM",ONAME);
sprintf(name,"%s.04.RM",ONAME);
out = fopen(name,"w");
}
if (DON)
fprintf(out,"# Cleanup all temp directories\n");
else
fprintf(out,"# Cleanup all T.las files\n");
if (DON)
for (i = fblock; i <= lblock; i++)
fprintf(out,"rm -r temp%d\n",i);
else
for (i = fblock; i <= lblock; i += BUNIT)
{ fprintf(out,"rm");
j = i+BUNIT;
......@@ -509,9 +402,9 @@ int main(int argc, char *argv[])
j = lblock+1;
for (k = i; k < j; k++)
if (useblock)
fprintf(out," %s.T.%d.las",root,k);
fprintf(out," TAN.%s.%d.las",root,k);
else
fprintf(out," %s.T.las",root);
fprintf(out," TAN.%s.las",root);
fprintf(out,"\n");
}
......@@ -521,7 +414,15 @@ int main(int argc, char *argv[])
printf("# Once all the .tan masks have been computed for every block\n");
printf("# you should call 'Catrack' to merge them, and then you should\n");
printf("# remove the individual block tracks\n");
printf("# remove the individual block tracks, e.g.:\n");
if (usepath)
{ printf("# Catrack -v %s/%s tan\n",pwd,root);
printf("# rm %s/.%s.*.tan.*\n",pwd,root);
}
else
{ printf("# Catrack -v %s tan\n",root);
printf("# rm .%s.*.tan.*\n",root);
}
free(root);
free(pwd);
......
......@@ -31,4 +31,4 @@ install:
package:
make clean
tar -zcf damasker.tar.gz README Makefile *.h *.c
tar -zcf damasker.tar.gz README.md Makefile *.h *.c
......@@ -863,6 +863,62 @@ static int delChar, subChar;
// Referred by: QVcoding_Scan, Create_QVcoding
void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV)
{
if (rlen == 0) // Initialization call
{ int i;
// Zero histograms
bzero(delHist,sizeof(uint64)*256);
bzero(mrgHist,sizeof(uint64)*256);
bzero(insHist,sizeof(uint64)*256);
bzero(subHist,sizeof(uint64)*256);
for (i = 0; i < 256; i++)
delRun[i] = subRun[i] = 1;
totChar = 0;
delChar = -1;
subChar = -1;
return;
}
// Add streams to accumulating histograms and figure out the run chars
// for the deletion and substition streams
Histogram_Seqs(delHist,(uint8 *) delQV,rlen);
Histogram_Seqs(insHist,(uint8 *) insQV,rlen);
Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen);
Histogram_Seqs(subHist,(uint8 *) subQV,rlen);
if (delChar < 0)
{ int k;
for (k = 0; k < rlen; k++)
if (delTag[k] == 'n' || delTag[k] == 'N')
{ delChar = delQV[k];
break;
}
}
if (delChar >= 0)
Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar);
totChar += rlen;
if (subChar < 0)
{ if (totChar >= 100000)
{ int k;
subChar = 0;
for (k = 1; k < 256; k++)
if (subHist[k] > subHist[subChar])
subChar = k;
}
}
if (subChar >= 0)
Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar);
return;
}
int QVcoding_Scan(FILE *input, int num, FILE *temp)
{ char *slash;
int rlen;
......@@ -1284,6 +1340,44 @@ void Free_QVcoding(QVcoding *coding)
*
********************************************************************************************/
void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub,
FILE *output, QVcoding *coding, int lossy)
{ int clen;
if (coding->delChar < 0)
{ Encode(coding->delScheme, output, (uint8 *) del, rlen);
clen = rlen;
}
else
{ Encode_Run(coding->delScheme, coding->dRunScheme, output,
(uint8 *) del, rlen, coding->delChar);
clen = Pack_Tag(tag,del,rlen,coding->delChar);
}
Number_Read(tag);
Compress_Read(clen,tag);
fwrite(tag,1,COMPRESSED_LEN(clen),output);
if (lossy)
{ uint8 *insert = (uint8 *) ins;
uint8 *merge = (uint8 *) mrg;
int k;
for (k = 0; k < rlen; k++)
{ insert[k] = (uint8) ((insert[k] >> 1) << 1);
merge[k] = (uint8) (( merge[k] >> 2) << 2);
}
}
Encode(coding->insScheme, output, (uint8 *) ins, rlen);
Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen);
if (coding->subChar < 0)
Encode(coding->subScheme, output, (uint8 *) sub, rlen);
else
Encode_Run(coding->subScheme, coding->sRunScheme, output,
(uint8 *) sub, rlen, coding->subChar);
return;
}
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy)
{ int rlen, clen;
......
......@@ -58,6 +58,7 @@ int Get_QV_Line();
// If there is an error then -1 is returned, otherwise the number of entries read.
int QVcoding_Scan(FILE *input, int num, FILE *temp);
void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub);
// Given QVcoding_Scan has been called at least once, create an encoding scheme based on
// the accumulated statistics and return a pointer to it. The returned encoding object
......@@ -83,6 +84,8 @@ void Free_QVcoding(QVcoding *coding);
// occurred, and the sequence length otherwise.
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub,
FILE *output, QVcoding *coding, int lossy);
// Assuming the input is position just beyond the compressed encoding of an entry header,
// read the set of compressed encodings for the ensuing 5 QV vectors, decompress them,
......
*** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET ***
DOCUMENTATION, EXAMPLES OF USE, AND DESIGN PHILOSOPHY.
The Dazzler Repeat Masking Suite: DA MASKER
Author: Gene Myers
First: April 10, 2016
Scrubbing is complicated by the presence of repeats. We currently handle this by
soft-masking all tandem and interspersed repeats in the input data when computing
overlaps. This implies that reads that are completely repetitive sequence are not
scrubbed. This is typically a small part of the data set and a portion thereof that
is currently not correctly assembled by any assembly system at the current time, and
therefore the masking is of minor consequence. Eventually, these completely masked
reads will be analyzed in downstream processes that will attempt to resolve ultra-long
(15Kbp or more) repeats.
The masking suite therefore consists of several programs that in combination can be
used to produce repeat masks for a data set as follows:
1. REPmask [-v] [-m<track(rep)>] -c<int> <subject:db> <overlaps:las> ...
This command takes as input a database <source> and a sequence of sorted local
alignments blocks, <overlaps>, produced by a daligner run for said database. Note
carefully that <source> must always refer to the entire DB, only <overlaps> can
involve a block number.
REPmask examines each pile for an A-read and determines the intervals that are covered
-c or more times by LAs. This set of intervals is output as a repeat mask for A in an
interval track with default name .rep, that can be overridden with the -m option. If
the -v option is set, then the number of intervals and total base pairs in intervals
is printed.
2. tander [-v] [-k<int(12)>] [-w<int(4)>] [-h<int(35)>] [-T<int(4)>]
[-e<double(.70)] [-l<int(1000)] [-s<int(100)]
<path:db|dam> ...
This program is a variation of daligner tailored to the task of comparing each read
against itself (and only those comparisons). As such each block or DB serves as both
the source and target, and the -b, -A, -I, -t, -M, -H, and -m options are irrelevant.
The remaining options are exactly as for daligner (see here).
For each subject block, say X, this program produces just 4NTHREAD files X.T#.las
where -T is the number of threads (by default 4) and where all the alignments do not
involve complementing the B-read (which is also the A-read). These should then be
sorted and merged with LAsort and LAmerge as for example performed by the script
generator HPC.TANmask described below.
3. TANmask [-v] [-l<int(500)>] [-m<track(tan)>] <subject:db> <overlaps:las> ...
This command takes as input a database <source> and a sequence of sorted local
alignments blocks, <overlaps>, produced by a datander run for said database. Note
carefully that <source> must always refer to the entire DB, only <overlaps> can
involve a block number.
TANmask examines each pile for an A-read and finds those self-LAs whose two alignment
intervals overlap and for which the union of these two intervals is -l bases or
longer. Each of these regions signals a tandem element in A of length -l or greater,
and a disjoint list of these is built. This set of intervals is output as a tandem
mask for A in an interval track with default name .tan, that can be overridden with
the -m option. If the -v option is set, then the number of intervals and total base
pairs in intervals is printed.
4. HPC.REPmask [-vbd]
[-t<int>] [-w<int(6)>] [-l<int(1000)>] [-s<int(100)>]
[-M<int>] [-B<int(4)>] [-D<int( 250)>] [-T<int(4)>] [-f<name>]
[-k<int(14)>] [-h<int(35)>] [-e<double(.70)>] [-m<track>]+
-g<int> -c<int> <reads:db|dam> [<first:int>[-<last:int>]]
HPC.REPmask writes a UNIX shell script to the standard output that consists of a
sequence of commands that effectively runs daligner, LAsort, and LAmerge to compare
all consecutive groups of -g blocks against each other and then applies REPmask to the
result in order to generate a repeat mask for the database <path>. For example if -g
is 3, then it will compare blocks 1-3 against each other, blocks 4-6 against each
other, and so on with daligner, and then sort and merge so that all the alignments
with an A-read in block i are in the file <path>.i.las (e.g. <path>.2.las will contain
all alignments where the A-read is in block 2 and the B-read is in blocks 1, 2, or 3).
Thereafter "REPmask -c -mrep<-g> <path> <path>.i.las" is run for every block i,
resulting in a .rep<-g> block for each mask that can then be combined with Catrack
into a single track for the entire DB.
The data base must have been previously split by DBsplit and all options, except -B,
-D, -d, and -f are passed through to the calls to daligner. The defaults for these
parameters are as for daligner. The -v flag, for verbose-mode, is also passed to all
calls to LAsort and LAmerge. The -d and -f parameters are explained later. The -B
and -D options control the form of the script generated by HPC.REPmask as follows.
The -B option determines the number of block comparisons per daligner job and the
-D option determines the fan-in for the hierarchical LAmerge process exactly as
for HPC.daligner. For a database divided into N sub-blocks, the calls to daligner
will produce a total of 2g^2(N/g)T .las files when daligner is run with T threads
as specified by the -T option. These will then be sorted and merged into Ng sorted
.las files, g^2 for each of the N/g block groups. These are then merged in
ceil(log_D g) phases where the number of files decreases geometrically in -D until
there is 1 file per block. So at the end one has N sorted .las files, one per block.
If the integers <first> and <last> are missing then the script produced is for every
block in the database. If <first> is present then HPC.REPmask produces an incremental
script that performs as much of the task as possible for blocks <first> through <last>
(<last> = <first> if not present) where it is assumed that the script has been called
previously up to block <first>-1. If <last> is not evenly divisible by -g then the
script performs the block comparisons and merges for the last incomplete group but
does not yet invoke REPmask as all the necessary comparisons have not been made.
Symmetrically, if the initial part of the range completes a previously incomplete
block group, then the script generates calls to complete the comparisons for the
initial group and produces the repeat-masks for that group.
The command script output by HPC.REPmask and other HPC.<x> programs consists of
command blocks each of which begins with a comment line (begins with #) followed by a
potentially long list of lines each containing a shell command. Command blocks whose
comment mentions "jobs" and gives the number of said in parenthesis, we call parallel
blocks because each command line in the block can be sent to a node in a cluster for
independent execution, i.e. none of the commands in a block depend on another in the
block. The remaining command blocks we call house-keeping blocks because they can be
executed by the shell on the launch/server node and the commands are either checking
the integrity of .las files with LAcheck, or removing intermediate files with rm. Each
block should be performed in the order given and should complete before the next block
is performed.
If the -f option is set, then each command block is written to a file with a name of
the form <name>.#.<description> where <name> is specified by the user in the -f
option argument, # gives the order in which the command block in the given file is
to be performed in relation to other command block files, and <description> is a
(very) short symbolic reminder of what the block is doing. For example,
"HPC.REPmask -g3 -c10 -fJOBS DB" would produce the files:
JOBS.01.OVL
JOBS.02.SORT
JOBS.03.CHECK.OPT
JOBS.04.RM
JOBS.MERGE
JOBS.06.CHECK.OPT
JOBS.07.RM
JOBS.08.MASK
JOBS.09.RM
The number of command blocks varies as it depends on the number of merging rounds
required in the external sort of the .las files. The files with the suffix .OPT
are optional and need not be executed albeit we highly recommend that one run
all the CHECK blocks.
The -d option requests scripts that organize files into a collection of
sub-directories so as not to overwhelm the underlying OS for large genomes. For
a DB divided into N blocks and the daligner calls in the script will produce 2gNT
.las-files where T is the number of threads specified by the -T option passed to
daligner (default is 4). With the -d option set, N sub-directories (with respect
to the directory HPC.daligner is called in) of the form "temp<i>" for i from 1 to
N are created in an initial command block, and then all intermediate files are
placed in those sub-directories, with a maximum of g(2T+1) files appearing in any
sub-directory at any given point in the process.
5. HPC.TANmask [-vd] [-k<int(12)>] [-w<int(4)>] [-h<int(35)>] [-T<int(4)>]
[-e<double(.70)] [-l<int(1000)] [-s<int(100)] [-f<name>]
<reads:db|dam> [<first:int>[-<last:int>]]
HPC.TANmask writes a UNIX shell script to the standard output that runs datander on
all relevant blocks of the supplied DB, then sorts and merges the resulting alignments
into a single .las for each block, and finally calls TANmask on each LA block to
produce a .tan mask for each block that can be merge into a single track for the
entire DB with Catrack.
All option arguments are passed through to datander except for the -d and -f options
which serve the same role as for HPC.REPmask above. The -v option is passed to all
programs in the script, and the -l option is also passed to TANmask. If the integers
<first> and <last> are missing then the script produced is for every block in the
database <reads>. If <first> is present then HPC.TANmask produces a script that
produces .tan tracks for blocks <first> through <last> (<last> = <first> if
not present).
# Damasker: The Dazzler Repeat Masking Suite
## _Author: Gene Myers_
## _First: April 10, 2016_
For typeset documentation, examples of use, and design philosophy please go to
my [blog](https://dazzlerblog.wordpress.com/command-guides/damasker-commands).
Scrubbing is complicated by the presence of repeats. We currently handle this by soft-masking all
tandem and interspersed repeats in the input data when computing overlaps. This implies that reads
that are completely repetitive sequence are not scrubbed. This is typically a small part of the
data set and a portion thereof that is currently not correctly assembled by any assembly system at
the current time, and therefore the masking is of minor consequence. Eventually, these completely
masked reads will be analyzed in downstream processes that will attempt to resolve ultra-long
(15Kbp or more) repeats.
The masking suite therefore consists of several programs that in combination can be used to
produce repeat masks for a data set as follows:
```
1. REPmask [-v] [-m<track(rep)>] -c<int> <subject:db> <overlaps:las> ...
```
This command takes as input a database \<source\> and a sequence of sorted local alignments blocks, \<overlaps\>, produced by a daligner run for said database. Note carefully that \<source\> must always refer to the entire DB, only \<overlaps\> can involve a block number.
REPmask examines each pile for an A-read and determines the intervals that are covered -c or more times by LAs. This set of intervals is output as a repeat mask for A in an interval track with default name .rep, that can be overridden with the -m option. If the -v option is set, then the number of intervals and total base pairs in intervals is printed.
```
2. datander [-v] [-k<int(12)>] [-w<int(4)>] [-h<int(35)>] [-T<int(4)>]
[-e<double(.70)>] [-l<int(1000)>] [-s<int(100)>] [-P<dir(/tmp)>]
<path:db|dam> ...
```
This program is a variation of daligner tailored to the task of comparing each read against itself (and only those comparisons). As such each block or DB serves as both the source and target, and the -b, -A, -I, -t, -M, -H, and -m options are irrelevant. The remaining options are exactly as for daligner (see here). For each subject block, say X, this program produces a single file TAN.X.las where all the alignments do not involve complementing the B-read (which is also the A-read).
```
3. TANmask [-v] [-l<int(500)>] [-m<track(tan)>] <subject:db> <overlaps:las> ...
```
This command takes as input a database \<source\> and a sequence of sorted local alignments blocks, \<overlaps\>, produced by a datander run for said database. Note carefully that \<source\> must always refer to the entire DB, only \<overlaps\> can involve a block number.
TANmask examines each pile for an A-read and finds those self-LAs whose two alignment intervals overlap and for which the union of these two intervals is -l bases or longer. Each of these regions signals a tandem element in A of length -l or greater, and a disjoint list of these is built. This set of intervals is output as a tandem mask for A in an interval track with default name .tan, that can be overridden with the -m option. If the -v option is set, then the number of intervals and total base pairs in intervals is printed.
```
4. HPC.REPmask [-vbd]
[-t<int>] [-w<int(6)>] [-l<int(1000)>] [-s<int(100)>] [-P<dir(/tmp)>]
[-M<int>] [-B<int(4)>] [-D<int( 250)>] [-T<int(4)>] [-f<name>]
[-k<int(14)>] [-h<int(35)>] [-e<double(.70)>] [-m<track>]+
-g<int> -c<int> <reads:db|dam> [<first:int>[-<last:int>]]
```
HPC.REPmask writes a UNIX shell script to the standard output that consists of a sequence of commands that effectively runs daligner, LAsort, and LAmerge to compare all consecutive groups of -g blocks against each other and then applies REPmask to the result in order to generate a repeat mask for the database \<path\>. For example if -g is 3, then it will compare blocks 1-3 against each other, blocks 4-6 against each other, and so on with daligner, and then sort and merge so that all the alignments with an A-read in block i are in the file \<path\>.i.las (e.g. \<path\>.2.las will contain all alignments where the A-read is in block 2 and the B-read is in blocks 1, 2, or 3). Thereafter "REPmask -c -mrep\<-g\> \<path\> \<path\>.i.las" is run for every block i, resulting in a .rep\<-g\> block for each mask that can then be combined with Catrack into a single track for the entire DB.
The data base must have been previously split by DBsplit and all options, except -B, -D, -d, and -f are passed through to the calls to daligner. The defaults for these parameters are as for daligner. The -v flag, for verbose-mode, is also passed to all calls to LAsort and LAmerge. The -d and -f parameters are explained later. The -B and -D options control the form of the script generated by HPC.REPmask as follows. The -B option determines the number of block comparisons per daligner job and the -D option determines the fan-in for the hierarchical LAmerge process exactly as for HPC.daligner. For a database divided into N sub-blocks, the calls to daligner will produce a total of gN .las files, g<sup>2</sup> for each of the N/g block groups. These are then merged in ceil(log<sub>D</sub> g) phases where the number of files decreases geometrically in -D until there is 1 file per block. So at the end one has N sorted .las files, one per block.
If the integers \<first\> and \<last\> are missing then the script produced is for every block in the database. If \<first\> is present then HPC.REPmask produces an incremental script that performs as much of the task as possible for blocks \<first\> through \<last\> (\<last\> = \<first\> if not present) where it is assumed that the script has been called previously up to block \<first\>-1. If \<last\> is not evenly divisible by -g then the script performs the block comparisons and merges for the last incomplete group but does not yet invoke REPmask as all the necessary comparisons have not been made. Symmetrically, if the initial part of the range completes a previously incomplete block group, then the script generates calls to complete the comparisons for the initial group and produces the repeat-masks for that group.
The command script output by HPC.REPmask and other HPC.\<x\> programs consists of command blocks each of which begins with a comment line (begins with #) followed by a potentially long list of lines each containing a shell command. Command blocks whose comment mentions "jobs" and gives the number of said in parenthesis, we call parallel blocks because each command line in the block can be sent to a node in a cluster for independent execution, i.e. none of the commands in a block depend on another in the block. The remaining command blocks we call house-keeping blocks because they can be executed by the shell on the launch/server node and the commands are either checking the integrity of .las files with LAcheck, or removing intermediate files with rm. Each block should be performed in the order given and should complete before the next block is performed.
If the -f option is set, then each command block is written to a file with a name of the form \<name\>.#.\<description\> where \<name\> is specified by the user in the -f option argument, # gives the order in which the command block in the given file is to be performed in relation to other command block files, and \<description\> is a (very) short symbolic reminder of what the block is doing. For example, "HPC.REPmask -g3 -c10 -fJOBS DB" would produce the files:
```
JOBS.01.OVL
JOBS.02.CHECK.OPT
JOBS.03.MERGE
JOBS.04.CHECK.OPT
JOBS.05.RM
JOBS.06.MASK
JOBS.07.RM
```
The number of command blocks varies as it depends on the number of merging rounds required in the external sort of the .las files. The files with the suffix .OPT are optional and need not be executed albeit we highly recommend that one run all the CHECK blocks.
The -d option requests scripts that organize files into a collection of sub-directories so as not to overwhelm the underlying OS for large genomes. For a DB divided into N blocks and the daligner calls in the script will produce 2gNT .las-files where T is the number of threads specified by the -T option passed to daligner (default is 4). With the -d option set, N sub-directories (with respect to the directory HPC.daligner is called in) of the form "temp\<i\>" for i from 1 to N are created in an initial command block, and then all intermediate files are placed in those sub-directories, with a maximum of g(2T+1) files appearing in any sub-directory at any given point in the process.
```
5. HPC.TANmask [-v] [-k<int(12)>] [-w<int(4)>] [-h<int(35)>] [-T<int(4)>] [-P<dir(/tmp)>]
[-e<double(.70)>] [-l<int(1000)>] [-s<int(100)>] [-f<name>]
<reads:db|dam> [<first:int>[-<last:int>]]
```
HPC.TANmask writes a UNIX shell script to the standard output that runs datander on all relevant blocks of the supplied DB, then sorts and merges the resulting alignments into a single .las for each block, and finally calls TANmask on each LA block to produce a .tan mask for each block that can be merge into a single track for the entire DB with Catrack.
All option arguments are passed through to datander except for the -f option which serves the same role as for HPC.REPmask above. The -v option is passed to all programs in the script, and the -l option is also passed to TANmask. If the integers \<first\> and \<last\> are missing then the script produced is for every block in the database \<reads\>. If \<first\> is present then HPC.TANmask produces a script that produces .tan tracks for blocks \<first\> through \<last\> (\<last\> = \<first\> if not present).
PacBio Disclaimer
-----------------
THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE
PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES
OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A
PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF
THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR
APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY
OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO
NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES.
......@@ -108,15 +108,16 @@ static int *blocks(Overlap *ovls, int novl, int *ptrim)
{ ab = ovls[i].path.abpos+PEEL_BACK;
ae = ovls[i].path.aepos-PEEL_BACK;
if (ae >= ab)
{ ev[ecnt].add = 1;
if (ae < ab)
ab = ae = (ovls[i].path.abpos + ovls[i].path.aepos)/2;
ev[ecnt].add = 1;
ev[ecnt].pos = ab;
ecnt += 1;
ev[ecnt].add = 0;
ev[ecnt].pos = ae;
ecnt += 1;
}
}
qsort(ev,ecnt,sizeof(Event),EVENT_SORT);
}
......@@ -348,6 +349,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
TBYTES = sizeof(uint16);
if (novl <= 0)
return (0);
Read_Overlap(input,ovls);
if (trace)
{ if (ovls[0].path.tlen > pmax)
......@@ -379,7 +383,7 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
n = 0;
else
{ if (trace)
memcpy(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
n = 1;
pcur = ovls[0].path.tlen;
while (1)
......
......@@ -188,6 +188,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
TBYTES = sizeof(uint16);
if (novl == 0)
return (0);
Read_Overlap(input,ovls);
if (trace)
{ if (ovls[0].path.tlen > pmax)
......@@ -219,7 +222,7 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
n = 0;
else
{ if (trace)
memcpy(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
n = 1;
pcur = ovls[0].path.tlen;
while (1)
......
......@@ -2983,8 +2983,8 @@ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec,
if (prefix)
{ if (reverse_extend(work,spec,align,diag,anti,minp,maxp))
EXIT(1);
apath->aepos = (anti-diag)/2;
apath->bepos = (anti+diag)/2;
apath->aepos = (anti+diag)/2;
apath->bepos = (anti-diag)/2;
#ifdef DEBUG_PASSES
printf("E1 (%d,%d) => (%d,%d) %d\n",
(anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs);
......@@ -2993,8 +2993,8 @@ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec,
else
{ if (forward_extend(work,spec,align,diag,anti,minp,maxp))
EXIT(1);
apath->abpos = (anti-diag)/2;
apath->bbpos = (anti+diag)/2;
apath->abpos = (anti+diag)/2;
apath->bbpos = (anti-diag)/2;
#ifdef DEBUG_PASSES
printf("F1 (%d,%d) => (%d,%d) %d\n",
(anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs);
......@@ -3044,10 +3044,13 @@ int Read_Trace(FILE *input, Overlap *ovl, int tbytes)
return (0);
}
void Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
{ fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output);
int Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
{ if (fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output) != 1)
return (1);
if (ovl->path.trace != NULL)
fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output);
if (fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output) != (size_t) ovl->path.tlen)
return (1);
return (0);
}
void Compress_TraceTo8(Overlap *ovl)
......@@ -3110,9 +3113,10 @@ void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent)
}
int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname)
{ int i, p;
{ int i, p, q;
if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2)
if (tspace != 0)
{ if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2)
{ if (verbose)
EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname);
return (1);
......@@ -3133,6 +3137,22 @@ int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname)
EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname);
return (1);
}
}
else
{ uint16 *trace16 = (uint16 *) ovl->path.trace;
p = ovl->path.bbpos;
q = ovl->path.abpos;
for (i = 1; i < ovl->path.tlen; i += 2)
{ p += trace16[i];
q += trace16[i-1];
}
if (p != ovl->path.bepos || q != ovl->path.aepos)
{ if (verbose)
EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname);
return (1);
}
}
return (0);
}
......
......@@ -325,7 +325,7 @@ typedef struct {
accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16).
Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that
occupies 'tbytes' bytes per value.
occupies 'tbytes' bytes per value. It returns non-zero if there was an error writing.
Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output'
where the trace occupes 'tbytes' per value and the print out is indented from the left
......@@ -343,7 +343,7 @@ typedef struct {
int Read_Overlap(FILE *input, Overlap *ovl);
int Read_Trace(FILE *innput, Overlap *ovl, int tbytes);
void Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
int Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent);
void Compress_TraceTo8(Overlap *ovl);
......