Commit 73190a14 authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 0.0.20121010

parent 0d517c8f
10-10-2012 Changed dna_complement[] in encoding.c to avoid 257 chars array.
This diff is collapsed.
This package implements the Sim4 algorithm for aligning expressed DNA
with genomic sequences, described in the paper:
L. Florea, G. Hartzell, Z. Zhang, G. Rubin, and W. Miller (1998)
"A computer program for aligning a cDNA sequence with a genomic DNA sequence."
Genome Research 8, 967-974.
Portions copyright by:
Copyright (C) 1998-2012 Liliana Florea
Copyright (C) 1998-2012 Scott Schwartz
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# The following files were written by Liliana Florea:
Xtend1.c
Xtend1.h
align.c
align.h
sim4.h
sim4.init.c
sim4b1.c
sim4b1.h
splice.c
splice.h
# The following files were written by Scott Schwartz:
args.c
args.h
charvec.c
charvec.h
discrim.c
discrim.h
dna.c
dna.h
encoding.c
encoding.h
libc.h
misc.c
misc.h
prnt.c
prnt.h
psublast.h
seq.c
seq.h
seq_read.c
types.h
Each distributed version of sim4 will be unpacked into
a directory whose name contains that version string.
For example, sim4.2000-03-13. Make a note of this
version when you fetch the tar file.
To compile:
gunzip < sim4.[version].tar.gz | tar -xf -
cd sim4.[version]
make
# For better performance, replace ``-O'' with whatever
# the best optimization flag is for your computer.
# For Sun's compilers under Solaris, ``-fast'' works well.
# For gcc, ``-O2'' works well.
CC=cc
CFLAGS=-O
LDLIBS=-lm
sim4:
$(CC) -o sim4 -I. $(CFLAGS) *.c $(LDLIBS)
clean:
rm -f sim4 *.o
psublast -- support library for alignment programs
Copyright (C) 1998-2012 Scott Schwartz
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
sim4 -- a program to align cDNA and genomic DNA
Copyright (C) 1998-2012 Liliana Florea
psublast -- a library for sequence alignment
Copyright (C) 1998-2012 Scott Schwartz
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
This diff is collapsed.
#ifndef XTEND1_H
#define XTEND1_H
/* $Id: Xtend1.h,v 1.3 1998/05/12 19:58:50 schwartz Exp $ */
extern int Xextend_bw(uchar *,uchar *,int,int,int,int,int *,int *);
extern int Xextend_fw(uchar *,uchar *,int,int,int,int,int *,int *);
#endif /* XTEND1_H */
This diff is collapsed.
#ifndef SCRIPTLIB_H
#define SCRIPTLIB_H
/* $Id: align.h,v 1.9 2000/06/06 15:03:02 florea Exp $ */
extern void align_path(int,int,int,int,int,edit_script**,edit_script**);
extern int align_get_dist(int, int, int, int, int);
extern void Condense_script(edit_script *);
extern void Condense_both_Ends(edit_script **, edit_script **, edit_script **);
extern void S2A(edit_script *, int *, int);
extern void align_reverse(int *);
extern void IDISPLAY(uchar *, uchar *, int, int, int *, int, int,int, Exon *);
extern void Free_script(edit_script *);
extern void Flip_script(struct edit_script **);
#ifdef AUXUTILS
extern void Reverse_script(edit_script *);
extern void Print_script(edit_script *head, int M, int N);
#endif
#endif /* SCRIPTLIB_H */
#include "libc.h"
#include "types.h"
#include "misc.h"
#include "args.h"
#ifndef __lint
static const char rcsid[] =
"$Id: args.c,v 1.3 2000/09/15 17:57:02 florea Exp $";
#endif
static int argc;
static char **argv;
char *argv0;
/* ckargs -- check that only certain parameters are set on the command line */
void ckargs(const char *options, int argcx, char **argvx, int non_options)
{
int i;
argc = argcx;
argv = argvx;
argv0 = argv0 ? argv0 : argv[0];
for (i = non_options+1; i < argc; ++i)
if (argv[i][1] != '=')
fatalf("Improper command option: '%s'.", argv[i]);
else if (!strchr(options, argv[i][0]))
fatalf("Available options: %s\n", options);
}
/* get_argval --------------------- get the value of a command-line argument */
bool get_argval(int c, int *val_ptr)
{
int i;
ck_argc("get_argval");
for (i = 0; i < argc; ++i)
if (argv[i][0] == c && argv[i][1] == '=') {
*val_ptr = atoi(argv[i]+2);
return 1;
}
return 0;
}
/* get_fargval --------------- get the float value of a command-line argument */
bool get_fargval(int c, double *val_ptr)
{
int i;
ck_argc("get_fargval");
for (i = 0; i < argc; ++i)
if (argv[i][0] == c && argv[i][1] == '=') {
*val_ptr = atof(argv[i]+2);
return 1;
}
return 0;
}
/* get_strargval ---------- get the string value of a command-line argument */
bool get_strargval(int c, char **val_ptr)
{
int i;
ck_argc("get_strargval");
for (i = 0; i < argc; ++i)
if (argv[i][0] == c && argv[i][1] == '=') {
*val_ptr = (char *) ckalloc(strlen(argv[i]+2)+1);
strcpy(*val_ptr, argv[i]+2);
return 1;
}
return 0;
}
bool get_cargval(int c, char **valp)
{
int i;
ck_argc("get_cargval");
for (i = 0; i < argc; ++i)
if (argv[i][0] == c && argv[i][1] == '=') {
*valp = argv[i]+2;
return 1;
}
return 0;
}
void fprintf_argv(FILE* fp)
{
int i;
fprintf(fp, "%s", argv0);
for (i = 1; i < argc; ++i)
(void)fprintf(fp, " %s", argv[i]);
}
/* ck_argc - die if argc is unknown */
void ck_argc(const char *proc_name)
{
if (argc == 0)
fatalf("Call ckargs() before %s.\n", proc_name);
}
#ifndef SIM_ARGS_H
#define SIM_ARGS_H
/* $Id: args.h,v 1.2 2000/09/15 17:57:02 florea Exp $ */
typedef struct argv_scores {
double E;
int I;
int M;
int O;
int V;
} argv_scores_t;
bool get_argval(int, int *);
bool get_fargval(int, double *);
bool get_strargval(int, char **);
bool get_cargval(int, char **);
void ckargs(const char *, int , char **, int );
void fprintf_argv(FILE* fp);
void ck_argc(const char *);
extern char *argv0;
#endif
/* genvec charvec char ; 1999-10-05 22:59:09 */
#include "charvec.h"
charvec_t* charvec_new(void* ((*ra)(void*,size_t)), void (*fr)(void*))
{
charvec_t *vec = ra(0, sizeof(*vec));
if (vec) {
if (charvec_init(vec, ra, fr))
return vec;
fr(vec);
}
return 0;
}
charvec_t* charvec_free(charvec_t *t)
{
charvec_fini(t);
t->free(t);
return 0;
}
int charvec_init(charvec_t *t, void* ((*a)(void*,size_t)), void (*f)(void*))
{
assert(t);
t->a = 0;
t->len = 0;
t->max = 0;
t->alloc = a;
t->free = f;
return charvec_need(t, 0);
}
int charvec_fini(charvec_t *t)
{
assert(t);
if (t->a && t->free) { t->free(t->a); t->a = 0; t->max = 0; }
t->len = 0;
return 1;
}
#ifndef BASE_ALLOC
#define BASE_ALLOC 30
#endif
enum { BASE = BASE_ALLOC };
int charvec_need(charvec_t *t, unsigned int n)
{
assert(t);
if (t->a == 0) {
assert(t->alloc);
t->len = 0;
t->max = n;
t->a = t->alloc(0, n * sizeof(char));
return t->a != 0;
}
if (n > t->max) {
unsigned int i = BASE + n + (n >> 3);
void *p = t->alloc(t->a, i * sizeof(char));
if (!p)
return 0;
t->max = i;
t->a = p;
}
return 1;
}
int charvec_more(charvec_t *t, unsigned int n)
{
assert(t);
return charvec_need(t, n + t->len);
}
int charvec_append(charvec_t *t, char e)
{
assert(t);
if (!charvec_more(t, 1))
return 0;
t->a[t->len++] = e;
return 1;
}
int charvec_fit(charvec_t *t)
{
assert(t);
assert(t->alloc);
{
unsigned int i = t->len;
void *p = t->alloc(t->a, i * sizeof(char));
if (!p)
return 0;
t->max = i;
t->a = p;
return 1;
}
}
/* genvec charvec char ; 1999-10-05 22:59:09 */
#ifndef HAS_GEN_charvec_H
#define HAS_GEN_charvec_H
#include <stdlib.h>
#include <string.h>
#include <assert.h>
typedef struct charvec {
char *a;
unsigned int len;
unsigned int max;
void *((*alloc)(void*, size_t));
void (*free)(void*);
} charvec_t;
#define charvec_INIT(a,f) {0, 0, 0, a, f}
charvec_t* charvec_new(void* ((*)(void*, size_t)), void ((*)(void*)));
charvec_t* charvec_free(charvec_t *t);
int charvec_init(charvec_t *t, void* ((*)(void*,size_t)), void((*)(void*)));
int charvec_fini(charvec_t *t);
int charvec_need(charvec_t *t, unsigned int n);
int charvec_more(charvec_t *t, unsigned int n);
int charvec_append(charvec_t *t, char e);
int charvec_fit(charvec_t *t);
#ifndef GENVEC_INBOUNDS
#define GENVEC_INBOUNDS(t,n) ((0<=(n))&&((n)<(t)->len))
#endif
#ifndef GENVEC_GET
#define GENVEC_GET(t,n) (assert(GENVEC_INBOUNDS(t,n)) , (t)->a[n])
#endif
#endif
sim4 (0.0.20121010-2) unstable; urgency=medium
* Moved debian/upstream to debian/upstream/metadata
* More precise homepage
* cme fix dpkg-control
* DEP5 names
-- Andreas Tille <tille@debian.org> Thu, 21 Jan 2016 09:55:40 +0100
sim4 (0.0.20121010-1) unstable; urgency=low
* New upstream version
* debian/upstream: add citation information
* debian/source/format: 3.0 (quilt)
* debian/control:
- cme fix dpkg-control
- debhelper 9
- drop cdbs build-depends
- anonscm in Vcs fields
* debian/rules: short dh
* debian/patches/hardening.patch: propagate hardening options
* debian/copyright: DEP5
-- Andreas Tille <tille@debian.org> Fri, 02 Aug 2013 13:24:50 +0200
sim4 (0.0.20030921-3) unstable; urgency=low
[ Charles Plessy ]
* Moved the Homepage: field out from the package's description.
[ David Paleino ]
* Updated to Standards-Version 3.7.3 (no changes needed)
[ Andreas Tille ]
* Added myself to Uploaders
-- Andreas Tille <tille@debian.org> Tue, 18 Mar 2008 13:27:59 +0100
sim4 (0.0.20030921-2) unstable; urgency=low
* Updated patches/Makefile.diff, so sim4 doesn't get stripped when
using DEB_BUILD_OPTIONS=nostrip (Closes: #438021);
* Changed maintainer to Debian-Med Packaging Team
<debian-med-packaging@lists.alioth.debian.org>;
* Added SVN repository URL in debian/control;
* Updated my email address;
* Updated Standards-Version to 3.7.2;
* Updated debhelper compatibility level to 5;
* Updated FSF address in copyright file;
* Updated watch file.
-- Nelson A. de Oliveira <naoliv@debian.org> Wed, 15 Aug 2007 11:53:53 -0300
sim4 (0.0.20030921-1) unstable; urgency=low
* Initial release. (Closes: #321180)
-- Nelson A. de Oliveira <naoliv@gmail.com> Wed, 3 Aug 2005 18:21:06 -0300
Source: sim4
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Nelson A. de Oliveira <naoliv@debian.org>,
Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 9)
Standards-Version: 3.9.6
Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/sim4/trunk/
Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/sim4/trunk/
Homepage: http://globin.bx.psu.edu/html/docs/sim4.html
Package: sim4
Architecture: any
Depends: ${shlibs:Depends},
${misc:Depends}
Description: tool for aligning cDNA and genomic DNA
sim4 is a similarity-based tool for aligning an expressed DNA sequence
(EST, cDNA, mRNA) with a genomic sequence for the gene. It also detects end
matches when the two input sequences overlap at one end (i.e., the start of
one sequence overlaps the end of the other).
.
sim4 employs a blast-based technique to first determine the basic matching
blocks representing the "exon cores". In this first stage, it detects all
possible exact matches of W-mers (i.e., DNA words of size W) between the two
sequences and extends them to maximal scoring gap-free segments. In the
second stage, the exon cores are extended into the adjacent as-yet-unmatched
fragments using greedy alignment algorithms, and heuristics are used to favor
configurations that conform to the splice-site recognition signals (GT-AG,
CT-AC). If necessary, the process is repeated with less stringent parameters
on the unmatched fragments.
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Contact: Liliana Florea <florea@gwu.edu>
Upstream-Name: Sim4
Source: http://globin.cse.psu.edu/ftp/dist/sim4/
Files: *
Copyright: © 1998-2012 Liliana Florea <florea@gwu.edu>,
Scott Schwartz
License: GPL-2+
Files: debian/*
Copyright: © 2005-2007 Nelson A. de Oliveira <naoliv@gmail.com>
© 2008-2016 Andreas Tille <tille@debian.org>
License: GPL-2+
License: GPL-2+
This package is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
.
This package is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
.
You should have received a copy of the GNU General Public License
along with this package; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
MA 02110-1301, USA.
.
On Debian systems, the complete text of the GNU General
Public License can be found in `/usr/share/common-licenses/GPL-2'.
Author: Andreas Tille <tille@debian.org>
LastChanged: Fri, 02 Aug 2013 13:24:50 +0200
Description: Propagate hardening options
--- a/Makefile
+++ b/Makefile
@@ -4,10 +4,10 @@
# For Sun's compilers under Solaris, ``-fast'' works well.
# For gcc, ``-O2'' works well.
CC=cc
-CFLAGS=-O
+CFLAGS+=-g -O2 -Wall
LDLIBS=-lm
sim4:
- $(CC) -o sim4 -I. $(CFLAGS) *.c $(LDLIBS)
+ $(CC) -o sim4 -I. $(CFLAGS) *.c $(LDLIBS) $(LDFLAGS)
clean:
rm -f sim4 *.o
#!/usr/bin/make -f
%:
dh $@
.TH SIM4 1 "Wed, 03 Aug 2005 18:40:58 -0300"
.SH NAME
sim4 \- align an expressed DNA sequence with a genomic sequence
.SH SYNOPSIS
.B sim4
\fIseqfile1\fR \fIseqfile2\fR {[WXKCRDAPNB]=\fIvalue\fR}
.SH DESCRIPTION
\fBsim4\fP is a similarity-based tool for aligning an expressed DNA sequence (EST, cDNA, mRNA) with a genomic sequence for the gene. It also detects end matches when the two input sequences overlap at one end (i.e., the start of one sequence overlaps the end of the other). If \fIseqfile2\fR is a database of sequences, the sequence in \fIseqfile1\fR will be aligned with each of the sequences in \fIseqfile2\fR.
\fBsim4\fP employs a blast-based technique to first determine the basic matching blocks representing the "exon cores". In this first stage, it detects all possible exact matches of W-mers (i.e., DNA words of size W) between the two sequences and extends them to maximal scoring gap-free segments. In the second stage, the exon cores are extended into the adjacent as-yet-unmatched fragments using greedy alignment algorithms, and heuristics are used to favor configurations that conform to the splice-site recognition signals (GT-AG, CT-AC). If necessary, the process is repeated with less stringent parameters on the unmatched fragments.
By default, \fBsim4\fP searches both strands and reports the best match, measured by the number of matching nucleotides found in the alignment. The R command line option can be used to restrict the search to one orientation (strand) only.
Currently, five major alignment display options are supported, controlled by the A option. By default (A=0), only the endpoints, overall similarity, and orientation of the introns are reported. An arrow sign (`->' or `<-') indicates the orientation of the intron (`+' or `-' strand), when the signals flanking the intron have three or more position matches with either the GT-AG or the CT-AC splice recognition signals. When the same number of matches is found for both orientations, the intron is reported as ambiguous, and represented by `--'. The sign `==' marks the absence from the alignment of a cDNA fragment starting at that position. Alternative formats (lav-block format, text, PipMaker-type `exons file', or certain combinations of these options) can be requested by specifying a different value for A.
If the P option is specified with a non-zero value, \fBsim4\fP will remove any 3'-end poly-A tails that it detects in the alignment.
Occasionally, \fBsim4\fP may miss an internal exon when surrounded by very large introns, typically longer than 100 Kb. When this is suspected, the H option can be used to reset the exons' weight to compensate for the intron gap penalty.
Ambiguity codes are by default allowed in sequence data, but \fBsim4\fP treats them non-differentially. If desired, the B command option can restrict the set of acceptable characters to A,C,G,T,N and X only.
\fBsim4\fP compares the lengths of the input sequences to distinguish between the cDNA (`short') and the genomic (`long') components in the comparison. When \fIseqfile2\fR contains a collection of sequences, the first entry in the file will be used to determine the type of this and all subsequent comparisons.
In the description below, the term MSP denotes a \fIM\fRaximal \fIS\fRegment \fIP\fRair, that is, a pair of highly similar fragments in the two sequences, obtained during the blast-like procedure by extending a W-mer hit by matches and perhaps a few mismatches.
.PP
.SH OPTIONS
The algorithm parameters (included in the first two sections below) have already been tuned and do not normally require adjustment by the user.
Parameters internal to the blast-like procedure:
.TP
.B W
Sets the word size for blast hits in the first stage of the algorithm. The default value is 12, but it can be increased for a more stringent search or decreased to find weaker matches.
.TP
.B X
Controls the limits for terminating word extensions in the blast-like stage of the algorithm. The default value is 12.
.TP
.B K
Sets the threshold for the MSP scores when determining the basic `exon cores', during the first stage of the algorithm. (If this option is not specified, the threshold is computed from the lengths of the sequences, using statistical criteria.) For example, a good value for genomic sequences in the range of a few hundred Kb is 16. To avoid spurious matches, however, a larger value may be needed for longer sequences.
.TP
.B C
Sets the threshold for the MSP scores when aligning the as-yet-unmatched fragments, during the second stage of the algorithm. By default, the smaller of the constant 12 and a statistics-based threshold is chosen.
.PP
Additional algorithm parameters:
.TP
.B D
Sets the bound for the "diagonal" distance within consecutive MSPs in an exon. The default value is 10.
.PP
Context parameters:
.TP
.B R
Specifies the direction of the search. If R=0, only the "+" (direct) strand is searched. If R=1, only the "-" (reverse complement) matches are sought. By default (R=2), sim4 searches both strands and reports the best match, measured by the number of matching pairs in the alignment.
.TP
.B A
Specifies the format of the output: exon endpoints only (A=0), exon endpoints and boundaries of the coding region (CDS) in the genomic sequence, when specified for the input mRNA (A=5), alignment text (A=1), alignment in lav-block format (A=2), or both exon endpoints and alignment text (A=3 or A=4). If a reverse complement match is found, A=0,1,2,3,5 will give its position in the "+" strand of the longer sequence and the "-" strand of the shorter sequence. A=4 will give its position in the "+" strand of the first sequence (seqfile1) and the "-" strand of the second sequence (seqfile2), regardless of which sequence is longer. The A=5 option can be used with the S command line option to specify the endpoints of the CDS in the mRNA, and produces output in the `exons file' format required by PipMaker.
.TP
.B P
Specifies whether or not the program should report the fragment of the alignment containing the poly-A tail (if found). By default (P=0) the alignment is displayed as computed, but specifying a non-zero value will request sim4 to remove the poly-A tails. When this feature is enabled, all display options produce additional lav alignment headers.
.TP
.B H
Resets the MSPs' weight to compensate for very large introns. The default value is H=500, but some introns larger than 100 Kb may require higher values, typically between 1000 and 2500. This option should be used cautiously, generally in cases where an unmatched internal portion of the cDNA may disguise a missed exon within a very large intron. It is not recommended for ESTs, where they may produce spurious exons.
.TP
.B N
Requests an additional search for small marginal exons (N=1) guided by the splice-site recognition signals. This option can be used when a high accuracy match is expected. The default value is N=0, specifying no additional search.
.TP
.B B
Controls the set of characters allowed in the input sequences. By default (B=1), ambiguity characters (ABCDGHKMNRSTVWXY) are allowed. By specifying B=0, the set of acceptable characters is restricted to A,C,G,T,N and X only.
.TP
.B S
Allows the user to specify the endpoints of the CDS in the input mRNA, with the syntax: S=n1..n2. This option is only available with the A=5 flag, which produces output in the format required by PipMaker. Alternatively, the CDS coordinates could appear in a construct CDS=n1..n2 in the FastA header of the mRNA sequence. When the second file is an mRNA database, the command line specification for the CDS will apply to the first sequence in the file only.
.SH EXAMPLES
sim4 est genomic
sim4 genomic estdb
sim4 est genomic A=1 P=1
sim4 est1 est2 R=1
sim4 mRNA genomic A=5 S=123..1020
sim4 mouse_cDNA human_genomic K=15 C=11 A=3 W=10
.SH AUTHORS
sim4 was written by Liliana Florea <florea@gwu.edu> and Scott Schwartz.
.PP
This manual page was written by Nelson A. de Oliveira <naoliv@gmail.com>, based on the online documentation at http://globin.cse.psu.edu/html/docs/sim4.html,
for the Debian project (but may be used by others).
Reference:
Author: Liliana Florea and George Hartzell and Zheng Zhang and Gerald M. Rubin and Webb Miller
Title: A Computer Program for Aligning a cDNA Sequence with a Genomic DNA Sequence
Journal: Genome Research
Year: 1998
Volume: 8
Pages: 967-974
DOI: 10.1101/gr.8.9.967
PMID: 9750195
URL: http://genome.cshlp.org/content/8/9/967.full
eprint: http://genome.cshlp.org/content/8/9/967.full.pdf+html
version=3
opts="uversionmangle=s/^/0.0./;s/-//g" \
http://globin.cse.psu.edu/ftp/dist/sim4/ sim4.(.*)\.tar\.gz
#include "libc.h"
#include "types.h"
#include "misc.h"
#include "args.h"
#include "seq.h"
#include "dna.h"
#include "discrim.h"
#ifndef __lint
static const char rcsid[] =
"$Id: discrim.c,v 1.2 2000/06/05 22:48:19 florea Exp $";
#endif
/* DNA characters */
const uchar dchars[] = "ABCDGHKMNRSTVWXY";
static int is_dchar(int ch);
bool is_DNA(uchar *s, int len)
{
int ACGT, i;
for (ACGT = i = 0; i < len; ++i)
if (strchr("ACGTNXacgtnx", s[i]))
++ACGT;
if (10*ACGT < 9*len) /* ACGT < 90% of len */
return 0;
for (i = 0; i < len; ++i)
if (!is_dchar(s[i])) {
fatalf("Illegal character '%c' in sequence file.\n", s[i]);
exit(1);
}
return 1;
}
static int is_dchar(int ch)
{
return !!strchr((const char*)dchars, toupper(ch));
}
#ifndef SIM_DISCRIM_H
#define SIM_DISCRIM_H
/* $Id: discrim.h,v 1.1 2000/06/05 22:23:15 florea Exp $ */
bool is_DNA(uchar *s, int len);
#endif
#include "libc.h"
#include "types.h"
#include "seq.h"
#include "misc.h"