New upstream version 1.5.2+dfsg

parent e3249b5c
......@@ -14,15 +14,11 @@ For FreeBSD OS, use command:
gmake -f Makefile.FreeBSD
For OpenSolaris/Oracle Solaris OS, use command:
gmake -f Makefile.SunOS
If the build is successful, a new directory called 'bin' will be created under the home directory of the package (ie. one level up from 'src' directory). The 'bin directory contains all the generated executables. To enable easy access to these executables, you may copy the executables to a system directory such as '/usr/bin' or add the path to the executables to your search path (add path to your environment variable `PATH').
Content
--------------
annotation Directory including NCBI RefSeq gene annotations for genomes 'hg19', 'mm10' and 'mm9'.
annotation Directory including NCBI RefSeq gene annotations for genomes 'hg19', 'hg38', 'mm10' and 'mm9'.
Each row is an exon. Entrez gene identifiers and chromosomal coordinates are provided for each exon.
bin Directory including executables after compilation (or directly available from a binary release).
doc Directory including the users manual.
......@@ -33,36 +29,41 @@ test Directory including test data and scripts.
A Quick Start
--------------
An index should be built before carrying out read alignments:
Build index for a reference genome:
subread-buildindex -o my_index chr1.fa chr2.fa ...
(You may provide a single FASTA file including all chromosomal sequences).
subread-buildindex -o my_index chr1.fa chr2.fa ...
(You may provide a single FASTA file including all chromosomal sequences).
Align a single-end RNA-seq dataset to the reference genome:
With built index, you can now align reads to the reference genome. Align single-end reads:
subread-align -i my_index -r reads.txt -t 0 -o subread_results.bam
subread-align -i my_index -r reads.txt -o subread_results.sam
Align a paired-end genomic DNA-seq dataset to the reference genome:
Align paired-end reads:
subread-align -i my_index -r reads1.txt -R reads2.txt -t 1 -o subread_results_PE.bam
subread-align -i my_index -r reads1.txt -R reads2.txt -o subread_results_PE.sam
Detect exon-exon junctions from a paired-end RNA-seq dataset (read mapping results are also produced):
Detect exon-exon junctions from RNA-seq data (read mapping results are also generated):
subjunc -i my_index -r reads1.txt -R reads2.txt -o subjunc_results.bam
subjunc -i my_index -r reads1.txt -R reads2.txt -o subjunc_results.sam
Assign mapped RNA-seq reads to mm10 genes using inbuilt annotation:
Assign mapped reads to genomic features (eg. genes):
featureCounts -a ../annotation/mm10_RefSeq_exon.txt -F 'SAF' -o counts.txt subread_results.bam
featureCounts -a annotation.gtf -o counts.txt subread_results.sam
Assign mapped RNA-seq reads to hg38 genes using a public GTF annotation:
featureCounts -a hg38_annotation.gtf -o counts.txt subread_results.bam
Tutorials
-------------------
A short tutorial for Subread - http://bioinf.wehi.edu.au/subread
A short tutorial for Subjunc - http://bioinf.wehi.edu.au/subjunc
A short tutorial for featureCounts - http://bioinf.wehi.edu.au/featureCounts
A short tutorial for exactSNP - http://bioinf.wehi.edu.au/exactSNP
Users Guide
--------------
Users Guide can be found in the 'doc' subdirectory. It provides comprehensive descriptions to the programs included in this package.
Users Guide can be found in the 'doc' subdirectory of this software package or via URL (http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf).
Citation
--------------
......@@ -72,8 +73,9 @@ Liao Y, Smyth GK and Shi W. The Subread aligner: fast, accurate and scalable rea
If you use the featureCounts program, please cite:
Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning sequence reads to genomic features. Bioinformatics, 2013. doi: 10.1093/bioinformatics/btt656
Liao Y, Smyth GK and Shi W. featureCounts: an efficient general purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30, 2014
Get help
Mailing lists
--------------
You may subscribe to the SeqAnswer forum (http://www.seqanswers.com) or the Bioconductor mailing list (http://bioconductor.org/) to get help. Alternatively, you may directly contact Wei Shi (shi at wehi dot edu dot au) or Yang Liao (liao at wehi dot edu dot au) for help.
Please post your questions/suggestions to Bioconductor support site(https://support.bioconductor.org/) or Subread google group (https://groups.google.com/forum/#!forum/subread).
This diff is collapsed.
......@@ -44,6 +44,7 @@
#include "subread.h"
#include "input-files.h"
#include "gene-algorithms.h"
#include "HelperFunctions.h"
......@@ -979,3 +980,171 @@ double fast_fisher_test_one_side(unsigned int a, unsigned int b, unsigned int c,
}
int load_features_annotation(char * file_name, int file_type, char * gene_id_column, char * feature_name_column,
void * context, int do_add_feature(char * gene_name, char * chro_name, unsigned int start, unsigned int end, int is_negative_strand, void * context) ){
char * file_line = malloc(MAX_LINE_LENGTH+1);
int lineno = 0, is_GFF_warned = 0, loaded_features = 0;
FILE * fp = fopen(file_name, "r");
if(NULL == fp){
SUBREADprintf("Error: unable to open the annotation file : %s\n", file_name);
return -1;
}
while(1){
int is_gene_id_found = 0, is_negative_strand = -1;
char * token_temp = NULL, * feature_name, * chro_name = NULL;
char feature_name_tmp[FEATURE_NAME_LENGTH];
feature_name = feature_name_tmp;
unsigned int start = 0, end = 0;
char * getres = fgets(file_line, MAX_LINE_LENGTH, fp);
if(getres == NULL) break;
lineno++;
if(is_comment_line(file_line, file_type, lineno-1))continue;
if(file_type == FILE_TYPE_RSUBREAD)
{
feature_name = strtok_r(file_line,"\t",&token_temp);
int feature_name_len = strlen(feature_name);
if(feature_name_len > FEATURE_NAME_LENGTH) feature_name[FEATURE_NAME_LENGTH -1 ] = 0;
chro_name = strtok_r(NULL,"\t", &token_temp);
int chro_name_len = strlen(chro_name);
if(chro_name_len > MAX_CHROMOSOME_NAME_LEN) chro_name[MAX_CHROMOSOME_NAME_LEN -1 ] = 0;
char * start_ptr = strtok_r(NULL,"\t", &token_temp);
char * end_ptr = strtok_r(NULL,"\t", &token_temp);
if(start_ptr == NULL || end_ptr == NULL){
SUBREADprintf("\nWarning: the format on the %d-th line is wrong.\n", lineno);
}
long long int tv1 = atoll(start_ptr);
long long int tv2 = atoll(end_ptr);
if( isdigit(start_ptr[0]) && isdigit(end_ptr[0]) ){
if(strlen(start_ptr) > 10 || strlen(end_ptr) > 10 || tv1 > 0x7fffffff || tv2> 0x7fffffff){
SUBREADprintf("\nError: Line %d contains a coordinate greater than 2^31!\n", lineno);
return -2;
}
}else{
SUBREADprintf("\nError: Line %d contains a format error. The expected annotation format is SAF.\n", lineno);
return -2;
}
start = atoi(start_ptr);// start
end = atoi(end_ptr);//end
char * strand_str = strtok_r(NULL,"\t", &token_temp);
if(strand_str == NULL)
is_negative_strand = 0;
else
is_negative_strand = ('-' ==strand_str[0]);
is_gene_id_found = 1;
} else if(file_type == FILE_TYPE_GTF) {
chro_name = strtok_r(file_line,"\t",&token_temp);
strtok_r(NULL,"\t", &token_temp);// source
char * feature_type = strtok_r(NULL,"\t", &token_temp);// feature_type
if(strcmp(feature_type, feature_name_column)==0){
char * start_ptr = strtok_r(NULL,"\t", &token_temp);
char * end_ptr = strtok_r(NULL,"\t", &token_temp);
if(start_ptr == NULL || end_ptr == NULL){
SUBREADprintf("\nWarning: the format on the %d-th line is wrong.\n", lineno);
}
long long int tv1 = atoll(start_ptr);
long long int tv2 = atoll(end_ptr);
if( isdigit(start_ptr[0]) && isdigit(end_ptr[0]) ){
if(strlen(start_ptr) > 10 || strlen(end_ptr) > 10 || tv1 > 0x7fffffff || tv2> 0x7fffffff){
SUBREADprintf("\nError: Line %d contains a coordinate greater than 2^31!\n", lineno);
return -2;
}
}else{
SUBREADprintf("\nError: Line %d contains a format error. The expected annotation format is GTF/GFF.\n", lineno);
return -2;
}
start = atoi(start_ptr);// start
end = atoi(end_ptr);//end
if(start < 1 || end<1 || start > 0x7fffffff || end > 0x7fffffff || start > end)
SUBREADprintf("\nWarning: the feature on the %d-th line has zero coordinate or zero lengths\n\n", lineno);
strtok_r(NULL,"\t", &token_temp);// score
is_negative_strand = ('-' == (strtok_r(NULL,"\t", &token_temp)[0]));//strand
strtok_r(NULL,"\t",&token_temp); // "frame"
char * extra_attrs = strtok_r(NULL,"\t",&token_temp); // name_1 "val1"; name_2 "val2"; ...
if(extra_attrs && (strlen(extra_attrs)>2)){
int attr_val_len = GTF_extra_column_value(extra_attrs , gene_id_column , feature_name_tmp, FEATURE_NAME_LENGTH);
if(attr_val_len>0) is_gene_id_found=1;
}
if(!is_gene_id_found){
if(!is_GFF_warned)
{
int ext_att_len = strlen(extra_attrs);
if(extra_attrs[ext_att_len-1] == '\n') extra_attrs[ext_att_len-1] =0;
SUBREADprintf("\nWarning: failed to find the gene identifier attribute in the 9th column of the provided GTF file.\nThe specified gene identifier attribute is '%s' \nThe attributes included in your GTF annotation are '%s' \n\n", gene_id_column, extra_attrs);
}
is_GFF_warned++;
}
}
}
if(is_gene_id_found){
do_add_feature(feature_name, chro_name, start, end, is_negative_strand, context);
loaded_features++;
}
}
fclose(fp);
free(file_line);
return loaded_features;
}
HashTable * load_alias_table(char * fname) {
FILE * fp = f_subr_open(fname, "r");
if(!fp)
{
print_in_box(80,0,0,"WARNING unable to open alias file '%s'", fname);
return NULL;
}
char * fl = malloc(2000);
HashTable * ret = HashTableCreate(1013);
HashTableSetDeallocationFunctions(ret, free, free);
HashTableSetKeyComparisonFunction(ret, fc_strcmp_chro);
HashTableSetHashFunction(ret, fc_chro_hash);
while (1)
{
char *ret_fl = fgets(fl, 1999, fp);
if(!ret_fl) break;
if(fl[0]=='#') continue;
char * sam_chr = NULL;
char * anno_chr = strtok_r(fl, ",", &sam_chr);
if((!sam_chr)||(!anno_chr)) continue;
sam_chr[strlen(sam_chr)-1]=0;
char * anno_chr_buf = malloc(strlen(anno_chr)+1);
strcpy(anno_chr_buf, anno_chr);
char * sam_chr_buf = malloc(strlen(sam_chr)+1);
strcpy(sam_chr_buf, sam_chr);
HashTablePut(ret, sam_chr_buf, anno_chr_buf);
}
fclose(fp);
free(fl);
return ret;
}
......@@ -71,4 +71,8 @@ unsigned int find_left_end_cigar(unsigned int right_pos, char * cigar);
int mac_or_rand_str(char * char_14);
double fast_fisher_test_one_side(unsigned int a, unsigned int b, unsigned int c, unsigned int d, long double * frac_buffer, int buffer_size);
int load_features_annotation(char * file_name, int file_type, char * gene_id_column, char * feature_name_column,
void * context, int do_add_feature(char * gene_name, char * chro_name, unsigned int start, unsigned int end, int is_negative_strand, void * context) );
HashTable * load_alias_table(char * fname) ;
#endif
......@@ -2,10 +2,11 @@
include makefile.version
OPT_LEVEL = 9
OPT_LEVEL = 3
CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -Wall -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64
LDFLAGS = ${STATIC_MAKE} -lpthread -lz -lm ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE # -DREPORT_ALL_THE_BEST
CC = gcc ${CCFLAGS} -ggdb -fomit-frame-pointer -ffast-math -funroll-loops -mmmx -msse -msse2 -msse3 -fmessage-length=0
CC_EXEC = gcc
CC = ${CC_EXEC} ${CCFLAGS} -fmessage-length=0 -ggdb # -fomit-frame-pointer -ffast-math -funroll-loops -mmmx -msse -msse2 -msse3 -fmessage-length=0
ALL_LIBS= core core-junction core-indel sambam-file sublog gene-algorithms hashtable input-files sorted-hashtable gene-value-index exon-algorithms HelperFunctions interval_merge long-hashtable core-bigtable seek-zlib
......
......@@ -79,6 +79,7 @@ struct SNP_Calling_Parameters{
char pile_file_name[300];
int delete_piles;
int disk_is_full;
char background_input_file[300];
char subread_index[300];
......@@ -294,7 +295,11 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
fread(&read_len, sizeof(short), 1, tmp_fp);
fread(read, sizeof(char), read_len, tmp_fp);
fread(qual, sizeof(char), read_len, tmp_fp);
int rlen = fread(qual, sizeof(char), read_len, tmp_fp);
if(rlen < read_len){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
first_base_pos = read_rec.pos - block_no * BASE_BLOCK_LENGTH;
parameters->is_paired_end_data = read_rec.flags & 1;
......@@ -581,7 +586,7 @@ void fishers_test_on_block(struct SNP_Calling_Parameters * parameters, float * s
int process_snp_votes(FILE *out_fp, unsigned int offset , unsigned int reference_len, char * referenced_genome, char * chro_name , char * temp_prefix, struct SNP_Calling_Parameters * parameters)
{
int block_no = (offset -1) / BASE_BLOCK_LENGTH, i;
int block_no = (offset -1) / BASE_BLOCK_LENGTH, i, disk_is_full = 0;
char temp_file_name[300];
FILE *tmp_fp;
unsigned int * snp_voting_piles, *snp_BGC_piles = NULL; // offset * 4 + "A/C/G/T"[0,1,2,3]
......@@ -632,7 +637,7 @@ int process_snp_votes(FILE *out_fp, unsigned int offset , unsigned int reference
pcutoff_list[i]=-1.;
}
read_tmp_block(parameters, tmp_fp,&SNP_bitmap_recorder,snp_voting_piles,block_no, reference_len, referenced_genome);
int read_is_error = read_tmp_block(parameters, tmp_fp,&SNP_bitmap_recorder,snp_voting_piles,block_no, reference_len, referenced_genome);
fclose(tmp_fp);
if (parameters -> delete_piles)
......@@ -891,7 +896,12 @@ int process_snp_votes(FILE *out_fp, unsigned int offset , unsigned int reference
snprintf(sprint_line,999, "%s\t%u\t.\t%c\t%s\t%.4f\t.\tDP=%d;MM=%s;BGTOTAL=%d;BGMM=%d%s\n", chro_name, BASE_BLOCK_LENGTH*block_no +1 + i, true_value,base_list, Qvalue, all_reads, supporting_list , snp_filter_background_matched[i]+snp_filter_background_unmatched[i], snp_filter_background_unmatched[i], BGC_Qvalue_str);
if(parameters->output_fp_lock)
subread_lock_occupy(parameters->output_fp_lock);
fwrite(sprint_line, 1, strlen(sprint_line),out_fp);
int sprint_line_len = strlen(sprint_line);
int wlen = fwrite(sprint_line, 1, sprint_line_len,out_fp);
if(wlen < sprint_line_len){
disk_is_full=1;
break;
}
parameters->reported_SNPs++;
if(parameters->output_fp_lock)
subread_lock_release(parameters->output_fp_lock);
......@@ -932,8 +942,12 @@ int process_snp_votes(FILE *out_fp, unsigned int offset , unsigned int reference
fwrite(referenced_genome + i, 1, 1, out_fp);
fwrite(referenced_genome + 1 + i + max(0,indels), 1, 1, out_fp);
unsigned short * indel_sups = parameters -> cigar_event_table-> appendix2;
fprintf(out_fp, "\t1.0\t.\tINDEL;DP=%d;SR=%d\n",all_reads,indel_sups[event_id]);
int wlen = fprintf(out_fp, "\t1.0\t.\tINDEL;DP=%d;SR=%d\n",all_reads,indel_sups[event_id]);
if(wlen < 10){
disk_is_full=1;
break;
}
parameters->reported_indels++;
if(parameters->output_fp_lock)
subread_lock_release(parameters->output_fp_lock);
......@@ -956,7 +970,7 @@ int process_snp_votes(FILE *out_fp, unsigned int offset , unsigned int reference
free(pcutoff_list);
free(sprint_line);
//SUBREADprintf("OVERLAPPED=%llu; MISMA=%llu; ALL_BASES=%llu\n",OVERLAPPED_BASES, OVER_MISMA_BASES, ALL_BASES);
return 0;
return read_is_error || disk_is_full;
}
......@@ -1033,9 +1047,10 @@ int run_chromosome_search(FILE *in_fp, FILE * out_fp, char * chro_name , char *
//#warning "=== ONLY TEST ONE BLOCK , USE 'if(1)' IN RELEASE ==="
//if(strcmp(chro_name,"chr7")==0 && all_offset == 60000000){
if(1){
process_snp_votes(out_fp, all_offset, offset, referenced_base, chro_name , temp_prefix, parameters);
parameters -> disk_is_full |= process_snp_votes(out_fp, all_offset, offset, referenced_base, chro_name , temp_prefix, parameters);
print_in_box(89,0,0,"processed block %c[36m%s@%d%c[0m by thread %d/%d [block number=%d/%d]", CHAR_ESC, chro_name, all_offset, CHAR_ESC , thread_no+1, all_threads, 1+(*task_no)-parameters->empty_blocks, parameters->all_blocks);
}
if(parameters -> disk_is_full)break;
}
else if((*task_no) % all_threads == thread_no)
{
......@@ -1204,6 +1219,11 @@ int parse_read_lists_maybe_threads(char * in_FASTA_file, char * out_BED_file, ch
}
//fprintf(out_fp, "## Fisher_Test_Size=%u\n",fisher_test_size);
fclose(out_fp);
if(parameters -> disk_is_full){
unlink(out_BED_file);
SUBREADputs("ERROR: cannot write into the output VCF file. Please check the disk space in the output directory.");
ret = 1;
}
return ret;
}
......@@ -1405,14 +1425,11 @@ int SNP_calling(char * in_SAM_file, char * out_BED_file, char * in_FASTA_file, c
HashTableSetKeyComparisonFunction(parameters-> cigar_event_table, my_strcmp);
memcpy(rand48_seed, &start_time, 6);
if(temp_location)
strcpy(temp_file_prefix, temp_location);
else{
char mac_rand[13];
mac_or_rand_str(mac_rand);
sprintf(temp_file_prefix, "./temp-snps-%06u-%s-", getpid(), mac_rand);
}
sprintf(temp_file_prefix, "%s/temp-snps-%06u-%s-", temp_location, getpid(), mac_rand);
_EXSNP_SNP_delete_temp_prefix = temp_file_prefix;
print_in_box(89,0,0,"Split %s file into %c[36m%s*%c[0m ..." , parameters -> is_BAM_file_input?"BAM":"SAM" , CHAR_ESC, temp_file_prefix, CHAR_ESC);
......@@ -1578,7 +1595,7 @@ int main_snp_calling_test(int argc,char ** argv)
optopt = 63;
memset(&parameters, 0, sizeof(struct SNP_Calling_Parameters));
parameters.start_time = miltime();
parameters.empty_blocks = 0;
parameters.reported_SNPs = 0;
......@@ -1688,10 +1705,6 @@ int main_snp_calling_test(int argc,char ** argv)
strncpy(out_BED_file, optarg,299);
break;
case '9': // UNUSED
strncpy(temp_path, optarg,299);
break;
case 'T':
threads = atoi(optarg);
if(!threads)threads=1;
......@@ -1814,7 +1827,16 @@ int main_snp_calling_test(int argc,char ** argv)
warning_file_type(in_SAM_file, parameters.is_BAM_file_input?FILE_TYPE_BAM:FILE_TYPE_SAM);
warning_file_type(in_FASTA_file, FILE_TYPE_FASTA);
warning_file_limit();
ret = SNP_calling(in_SAM_file, out_BED_file, in_FASTA_file, temp_path[0]?temp_path:NULL, read_count, threads, &parameters);
int x1;
for(x1 = strlen(out_BED_file); x1 >= 0; x1--){
if(out_BED_file[x1]=='/'){
memcpy(temp_path, out_BED_file, x1);
temp_path[x1]=0;
break;
}
}
if(temp_path[0]==0)strcpy(temp_path, "./");
ret = SNP_calling(in_SAM_file, out_BED_file, in_FASTA_file, temp_path, read_count, threads, &parameters);
if(ret != -1)
{
print_in_box(80,0,1,"");
......
......@@ -430,7 +430,7 @@ void bktable_append(bucketed_table_t * tab, char * chro, unsigned int pos, void
}
void bktable_free_ptrs(void * buckv, HashTable * tab){
void bktable_free_ptrs(void * bukey, void * buckv, HashTable * tab){
int x1;
bucketed_table_bucket_t * buck = buckv;
for(x1 = 0; x1 < buck -> items; x1++)
......
......@@ -36,7 +36,7 @@ void bktable_init(bucketed_table_t * tab, unsigned int maximum_interval_length,
void bktable_destroy(bucketed_table_t * tab);
void bktable_free_ptrs(void * buckv, HashTable * tab);
void bktable_free_ptrs(void * bkey, void * buckv, HashTable * tab);
void fraglist_init(fragment_list_t * list);
......
This diff is collapsed.
......@@ -30,8 +30,8 @@
//#define MAX_EVENT_ENTRIES_PER_SITE 5
//#define MAX_EVENT_ENTRIES_PER_SITE 12
//
#define EVENT_ENTRIES_INIT_SIZE 9
#define MAX_EVENT_ENTRIES_PER_SITE 9
#define EVENT_ENTRIES_INIT_SIZE (9)
#define MAX_EVENT_ENTRIES_PER_SITE (9)
#define CHRO_EVENT_TYPE_REMOVED 0
#define CHRO_EVENT_TYPE_INDEL 8
#define CHRO_EVENT_TYPE_LONG_INDEL 16
......@@ -132,6 +132,8 @@ typedef struct{
unsigned int block_start_linear_pos;
} reassembly_block_context_t;
#define EVENT_BODY_LOCK_BUCKETS 14929
typedef struct{
HashTable * event_entry_table;
......@@ -139,6 +141,7 @@ typedef struct{
unsigned int current_max_event_number;
chromosome_event_t * event_space_dynamic;
HashTable * local_reassembly_pileup_files;
subread_lock_t event_body_locks[EVENT_BODY_LOCK_BUCKETS];
short ** dynamic_align_table;
char ** dynamic_align_table_mask;
......@@ -159,7 +162,9 @@ typedef struct{
int init_indel_tables(global_context_t * context);
int destroy_indel_module(global_context_t * context);
int init_indel_thread_contexts(global_context_t * global_context, thread_context_t * thread_context, int task);
int finalise_indel_thread(global_context_t * global_context, thread_context_t * thread_context, int task);
int sort_global_event_table(global_context_t * global_context);
int load_known_junctions(global_context_t * global_context);
int finalise_indel_and_junction_thread(global_context_t * global_context, thread_context_t * thread_contexts, int task);
int find_new_indels(global_context_t * global_context, thread_context_t * thread_context, int pair_number, char * read_name, char * read_text, char * qual_text, int read_len, int is_second_read, int best_read_id);
int write_indel_final_results(global_context_t * context);
int search_event(global_context_t * global_context,HashTable * event_table, chromosome_event_t * event_space, unsigned int pos, int search_type, char event_type, chromosome_event_t ** return_buffer);
......@@ -188,5 +193,11 @@ int anti_supporting_read_scan(global_context_t * global_context);
int core_dynamic_align(global_context_t * global_context, thread_context_t * thread_context, char * read, int read_len, unsigned int begin_position, char * movement_buffer, int expected_offset, char * read_name);
chromosome_event_t * local_add_indel_event(global_context_t * global_context, thread_context_t * thread_context, HashTable * event_table, char * read_text, unsigned int left_edge, int indels, int score_supporting_read_added, int is_ambiguous, int mismatched_bases);
void init_core_temp_path(global_context_t * context);
chromosome_event_t * local_add_indel_event(global_context_t * global_context, thread_context_t * thread_context, HashTable * event_table, char * read_text, unsigned int left_edge, int indels, int score_supporting_read_added, int is_ambiguous, int mismatched_bases,int * old_event_id);
void print_indel_table(global_context_t * global_context);
int sort_junction_entry_table(global_context_t * global_context);
void mark_event_bitmap(unsigned char * bitmap, unsigned int pos);
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -66,6 +66,7 @@ typedef struct{
int result_front_junction_numbers[MAX_ALIGNMENT_PER_ANCHOR];
int all_back_alignments;
int all_front_alignments;
int known_junctions;
// unsigned int tmp_jump_length;
// unsigned int best_jump_length;
......@@ -153,5 +154,6 @@ int is_funky_fragment(global_context_t * global_context, char * rname1, char * c
void finalise_structural_variances(global_context_t * global_context);
void debug_show_event(global_context_t* global_context, chromosome_event_t * event);
void get_event_two_coordinates(global_context_t * global_context, unsigned int event_no, char ** small_chro, int * small_pos, unsigned int * small_abs, char ** large_chro, int * large_pos, unsigned int * large_abs);
#endif
This diff is collapsed.
......@@ -139,7 +139,11 @@ typedef struct{
// input_scheme
char first_read_file[MAX_FILE_NAME_LENGTH];
char second_read_file[MAX_FILE_NAME_LENGTH];
char medium_result_prefix[MAX_FILE_NAME_LENGTH];
char exon_annotation_file[MAX_FILE_NAME_LENGTH];
char exon_annotation_alias_file[MAX_FILE_NAME_LENGTH];
int exon_annotation_file_type;
char exon_annotation_gene_id_column[MAX_READ_NAME_LEN];
char exon_annotation_feature_name_column[MAX_READ_NAME_LEN];
short read_trim_5;
......@@ -266,6 +270,7 @@ typedef struct{
#define CORE_EXPERIMENT_DNASEQ 1000
#define CORE_EXPERIMENT_RNASEQ 2000
#define PRINT_BOX_WRAPPED 4
#define PRINT_BOX_NOCOLOR_FOR_COLON 2
#define PRINT_BOX_CENTER 1
......@@ -305,11 +310,10 @@ typedef struct{
short junction_flanking_left;
short junction_flanking_right;
unsigned char event_type;
char indel_at_junction;
char is_negative_strand; // this only works to junction detection, according to 'GT/AG' or 'CT/AC' donors. This only applys to junctions.
char is_strand_jumped; // "strand jumped" means that the left and right sides are on different strands. This only applys to fusions.
char is_donor_found; // only for junctions: GT/AG is found at the location.
char is_donor_found_or_annotation; // only for junctions: GT/AG is found at the location. 1: found, 0:not found: 64: from annotation (thus unknown)
// Also, if "is_strand_jumped" is true, all coordinates (e.g., splicing points, cover_start, cover_end, etc) are on "reversed read" view.
char small_side_increasing_coordinate;
......@@ -325,6 +329,7 @@ typedef struct{
unsigned short anti_supporting_reads;
unsigned short final_counted_reads;
unsigned short final_reads_mismatches;
unsigned char event_type;
unsigned int global_event_id;
float event_quality;
......@@ -373,6 +378,7 @@ typedef struct{
short final_quality;
short chromosomal_length;
short MAPQ_adjustment;
int known_junction_supp;
} realignment_result_t;
#define BUCKETED_TABLE_INIT_ITEMS 3
......@@ -402,6 +408,7 @@ typedef struct {
int item_index_j;
unsigned int mapping_position;
int major_half_votes;
unsigned short read_start_base;
}simple_mapping_t;
typedef struct{
......@@ -439,6 +446,7 @@ typedef struct {
typedef struct{
unsigned long long all_correct_PE_reads;
int thread_id;
pthread_t thread;
......@@ -478,6 +486,7 @@ typedef struct{
FILE * output_sam_fp;
FILE * long_insertion_FASTA_fp;
char * output_sam_inner_buffer;
int output_sam_is_full;
// running contexts
void * module_contexts[5];
......@@ -536,7 +545,8 @@ typedef struct{
// per chunk parameters
subread_read_number_t read_block_start;
char * exonic_region_bitmap;
HashTable * sam_chro_to_anno_chr_alias;
} global_context_t;
......@@ -639,4 +649,6 @@ int is_valid_digit(char * optarg, char * optname);
int is_valid_digit_range(char * optarg, char * optname, int min, int max_inc);
int is_valid_float(char * optarg, char * optname);
int exec_cmd(char * cmd, char * outstr, int out_limit);
int is_pos_in_annotated_exon_regions(global_context_t * global_context, unsigned int pos);
char * get_sam_chro_name_from_alias(HashTable * tab, char * anno_chro);
#endif
......@@ -2,6 +2,7 @@
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <assert.h>
#include "subread.h"
#include "core.h"
#include "HelperFunctions.h"
......@@ -11,10 +12,12 @@
#include "hashtable.h"
#define COVERAGE_MAX_INT 0x7ffffff0
#define MAX_FRAGMENT_LENGTH 3000
unsigned long long all_counted;
typedef unsigned int coverage_bin_entry_t;
int is_BAM_input = 0;
int max_M = 10;
int paired_end = 0;
char input_file_name[300];
char output_file_name[300];
HashTable * cov_bin_table;
......@@ -48,6 +51,8 @@ void calcCount_usage()
SUBREADputs("");
SUBREADputs("Optional arguments:");
SUBREADputs("");
SUBREADputs(" -p The input file contains paired-end reads.");
SUBREADputs("");
SUBREADputs(" --maxMOp <int> Maximum number of 'M' operations allowed in a CIGAR string.");
SUBREADputs(" 10 by default. Both 'X' and '=' are treated as 'M' and adjacent");
SUBREADputs(" 'M' operations are merged in the CIGAR string.");
......@@ -133,7 +138,9 @@ int covCalc()
HashTableSetKeyComparisonFunction(cov_bin_table , fc_strcmp_chro);
HashTableSetDeallocationFunctions(cov_bin_table , free, free);
unsigned int hit_locs[2][MAX_FRAGMENT_LENGTH], reads = 0;
coverage_bin_entry_t * chrbin12[2];
int hits1,hits2;
SamBam_FILE * in_fp = SamBam_fopen(input_file_name, is_BAM_input?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
char * line_buffer = malloc(3000);
......@@ -147,10 +154,10 @@ int covCalc()
}
else
{
char * Chros[FC_CIGAR_PARSER_ITEMS];
unsigned int Staring_Points[FC_CIGAR_PARSER_ITEMS];
unsigned short Staring_Read_Points[FC_CIGAR_PARSER_ITEMS];
unsigned short Section_Lengths[FC_CIGAR_PARSER_ITEMS];
char * Chros[ max_M ];
unsigned int Staring_Points[max_M];
unsigned short Staring_Read_Points[max_M];
unsigned short Section_Lengths[max_M];
int flags=0, x1, is_junc = 0;
char cigar_str[200];
......@@ -159,6 +166,7 @@ int covCalc()
cigar_str[0]=0;
chro[0]=0;
if(reads % 2 == 0) hits1= hits2 = 0;
get_read_info(line_buffer, chro, &pos, cigar_str, &flags);
if(flags & 4) continue;
......@@ -172,8 +180,35 @@ int covCalc()
coverage_bin_entry_t * chrbin = (coverage_bin_entry_t*) bin_entry[0];
unsigned int chrlen = (void *)( bin_entry[1]) - NULL;
int cigar_sections = RSubread_parse_CIGAR_string(chro, pos, cigar_str, max_M, Chros, Staring_Points, Staring_Read_Points, Section_Lengths, &is_junc);
for(x1 = 0; x1 < cigar_sections; x1++)
{
if(paired_end) {
int * this_hits = (reads%2)?&hits2:&hits1;
unsigned int * this_hit_locs = &(hit_locs[reads%2][0]);
for(x1 = 0; x1 < cigar_sections; x1++){
unsigned int x2,x3;
//if(strcmp( cigar_str, "8S2M1D14M4D5M1D9M1I14M1D9M4D10M1D23M1I4M1D9M1I9M4D5M4D2M" )==0){
// SUBREADprintf("Cigar [%d] = %u ~ + %u ; this_hits=%d\n", x1, Staring_Points[x1], Section_Lengths[x1], *this_hits);
//}
for(x2 = Staring_Points[x1]; x2<Staring_Points[x1]+Section_Lengths[x1]; x2++){
int found = 0;
for(x3=0; x3<*this_hits; x3++){
if(this_hit_locs[x3] == x2){
found =1;
break;
}
}
if(!found) this_hit_locs[(*this_hits)++] = x2;
if( *this_hits >= MAX_FRAGMENT_LENGTH){
SUBREADprintf("ERROR: read is too long : %s!\n", cigar_str);
return -1;
}
}
}
if(* this_hits > 0) assert(chrbin);
chrbin12[reads%2] = chrbin;
} else for(x1 = 0; x1 < cigar_sections; x1++) {
unsigned int x2;
for(x2 = Staring_Points[x1]; x2<Staring_Points[x1]+Section_Lengths[x1]; x2++)
{
......@@ -189,6 +224,19 @@ int covCalc()
}
}
}
if(reads % 2 == 1){
int r,x1;
for(r = 0; r < 2; r++){
int hits = r?hits2:hits1;