New upstream version 1.6.2+dfsg

parent 9c61fb61
This diff is collapsed.
......@@ -52,6 +52,20 @@
#include "HelperFunctions.h"
char * get_short_fname(char * lname){
char * ret = lname;
int x1;
for(x1 = strlen(lname)-1; x1>=0; x1--){
if(lname [x1] == '/'){
ret = lname + x1 + 1;
break;
}
}
return ret;
}
// This assumes the first part of Cigar has differet strandness to the main part of the cigar.
// Pos is the LAST WANTED BASE location before the first strand jump (split by 'b' or 'n').
......
......@@ -75,4 +75,6 @@ int load_features_annotation(char * file_name, int file_type, char * gene_id_col
void * context, int do_add_feature(char * gene_name, char * transcript_id, char * chrome_name, unsigned int start, unsigned int end, int is_negative_strand, void * context) );
HashTable * load_alias_table(char * fname) ;
char * get_short_fname(char * lname);
#endif
#MACOS = -D MACOS
include makefile.version
CC_EXEC = gcc
OPT_LEVEL = 3
CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64 # -w
include makefile.version
-include ~/.R/DBPZ_debug_makefile
CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64 ${WARNING_LEVEL}
LDFLAGS = ${STATIC_MAKE} -pthread -lz -lm ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE
CC_EXEC = gcc
CC = ${CC_EXEC} ${CCFLAGS} -fmessage-length=0 -ggdb
......@@ -14,11 +17,11 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
all: detectionCall sublong repair txUnique featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # samMappedBases mergeVCF testZlib
all: detectionCall sublong repair txUnique featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount flattenGTF # samMappedBases mergeVCF testZlib
mkdir -p ../bin/utilities
mv longread-one/LRM longread-one/sublong
mv longread-one/sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv detectionCall repair coverageCount propmapped qualityScores removeDup subread-fullscan txUnique ../bin/utilities
mv detectionCall repair coverageCount propmapped qualityScores removeDup subread-fullscan txUnique flattenGTF ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
......@@ -34,6 +37,9 @@ sublong: longread-one/longread-mapping.c ${ALL_OBJECTS}
rm -f longread-one/*.o
cd longread-one && $(MAKE)
flattenGTF: flattenAnnotations.c ${ALL_OBJECTS}
${CC} -o flattenGTF flattenAnnotations.c ${ALL_OBJECTS} ${LDFLAGS}
detectionCall: detection-calls.c ${ALL_OBJECTS}
${CC} -o detectionCall detection-calls.c ${ALL_OBJECTS} ${LDFLAGS}
......
MACOS = -D MACOS
include makefile.version
CCFLAGS = -mtune=core2 ${MACOS} -O9 -w -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64
CCFLAGS = -mtune=core2 ${MACOS} -O9 -w -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64 ${WARNING_LEVEL}
LDFLAGS = -pthread -lz -lm ${MACOS} -DMAKE_FOR_EXON -D MAKE_STANDALONE # -DREPORT_ALL_THE_BEST
CC = gcc ${CCFLAGS} ${STATIC_MAKE} -ggdb -fomit-frame-pointer -O3 -ffast-math -funroll-loops -mmmx -msse -msse2 -msse3 -fmessage-length=0
......@@ -11,11 +11,11 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
all: sublong repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # globalReassembly testZlib
all: sublong repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount flattenGTF # globalReassembly testZlib
mkdir -p ../bin/utilities
mv longread-one/LRM longread-one/sublong
mv longread-one/sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv repair coverageCount subread-fullscan qualityScores removeDup propmapped ../bin/utilities
mv repair coverageCount subread-fullscan qualityScores removeDup propmapped flattenGTF ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
......@@ -31,6 +31,9 @@ sublong: longread-one/longread-mapping.c ${ALL_OBJECTS}
rm -f longread-one/*.o
cd longread-one && $(MAKE)
flattenGTF: flattenAnnotations.c ${ALL_OBJECTS}
${CC} -o flattenGTF flattenAnnotations.c ${ALL_OBJECTS} ${LDFLAGS}
repair: read-repair.c ${ALL_OBJECTS}
${CC} -o repair read-repair.c ${ALL_OBJECTS} ${LDFLAGS}
......
......@@ -264,7 +264,7 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
while(!feof(tmp_fp))
{
int type_char = fgetc(tmp_fp);
int type_char = fgetc(tmp_fp), rlen=-1;
if(type_char == EOF ) break;
fseek(tmp_fp, -1 , SEEK_CUR);
......@@ -273,7 +273,11 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
{
VCF_temp_read_t SNP_rec;
fread(&SNP_rec, sizeof(SNP_rec),1 , tmp_fp);
rlen = fread(&SNP_rec, sizeof(SNP_rec),1 , tmp_fp);
if(rlen < 1){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
if(!(*SNP_bitmap_recorder))
{
(*SNP_bitmap_recorder)=malloc((reference_len/8)+2);
......@@ -292,10 +296,25 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
char read[MAX_READ_LENGTH];
char qual[MAX_READ_LENGTH];
fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
fread(&read_len, sizeof(short), 1, tmp_fp);
fread(read, sizeof(char), read_len, tmp_fp);
int rlen = fread(qual, sizeof(char), read_len, tmp_fp);
int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
if(rlen < 1){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
if(rlen < 1){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
rlen = fread(read, sizeof(char), read_len, tmp_fp);
if(rlen < read_len){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
rlen = fread(qual, sizeof(char), read_len, tmp_fp);
if(rlen < read_len){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
......@@ -1803,9 +1822,9 @@ int main_snp_calling_test(int argc,char ** argv)
print_in_box(80,1,1,"exactSNP setting");
print_in_box(80,0,1,"");
print_in_box(80,0,0," Input file : %s (%s)", in_SAM_file, parameters.is_BAM_file_input?"BAM":"SAM");
print_in_box(80,0,0," Output file : %s", out_BED_file);
print_in_box(80,0,0," Reference genome : %s", in_FASTA_file);
print_in_box(80,0,0," Input file : %s (%s)", get_short_fname(in_SAM_file), parameters.is_BAM_file_input?"BAM":"SAM");
print_in_box(80,0,0," Output file : %s", get_short_fname(out_BED_file));
print_in_box(80,0,0," Reference genome : %s", get_short_fname(in_FASTA_file));
print_in_box(80,0,1,"");
print_in_box(80,0,0," Threads : %d", threads);
print_in_box(80,0,0," Min supporting reads : %d", parameters.min_supporting_read_number);
......@@ -1817,7 +1836,7 @@ int main_snp_calling_test(int argc,char ** argv)
print_in_box(80,0,0," P value upper bound : %.5f", parameters.cutoff_upper_bound);
print_in_box(80,0,0," Flanking windows size : %d", parameters.fisher_exact_testlen);
if(parameters.known_SNP_vcf[0])
print_in_box(80,0,0," Known SNP annotations : %s", parameters.known_SNP_vcf);
print_in_box(80,0,0," Known SNP annotations : %s", get_short_fname(parameters.known_SNP_vcf));
print_in_box(80,0,1,"");
print_in_box(80,2,1,"http://subread.sourceforge.net/");
......
This diff is collapsed.
......@@ -43,7 +43,7 @@ unsigned long long get_inner_pair(global_context_t * global_context , subread_re
bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_context , thread_context_t * thread_context , subread_read_number_t pair_number, int is_second_read, int load_more);
void bigtable_readonly_result(global_context_t * global_context , thread_context_t * thread_context , subread_read_number_t pair_number, int result_number, int is_second_read, mapping_result_t * return_ptr, subjunc_result_t * return_junction_ptr){
int rlen = -1;
if(global_context -> bigtable_cache_file_fp){
int loadjunc;
......@@ -67,7 +67,9 @@ void bigtable_readonly_result(global_context_t * global_context , thread_context
void * write_ptr = return_ptr;
if(loadjunc) write_ptr = return_junction_ptr;
fread(write_ptr, loadjunc?sizeof(subjunc_result_t):sizeof(mapping_result_t), 1, global_context -> bigtable_cache_file_fp);
rlen = fread(write_ptr, loadjunc?sizeof(subjunc_result_t):sizeof(mapping_result_t), 1, global_context -> bigtable_cache_file_fp);
if(rlen <1)
SUBREADprintf("UNABLE TO READ RESULT\n");
}
}else{
bigtable_cached_result_t * rett = bigtable_retrieve_cache(global_context , thread_context , pair_number, is_second_read,0);
......@@ -123,7 +125,7 @@ int init_bigtable_results(global_context_t * global_context, int is_rewinding)
if(global_context -> config.use_memory_buffer)
global_context -> bigtable_cache_file_fp = NULL;
else {
char tmpfname[MAX_FILE_NAME_LENGTH];
char tmpfname[MAX_FILE_NAME_LENGTH+33];
sprintf(tmpfname, "%s-%02d-align.bin", global_context -> config.temp_file_prefix, 0);
//if(is_rewinding) unlink(tmpfname);
......@@ -195,9 +197,6 @@ bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_con
if(global_context -> bigtable_cache_file_fp){
//SUBREADprintf("MARK_OCCPY=%lld BY THREAD %d\n", pair_number, thread_context ? thread_context -> thread_id : -1);
if(global_context -> bigtable_cache_file_loaded_fragments_begin == -1 || inner_pair_number >= global_context -> bigtable_cache_file_loaded_fragments_begin + global_context -> bigtable_chunked_fragments || inner_pair_number < global_context -> bigtable_cache_file_loaded_fragments_begin)
{
SUBREADprintf("THREAD # %d WAITING FOR %llu for RETRIEVE %llu\n", thread_context? thread_context -> thread_id:-1, global_context -> bigtable_cache_file_loaded_fragments_begin, pair_number);
......@@ -205,10 +204,11 @@ bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_con
}
bigtable_lock(global_context);
//SUBREADprintf("inner_pair_number=%lld, fragments_begin=%lld\n", inner_pair_number, global_context -> bigtable_cache_file_loaded_fragments_begn[thread_no]);
if(global_context -> bigtable_cache_file_loaded_fragments_begin == -1 || inner_pair_number >= global_context -> bigtable_cache_file_loaded_fragments_begin + global_context -> bigtable_chunked_fragments || inner_pair_number < global_context -> bigtable_cache_file_loaded_fragments_begin)
{
long long load_end_pair_no = load_start_pair_no + global_context -> bigtable_chunked_fragments;
int rlen = -1;
// this function will see if there is data to write or not.
bigtable_write_thread_cache(global_context);
......@@ -224,17 +224,35 @@ bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_con
{
for(xk2 = 0; xk2 < 1 + global_context -> input_reads.is_paired_end_reads; xk2++){
bigtable_cached_result_t * current_cache = global_context -> bigtable_cache + xk1* (1+global_context -> input_reads.is_paired_end_reads) + xk2;
fread( current_cache -> big_margin_data , sizeof(short) * 3 * global_context -> config.big_margin_record_size , 1, global_context -> bigtable_cache_file_fp );
fread( current_cache -> alignment_res , sizeof(mapping_result_t) * global_context -> config.multi_best_reads , 1, global_context -> bigtable_cache_file_fp );
rlen = fread( current_cache -> big_margin_data , sizeof(short) * 3 * global_context -> config.big_margin_record_size , 1, global_context -> bigtable_cache_file_fp );
if(rlen < 1){
SUBREADprintf("ERROR: cannot read margin\n");
return NULL;
}
if(global_context -> config.do_breakpoint_detection)
fread( current_cache -> subjunc_res , sizeof(subjunc_result_t) * global_context -> config.multi_best_reads , 1, global_context -> bigtable_cache_file_fp);
rlen = fread( current_cache -> alignment_res , sizeof(mapping_result_t) * global_context -> config.multi_best_reads , 1, global_context -> bigtable_cache_file_fp );
if(rlen < 1){
SUBREADprintf("ERROR: cannot read margin\n");
return NULL;
}
if(global_context -> config.do_breakpoint_detection){
rlen = fread( current_cache -> subjunc_res , sizeof(subjunc_result_t) * global_context -> config.multi_best_reads , 1, global_context -> bigtable_cache_file_fp);
if(rlen < 1){
SUBREADprintf("ERROR: cannot read margin\n");
return NULL;
}
}
}
}
}else{
long long new_file_size = calc_file_location(load_end_pair_no);
//SUBREADprintf("FILE_TRUNCATE %lld\n", load_start_pair_no);
ftruncate(fileno(global_context -> bigtable_cache_file_fp), new_file_size);
rlen = ftruncate(fileno(global_context -> bigtable_cache_file_fp), new_file_size);
if(rlen != 0){
SUBREADprintf("ERROR: cannot truncate file\n");
return NULL;
}
global_context -> bigtable_cache_file_fragments = load_end_pair_no;
int xk1, xk2;
for(xk1 = 0; xk1 < global_context -> bigtable_chunked_fragments; xk1++)
......@@ -313,7 +331,7 @@ int finalise_bigtable_results(global_context_t * global_context){
if(global_context -> bigtable_cache_file_fp){
fclose(global_context -> bigtable_cache_file_fp);
char tmpfname[MAX_FILE_NAME_LENGTH];
char tmpfname[MAX_FILE_NAME_LENGTH+33];
sprintf(tmpfname, "%s-%02d-align.bin", global_context -> config.temp_file_prefix, 0);
unlink(tmpfname);
}
......
This diff is collapsed.
......@@ -167,7 +167,7 @@ int load_known_junctions(global_context_t * global_context);
int finalise_indel_and_junction_thread(global_context_t * global_context, thread_context_t * thread_contexts, int task);
int find_new_indels(global_context_t * global_context, thread_context_t * thread_context, int pair_number, char * read_name, char * read_text, char * qual_text, int read_len, int is_second_read, int best_read_id);
int write_indel_final_results(global_context_t * context);
int search_event(global_context_t * global_context,HashTable * event_table, chromosome_event_t * event_space, unsigned int pos, int search_type, char event_type, chromosome_event_t ** return_buffer);
int search_event(global_context_t * global_context,HashTable * event_table, chromosome_event_t * event_space, unsigned int pos, int search_type, unsigned char event_type, chromosome_event_t ** return_buffer);
void set_alignment_result(global_context_t * global_context, int pair_number, int is_second_read, int best_read_id, unsigned int position, int votes, gene_vote_number_t * indel_record, short best_cover_start, short best_cover_end, int is_negative_strand, int is_PE, unsigned int minor_position, unsigned int minor_votes, unsigned int minor_coverage_start, unsigned int minor_coverage_end, unsigned int split_point, int inserted_bases, int is_strand_jumped, int is_GT_AG_donors, int used_subreads_in_vote, int noninformative_subreads_in_vote, int major_indel_offset, int minor_indel_offset, int main_hamming, int minor_hamming, int main_quality, int minor_quality);
......
......@@ -61,6 +61,8 @@ static struct option long_options[] =
{"minVoteCutoff", required_argument, 0, 0},
{"maxRealignLocations", required_argument, 0, 0},
{"multiMapping", no_argument, 0, 0},
{"keepReadOrder", no_argument, 0, 0},
{"sortReadsByCoordinates", no_argument, 0, 0},
{0, 0, 0, 0}
};
......@@ -169,6 +171,17 @@ void print_usage_core_aligner()
SUBREADputs("");
SUBREADputs(" --rg <string> Add <tag:value> to the read group (RG) header in the output.");
SUBREADputs("");
SUBREADputs("# read order");
SUBREADputs("");
SUBREADputs(" --keepReadOrder Keep order of reads in BAM output the same as that in the");
SUBREADputs(" input file. Reads from the same pair are always placed next");
SUBREADputs(" to each other no matter this option is specified or not.");
SUBREADputs("");
SUBREADputs(" --sortReadsByCoordinates Output location-sorted reads. This option is");
SUBREADputs(" applicable for BAM output only. A BAI index file is also");
SUBREADputs(" generated for each BAM file so the BAM files can be directly");
SUBREADputs(" loaded into a genome browser.");
SUBREADputs("");
SUBREADputs("# color space reads");
SUBREADputs("");
SUBREADputs(" -b Convert color-space read bases to base-space read bases in");
......@@ -241,6 +254,8 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
opterr = 1;
optopt = 63;
subread_rebuild_cmd(argc, argv, global_context);
global_context->config.entry_program_name = CORE_PROGRAM_SUBREAD;
global_context->config.max_mismatch_exonic_reads = 3;
global_context->config.max_mismatch_junction_reads = 3;
......@@ -467,6 +482,10 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
{
global_context->config.report_multi_mapping_reads = 1;
}
else if(strcmp("ignoreUnmapped", long_options[option_index].name)==0)
{
global_context->config.ignore_unmapped_reads = 1;
}
else if(strcmp("memoryMultiplex", long_options[option_index].name)==0)
{
global_context->config.memory_use_multiplex = atof(optarg);
......@@ -494,6 +513,14 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
global_context->config.is_BAM_input = 1;
global_context->config.is_SAM_file_input = 1;
}
else if(strcmp("keepReadOrder", long_options[option_index].name)==0)
{
global_context->config.is_input_read_order_required=1;
}
else if(strcmp("sortReadsByCoordinates", long_options[option_index].name)==0)
{
global_context->config.sort_reads_by_coordinates=1;
}
else if(strcmp("extraColumns", long_options[option_index].name)==0)
{
global_context->config.SAM_extra_columns=1;
......@@ -516,10 +543,6 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
{
global_context -> config.fast_run = 1;
}
else if(strcmp("ignoreUnmapped", long_options[option_index].name)==0)
{
global_context->config.ignore_unmapped_reads = 1;
}
else if(strcmp("sv", long_options[option_index].name)==0)
{
global_context->config.do_breakpoint_detection = 1;
......
......@@ -67,6 +67,8 @@ static struct option long_options[] =
{"minMappedFraction", required_argument, 0, 0},
{"complexIndels", no_argument, 0, 0},
{"multiMapping", no_argument, 0, 0},
{"keepReadOrder", no_argument, 0, 0},
{"sortReadsByCoordinates", no_argument, 0, 0},
{0, 0, 0, 0}
};
......@@ -171,6 +173,17 @@ void print_usage_core_subjunc()
SUBREADputs("");
SUBREADputs(" --rg <string> Add <tag:value> to the read group (RG) header in the output.");
SUBREADputs("");
SUBREADputs("# read order");
SUBREADputs("");
SUBREADputs(" --keepReadOrder Keep order of reads in BAM output the same as that in the");
SUBREADputs(" input file. Reads from the same pair are always placed next");
SUBREADputs(" to each other no matter this option is specified or not.");
SUBREADputs("");
SUBREADputs(" --sortReadsByCoordinates Output location-sorted reads. This option is");
SUBREADputs(" applicable for BAM output only. A BAI index file is also");
SUBREADputs(" generated for each BAM file so the BAM files can be directly");
SUBREADputs(" loaded into a genome browser.");
SUBREADputs("");
SUBREADputs("# color space reads");
SUBREADputs("");
SUBREADputs(" -b Convert color-space read bases to base-space read bases in");
......@@ -242,6 +255,8 @@ int parse_opts_subjunc(int argc , char ** argv, global_context_t * global_contex
opterr = 1;
optopt = 63;
subread_rebuild_cmd(argc, argv, global_context);
global_context->config.entry_program_name = CORE_PROGRAM_SUBJUNC;
global_context->config.max_mismatch_exonic_reads = 3;
global_context->config.max_mismatch_junction_reads = 3;
......@@ -523,6 +538,14 @@ int parse_opts_subjunc(int argc , char ** argv, global_context_t * global_contex
global_context->config.limited_tree_scan = 0;
global_context->config.max_insertion_at_junctions = atoi(optarg);
}
else if(strcmp("keepReadOrder", long_options[option_index].name)==0)
{
global_context->config.is_input_read_order_required=1;
}
else if(strcmp("sortReadsByCoordinates", long_options[option_index].name)==0)
{
global_context->config.sort_reads_by_coordinates=1;
}
else if(strcmp("extraColumns", long_options[option_index].name)==0)
{
global_context->config.SAM_extra_columns=1;
......
This diff is collapsed.
This diff is collapsed.
......@@ -172,6 +172,8 @@ typedef struct{
int downscale_mapping_quality;
int is_BAM_input;
int is_BAM_output;
int is_input_read_order_required;
int sort_reads_by_coordinates;
int convert_color_to_base;
int SAM_extra_columns;
int report_multiple_best_in_pairs;
......@@ -456,12 +458,7 @@ typedef struct{
void * module_thread_contexts[5];
gene_value_index_t * current_value_index;
output_fragment_buffer_t * output_buffer;
int output_buffer_item;
int output_buffer_pointer;
int is_finished;
subread_lock_t output_lock;
unsigned int all_mapped_reads;
unsigned int not_properly_pairs_wrong_arrangement;
unsigned int not_properly_pairs_different_chro;
......@@ -501,11 +498,12 @@ typedef struct{
// running contexts
void * module_contexts[5];
thread_context_t * all_thread_contexts;
int last_written_fragment_number;
subread_read_number_t last_written_fragment_number;
int need_merge_buffer_now;
read_input_t input_reads;
bigtable_t bigtable;
int rebuilt_command_line_size;
char * rebuilt_command_line;
subread_lock_t bigtable_lock;
subread_lock_t output_lock;
......@@ -671,4 +669,5 @@ int is_valid_float(char * optarg, char * optname);
int exec_cmd(char * cmd, char * outstr, int out_limit);
int is_pos_in_annotated_exon_regions(global_context_t * global_context, unsigned int pos);
char * get_sam_chro_name_from_alias(HashTable * tab, char * anno_chro);
void subread_rebuild_cmd(int argc, char ** argv, global_context_t * global_context);
#endif
......@@ -157,7 +157,7 @@ int DTCwrite_annotations(char * gene_name, char * transcript_name, char * chro_n
}
void DTCprint_lentab_items(void * key, void * hashed_obj, HashTable * tab){
SUBREADprintf("%s => %ld\n", key, hashed_obj - NULL);
SUBREADprintf("%s => %ld\n", (char*)key, hashed_obj - NULL);
}
// start and end are 1-based
......@@ -319,11 +319,11 @@ int DTCparse_GTF_and_Genome(DTCcontext_t * context){
}
}
for(ii = 0; ii < in_chro_exons -> numOfElements; ii++){
/*for(ii = 0; ii < in_chro_exons -> numOfElements; ii++){
DTCexon_t * tmpexon = ArrayListGet(in_chro_exons , ii);
unsigned int * tmp_gene_info = HashTableGet(genename_to_3int_table , tmpexon -> gene_name);
//DTCadd_annotation(context, tmpexon -> gene_name, current_chro, tmpexon -> start, tmpexon -> end, tmpexon -> is_negative_strand, tmp_gene_info[3], tmp_gene_info[4]);
}
DTCadd_annotation(context, tmpexon -> gene_name, current_chro, tmpexon -> start, tmpexon -> end, tmpexon -> is_negative_strand, tmp_gene_info[3], tmp_gene_info[4]);
}*/
if(context -> use_intron_bins){
DTCexon_t * tmpexon = ArrayListGet(in_chro_exons , 0);
......@@ -393,7 +393,7 @@ int DTCparse_GTF_and_Genome(DTCcontext_t * context){
HashTableIteration(genename_to_3int_table ,DTCprocess_gene_tab);
ArrayListSort(merged_gene_list, DTCcompare_merge_genes);
SUBREADprintf("%s has %d merged genes from %ld item hashtable\n", current_chro, merged_gene_list -> numOfElements, genename_to_3int_table -> numOfElements);
SUBREADprintf("%s has %ld merged genes from %ld item hashtable\n", current_chro, merged_gene_list -> numOfElements, genename_to_3int_table -> numOfElements);
assert(merged_gene_list -> numOfElements>0);
unsigned int * geneints = ArrayListGet(merged_gene_list, 0);
......@@ -493,7 +493,7 @@ int DTCinit_context(DTCcontext_t ** context, int argc, char ** argv){
else ret -> sam_chro_to_anno_chr_alias = NULL;
ret -> out_FP_genes = fopen(ret -> out_file_name,"w");
char binfn[MAX_FILE_NAME_LENGTH];
char binfn[MAX_FILE_NAME_LENGTH+12];
sprintf(binfn,"%s-bins", ret -> out_file_name);
ret -> out_FP_bins = fopen(binfn,"w");
......
/***************************************************************
The Subread and Rsubread software packages are free
software packages:
you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License,
or (at your option) any later version.
Subread is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
Authors: Drs Yang Liao and Wei Shi
***************************************************************/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <assert.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#ifndef MAKE_STANDALONE
#include <R.h>
#endif
#include <zlib.h>
#include <math.h>
#include <pthread.h>
#include <getopt.h>
#include "subread.h"
#include "interval_merge.h"
#include "core.h"
#include "gene-algorithms.h"
#include "sambam-file.h"
#include "input-files.h"
#include "hashtable.h"
#include "seek-zlib.h"
#include "HelperFunctions.h"
typedef struct{
char GTF_gene_id_column[MAX_READ_NAME_LEN];
char GTF_wanted_feature_type[MAX_READ_NAME_LEN];
char GTF_file_name[MAX_FILE_NAME_LENGTH];
char output_file_name[MAX_FILE_NAME_LENGTH];
FILE * output_FP;
HashTable * gene_to_chro_strand_table;
HashTable * gene_chro_strand_to_exons_table;
} flatAnno_context_t;
static struct option long_options[] =
{
{0, 0, 0, 0}
};
void flatAnno_print_usage(){
SUBREADprintf("flattenGTF Version %s\n\n", SUBREAD_VERSION);
SUBREADputs(" Flatten features included in a GTF annotation and save the modified annotation");
SUBREADputs(" to a SAF format file.");
SUBREADputs("");
SUBREADputs("Usage:");
SUBREADputs(" ./flattenGTF [options] -a <input_file> -o <output_file>");
SUBREADputs("");
SUBREADputs("## Mandatory arguments: ");
SUBREADputs("");
SUBREADputs(" -a <file> Name of an annotation file in GTF/GFF format.");
SUBREADputs("");
SUBREADputs(" -o <file> Name of output file.");
SUBREADputs("");
SUBREADputs("## Optional arguments: ");
SUBREADputs("");
SUBREADputs(" -t <string> Specify feature type in a GTF annotation. 'exon' by default.");
SUBREADputs(" Features with the specified feature type are extracted from the");
SUBREADputs(" annotation for processing.");
SUBREADputs("");
SUBREADputs(" -g <string> Specify attribute type in GTF annotation. 'gene_id' by default.");
SUBREADputs(" This attribute type is used to group features into meta-");
SUBREADputs(" features.");
SUBREADputs("");
}
int flatAnno_finalise(flatAnno_context_t * context){
HashTableDestroy(context -> gene_to_chro_strand_table);
HashTableDestroy(context -> gene_chro_strand_to_exons_table);
fclose(context -> output_FP );
SUBREADputs("Finished.\n");
return 0;
}
int flatAnno_do_anno_1R(char * gene_name, char * transcript_name, char * chro_name, unsigned int start, unsigned int end, int is_negative_strand, void * Vcontext){
flatAnno_context_t * context = Vcontext;
ArrayList * chro_strand_list_for_gene = HashTableGet(context -> gene_to_chro_strand_table, gene_name);
if(NULL == chro_strand_list_for_gene){
char * mem_gene = malloc(strlen(gene_name)+1);
strcpy(mem_gene, gene_name);
chro_strand_list_for_gene = ArrayListCreate(3);
ArrayListSetDeallocationFunction(chro_strand_list_for_gene, free);
HashTablePut(context -> gene_to_chro_strand_table, mem_gene, chro_strand_list_for_gene);
}
int i, found = 0;
char chro_strand[MAX_CHROMOSOME_NAME_LEN+10+FEATURE_NAME_LENGTH];
sprintf( chro_strand, "%s\t%s\t%c", gene_name,chro_name,is_negative_strand?'-':'+');
for(i=0; i<chro_strand_list_for_gene->numOfElements; i++){
char * old_ch_st = ArrayListGet(chro_strand_list_for_gene,i);
if(strcmp(old_ch_st, chro_strand) == 0){
found=1;
break;
}
}
if(!found){
char * mem_ch_st = strdup(chro_strand);
ArrayListPush(chro_strand_list_for_gene, mem_ch_st);
}
ArrayList * ge_ch_st_exon_list = HashTableGet(context -> gene_chro_strand_to_exons_table, chro_strand);
if(NULL == ge_ch_st_exon_list){
ge_ch_st_exon_list = ArrayListCreate(3);
ArrayListSetDeallocationFunction(ge_ch_st_exon_list,free);
HashTablePut(context -> gene_chro_strand_to_exons_table, strdup(chro_strand), ge_ch_st_exon_list);
}
int * mem_start_end = malloc(sizeof(int)*2);
mem_start_end[0] = (int)start;
mem_start_end[1] = (int)end;
ArrayListPush(ge_ch_st_exon_list, mem_start_end);
return 0;
}
int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR){
int * iL = vL, *iR = vR;
if((*iL)>(*iR))return 1;
if((*iL)<(*iR))return -1;
return 0;
}
void flatAnno_do_anno_merge_one_array(void * key, void * hashed_obj, HashTable * tab){
ArrayList * this_list = hashed_obj;
ArrayListSort(this_list, flatAnno_do_anno_merge_one_array_compare);
int i, n1_items = 0;
for(i=1; i<this_list -> numOfElements; i++){
int * last_2i = this_list -> elementList[ n1_items ];
int * curr_2i = this_list -> elementList[ i ];
if(last_2i[1] >= curr_2i[1]) continue;
if(last_2i[1] >= curr_2i[0] -1){
last_2i[1]=curr_2i[1];
continue;
}
n1_items++;
if(n1_items< i){
last_2i = this_list -> elementList[ n1_items ];
last_2i[0] = curr_2i[0];
last_2i[1] = curr_2i[1];
}
}
for(i=n1_items+1; i<this_list -> numOfElements; i++)free(this_list -> elementList[i]);
this_list -> numOfElements = n1_items+1;
}
int flatAnno_do_anno_merge_and_write(flatAnno_context_t * context){
context -> gene_chro_strand_to_exons_table -> appendix1 = context;
HashTableIteration(context -> gene_chro_strand_to_exons_table, flatAnno_do_anno_merge_one_array);
ArrayList * all_chro_st_list = HashTableKeyArray(context -> gene_chro_strand_to_exons_table);
ArrayListSort(all_chro_st_list, (int(*)(void * , void*))strcmp);
fprintf( context -> output_FP , "GeneID\tChr\tStart\tEnd\tStrand\n");
int i,j;
for(i = 0; i< all_chro_st_list -> numOfElements; i++){
char * ge_chro_strand = ArrayListGet(all_chro_st_list,i);
char * local_ge_chro_strand = strdup(ge_chro_strand);
char * strand_ptr = local_ge_chro_strand;
for(j=0; j<2; strand_ptr++)
if(*strand_ptr=='\t')j++;
strand_ptr[-1] = 0;
ArrayList * exon_in_chro_strand = HashTableGet(context -> gene_chro_strand_to_exons_table, ge_chro_strand);
for(j=0; j< exon_in_chro_strand -> numOfElements; j++){
int * start_end_2i = ArrayListGet(exon_in_chro_strand,j);
fprintf( context -> output_FP ,"%s\t%d\t%d\t%s\n", local_ge_chro_strand, start_end_2i[0], start_end_2i[1], strand_ptr);
}
free(local_ge_chro_strand);
}
ArrayListDestroy(all_chro_st_list );
return 0;
}
int flatAnno_do_anno(flatAnno_context_t * context){
int loaded_features = load_features_annotation(context -> GTF_file_name, FILE_TYPE_GTF, context -> GTF_gene_id_column, NULL, context -> GTF_wanted_feature_type, context, flatAnno_do_anno_1R);
if(loaded_features<0)SUBREADputs("ERROR: Unable to open the GTF file.");
if(loaded_features==0)SUBREADprintf("ERROR: No '%s' feature was found in the GTF file. (the '%s' attribute is required)\n", context -> GTF_wanted_feature_type, context -> GTF_gene_id_column);
if(loaded_features<=0) return -1;
return flatAnno_do_anno_merge_and_write(context);
}
int flatAnno_start(flatAnno_context_t * context){
SUBREADputs("");