Commit 0d7e546a authored by Alexandre Mestiashvili's avatar Alexandre Mestiashvili

Merge tag 'upstream/1.5.0-p2+dfsg'

Upstream version 1.5.0-p2+dfsg
parents 5e128f57 8b7078e3
......@@ -35,9 +35,9 @@
\begin{center}
{\Huge\bf Subread/Rsubread Users Guide}\\
\vspace{1 cm}
{\centering\large Subread v1.5.0/Rsubread v1.20.2\\}
{\centering\large Subread v1.5.0-p2/Rsubread v1.20.5\\}
\vspace{1 cm}
\centering 16 December 2015\\
\centering 11 April 2016\\
\vspace{5 cm}
\Large Wei Shi and Yang Liao\\
\vspace{1 cm}
......@@ -47,7 +47,7 @@ The Walter and Eliza Hall Institute of Medical Research\\
The University of Melbourne\\
Melbourne, Australia\\}
\vspace{7 cm}
\centering Copyright \small{\copyright} 2011 - 2015\\
\centering Copyright \small{\copyright} 2011 - 2016\\
\end{center}
\end{titlepage}
......@@ -419,7 +419,7 @@ Arguments & Description \\
\hline
-b \newline (\code{color2base=TRUE}) & Output base-space reads instead of color-space reads in mapping output for color space data (eg. LifTech SOLiD data). Note that the mapping itself will still be performed at color-space.\\
\hline
-B $<int>$ \newline (\code{nBestLocations}) & Specify the maximal number of equally-best mapping locations allowed to be reported for each read. 1 by default. Allowed values are between 1 to 16 (inclusive). `NH' tag is used to indicate how many alignments are reported for the read and `HI' tag is used for numbering the alignments reported for the same read, in the output. Note that \code{-u} option takes precedence over \code{-B}.\\
-B $<int>$ \newline (\code{nBestLocations}) & Specify the maximal number of equally-best mapping locations allowed to be reported for each read. 1 by default. `NH' tag is used to indicate how many alignments are reported for the read and `HI' tag is used for numbering the alignments reported for the same read, in the output. Note that \code{-u} option takes precedence over \code{-B}.\\
\hline
-d $<int>$ \newline (\code{minFragLength}) & Specify the minimum fragment/template length, 50 by default. Note that if the two reads from the same pair do not satisfy the fragment length criteria, they will be mapped individually as if they were single-end reads.\\
\hline
......@@ -447,7 +447,7 @@ Arguments & Description \\
\hline
-S $<ff:fr:rf>$ \newline (\code{PE\_orientation}) & Specify the orientation of the two reads from the same pair. It has three possible values including `fr', `ff' and `'rf. Letter `f' denotes the forward strand and letter `r' the reverse strand. `fr' by default (ie. the first read in the pair is on the forward strand and the second read on the reverse strand).\\
\hline
$^*$ -t $<int>$ \newline (\code{type}) & Specify the type of input sequencing data. Possible values include \code{0}, denoting RNA-seq data, or \code{1}, denoting genomic DNA-seq data. For genomic DNA-seq data, the aligner takes into account both the number of matched bases and the number of mis-matched bases to determine the the best mapping location after applying the `seed-and-vote' approach for read mapping. For RNA-seq data, only the number of mis-matched bases is considered for determining the best mapping location.\\
$^*$ -t $<int>$ \newline (\code{type}) & Specify the type of input sequencing data. Possible values include \code{0}, denoting RNA-seq data, or \code{1}, denoting genomic DNA-seq data. Character values including `rna' and `dna' can also be used in the {\R} function. For genomic DNA-seq data, the aligner takes into account both the number of matched bases and the number of mis-matched bases to determine the the best mapping location after applying the `seed-and-vote' approach for read mapping. For RNA-seq data, only the number of mis-matched bases is considered for determining the best mapping location. \\
\hline
-T $<int>$ \newline (\code{nthreads}) & Specify the number of threads/CPUs used for mapping. The value should be between 1 and 32. 1 by default.\\
\hline
......@@ -841,7 +841,7 @@ input\_files \newline (\code{files}) & Give the names of input read files that i
\hline
-M \newline (\code{countMultiMappingReads}) & If specified, multi-mapping reads/fragments will be counted. A multi-mapping read will be counted up to N times if it has N reported mapping locations. The program uses the `NH' tag to find multi-mapping reads.\\
\hline
-o $<input>$ & Give the name of the output file. The output file contains the number of reads assigned to each meta-feature (or each feature if \code{-f} is specified). Note that the {\featureCounts} function in {\Rsubread} does not use this parameter. It returns a \code{list} object including read summarization results and other data. \\
-o $<output>$ & Give the name of the output file. The output file contains the number of reads assigned to each meta-feature (or each feature if \code{-f} is specified). Note that the {\featureCounts} function in {\Rsubread} does not use this parameter. It returns a \code{list} object including read summarization results and other data. \\
\hline
-O \newline (\code{allowMultiOverlap}) & If specified, reads (or fragments if \code{-p} is specified) will be allowed to be assigned to more than one matched meta-feature (or feature if \code{-f} is specified). Reads/fragments overlapping with more than one meta-feature/feature will be counted more than once. Note that when performing meta-feature level summarization, a read (or fragment) will still be counted once if it overlaps with multiple features belonging to the same meta-feature but does not overlap with other meta-features. \\
\hline
......@@ -855,15 +855,15 @@ input\_files \newline (\code{files}) & Give the names of input read files that i
\hline
-s $<int>$ \newline (\code{isStrandSpecific}) & Indicate if strand-specific read counting should be performed. It has three possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default. For paired-end reads, strand of the first read is taken as the strand of the whole fragment and FLAG field of the current read is used to tell if it is the first read in the fragment.\\
\hline
-S $<ff:fr:rf>$ \newline (\code{PE\_orientation}) & Specify the orientation of the two reads from the same pair. It has three possible values including `fr', `ff' and `'rf. Letter `f' denotes the forward strand and letter `r' the reverse strand. `fr' by default (ie. the first read in the pair is on the forward strand and the second read on the reverse strand).\\
\hline
-t $<input>$ \newline (\code{GTF.featureType}) & Specify the feature type. Only rows which have the matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default.\\
\hline
-T $<int>$ \newline (\code{nthreads}) & Number of the threads. The value should be between 1 and 32. 1 by default.\\
\hline
-v & Output version of the program. \\
\hline
$--$countSplitAlignmentsOnly \newline (\code{countSplitAlignmentsOnly}) & If specified, only split alignments (CIGAR strings containing letter `N') will be counted. All the other alignments will be ignored. An example of split alignments is the exon-spanning reads in RNA-seq data. If exon-spanning reads need to be assigned to all their overlapping exons, `-f' and `-O' options should be provided as well.\\
$--$countSplitAlignmentsOnly \newline (\code{splitOnly}) & If specified, only split alignments (CIGAR strings contain letter `N') will be counted. All the other alignments will be ignored. An example of split alignments is the exon-spanning reads in RNA-seq data. If exon-spanning reads need to be assigned to all their overlapping exons, `-f' and `-O' options should be provided as well.\\
\hline
$--$countNonSplitAlignmentsOnly \newline (\code{nonSplitOnly}) & If specified, only non-split alignments (CIGAR strings do not contain letter `N') will be counted. All the other alignments will be ignored.\\
\hline
$--$donotsort \newline (\code{autosort}) & If specified, paired end reads will not be re-ordered even if reads from the same pair were found not to be next to each other in the input.\\
\hline
......@@ -877,7 +877,7 @@ $--$maxMOp $<int>$ \newline (\code{maxMOp}) & Specify the maximum number of `M'
\hline
$--$minOverlap $<int>$ \newline (\code{minOverlap}) & Specify the minimum required number of overlapping bases between a read (or a fragment) and an overlapping feature. 1 by default. If a negative value is provided, the read will be extended from both ends.\\
\hline
$--$primary \newline (\code{countPrimaryAlignmentsOnly}) & If specified, only primary alignments will be counted. Primary and secondary alignments are identified using bit 0x100 in the Flag field of SAM/BAM files. All primary alignments in a dataset will be counted no matter they are from multi-mapping reads or not (ie. `-M' is ignored).\\
$--$primary \newline (\code{primaryOnly}) & If specified, only primary alignments will be counted. Primary and secondary alignments are identified using bit 0x100 in the Flag field of SAM/BAM files. All primary alignments in a dataset will be counted no matter they are from multi-mapping reads or not (ie. `-M' is ignored).\\
\hline
$--$read2pos $<int>$ \newline (\code{read2pos}) & The read is reduced to its 5' most base or 3' most base. Read summarization is then performed based on the single base position to which the read is reduced. By default, no read reduction will be performed.\\
\hline
......
......@@ -21,6 +21,27 @@
#include <string.h>
#include <assert.h>
#ifdef MACOS
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#else
#include <sys/ioctl.h>
#include <net/if.h>
#include <unistd.h>
#include <netinet/in.h>
#endif
#include "subread.h"
#include "gene-algorithms.h"
#include "HelperFunctions.h"
......@@ -740,3 +761,130 @@ int strcmp_number(char * s1, char * s2)
return ret;
}
}
int mac_str(char * str_buff)
{
#ifdef FREEBSD
return 1;
#else
#ifdef MACOS
int mib[6], x1, ret = 1;
size_t len;
char *buf;
unsigned char *ptr;
struct if_msghdr *ifm;
struct sockaddr_dl *sdl;
for(x1 = 0 ; x1 < 40; x1++)
{
mib[0] = CTL_NET;
mib[1] = AF_ROUTE;
mib[2] = 0;
mib[3] = AF_LINK;
mib[4] = NET_RT_IFLIST;
mib[5] = x1;
if (sysctl(mib, 6, NULL, &len, NULL, 0) >=0) {
buf = malloc(len);
if (sysctl(mib, 6, buf, &len, NULL, 0) >=0) {
ifm = (struct if_msghdr *)buf;
sdl = (struct sockaddr_dl *)(ifm + 1);
ptr = (unsigned char *)LLADDR(sdl);
if(sdl -> sdl_nlen < 1) continue;
char * ifname = malloc(sdl -> sdl_nlen + 1);
memcpy(ifname, sdl -> sdl_data, sdl -> sdl_nlen);
ifname[sdl -> sdl_nlen] = 0;
if(ifname[0]!='e'){
free(ifname);
continue;
}
free(ifname);
sprintf(str_buff,"%02X%02X%02X%02X%02X%02X", *ptr, *(ptr+1), *(ptr+2),
*(ptr+3), *(ptr+4), *(ptr+5));
ret = 0;
break;
}
free(buf);
}
}
return ret;
#else
struct ifreq ifr;
struct ifconf ifc;
char buf[1024];
int success = 0;
int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
if (sock == -1) { /* handle error*/ };
ifc.ifc_len = sizeof(buf);
ifc.ifc_buf = buf;
if (ioctl(sock, SIOCGIFCONF, &ifc) == -1) { /* handle error */ }
struct ifreq* it = ifc.ifc_req;
const struct ifreq* const end = it + (ifc.ifc_len / sizeof(struct ifreq));
for (; it != end; ++it) {
strcpy(ifr.ifr_name, it->ifr_name);
if (ioctl(sock, SIOCGIFFLAGS, &ifr) == 0) {
if (! (ifr.ifr_flags & IFF_LOOPBACK)) { // don't count loopback
if (ioctl(sock, SIOCGIFHWADDR, &ifr) == 0) {
success = 1;
break;
}
}
}
}
unsigned char mac_address[6];
if (success){
memcpy(mac_address, ifr.ifr_hwaddr.sa_data, 6);
int x1;
for(x1 = 0; x1 < 6; x1++){
sprintf(str_buff+2*x1, "%02X",mac_address[x1]);
}
return 0;
}
return 1;
#endif
#endif
}
int rand_str(char * str_buff){
int ret = 1;
FILE * fp = fopen("/dev/urandom","r");
if(fp){
int x1;
for(x1=0; x1<6; x1++){
sprintf(str_buff + 2*x1 , "%02X", fgetc(fp));
}
fclose(fp);
ret = 0;
}
return ret;
}
int mathrand_str(char * str_buff){
srand((int)(miltime()*100));
int x1;
for(x1 = 0; x1 < 6; x1++){
sprintf(str_buff+2*x1, "%02X", rand() & 0xff );
}
return 0;
}
int mac_or_rand_str(char * str_buff){
return mac_str(str_buff) && rand_str(str_buff) && mathrand_str(str_buff);
}
......@@ -68,4 +68,5 @@ int strcmp_number(char * s1, char * s2);
unsigned int reverse_cigar(unsigned int pos, char * cigar, char * new_cigar);
unsigned int find_left_end_cigar(unsigned int right_pos, char * cigar);
int mac_or_rand_str(char * char_14);
#endif
......@@ -8,7 +8,7 @@ all:
@echo " For building subread in FreeBSD, please run ' gmake -f Makefile.FreeBSD '."
@echo " For building subread in Oracle Solaris or OpenSolaris, please run ' gmake -f Makefile.SunOS '."
@echo
@echo " The default compiler is gcc; you may change the compailer by editing the makefile."
@echo " The default compiler is gcc; you may change it by editing the Makefiles for platforms."
@echo
@echo " The generated executables are saved to directory (PACKAGE_DIR)/bin"
@echo
MACOS = -D MACOS
#MACOS = -D MACOS
include makefile.version
CCFLAGS = -mtune=core2 ${MACOS} -O9 -Wall -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64
#CCFLAGS = -D_FORTIFY_SOURCE=2 -mtune=core2 ${MACOS} -O2 -Wall -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" # -DREPORT_ALL_THE_BEST
LDFLAGS = ${STATIC_MAKE} -lpthread -lz -lm ${MACOS} -O9 -DMAKE_FOR_EXON -D MAKE_STANDALONE # -DREPORT_ALL_THE_BEST
OPT_LEVEL = 9
CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -Wall -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64
LDFLAGS = ${STATIC_MAKE} -lpthread -lz -lm ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE # -DREPORT_ALL_THE_BEST
CC = gcc ${CCFLAGS} -ggdb -fomit-frame-pointer -ffast-math -funroll-loops -mmmx -msse -msse2 -msse3 -fmessage-length=0
......
......@@ -32,6 +32,7 @@
#include <getopt.h>
#include "subread.h"
#include "hashtable.h"
#include "HelperFunctions.h"
#include "gene-algorithms.h"
#include "SNPCalling.h"
#include "input-files.h"
......@@ -506,8 +507,8 @@ void fishers_test_on_block(struct SNP_Calling_Parameters * parameters, float * s
flanking_matched = d;
}
//SUBREADprintf("TEST: %u a,b,c,d=%d %d %d %d; FU=%d FM=%d; Goahead=%d; Tailleft=%d\n", i,a,b,c,d, flanking_unmatched, flanking_matched, go_ahead, left_tail);
float p_middle = fisher_exact_test(a, flanking_unmatched, c, flanking_matched);
//SUBREADprintf("TEST: %u a,b,c,d=%d %d %d %d; FU=%d FM=%d; Goahead=%d; Tailleft=%d; p=%G; p-cut=%G\n", i,a,b,c,d, flanking_unmatched, flanking_matched, go_ahead, left_tail, p_middle, p_cutoff);
if(all_result_needed || ( p_middle < p_cutoff && flanking_matched*20>(flanking_matched+ flanking_unmatched )*16))
snp_fisher_raw [i] = p_middle;
else snp_fisher_raw [i] = -999;
......@@ -842,6 +843,8 @@ int process_snp_votes(FILE *out_fp, unsigned int offset , unsigned int reference
}
}
}
// SUBREADprintf("OUTTEXT: i=%d, all_reads=%d, Fisher-p=%f, SNPs=%d, POI-MM=%d\n", i, all_reads, snp_fisher_raw[i], snps, POI_MM);
if(snps && POI_MM *1. / all_reads >= parameters->supporting_read_rate )
{
if(snp_fisher_raw[i] >= 0. || parameters -> fisher_exact_testlen < 1)
......@@ -1008,8 +1011,8 @@ int run_chromosome_search(FILE *in_fp, FILE * out_fp, char * chro_name , char *
if( (*task_no) % all_threads == thread_no && all_offset <= chro_len)
{
//#warning "=== ONLY TEST ONE BLOCK ==="
//if(strcmp(chro_name,"chr7")==0 && all_offset == 45000000){
//#warning "=== ONLY TEST ONE BLOCK , USE 'if(1)' IN RELEASE ==="
//if(strcmp(chro_name,"chr7")==0 && all_offset == 60000000){
if(1){
process_snp_votes(out_fp, all_offset, offset, referenced_base, chro_name , temp_prefix, parameters);
print_in_box(89,0,0,"processed block %c[36m%s@%d%c[0m by thread %d/%d [block number=%d/%d]", CHAR_ESC, chro_name, all_offset, CHAR_ESC , thread_no+1, all_threads, 1+(*task_no)-parameters->empty_blocks, parameters->all_blocks);
......@@ -1018,7 +1021,7 @@ int run_chromosome_search(FILE *in_fp, FILE * out_fp, char * chro_name , char *
else if((*task_no) % all_threads == thread_no)
{
print_in_box(80,0,0,"Ignored in: %s@%d by thr %d/%d [tid=%d]\n", chro_name, all_offset, thread_no, all_threads, *task_no);
SUBREADprintf("LEN %u > %u\n", all_offset, chro_len);
// SUBREADprintf("LEN %u > %u\n", all_offset, chro_len);
}
offset = 0;
(*task_no)++;
......@@ -1382,11 +1385,14 @@ int SNP_calling(char * in_SAM_file, char * out_BED_file, char * in_FASTA_file, c
HashTableSetKeyComparisonFunction(parameters-> cigar_event_table, my_strcmp);
memcpy(rand48_seed, &start_time, 6);
srand(time(NULL));
if(temp_location)
strcpy(temp_file_prefix, temp_location);
else
sprintf(temp_file_prefix, "./temp-snps-%06u-%08X-", getpid(), rand());
else{
char mac_rand[13];
mac_or_rand_str(mac_rand);
sprintf(temp_file_prefix, "./temp-snps-%06u-%s-", getpid(), mac_rand);
}
_EXSNP_SNP_delete_temp_prefix = temp_file_prefix;
print_in_box(89,0,0,"Split %s file into %c[36m%s*%c[0m ..." , parameters -> is_BAM_file_input?"BAM":"SAM" , CHAR_ESC, temp_file_prefix, CHAR_ESC);
......
......@@ -86,21 +86,23 @@ int init_bigtable_results(global_context_t * global_context, int is_rewinding)
global_context -> bigtable_chunked_fragments = global_context -> config.reads_per_chunk+1;
global_context -> bigtable_cache_size = global_context -> bigtable_chunked_fragments * (1+global_context -> input_reads.is_paired_end_reads);
} else {
global_context -> bigtable_chunked_fragments = 300000 - 260000;
global_context -> bigtable_cache_size = global_context -> config.all_threads * global_context -> bigtable_chunked_fragments * (1+global_context -> input_reads.is_paired_end_reads);
global_context -> bigtable_chunked_fragments = 110llu*1024*1024;
global_context -> bigtable_cache_size = (1+0*global_context -> config.all_threads) * global_context -> bigtable_chunked_fragments * (1+global_context -> input_reads.is_paired_end_reads);
}
//SUBREADprintf("reads_per_chunk = %u ; cached_single_reads = %u ; size of each read = %d + %d\n", global_context -> config.reads_per_chunk, global_context -> bigtable_cache_size, sizeof(mapping_result_t) , sizeof(subjunc_result_t));
if(!is_rewinding)
if(!is_rewinding){
global_context -> bigtable_cache = malloc(sizeof(bigtable_cached_result_t) * global_context -> bigtable_cache_size);
}
int xk1;
for(xk1 = 0; xk1 < global_context -> bigtable_cache_size; xk1++){
if(!is_rewinding)
if(!is_rewinding){
global_context -> bigtable_cache [xk1].alignment_res = malloc(sizeof(mapping_result_t) * global_context -> config.multi_best_reads);
assert(global_context -> bigtable_cache [xk1].alignment_res!=NULL);
}
if(global_context -> config.use_memory_buffer)
{
......@@ -134,6 +136,7 @@ int init_bigtable_results(global_context_t * global_context, int is_rewinding)
return 0;
}
mapping_result_t * _global_retrieve_alignment_ptr(global_context_t * global_context, subread_read_number_t pair_number, int is_second_read, int best_read_id){
mapping_result_t * ret;
bigtable_retrieve_result(global_context, NULL, pair_number, best_read_id, is_second_read, &ret, NULL);
......@@ -191,8 +194,13 @@ bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_con
long long load_start_pair_no = inner_pair_number - inner_pair_number % global_context -> bigtable_chunked_fragments;
if(global_context -> bigtable_cache_file_fp){
//SUBREADprintf("MARK_OCCPY=%lld BY THREAD %d\n", pair_number, thread_context ? thread_context -> thread_id : -1);
if(global_context -> bigtable_cache_file_loaded_fragments_begin == -1 || inner_pair_number >= global_context -> bigtable_cache_file_loaded_fragments_begin + global_context -> bigtable_chunked_fragments || inner_pair_number < global_context -> bigtable_cache_file_loaded_fragments_begin)
{
SUBREADprintf("THREAD # %d WAITING FOR %llu for RETRIEVE %llu\n", thread_context? thread_context -> thread_id:-1, global_context -> bigtable_cache_file_loaded_fragments_begin, pair_number);
wait_occupied(global_context, global_context -> bigtable_cache_file_loaded_fragments_begin);
}
......@@ -238,7 +246,7 @@ bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_con
if(0 && xk1 < 10)
{
//SUBREADprintf("CACHEP_211: %p (%d from %llu)\n", current_cache, xk1, pair_number);
SUBREADprintf("NENSET_211: %p\n", current_cache -> alignment_res);
SUBREADprintf("NENSET_211: %p + %d\n", current_cache -> alignment_res, xk1 * (1+global_context -> input_reads.is_paired_end_reads) + xk2);
}
memset( current_cache -> alignment_res , 0, sizeof(mapping_result_t) * global_context -> config.multi_best_reads);
......@@ -259,8 +267,9 @@ bigtable_cached_result_t * bigtable_retrieve_cache(global_context_t * global_con
bigtable_unlock(global_context);
}
if(global_context -> bigtable_cache_file_fp)
if(global_context -> bigtable_cache_file_fp){
global_context -> bigtable_cache[inner_pair_number - load_start_pair_no].status = CACHE_STATUS_OCCUPIED;
}
bigtable_cached_result_t * ret_cache = global_context -> bigtable_cache + (inner_pair_number - load_start_pair_no)* (1+global_context -> input_reads.is_paired_end_reads) + is_second_read;
return ret_cache;
......@@ -285,15 +294,17 @@ unsigned short * _global_retrieve_big_margin_ptr(global_context_t * global_conte
}
// do nothing : the data is automatically saved into the temporary file when using mmap.
void bigtable_release_result(global_context_t * global_context , thread_context_t * thread_context , subread_read_number_t pair_number, int commit_change){
long long inner_pair_number = get_inner_pair(global_context, pair_number);
long long load_start_pair_no = inner_pair_number - inner_pair_number % global_context -> bigtable_chunked_fragments;
if(global_context -> bigtable_cache_file_fp)
if(global_context -> bigtable_cache_file_fp){
long long inner_pair_number = get_inner_pair(global_context, pair_number);
long long load_start_pair_no = inner_pair_number - inner_pair_number % global_context -> bigtable_chunked_fragments;
//SUBREADprintf("REAL RELEASE:%lld\n", inner_pair_number - load_start_pair_no);
global_context -> bigtable_cache[inner_pair_number - load_start_pair_no].status = CACHE_STATUS_RELEASED;
//SUBREADprintf("OCCUPYXX0 = %d\n", global_context -> bigtable_cache[0].status);
if(commit_change){
global_context -> bigtable_dirty_data=1;
if(commit_change){
global_context -> bigtable_dirty_data=1;
}
}
}
......@@ -479,8 +490,11 @@ void fraglist_destroy(fragment_list_t * list){
void fraglist_append(fragment_list_t * list, subread_read_number_t fragment_number){
if(list -> fragments >= list -> capacity){
//#warning "============== COMMENT DEBUG INFO ====================="
//SUBREADprintf("REALLOC_PRT_IN = %d , %p\n",list -> capacity , list -> fragment_numbers );
list -> capacity = max(list -> capacity + 5, list -> capacity * 1.3);
list -> fragment_numbers = realloc(list -> fragment_numbers, sizeof(subread_read_number_t) * list -> capacity);
// SUBREADprintf("REALLOC_PRT_OUT = %d , %p\n",list -> capacity , list -> fragment_numbers );
}
list -> fragment_numbers[ list -> fragments ++ ] = fragment_number;
......
......@@ -25,6 +25,7 @@
#include <assert.h>
#include <unistd.h>
#include "hashtable.h"
#include "HelperFunctions.h"
#include "gene-value-index.h"
#include "gene-algorithms.h"
#include "input-files.h"
......@@ -1981,7 +1982,7 @@ int write_local_reassembly(global_context_t *global_context, HashTable *pileup_f
if(0 == locate_gene_position_max(anchor_pos, &global_context -> chromosome_table, &chro_name, &chro_offset, read_len))
if(0 == locate_gene_position_max(anchor_pos, &global_context -> chromosome_table, &chro_name, &chro_offset, NULL, NULL, read_len))
{
char temp_file_name[MAX_FILE_NAME_LENGTH];
int close_now = 0;
......@@ -3654,8 +3655,8 @@ int finalise_pileup_file_by_voting(global_context_t * global_context , char * te
char * chro_begin;
int chro_offset_start = 0;
int chro_offset_end = 0;
locate_gene_position_max(contig_start_pos + head_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_start, 0);
locate_gene_position_max(contig_end_pos - tail_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_end, 0);
locate_gene_position_max(contig_start_pos + head_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_start, NULL, NULL, 0);
locate_gene_position_max(contig_end_pos - tail_removed_bases ,& global_context -> chromosome_table, &chro_begin, &chro_offset_end, NULL, NULL, 0);
if(full_rebuilt_window_size - read_position_cursor - tail_removed_bases)
fprintf(global_context -> long_insertion_FASTA_fp, ">%s-%u-%u-%s%dM\n", chro_name, chro_offset_start, chro_offset_end - 1, contig_CIGAR, full_rebuilt_window_size - read_position_cursor - tail_removed_bases);
else
......@@ -4003,9 +4004,6 @@ void init_global_context(global_context_t * context)
context->config.show_soft_cliping = 1 ;
context->config.big_margin_record_size = 9;
//#warning "====== FOR HIGH JUNCTION ACCURACT ==========="
context->config.big_margin_record_size = 15;
context->config.read_group_id[0] = 0;
context->config.read_group_txt[0] = 0;
context->config.first_read_file[0] = 0;
......@@ -4033,7 +4031,10 @@ void init_global_context(global_context_t * context)
memcpy(seed_rand, &double_time, 2*sizeof(int));
srand(seed_rand[0]^seed_rand[1]);
sprintf(context->config.temp_file_prefix, "./core-temp-sum-%06u-%06u", getpid(),rand());
char mac_rand[13];
mac_or_rand_str(mac_rand);
sprintf(context->config.temp_file_prefix, "./core-temp-sum-%06u-%s", getpid(), mac_rand );
_COREMAIN_delete_temp_prefix = context->config.temp_file_prefix;
......@@ -4060,6 +4061,7 @@ void init_global_context(global_context_t * context)
int CORE_DPALIGN_MATCH_SCORE = 2;
int CORE_DPALIGN_MISMATCH_PENALTY = 0;
int core_dynamic_align(global_context_t * global_context, thread_context_t * thread_context, char * read, int read_len, unsigned int begin_position, char * movement_buffer, int expected_offset, char * read_name)
// read must be converted to the positive strand.
// movement buffer: 0:match, 1: read-insert, 2: gene-insert, 3:mismatch
......@@ -4081,6 +4083,9 @@ int core_dynamic_align(global_context_t * global_context, thread_context_t * thr
gene_value_index_t * current_value_index = thread_context?thread_context->current_value_index:global_context->current_value_index;
indel_context_t * indel_context = (indel_context_t*)global_context -> module_contexts[MODULE_INDEL_ID];
//unsigned long long table_ptr = (unsigned long long) indel_context -> dynamic_align_table;
short ** table = indel_context -> dynamic_align_table;
char ** table_mask = indel_context -> dynamic_align_table_mask;
if(thread_context)
......
......@@ -467,6 +467,8 @@ int parse_opts_aligner(int argc , char ** argv, global_context_t * global_contex
else if(strcmp("SVdetection", long_options[option_index].name)==0)
{
global_context -> config.do_structural_variance_detection = 1;
global_context -> config.use_memory_buffer = 1;
global_context -> config.reads_per_chunk = 600llu*1024*1024;
}
else if(strcmp("maxRealignLocations", long_options[option_index].name)==0)
{
......
......@@ -60,7 +60,6 @@ static struct option long_options[] =
{"maxRealignLocations", required_argument, 0, 0},
{"minVoteCutoff", required_argument, 0, 0},
{"minMappedFraction", required_argument, 0, 0},
{"disableBigMargin", no_argument, 0, 0},
{"complexIndels", no_argument, 0, 0},
{0, 0, 0, 0}
};
......@@ -482,6 +481,8 @@ int parse_opts_subjunc(int argc , char ** argv, global_context_t * global_contex
else if(strcmp("SVdetection", long_options[option_index].name)==0)
{
global_context -> config.do_structural_variance_detection = 1;
global_context -> config.use_memory_buffer = 1;
global_context -> config.reads_per_chunk = 300llu*1024*1024;
}
else if(strcmp("minDistanceBetweenVariants", long_options[option_index].name)==0)
{
......@@ -540,6 +541,23 @@ int parse_opts_subjunc(int argc , char ** argv, global_context_t * global_contex
if(global_context->config.is_SAM_file_input) global_context->config.phred_score_format = FASTQ_PHRED33;
global_context->config.more_accurate_fusions = global_context->config.more_accurate_fusions && global_context->config.do_fusion_detection;
if(global_context->config.more_accurate_fusions)
{
global_context->config.high_quality_base_threshold = 999999;
//#warning "============ REMOVE THE NEXT LINE ======================"
global_context->config.show_soft_cliping = 1;
//#warning "============ REMOVE ' + 3' FROM NEXT LINE =============="
global_context->config.max_mismatch_junction_reads = 0 + 3;
//#warning "============ REMOVE ' - 1' FROM NEXT LINE =============="
global_context->config.do_big_margin_filtering_for_junctions = 1 - 1;
global_context->config.total_subreads = 28;
//#warning "============ REMOVE THE NEXT LINE BEFORE RELEASE ==============="
//global_context->config.multi_best_reads = 1;
}
return 0;
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -371,7 +371,7 @@ typedef struct{
short best_second_diff_bases;
short realign_flags;
short final_quality;
short hamming_matched;
short chromosomal_length;
} realignment_result_t;
......@@ -623,6 +623,7 @@ int chimeric_cigar_parts(global_context_t * global_context , unsigned int sel_po
void warning_file_limit();
void quick_sort(void * arr,int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r));
void basic_sort(void * arr,int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r));
// L_Minus_R should return -1, 0 or 1 when L<R, L==R or L>R.
// The result is from Small to Large.
......
......@@ -185,7 +185,7 @@ int covCalc()
} else {
SUBREADprintf("%s:%s %u [%s] :: %u-%u\n", line_buffer , chro, pos, cigar_str, Staring_Points[x1], Staring_Points[x1]+Section_Lengths[x1]);
SUBREADprintf("Read %s overhangs the boundary of chromosome %s (%u >= %u)\n", line_buffer, chro, x2, chrlen);
exit(-1);
//exit(-1);
}
}
}
......@@ -279,7 +279,7 @@ int cov_calc_main(int argc, char ** argv)
return 0;
}
int is_bam = is_certainly_bam_file(input_file_name, NULL);
int is_bam = is_certainly_bam_file(input_file_name, NULL, NULL);
if(1==is_bam) is_BAM_input = 1;
else if(is_bam < 0)
......
......@@ -438,7 +438,7 @@ unsigned int get_gene_linear(int chrono, int offset, const unsigned int offsets
}
int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int rl)
int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int * head_cut_length, int * tail_cut_length, int rl)
{
int n = 0;
......@@ -462,13 +462,27 @@ int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets ,
//SUBREADprintf("max=%u <= lim=%u : ACCEPTED.\n", rl + linear , offsets->read_offsets[n] + 16);
// the end of the read should not excess the end of the chromosome
if(rl + linear > offsets->read_offsets[n] + 16) return 1;
if(tail_cut_length == NULL){
if(rl + linear > offsets->read_offsets[n] + 15 - offsets -> padding) return 1;
} else {
(*tail_cut_length) = linear + rl - ( offsets->read_offsets[n] + 15 - offsets -> padding);
if( (*tail_cut_length) >= rl )return 1;
}
if (n==0)
*pos = linear;
else
*pos = linear - offsets->read_offsets[n-1];
if( (*pos) < offsets -> padding ) {
if(head_cut_length == NULL || (*pos) + rl <= offsets -> padding){
return 1;
}else{
(*head_cut_length) = offsets -> padding - (*pos);
(*pos) = offsets -> padding;
}
}
(*pos) -= offsets -> padding;
*chro_name = (char *)offsets->read_names+n*MAX_CHROMOSOME_NAME_LEN;
......@@ -483,7 +497,7 @@ int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets ,
int locate_gene_position(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos)
{
return locate_gene_position_max(linear, offsets, chro_name, pos, 0);
return locate_gene_position_max(linear, offsets, chro_name, pos, NULL, NULL, 0);
}
#define _index_vote(key) (((unsigned int)key)%GENE_VOTE_TABLE_SIZE)
......
......@@ -33,7 +33,7 @@ void destroy_offsets(gene_offset_t* offsets);
// Return 0 if the linear position is in a reasonable range or -1 if it is out of range.
// The pointer to the name of the chromosome is put into chro_name, and the position in this chromosome is in pos.
int locate_gene_position(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos);
int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int rl);
int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int * head_cut, int * tail_cut, int rl);
unsigned int linear_gene_position(const gene_offset_t* offsets , char *chro_name, unsigned int chro_pos);
......
......@@ -59,10 +59,13 @@ int gvindex_get(gene_value_index_t * index, gehash_data_t offset)
unsigned int offset_byte, offset_bit;
gvindex_baseno2offset_m(offset, index , offset_byte, offset_bit);
if(offset_byte >= index-> values_bytes)return 'N';
if(offset_byte >= index-> values_bytes -1)return 'N';
unsigned int one_base_value = (index->values [offset_byte]) >> (offset_bit);
//SUBREADprintf("RECV_BASE=%d (%d - %d)\n",one_base_value & 3, offset_byte , offset_bit);
return int2base(one_base_value & 3);
}
......@@ -92,7 +95,7 @@ int gvindex_match(gene_value_index_t * index, gehash_data_t offset, gehash_key_t
}
void gvindex_set (gene_value_index_t * index, gehash_data_t offset, gehash_key_t base_values, int padding)
void gvindex_set(gene_value_index_t * index, gehash_data_t offset, gehash_key_t base_values, int padding)
{
unsigned int offset_byte, offset_bit;
gvindex_baseno2offset(offset, index , &offset_byte, &offset_bit);
......@@ -112,6 +115,7 @@ void gvindex_set (gene_value_index_t * index, gehash_data_t offset, gehash_key_t
index->values [offset_byte] &= mask;
index->values [offset_byte] |= ((base_values >> (30 - i*2))&0x03) << (offset_bit);
//SUBREADprintf("PUT_BASE=%d (%d - %d)\n", (base_values >> (30 - i*2))&0x03, offset_byte, offset_bit);
offset_bit +=2;
if(offset_bit >=8)
{
......@@ -133,7 +137,7 @@ void gvindex_dump(gene_value_index_t * index, const char filename [])
unsigned int useful_bytes, useful_bits;
gvindex_baseno2offset (index -> length+ index -> start_point, index,&useful_bytes,&useful_bits);
fwrite(index->values, 1, useful_bytes, fp);
fwrite(index->values, 1, useful_bytes+1, fp);
fclose(fp);
}
......@@ -153,8 +157,8 @@ int gvindex_load(gene_value_index_t * index, const char filename [])
unsigned int useful_bytes, useful_bits;
index -> start_base_offset = index -> start_point - index -> start_point%4;
gvindex_baseno2offset (index -> length+ index -> start_point, index ,&useful_bytes,&useful_bits);
index -> values = malloc(useful_bytes);
index -> values_bytes = useful_bytes;
index -> values = malloc(useful_bytes+1);
index -> values_bytes = useful_bytes+1;
if(!index->values)
{
SUBREADputs(MESSAGE_OUT_OF_MEMORY);
......@@ -162,7 +166,7 @@ int gvindex_load(gene_value_index_t * index, const char filename [])
}
read_length =fread(index->values, 1, useful_bytes, fp);
read_length =fread(index->values, 1, useful_bytes+1, fp);
assert(read_length>0);
fclose(fp);
......
......@@ -261,6 +261,7 @@ int GRA_check_config (GRA_global_context_t * global_context)
void GRA_init_context(GRA_global_context_t * global_context)
{
char mac_rand[13];
FILE * system_random_fp;
memset(global_context, 0 , sizeof(GRA_global_context_t));
......@@ -269,7 +270,9 @@ void GRA_init_context(GRA_global_context_t * global_context)