Skip to content
Commits on Source (4)
language: python
python:
- "3.8"
- "3.7"
- "3.6"
- "3.5"
install:
- pip install --upgrade pip
- pip install --upgrade -r requirements.txt
......
2019-12-12 Tao Liu <vladimir.liu@gmail.com>
MACS version 2.2.6
* New Features
1) Speed up MACS2. Some programming tricks and code cleanup. The
filter_dup function replaces separate_dups. The later one was
implemented for potentially putting back duplicate reads in
certain downstream analysis. However such analysis hasn't been
implemented. Optimize the speed of writing bedGraph
files. Optimize BAM and BAMPE parsing with pointer casting instead
of python unpack.
2) The comment lines in the headers of BED or SAM files will be
correctly skipped. However, MACS2 won't check comment lines in the
middle of the file.
* Bugs fixed
1) Cutoff-analysis in callpeak command. #341
2) Issues related to SAMParser and three ELAND Parsers are
fixed. #347
* Other
1) cmdlinetest script in test/ folder has been updated to: 1. test
cutoff-analysis with callpeak cmd; 2. output the 2 lines before
and after the error or warning message during tests; 3. output
only the first 10 lines if the difference between test result and
standard result can be found; 4. prockreport monitor CPU time and
memory usage in 1 sec interval -- a bit more accurate.
2) Python3.5 support is removed. Now MACS2 requires Python>=3.6.
2019-10-31 Tao Liu <vladimir.liu@gmail.com>
MACS version 2.2.5 (Py3 speed up)
......
# Official MACS2 v2.2.5 docker
# Official MACS2 v2.2.6 docker
MACS2 is a bioinformatics algorithm to analyze ChIP-seq datasets.
......
# INSTALL Guide For MACS
Time-stamp: <2019-10-03 11:56:03 taoliu>
Time-stamp: <2019-12-12 13:26:11 taoliu>
Please check the following instructions to complete your installation.
## Prerequisites
MACS v2.2.x requires Python3. We have tested MACS in Python3.5, 3.6
and 3.7. MACS runs slower under Python3.5, so Python3.6 or Python3.7
is recommended.
MACS v2.2.x requires Python3. We have tested MACS in Python3.6, 3.7
and 3.8.
MACS also requires [Numpy](http://www.scipy.org/Download) (>=1.17).
......
MACS_VERSION = "2.2.5"
MACS_VERSION = "2.2.6"
FILTERDUP_VERSION = "1.0.0 20140616"
RANDSAMPLE_VERSION = "1.0.0 20120703"
MAX_PAIRNUM = 1000
......
This diff is collapsed.
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2019-10-30 16:33:01 taoliu>
# cython: linetrace=True
# Time-stamp: <2019-11-06 13:00:01 taoliu>
"""Module for Calculate Scores.
......@@ -40,28 +41,53 @@ from MACS2.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname
from MACS2.IO.FixWidthTrack import FWTrack
from MACS2.IO.PairedEndTrack import PETrackI
from MACS2.Statistics import P_Score_Upper_Tail, LogLR_Asym # pure C code for calculating p-value scores/logLR of Poisson
#
pscore_table = P_Score_Upper_Tail() # this table will cache pscore being calculated.
get_pscore = pscore_table.get_pscore
logLR_table = LogLR_Asym() # this table will cache pscore being calculated.
get_logLR_asym = logLR_table.get_logLR_asym
# --------------------------------------------
# cached pscore function and LR_asym functions
# --------------------------------------------
from MACS2.Prob import poisson_cdf
pscore_dict = dict()
logLR_dict = dict()
#cdef float get_pscore ( int x, float l ):
cdef float get_pscore ( tuple x ):
cdef:
float val
if x in pscore_dict:
return pscore_dict [ x ]
else:
# calculate and cache
val = -1 * poisson_cdf ( x[0], x[1], False, True )
pscore_dict[ x ] = val
return val
cdef float get_logLR_asym ( float x, float y ):
cdef:
float val
if ( x, y ) in logLR_dict:
return logLR_dict[ ( x, y ) ]
else:
# calculate and cache
if x > y:
val = (x*(log10(x)-log10(y))+y-x)
elif x < y:
val = (x*(-log10(x)+log10(y))-y+x)
else:
val = 0
logLR_dict[ ( x, y ) ] = val
return val
# ------------------------------------
# constants
# ------------------------------------
__version__ = "scoreCalculate $Revision$"
__version__ = "CallPeakUnit $Revision$"
__author__ = "Tao Liu <vladimir.liu@gmail.com>"
__doc__ = "scoreTrackI classes"
__doc__ = "CallPeakUnit"
# ------------------------------------
# Misc functions
# ------------------------------------
cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline int int_min(int a, int b): return a if a <= b else b
def do_nothing(*args, **kwargs):
pass
LOG10_E = 0.43429448190325176
cdef void clean_up_ndarray ( np.ndarray x ):
......@@ -243,7 +269,7 @@ cdef tuple find_optimal_cutoff( list x, list y ):
for i in range( 10, l ):
# at least the largest 10 points
m, c = np.linalg.lstsq( npA[:i], npy[:i] )[ 0 ]
m, c = np.linalg.lstsq( npA[:i], npy[:i], rcond=None )[ 0 ]
sst = sum( ( npy[:i] - np.mean( npy[:i] ) ) ** 2 )
sse = sum( ( npy[:i] - m*npx[:i] - c ) ** 2 )
rsq = 1 - sse/sst
......@@ -283,7 +309,7 @@ cdef class CallerFromAlignments:
bool no_lambda_flag # whether ignore local bias, and to use global bias instead
bool PE_mode # whether it's in PE mode, will be detected during initiation
# temporary data buffer
bytes chrom # name of current chromosome
#bytes chrom # name of current chromosome
list chr_pos_treat_ctrl # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
bytes bedGraph_treat_filename
bytes bedGraph_control_filename
......@@ -292,7 +318,7 @@ cdef class CallerFromAlignments:
#object bedGraph_treat # file handler to write ChIP pileup
#object bedGraph_ctrl # file handler to write Control pileup
# data needed to be pre-computed before peak calling
object pqtable # remember pvalue->qvalue convertion
dict pqtable # remember pvalue->qvalue convertion
bool pvalue_all_done # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value.
dict pvalue_npeaks # record for each pvalue cutoff, how many peaks can be called
......@@ -469,12 +495,6 @@ cdef class CallerFromAlignments:
clean_up_ndarray( self.chr_pos_treat_ctrl[0] )
clean_up_ndarray( self.chr_pos_treat_ctrl[1] )
clean_up_ndarray( self.chr_pos_treat_ctrl[2] )
#self.chr_pos_treat_ctrl[0].resize(10000,refcheck=False)
#self.chr_pos_treat_ctrl[1].resize(10000,refcheck=False)
#self.chr_pos_treat_ctrl[2].resize(10000,refcheck=False)
#self.chr_pos_treat_ctrl[0].resize(0,refcheck=False)
#self.chr_pos_treat_ctrl[1].resize(0,refcheck=False)
#self.chr_pos_treat_ctrl[2].resize(0,refcheck=False)
if self.PE_mode:
treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 )
......@@ -623,7 +643,7 @@ cdef class CallerFromAlignments:
#ptr += 1
return s
cdef object __cal_pvalue_qvalue_table ( self ):
cdef void __cal_pvalue_qvalue_table ( self ):
"""After this function is called, self.pqtable is built. All
chromosomes will be iterated. So it will take some time.
......@@ -631,7 +651,7 @@ cdef class CallerFromAlignments:
cdef:
bytes chrom
np.ndarray pos_array, treat_array, ctrl_array, score_array
dict pvalue_stat = {}
dict pvalue_stat
long n, pre_p, length, j, pre_l, l, i
float this_v, pre_v, v, q, pre_q
long N, k, this_l
......@@ -646,6 +666,7 @@ cdef class CallerFromAlignments:
logging.debug ( "Start to calculate pvalue stat..." )
pvalue_stat = dict()
for i in range( len( self.chromosomes ) ):
chrom = self.chromosomes[ i ]
pre_p = 0
......@@ -658,7 +679,8 @@ cdef class CallerFromAlignments:
ctrl_value_ptr = <float32_t *> ctrl_array.data
for j in range(pos_array.shape[0]):
this_v = get_pscore( int(treat_value_ptr[0]), ctrl_value_ptr[0] )
#this_v = get_pscore( int(treat_value_ptr[0]), ctrl_value_ptr[0] )
this_v = get_pscore( (int(treat_value_ptr[0]), ctrl_value_ptr[0] ) )
this_l = pos_ptr[0] - pre_p
if this_v in pvalue_stat:
......@@ -669,7 +691,7 @@ cdef class CallerFromAlignments:
pos_ptr += 1
treat_value_ptr += 1
ctrl_value_ptr += 1
nhcal += pos_array.shape[0]
#logging.debug ( "make pvalue_stat cost %.5f seconds" % t )
......@@ -685,7 +707,6 @@ cdef class CallerFromAlignments:
pre_q = 2147483647 # save the previous q-value
self.pqtable = {}
#self.pqtable = Float64HashTable()
unique_values = sorted(list(pvalue_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
for i in range(len(unique_values)):
v = unique_values[i]
......@@ -695,15 +716,15 @@ cdef class CallerFromAlignments:
self.pqtable[ v ] = q
pre_v = v
pre_q = q
k+=l
k += l
nhcal += 1
logging.debug( "access pq hash for %d times" % nhcal )
return self.pqtable
return
cdef object __pre_computes ( self, int max_gap = 50, int min_length = 200 ):
cdef void __pre_computes ( self, int max_gap = 50, int min_length = 200 ):
"""After this function is called, self.pqtable and self.pvalue_length is built. All
chromosomes will be iterated. So it will take some time.
......@@ -711,7 +732,7 @@ cdef class CallerFromAlignments:
cdef:
bytes chrom
np.ndarray pos_array, treat_array, ctrl_array, score_array
dict pvalue_stat = {}
dict pvalue_stat
long n, pre_p, this_p, length, j, pre_l, l, i
float this_v, pre_v, v, q, pre_q, this_t, this_c
long N, k, this_l
......@@ -736,7 +757,8 @@ cdef class CallerFromAlignments:
logging.debug ( "Start to calculate pvalue stat..." )
tmplist = sorted( list(np.arange(0.3, 10.0, 0.3)), reverse = True )
pvalue_stat = dict()
for i in range( len( self.chromosomes ) ):
chrom = self.chromosomes[ i ]
self.__pileup_treat_ctrl_a_chromosome( chrom )
......@@ -824,7 +846,6 @@ cdef class CallerFromAlignments:
pre_q = 2147483647 # save the previous q-value
self.pqtable = {}
#self.pqtable = Float64HashTable()
unique_values = sorted(list(pvalue_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
for i in range(len(unique_values)):
v = unique_values[i]
......@@ -850,14 +871,13 @@ cdef class CallerFromAlignments:
y.append( self.pvalue_length[ cutoff ] )
fhd.close()
logging.info( "#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename )
logging.info( "#3 Suggest a cutoff..." )
optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
logging.info( "#3 -10log10pvalue cutoff %.2f will call approximately %d bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
return self.pqtable
#logging.info( "#3 Suggest a cutoff..." )
#optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
#logging.info( "#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
return
cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int min_length = 200,
int max_gap = 50, bool call_summits = False, bool auto_cutoff = False ):
int max_gap = 50, bool call_summits = False, bool cutoff_analysis = False ):
"""Call peaks for all chromosomes. Return a PeakIO object.
scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
......@@ -876,8 +896,8 @@ cdef class CallerFromAlignments:
# prepare p-q table
if not self.pqtable:
logging.info("#3 Pre-compute pvalue-qvalue table...")
if auto_cutoff:
logging.info("#3 Cutoff will be automatically decided!")
if cutoff_analysis:
logging.info("#3 Cutoff vs peaks called will be analyzed!")
self.__pre_computes( max_gap = max_gap, min_length = min_length )
else:
self.__cal_pvalue_qvalue_table()
......@@ -922,7 +942,7 @@ cdef class CallerFromAlignments:
return peaks
cdef __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int min_length,
cdef void __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int min_length,
int max_gap, bool call_summits, bool save_bedGraph ):
""" Call peaks for a chromosome.
......@@ -997,7 +1017,7 @@ cdef class CallerFromAlignments:
if above_cutoff.size == 0:
# nothing above cutoff
return peaks
return
if above_cutoff[0] == 0:
# first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
......@@ -1050,7 +1070,7 @@ cdef class CallerFromAlignments:
lastp = te #above_cutoff_endpos[i]
# save the last peak
if not peak_content:
return peaks
return
else:
if call_summits:
self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
......@@ -1058,7 +1078,7 @@ cdef class CallerFromAlignments:
self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
#print "close peaks -- chrom:",chrom," time:", ttime() - t0
return peaks
return
cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int min_length,
bytes chrom, int smoothlen, list score_array_s, list score_cutoff_s=[]):
......@@ -1104,7 +1124,8 @@ cdef class CallerFromAlignments:
if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
return False # not passed, then disgard this peak.
summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
#summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
summit_p_score = get_pscore(( int(summit_treat), summit_ctrl ) )
summit_q_score = self.pqtable[ summit_p_score ]
peaks.add( chrom, # chromosome
......@@ -1187,7 +1208,8 @@ cdef class CallerFromAlignments:
summit_treat = peak_content[ summit_index ][ 2 ]
summit_ctrl = peak_content[ summit_index ][ 3 ]
summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
#summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
summit_p_score = get_pscore(( int(summit_treat), summit_ctrl ) )
summit_q_score = self.pqtable[ summit_p_score ]
for i in range(len(score_cutoff_s)):
......@@ -1225,7 +1247,8 @@ cdef class CallerFromAlignments:
array1_size = array1.shape[0]
for i in range(array1_size):
s_ptr[0] = get_pscore( int(a1_ptr[0]), a2_ptr[0] )
#s_ptr[0] = get_pscore( int(a1_ptr[0]), a2_ptr[0] )
s_ptr[0] = get_pscore(( int(a1_ptr[0]), a2_ptr[0] ))
s_ptr += 1
a1_ptr += 1
a2_ptr += 1
......@@ -1247,7 +1270,8 @@ cdef class CallerFromAlignments:
s_ptr = <float32_t *> s.data
for i in range(array1.shape[0]):
s_ptr[0] = self.pqtable[ get_pscore( int(a1_ptr[0]), a2_ptr[0] ) ]
#s_ptr[0] = self.pqtable[ get_pscore( int(a1_ptr[0]), a2_ptr[0] ) ]
s_ptr[0] = self.pqtable[ get_pscore(( int(a1_ptr[0]), a2_ptr[0] )) ]
s_ptr += 1
a1_ptr += 1
a2_ptr += 1
......@@ -1402,29 +1426,23 @@ cdef class CallerFromAlignments:
ctrl_array_ptr += 1
if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits
tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t )
#t_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t ) )
fprintf( ft, tmp_bytes )
fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
pre_v_t = v_t
pre_p_t = p
if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits
tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c )
fprintf( fc, tmp_bytes )
#c_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c ) )
fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
pre_v_c = v_c
pre_p_c = p
p = pos_array_ptr[ 0 ]
# last one
tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t )
fprintf( ft, tmp_bytes )
tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c )
fprintf( fc, tmp_bytes )
fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
return True
cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length=200, int lvl1_max_gap=50, int lvl2_max_gap=400, bool auto_cutoff = False):
cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length=200, int lvl1_max_gap=50, int lvl2_max_gap=400, bool cutoff_analysis = False):
"""This function try to find enriched regions within which,
scores are continuously higher than a given cutoff for level
1, and link them using the gap above level 2 cutoff with a
......@@ -1456,8 +1474,8 @@ cdef class CallerFromAlignments:
# prepare p-q table
if not self.pqtable:
logging.info("#3 Pre-compute pvalue-qvalue table...")
if auto_cutoff:
logging.info("#3 Cutoff for broad region will be automatically decided!")
if cutoff_analysis:
logging.info("#3 Cutoff value vs broad region calls will be analyzed!")
self.__pre_computes( max_gap = lvl2_max_gap, min_length = min_length )
else:
self.__cal_pvalue_qvalue_table()
......@@ -1527,7 +1545,7 @@ cdef class CallerFromAlignments:
return broadpeaks
cdef __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
cdef void __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
int min_length, int lvl1_max_gap, int lvl2_max_gap, bool save_bedGraph):
""" Call peaks for a chromosome.
......
This diff is collapsed.
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2019-10-30 16:32:50 taoliu>
# Time-stamp: <2019-11-04 11:47:48 taoliu>
"""Module for FWTrack classes.
......@@ -96,7 +96,7 @@ cdef class FWTrack:
self.length = 0
self.__destroyed = False
cpdef destroy ( self ):
cpdef void destroy ( self ):
"""Destroy this object and release mem.
"""
cdef:
......@@ -120,10 +120,10 @@ cdef class FWTrack:
self.__dup_locations[chromosome] = [None, None]
self.__dup_locations.pop(chromosome)
self.__destroyed = True
return True
return
cpdef add_loc ( self, bytes chromosome, int32_t fiveendpos, int strand ):
cpdef void add_loc ( self, bytes chromosome, int32_t fiveendpos, int strand ):
"""Add a location to the list according to the sequence name.
chromosome -- mostly the chromosome name
......@@ -139,24 +139,18 @@ cdef class FWTrack:
self.__locations[chromosome][strand][0] = fiveendpos
self.__pointer[chromosome][strand] = 1
else:
try:
i = self.__pointer[chromosome][strand]
if i % self.buffer_size == 0:
self.__expand__ ( self.__locations[chromosome][strand] )
self.__locations[chromosome][strand][i]= fiveendpos
self.__pointer[chromosome][strand] += 1
except:
print ("i:",i)
print ("self.buffer_size:", self.buffer_size)
print ("strand:", strand)
print ("loc len:", len( self.__locations[chromosome][strand] ))
raise Exception("!!")
cdef np.ndarray[np.int32_t, ndim=1] __expand__ ( self, np.ndarray[np.int32_t, ndim=1] arr ):
i = self.__pointer[chromosome][strand]
if i % self.buffer_size == 0:
self.__expand__ ( self.__locations[chromosome][strand] )
self.__locations[chromosome][strand][i]= fiveendpos
self.__pointer[chromosome][strand] += 1
return
cdef void __expand__ ( self, np.ndarray[np.int32_t, ndim=1] arr ):
arr.resize( arr.shape[0] + self.buffer_size, refcheck = False )
return arr
return
cpdef finalize ( self ):
cpdef void finalize ( self ):
""" Resize np arrays for 5' positions and sort them in place
Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
......@@ -233,7 +227,7 @@ cdef class FWTrack:
# """
# return self.total*self.fw
cpdef sort ( self ):
cpdef void sort ( self ):
"""Naive sorting for locations.
"""
......@@ -249,9 +243,10 @@ cdef class FWTrack:
self.__locations[c][1].sort()
self.__sorted = True
return
@cython.boundscheck(False)
cpdef separate_dups( self, maxint = 1 ):
cpdef void separate_dups( self, int maxnum = 1 ):
"""Separate the duplicated reads into a different track
stored at self.dup
"""
......@@ -298,7 +293,7 @@ cdef class FWTrack:
else:
current_loc = p
n = 1
if n > maxint:
if n > maxnum:
dup_plus [ i_dup ] = p
i_dup += 1
else:
......@@ -344,7 +339,7 @@ cdef class FWTrack:
else:
current_loc = p
n = 1
if n > maxint:
if n > maxnum:
dup_minus [ i_dup ] = p
i_dup += 1
else:
......@@ -439,7 +434,7 @@ cdef class FWTrack:
return 0
@cython.boundscheck(False) # do not check that np indices are valid
cpdef unsigned long filter_dup ( self, int32_t maxnum = -1):
cpdef unsigned long filter_dup ( self, int maxnum = -1):
"""Filter the duplicated reads.
Run it right after you add all data into this object.
......@@ -486,16 +481,12 @@ cdef class FWTrack:
p = plus[ i_old ]
if p == current_loc:
n += 1
if n <= maxnum:
new_plus[ i_new ] = p
i_new += 1
else:
logging.debug("Duplicate reads found at %s:%d at + strand" % (k,p) )
else:
current_loc = p
new_plus[ i_new ] = p
i_new += 1
n = 1
if n <= maxnum:
new_plus[ i_new ] = p
i_new += 1
new_plus.resize( i_new, refcheck=False )
self.total += i_new
self.__pointer[k][0] = i_new
......@@ -525,16 +516,12 @@ cdef class FWTrack:
p = minus[ i_old ]
if p == current_loc:
n += 1
if n <= maxnum:
new_minus[ i_new ] = p
i_new += 1
else:
logging.debug("Duplicate reads found at %s:%d at + strand" % (k,p) )
else:
current_loc = p
new_minus[ i_new ] = p
i_new += 1
n = 1
if n <= maxnum:
new_minus[ i_new ] = p
i_new += 1
new_minus.resize( i_new, refcheck=False )
self.total += i_new
self.__pointer[k][1] = i_new
......@@ -553,7 +540,7 @@ cdef class FWTrack:
self.length = self.fw * self.total
return self.total
cpdef sample_percent (self, float percent, int seed = -1 ):
cpdef void sample_percent (self, float percent, int seed = -1 ):
"""Sample the tags for a given percentage.
Warning: the current object is changed!
......@@ -592,7 +579,7 @@ cdef class FWTrack:
self.length = self.fw * self.total
return
cpdef sample_num (self, uint64_t samplesize, int seed = -1):
cpdef void sample_num (self, uint64_t samplesize, int seed = -1):
"""Sample the tags for a given percentage.
Warning: the current object is changed!
......@@ -604,7 +591,7 @@ cdef class FWTrack:
self.sample_percent ( percent, seed )
return
cpdef print_to_bed (self, fhd=None):
cpdef void print_to_bed (self, fhd=None):
"""Output FWTrack to BED format files. If fhd is given,
write to a file, otherwise, output to standard output.
......@@ -675,7 +662,7 @@ cdef class FWTrack:
rt_minus = np.array(temp)
return (rt_plus, rt_minus)
cpdef compute_region_tags_from_peaks ( self, peaks, func, int window_size = 100, float cutoff = 5 ):
cpdef list compute_region_tags_from_peaks ( self, peaks, func, int window_size = 100, float cutoff = 5 ):
"""Extract tags in peak, then apply func on extracted tags.
peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
......@@ -759,7 +746,7 @@ cdef class FWTrack:
return retval
cpdef refine_peak_from_tags_distribution ( self, peaks, int window_size = 100, float cutoff = 5 ):
cpdef object refine_peak_from_tags_distribution ( self, peaks, int window_size = 100, float cutoff = 5 ):
"""Extract tags in peak, then apply func on extracted tags.
peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
......@@ -781,6 +768,7 @@ cdef class FWTrack:
set pchrnames, chrnames
list temp, retval, cpeaks
np.ndarray[np.int32_t, ndim=1] adjusted_summits, passflags
object ret_peaks
pchrnames = peaks.get_chr_names()
retval = []
......@@ -864,7 +852,7 @@ cdef class FWTrack:
# end of a loop
return ret_peaks
cpdef pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0, bint directional = True, int end_shift = 0 ):
cpdef list pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0, bint directional = True, int end_shift = 0 ):
"""pileup a certain chromosome, return [p,v] (end position and value) list.
ds : tag will be extended to this value to 3' direction,
......@@ -932,7 +920,7 @@ cdef inline int32_t left_forward ( data, int pos, int window_size ):
cdef inline int32_t right_forward ( data, int pos, int window_size ):
return data.get(pos + window_size, 0) - data.get(pos, 0)
cdef wtd_find_summit(chrom, np.ndarray[np.int32_t, ndim=1] plus, np.ndarray[np.int32_t, ndim=1] minus, int32_t search_start, int32_t search_end, int32_t window_size, float cutoff):
cdef tuple wtd_find_summit(chrom, np.ndarray[np.int32_t, ndim=1] plus, np.ndarray[np.int32_t, ndim=1] minus, int32_t search_start, int32_t search_end, int32_t window_size, float cutoff):
"""internal function to be called by refine_peak_from_tags_distribution()
"""
......
This diff is collapsed.
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2019-10-30 17:49:19 taoliu>
# Time-stamp: <2019-11-04 13:23:29 taoliu>
"""Module for filter duplicate tags from paired-end data
......@@ -26,7 +26,10 @@ from MACS2.Pileup import quick_pileup, max_over_two_pv_array, se_all_in_one_pile
cdef INT_MAX = <int>((<unsigned int>-1)>>1)
cdef packed struct peLoc:
np.int32_t l
np.int32_t r
# Let numpy enforce PE-ness using ndarray, gives bonus speedup when sorting
# PE data doesn't have strandedness
cdef class PETrackI:
......@@ -72,14 +75,14 @@ cdef class PETrackI:
self.length = 0
self.average_template_length = 0.0
cpdef add_loc ( self, bytes chromosome, int start, int end):
cpdef void add_loc ( self, bytes chromosome, int start, int end):
"""Add a location to the list according to the sequence name.
chromosome -- mostly the chromosome name
fiveendpos -- 5' end pos, left for plus strand, right for neg strand
"""
cdef:
long i
int i
if chromosome not in self.__locations:
self.__locations[chromosome] = np.zeros(shape=self.buffer_size, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
......@@ -90,10 +93,11 @@ cdef class PETrackI:
if i % self.buffer_size == 0:
self.__expand__ ( self.__locations[chromosome] )
self.__locations[chromosome][ i ] = ( start, end )
self.__pointer[chromosome] += 1
self.__pointer[chromosome] = i + 1
self.length += end - start
return
cpdef destroy ( self ):
cpdef void destroy ( self ):
"""Destroy this object and release mem.
"""
cdef:
......@@ -114,9 +118,9 @@ cdef class PETrackI:
self.__dup_locations.pop(chromosome)
self.__destroyed = True
return True
return
cpdef __expand__ ( self, np.ndarray arr ):
cpdef void __expand__ ( self, np.ndarray arr ):
arr.resize((arr.shape[0] + self.buffer_size), refcheck = False )
return
......@@ -151,8 +155,7 @@ cdef class PETrackI:
self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()])
return self.rlengths
def finalize ( self ):
cpdef void finalize ( self ):
""" Resize np arrays for 5' positions and sort them in place
Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
......@@ -177,7 +180,7 @@ cdef class PETrackI:
self.average_template_length = float( self.length ) / self.total
return
def get_locations_by_chr ( self, bytes chromosome ):
cpdef get_locations_by_chr ( self, bytes chromosome ):
"""Return a tuple of two lists of locations for certain chromosome.
"""
......@@ -203,7 +206,7 @@ cdef class PETrackI:
# l += (v[:,1] - v[:,0]).sum()
# return l
cpdef sort ( self ):
cpdef void sort ( self ):
"""Naive sorting for locations.
"""
......@@ -219,6 +222,7 @@ cdef class PETrackI:
self.__locations[c].sort( order=['l', 'r'] ) # sort by the leftmost location
#print "before", self.__locations[c][0:100]
self.__sorted = True
return
# def centered_fake_fragments(track, int d):
# """Return a copy of the PETrackI object so that its locations are all
......@@ -270,7 +274,7 @@ cdef class PETrackI:
return pmf
@cython.boundscheck(False) # do not check that np indices are valid
def separate_dups ( self , int maxint = 1 ):
cpdef void separate_dups ( self , int maxnum = 1 ):
"""Filter the duplicated reads.
Run it right after you add all data into this object.
......@@ -285,6 +289,8 @@ cdef class PETrackI:
set chrnames
bytes k
if maxnum < 0: return # condition to return if not filtering
chrnames = self.get_chr_names()
if not self.__sorted: self.sort()
......@@ -296,86 +302,82 @@ cdef class PETrackI:
self.average_template_length = 0.0
for k in chrnames: # for each chromosome
# dups.__locations[k] = self.__locations[k].copy()
i_new = 0
i_dup = 0
locs = self.__locations[k]
size = locs.shape[0]
if size <= 1:
new_locs = locs
else:
new_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
dup_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
n = 1
current_loc_start = locs[0][0] # same as locs[0]['l']
current_loc_end = locs[0][1]# same as locs[0]['r']
new_locs[i_new][0] = current_loc_start
new_locs[i_new][1] = current_loc_end
i_new += 1
self.length += current_loc_end - current_loc_start
for i_old in range(1, size):
loc_start = locs[i_old][0]
loc_end = locs[i_old][1]
all_same = ((loc_start == current_loc_start) and
(loc_end == current_loc_end))
if all_same:
n += 1
else:
current_loc_start = loc_start
current_loc_end = loc_end
n = 1
if n > maxint:
dup_locs[i_dup][0] = loc_start
dup_locs[i_dup][1] = loc_end
i_dup += 1
else:
new_locs[i_new][0] = loc_start
new_locs[i_new][1] = loc_end
self.length += loc_end - loc_start
i_new += 1
new_locs.resize( i_new , refcheck = False)
dup_locs.resize( i_dup , refcheck = False)
self.total += i_new
self.dup_total += i_dup
self.__pointer[k] = i_new
self.__dup_pointer[k] = i_dup
# unnecessary
# new_size = new_locs.shape[0]
# dup_size = dup_locs.shape[0]
# self.__pointer[k] = new_size
# dups.__pointer[k] = dup_size
# free memory?
if size == 1:
# do nothing and continue
self.total += size
self.__pointer[k] = size
self.length += locs[0][1] - locs[0][0]
self.__dup_locations[k] = []
continue
# save duplicate raeds to dup_locations[k] and update locations[k]
new_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
dup_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
current_loc_start = locs[0][0] # same as locs[0]['l']
current_loc_end = locs[0][1]# same as locs[0]['r']
new_locs[0][0] = current_loc_start
new_locs[0][1] = current_loc_end
self.length += current_loc_end - current_loc_start
i_new = 1 # index of new_locs
i_dup = 0 # index of dup_locs
n = 1
for i_old in range(1, size):
loc_start = locs[i_old][0]
loc_end = locs[i_old][1]
if (loc_start == current_loc_start) and (loc_end == current_loc_end) :
# both ends are the same
n += 1
else:
# not the same, update current_loc and reset n
current_loc_start = loc_start
current_loc_end = loc_end
n = 1
if n > maxnum:
# more than maxnum duplicates, put the duplicates in dup_locs
dup_locs[i_dup][0] = loc_start
dup_locs[i_dup][1] = loc_end
i_dup += 1
else:
# otherwise, put the fragment in new_locs
new_locs[i_new][0] = loc_start
new_locs[i_new][1] = loc_end
self.length += loc_end - loc_start
i_new += 1
new_locs.resize( i_new , refcheck = False)
dup_locs.resize( i_dup , refcheck = False)
self.total += i_new
self.dup_total += i_dup
self.__pointer[k] = i_new
self.__dup_pointer[k] = i_dup
# free memory?
# I know I should shrink it to 0 size directly,
# however, on Mac OSX, it seems directly assigning 0
# doesn't do a thing.
locs.resize( self.buffer_size, refcheck=False )
locs.resize( 0, refcheck=False )
# hope there would be no mem leak...
self.__locations[k] = new_locs
if size > 1:
self.__dup_locations[k] = dup_locs
self.average_template_length = float( self.length ) / self.total
self.__dup_locations[k] = dup_locs
self.average_template_length = self.length / self.total
return
@cython.boundscheck(False) # do not check that np indices are valid
cpdef unsigned long filter_dup ( self, int maxnum=-1):
cpdef void filter_dup ( self, int maxnum=-1):
"""Filter the duplicated reads.
Run it right after you add all data into this object.
"""
cdef:
int i_chrom, n, start, end
# np.ndarray[np.int32_t, ndim=1] loc #= np.zeros([1,2], np.int32)
# np.ndarray[np.int32_t, ndim=1] current_loc #= np.zeros([1,2], np.int32)
int loc_start, loc_end, current_loc_start, current_loc_end
unsigned long i_old, i_new, size, new_size
bytes k
np.ndarray locs, new_locs
set chrnames
#peLoc current_range, this_range
if maxnum < 0: return self.total # condition to return if not filtering
if maxnum < 0: return # condition to return if not filtering
if not self.__sorted: self.sort()
......@@ -386,58 +388,62 @@ cdef class PETrackI:
chrnames = self.get_chr_names()
for k in chrnames: # for each chromosome
i_new = 0
locs = self.__locations[k]
size = locs.shape[0]
if size <= 1:
new_locs = locs
else:
new_locs = np.zeros( self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
n = 1 # the number of tags in the current location
current_loc_start = locs[0][0]
current_loc_end = locs[0][1]
new_locs[i_new][0] = current_loc_start
new_locs[i_new][1] = current_loc_end
i_new += 1
self.length += current_loc_end - current_loc_start
for i_old in range(1, size):
loc_start = locs[i_old][0]
loc_end = locs[i_old][1]
all_same = ((loc_start == current_loc_start) and
(loc_end == current_loc_end))
if all_same:
n += 1
if n <= maxnum:
new_locs[i_new][0] = loc_start
new_locs[i_new][1] = loc_end
self.length += loc_end - loc_start
i_new += 1
else:
current_loc_start = loc_start
current_loc_end = loc_end
new_locs[i_new][0] = loc_start
new_locs[i_new][1] = loc_end
self.length += loc_end - loc_start
i_new += 1
n = 1
new_locs.resize( i_new, refcheck = False )
new_size = new_locs.shape[0]
self.__pointer[k] = new_size
self.total += new_size
# free memory?
size = locs.shape[0]
if size == 1:
# do nothing and continue
self.total += size
self.__pointer[k] = size
self.length += locs[0][1] - locs[0][0]
continue
# discard duplicate reads and make a new __locations[k]
new_locs = np.zeros( self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
# get the first loc
#current_loc_start = locs[0][0]
#current_loc_end = locs[0][1]
( current_loc_start, current_loc_end ) = locs[0]
#new_locs[0][0] = current_loc_start
#new_locs[0][1] = current_loc_end
new_locs[0] = ( current_loc_start, current_loc_end )
self.length += current_loc_end - current_loc_start
i_new = 1 # index of new_locs
n = 1 # the number of tags in the current genomic location
for i_old in range(1, size):
#loc_start = locs[i_old][0]
#loc_end = locs[i_old][1]
( loc_start, loc_end ) = locs[i_old]
if loc_start == current_loc_start and loc_end == current_loc_end:
# both ends are the same, add 1 to duplicate number n
n += 1
else:
# not the same, update currnet_loc, reset n
current_loc_start = loc_start
current_loc_end = loc_end
n = 1
if n <= maxnum:
# smaller than maxnum, then add to new_locs,
# otherwise, discard
#new_locs[i_new][0] = loc_start
#new_locs[i_new][1] = loc_end
new_locs[i_new] = (loc_start, loc_end)
self.length += loc_end - loc_start
i_new += 1
new_locs.resize( i_new, refcheck = False )
new_size = new_locs.shape[0]
self.__pointer[k] = new_size
self.total += new_size
# free memory?
# I know I should shrink it to 0 size directly,
# however, on Mac OSX, it seems directly assigning 0
# doesn't do a thing.
locs.resize( self.buffer_size, refcheck=False )
locs.resize( 0, refcheck=False )
# hope there would be no mem leak...
self.__locations[k] = new_locs
self.average_template_length = float( self.length ) / self.total
return self.total
self.average_template_length = self.length / self.total
return
def sample_percent (self, float percent, int seed = -1):
cpdef void sample_percent (self, float percent, int seed = -1):
"""Sample the tags for a given percentage.
Warning: the current object is changed!
......@@ -470,7 +476,7 @@ cdef class PETrackI:
self.average_template_length = float( self.length )/ self.total
return
def sample_num (self, uint64_t samplesize, int seed = -1):
cpdef void sample_num (self, uint64_t samplesize, int seed = -1):
"""Sample the tags for a given percentage.
Warning: the current object is changed!
......@@ -481,7 +487,7 @@ cdef class PETrackI:
self.sample_percent ( percent, seed )
return
def print_to_bed (self, fhd=None):
cpdef void print_to_bed (self, fhd=None):
"""Output to BEDPE format files. If fhd is given, write to a
file, otherwise, output to standard output.
......@@ -510,7 +516,7 @@ cdef class PETrackI:
return
cpdef pileup_a_chromosome ( self, bytes chrom, list scale_factor_s, float baseline_value = 0.0 ):
cpdef list pileup_a_chromosome ( self, bytes chrom, list scale_factor_s, float baseline_value = 0.0 ):
"""pileup a certain chromosome, return [p,v] (end position and value) list.
scale_factor_s : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
......@@ -537,7 +543,7 @@ cdef class PETrackI:
return prev_pileup
cpdef pileup_a_chromosome_c ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0 ):
cpdef list pileup_a_chromosome_c ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0 ):
"""pileup a certain chromosome, return [p,v] (end position and value) list.
This function is for control track. Basically, here is a
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -744,15 +744,6 @@ cdef class scoreTrackII:
pqtable = self.make_pq_table()
# convert p to q
# convert pvalue2qvalue to a simple dict based on khash
# khash has big advantage while checking keys for millions of times.
#s_p2q = Float64HashTable()
#for k in pqtable.keys():
# s_p2q.set_item(k,pqtable[k])
#g = s_p2q.get_item
for chrom in self.data.keys():
v = self.data[chrom][3]
l = self.datalength[chrom]
......
This diff is collapsed.
......@@ -251,14 +251,14 @@ class PeakDetect:
min_length=self.minlen,
lvl1_max_gap=self.maxgap,
lvl2_max_gap=self.maxgap*4,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
else:
self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
min_length=self.minlen,
max_gap=self.maxgap,
call_summits=call_summits,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
elif self.log_qvalue != None:
if self.opt.broad:
self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
......@@ -268,13 +268,13 @@ class PeakDetect:
min_length=self.minlen,
lvl1_max_gap=self.maxgap,
lvl2_max_gap=self.maxgap*4,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
else:
peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
min_length=self.minlen,
max_gap=self.maxgap,
call_summits=call_summits,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
scorecalculator.destroy()
return peaks
......@@ -362,14 +362,14 @@ class PeakDetect:
min_length=self.minlen,
lvl1_max_gap=self.maxgap,
lvl2_max_gap=self.maxgap*4,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
else:
self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
min_length=self.minlen,
max_gap=self.maxgap,
call_summits=call_summits,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
elif self.log_qvalue != None:
if self.opt.broad:
self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
......@@ -379,13 +379,13 @@ class PeakDetect:
min_length=self.minlen,
lvl1_max_gap=self.maxgap,
lvl2_max_gap=self.maxgap*4,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
else:
peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
min_length=self.minlen,
max_gap=self.maxgap,
call_summits=call_summits,
auto_cutoff=self.opt.cutoff_analysis )
cutoff_analysis=self.opt.cutoff_analysis )
scorecalculator.destroy()
return peaks
......
This diff is collapsed.
......@@ -39,9 +39,10 @@ from time import time as ttime
# functions
# ------------------------------------
cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline long long_max(long a, long b): return a if a >= b else b
cdef inline float float_max(float a, float b): return a if a >= b else b
# use python3 max function instead
#cdef inline int int_max(int a, int b): return a if a > b else b
#cdef inline long long_max(long a, long b): return a if a > b else b
#cdef inline float float_max(float a, float b): return a if a > b else b
cdef void clean_up_ndarray ( np.ndarray x ):
# clean numpy ndarray in two steps
......@@ -122,6 +123,7 @@ cpdef pileup_and_write( trackI,
# function to pileup BAMPE/BEDPE stored in PETrackI object and write to a BEDGraph file
# this function uses c function
cpdef pileup_and_write_pe( petrackI,
bytes output_filename,
float scale_factor = 1,
......@@ -572,7 +574,7 @@ cdef int * fix_coordinates_2 ( int * poss, int l_of_poss, int rlength) nogil:
return poss
# general pileup function
# general pileup function implemented in cython
cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarray[np.int32_t, ndim=1] minus_tags, long five_shift, long three_shift, int rlength, float scale_factor, float baseline_value ):
"""Return pileup given 5' end of fragment at plus or minus strand
separately, and given shift at both direction to recover a
......@@ -647,7 +649,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
if pre_p != 0:
# the first chunk of 0
ret_p_ptr[0] = pre_p
ret_v_ptr[0] = float_max(0,baseline_value)
ret_v_ptr[0] = max(0,baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -662,7 +664,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
p = start_poss_ptr[0]
if p != pre_p:
ret_p_ptr[0] = p
ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -674,7 +676,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
p = end_poss_ptr[0]
if p != pre_p:
ret_p_ptr[0] = p
ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -694,7 +696,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
p = end_poss_ptr[0]
if p != pre_p:
ret_p_ptr[0] = p
ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -720,6 +722,7 @@ cdef int compare(const void * a, const void * b) nogil:
if a - b > 0: return 1
return 0
# quick pileup implemented in cython
cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.int32_t, ndim=1] end_poss, float scale_factor, float baseline_value ):
"""Return pileup given plus strand and minus strand positions of fragments.
......@@ -775,7 +778,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
if pre_p != 0:
# the first chunk of 0
ret_p_ptr[0] = pre_p
ret_v_ptr[0] = float_max(0,baseline_value)
ret_v_ptr[0] = max(0,baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -787,7 +790,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
p = start_poss_ptr[0]
if p != pre_p:
ret_p_ptr[0] = p
ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -801,7 +804,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
p = end_poss_ptr[0]
if p != pre_p:
ret_p_ptr[0] = p
ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -822,7 +825,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
#for p in minus_tags[i_e:]:
if p != pre_p:
ret_p_ptr[0] = p
ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
ret_p_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -835,8 +838,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
return tmp
# general function to calculate maximum between two arrays.
# general function to calculate maximum between two arrays in cython.
cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
"""Merge two position-value arrays. For intersection regions, take
the maximum value within region.
......@@ -884,7 +886,7 @@ cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
if a1_pos_ptr[0] < a2_pos_ptr[0]:
# clip a region from pre_p to p1, then set pre_p as p1.
ret_pos_ptr[0] = a1_pos_ptr[0]
ret_v_ptr[0] = float_max( a1_v_ptr[0], a2_v_ptr[0] )
ret_v_ptr[0] = max( a1_v_ptr[0], a2_v_ptr[0] )
ret_pos_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -896,7 +898,7 @@ cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
elif a1_pos_ptr[0] > a2_pos_ptr[0]:
# clip a region from pre_p to p2, then set pre_p as p2.
ret_pos_ptr[0] = a2_pos_ptr[0]
ret_v_ptr[0] = float_max( a1_v_ptr[0], a2_v_ptr[0] )
ret_v_ptr[0] = max( a1_v_ptr[0], a2_v_ptr[0] )
ret_pos_ptr += 1
ret_v_ptr += 1
I += 1
......@@ -908,7 +910,7 @@ cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
else:
# from pre_p to p1 or p2, then set pre_p as p1 or p2.
ret_pos_ptr[0] = a1_pos_ptr[0]
ret_v_ptr[0] = float_max( a1_v_ptr[0], a2_v_ptr[0] )
ret_v_ptr[0] = max( a1_v_ptr[0], a2_v_ptr[0] )
ret_pos_ptr += 1
ret_v_ptr += 1
I += 1
......
This diff is collapsed.