Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · 52690ccf · 52690ccf
--- a/.travis.yml
+++ b/.travis.yml
 language: python
 python:
+  - "3.8"
  - "3.7"
  - "3.6"
-  - "3.5"
 install:
  - pip install --upgrade pip
  - pip install --upgrade -r requirements.txt

--- a/ChangeLog
+++ b/ChangeLog
+2019-12-12  Tao Liu  <vladimir.liu@gmail.com>
+	MACS version 2.2.6
+
+	* New Features
+
+	1) Speed up MACS2. Some programming tricks and code cleanup. The
+	filter_dup function replaces separate_dups. The later one was
+	implemented for potentially putting back duplicate reads in
+	certain downstream analysis. However such analysis hasn't been
+	implemented. Optimize the speed of writing bedGraph
+	files. Optimize BAM and BAMPE parsing with pointer casting instead
+	of python unpack.
+
+	2) The comment lines in the headers of BED or SAM files will be
+	correctly skipped. However, MACS2 won't check comment lines in the
+	middle of the file.
+
+	* Bugs fixed
+
+	1) Cutoff-analysis in callpeak command. #341
+
+	2) Issues related to SAMParser and three ELAND Parsers are
+	fixed. #347
+
+	* Other
+
+	1) cmdlinetest script in test/ folder has been updated to: 1. test
+	cutoff-analysis with callpeak cmd; 2. output the 2 lines before
+	and after the error or warning message during tests; 3. output
+	only the first 10 lines if the difference between test result and
+	standard result can be found; 4. prockreport monitor CPU time and
+	memory usage in 1 sec interval -- a bit more accurate.
+
+	2) Python3.5 support is removed. Now MACS2 requires Python>=3.6.
+
 2019-10-31  Tao Liu  <vladimir.liu@gmail.com>
 	MACS version 2.2.5 (Py3 speed up)


--- a/DOCKER/README.md
+++ b/DOCKER/README.md
-# Official MACS2 v2.2.5 docker
+# Official MACS2 v2.2.6 docker

 MACS2 is a bioinformatics algorithm to analyze ChIP-seq datasets. 


--- a/INSTALL.md
+++ b/INSTALL.md
 # INSTALL Guide For MACS
-Time-stamp: <2019-10-03 11:56:03 taoliu>
+Time-stamp: <2019-12-12 13:26:11 taoliu>

 Please check the following instructions to complete your installation.

 ## Prerequisites

-MACS v2.2.x requires Python3. We have tested MACS in Python3.5, 3.6
-and 3.7. MACS runs slower under Python3.5, so Python3.6 or Python3.7
-is recommended.
+MACS v2.2.x requires Python3. We have tested MACS in Python3.6, 3.7
+and 3.8. 

 MACS also requires [Numpy](http://www.scipy.org/Download) (>=1.17).


--- a/MACS2/Constants.py
+++ b/MACS2/Constants.py
-MACS_VERSION = "2.2.5"
+MACS_VERSION = "2.2.6"
 FILTERDUP_VERSION = "1.0.0 20140616"
 RANDSAMPLE_VERSION = "1.0.0 20120703"
 MAX_PAIRNUM = 1000

--- a/MACS2/IO/CallPeakUnit.c
+++ b/MACS2/IO/CallPeakUnit.c
--- a/MACS2/IO/CallPeakUnit.pyx
+++ b/MACS2/IO/CallPeakUnit.pyx
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2019-10-30 16:33:01 taoliu>
+# cython: linetrace=True
+# Time-stamp: <2019-11-06 13:00:01 taoliu>

 """Module for Calculate Scores.

@@ -40,28 +41,53 @@ from MACS2.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname
 from MACS2.IO.FixWidthTrack import FWTrack
 from MACS2.IO.PairedEndTrack import PETrackI
 from MACS2.Statistics import P_Score_Upper_Tail, LogLR_Asym # pure C code for calculating p-value scores/logLR of Poisson
-#
-pscore_table = P_Score_Upper_Tail() # this table will cache pscore being calculated.
-get_pscore = pscore_table.get_pscore

-logLR_table = LogLR_Asym() # this table will cache pscore being calculated.
-get_logLR_asym = logLR_table.get_logLR_asym
+# --------------------------------------------
+# cached pscore function and LR_asym functions
+# --------------------------------------------
+from MACS2.Prob import poisson_cdf
+pscore_dict = dict()
+logLR_dict = dict()
+
+#cdef float get_pscore ( int x, float l ):
+cdef float get_pscore ( tuple x ):
+    cdef:
+        float val
+    if x in pscore_dict:
+        return pscore_dict [ x ]
+    else:
+        # calculate and cache
+        val = -1 * poisson_cdf ( x[0], x[1], False, True )
+        pscore_dict[ x ] = val
+        return val
+
+cdef float get_logLR_asym ( float x, float y ):
+    cdef:
+        float val
+
+    if ( x, y ) in logLR_dict:
+        return logLR_dict[ ( x, y ) ]
+    else:
+        # calculate and cache
+        if x > y:
+            val = (x*(log10(x)-log10(y))+y-x)
+        elif x < y:
+            val = (x*(-log10(x)+log10(y))-y+x)
+        else:
+            val = 0
+        logLR_dict[ ( x, y ) ] = val
+        return val

 # ------------------------------------
 # constants
 # ------------------------------------
-__version__ = "scoreCalculate $Revision$"
+__version__ = "CallPeakUnit $Revision$"
 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
-__doc__ = "scoreTrackI classes"
+__doc__ = "CallPeakUnit"

 # ------------------------------------
 # Misc functions
 # ------------------------------------
-cdef inline int int_max(int a, int b): return a if a >= b else b
-cdef inline int int_min(int a, int b): return a if a <= b else b
-def do_nothing(*args, **kwargs):
-    pass
-
 LOG10_E = 0.43429448190325176

 cdef void clean_up_ndarray ( np.ndarray x ):
@@ -243,7 +269,7 @@ cdef tuple find_optimal_cutoff( list x, list y ):

    for i in range( 10, l ):
        # at least the largest 10 points
-        m, c = np.linalg.lstsq( npA[:i], npy[:i] )[ 0 ]
+        m, c = np.linalg.lstsq( npA[:i], npy[:i], rcond=None )[ 0 ]
        sst = sum( ( npy[:i] - np.mean( npy[:i] ) ) ** 2 )
        sse = sum( ( npy[:i] - m*npx[:i] - c ) ** 2 )
        rsq = 1 - sse/sst
@@ -283,7 +309,7 @@ cdef class CallerFromAlignments:
        bool no_lambda_flag              # whether ignore local bias, and to use global bias instead
        bool PE_mode                     # whether it's in PE mode, will be detected during initiation
        # temporary data buffer
-        bytes chrom                        # name of current chromosome
+        #bytes chrom                        # name of current chromosome
        list chr_pos_treat_ctrl          # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
        bytes bedGraph_treat_filename
        bytes bedGraph_control_filename
@@ -292,7 +318,7 @@ cdef class CallerFromAlignments:
        #object bedGraph_treat            # file handler to write ChIP pileup
        #object bedGraph_ctrl             # file handler to write Control pileup
        # data needed to be pre-computed before peak calling
-        object pqtable                   # remember pvalue->qvalue convertion
+        dict pqtable                   # remember pvalue->qvalue convertion
        bool pvalue_all_done             # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value.

        dict pvalue_npeaks               # record for each pvalue cutoff, how many peaks can be called  
@@ -469,12 +495,6 @@ cdef class CallerFromAlignments:
            clean_up_ndarray( self.chr_pos_treat_ctrl[0] )
            clean_up_ndarray( self.chr_pos_treat_ctrl[1] )
            clean_up_ndarray( self.chr_pos_treat_ctrl[2] )
-            #self.chr_pos_treat_ctrl[0].resize(10000,refcheck=False)
-            #self.chr_pos_treat_ctrl[1].resize(10000,refcheck=False)
-            #self.chr_pos_treat_ctrl[2].resize(10000,refcheck=False)
-            #self.chr_pos_treat_ctrl[0].resize(0,refcheck=False)
-            #self.chr_pos_treat_ctrl[1].resize(0,refcheck=False)
-            #self.chr_pos_treat_ctrl[2].resize(0,refcheck=False)            

        if self.PE_mode:
            treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 )
@@ -623,7 +643,7 @@ cdef class CallerFromAlignments:
            #ptr += 1
        return s

-    cdef object __cal_pvalue_qvalue_table ( self ):
+    cdef void __cal_pvalue_qvalue_table ( self ):
        """After this function is called, self.pqtable is built. All
        chromosomes will be iterated. So it will take some time.
        
@@ -631,7 +651,7 @@ cdef class CallerFromAlignments:
        cdef:
            bytes chrom
            np.ndarray pos_array, treat_array, ctrl_array, score_array
-            dict pvalue_stat = {}
+            dict pvalue_stat
            long n, pre_p, length, j, pre_l, l, i
            float this_v, pre_v, v, q, pre_q
            long N, k, this_l
@@ -646,6 +666,7 @@ cdef class CallerFromAlignments:

        logging.debug ( "Start to calculate pvalue stat..." )

+        pvalue_stat = dict()
        for i in range( len( self.chromosomes ) ):
            chrom = self.chromosomes[ i ]
            pre_p = 0
@@ -658,7 +679,8 @@ cdef class CallerFromAlignments:
            ctrl_value_ptr = <float32_t *> ctrl_array.data

            for j in range(pos_array.shape[0]):
-                this_v = get_pscore( int(treat_value_ptr[0]), ctrl_value_ptr[0] )
+                #this_v = get_pscore( int(treat_value_ptr[0]), ctrl_value_ptr[0] )
+                this_v = get_pscore( (int(treat_value_ptr[0]), ctrl_value_ptr[0] ) )
                this_l = pos_ptr[0] - pre_p

                if this_v in pvalue_stat:
@@ -669,7 +691,7 @@ cdef class CallerFromAlignments:
                pos_ptr += 1
                treat_value_ptr += 1
                ctrl_value_ptr += 1
-
+   
            nhcal += pos_array.shape[0]            

        #logging.debug ( "make pvalue_stat cost %.5f seconds" % t )
@@ -685,7 +707,6 @@ cdef class CallerFromAlignments:
        pre_q = 2147483647              # save the previous q-value

        self.pqtable = {}
-        #self.pqtable = Float64HashTable()
        unique_values = sorted(list(pvalue_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
        for i in range(len(unique_values)):
            v = unique_values[i]
@@ -695,15 +716,15 @@ cdef class CallerFromAlignments:
            self.pqtable[ v ] = q
            pre_v = v
            pre_q = q
-            k+=l
+            k += l
            nhcal += 1
            
        logging.debug( "access pq hash for %d times" % nhcal )

-        return self.pqtable
+        return


-    cdef object __pre_computes ( self, int max_gap = 50, int min_length = 200 ):
+    cdef void __pre_computes ( self, int max_gap = 50, int min_length = 200 ):
        """After this function is called, self.pqtable and self.pvalue_length is built. All
        chromosomes will be iterated. So it will take some time.
        
@@ -711,7 +732,7 @@ cdef class CallerFromAlignments:
        cdef:
            bytes chrom
            np.ndarray pos_array, treat_array, ctrl_array, score_array
-            dict pvalue_stat = {}
+            dict pvalue_stat
            long n, pre_p, this_p, length, j, pre_l, l, i
            float this_v, pre_v, v, q, pre_q, this_t, this_c
            long N, k, this_l
@@ -736,7 +757,8 @@ cdef class CallerFromAlignments:
        logging.debug ( "Start to calculate pvalue stat..." )

        tmplist = sorted( list(np.arange(0.3, 10.0, 0.3)), reverse = True )
-        
+
+        pvalue_stat = dict()
        for i in range( len( self.chromosomes ) ):
            chrom = self.chromosomes[ i ]
            self.__pileup_treat_ctrl_a_chromosome( chrom )
@@ -824,7 +846,6 @@ cdef class CallerFromAlignments:
        pre_q = 2147483647              # save the previous q-value

        self.pqtable = {}
-        #self.pqtable = Float64HashTable()
        unique_values = sorted(list(pvalue_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
        for i in range(len(unique_values)):
            v = unique_values[i]
@@ -850,14 +871,13 @@ cdef class CallerFromAlignments:
            y.append( self.pvalue_length[ cutoff ] )
        fhd.close()
        logging.info( "#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename )
-        logging.info( "#3 Suggest a cutoff..." )
-        optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
-        logging.info( "#3 -10log10pvalue cutoff %.2f will call approximately %d bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
-        return self.pqtable
-
+        #logging.info( "#3 Suggest a cutoff..." )
+        #optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
+        #logging.info( "#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
+        return

    cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int min_length = 200, 
-                       int max_gap = 50, bool call_summits = False, bool auto_cutoff = False ):
+                       int max_gap = 50, bool call_summits = False, bool cutoff_analysis = False ):
        """Call peaks for all chromosomes. Return a PeakIO object.
        
        scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
@@ -876,8 +896,8 @@ cdef class CallerFromAlignments:
        # prepare p-q table
        if not self.pqtable:
            logging.info("#3 Pre-compute pvalue-qvalue table...")
-            if auto_cutoff:
-                logging.info("#3 Cutoff will be automatically decided!")
+            if cutoff_analysis:
+                logging.info("#3 Cutoff vs peaks called will be analyzed!")
                self.__pre_computes( max_gap = max_gap, min_length = min_length )
            else:
                self.__cal_pvalue_qvalue_table()
@@ -922,7 +942,7 @@ cdef class CallerFromAlignments:

        return peaks

-    cdef __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int min_length, 
+    cdef void __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int min_length, 
                                                   int max_gap, bool call_summits, bool save_bedGraph ):
        """ Call peaks for a chromosome.

@@ -997,7 +1017,7 @@ cdef class CallerFromAlignments:

        if above_cutoff.size == 0:
            # nothing above cutoff
-            return peaks
+            return

        if above_cutoff[0] == 0:
            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
@@ -1050,7 +1070,7 @@ cdef class CallerFromAlignments:
                lastp = te #above_cutoff_endpos[i]
        # save the last peak
        if not peak_content:
-            return peaks
+            return
        else:
            if call_summits:
                self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
@@ -1058,7 +1078,7 @@ cdef class CallerFromAlignments:
                self.__close_peak_wo_subpeaks   (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'

        #print "close peaks -- chrom:",chrom,"  time:", ttime() - t0
-        return peaks
+        return

    cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int min_length,
                                          bytes chrom, int smoothlen, list score_array_s, list score_cutoff_s=[]):
@@ -1104,7 +1124,8 @@ cdef class CallerFromAlignments:
                if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
                    return False # not passed, then disgard this peak.

-            summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
+            #summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
+            summit_p_score = get_pscore(( int(summit_treat), summit_ctrl ) )
            summit_q_score = self.pqtable[ summit_p_score ]

            peaks.add( chrom,           # chromosome
@@ -1187,7 +1208,8 @@ cdef class CallerFromAlignments:
            summit_treat = peak_content[ summit_index ][ 2 ]
            summit_ctrl = peak_content[ summit_index ][ 3 ]            

-            summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
+            #summit_p_score = get_pscore( int(summit_treat), summit_ctrl )
+            summit_p_score = get_pscore(( int(summit_treat), summit_ctrl ) )
            summit_q_score = self.pqtable[ summit_p_score ]

            for i in range(len(score_cutoff_s)):
@@ -1225,7 +1247,8 @@ cdef class CallerFromAlignments:
        array1_size = array1.shape[0]
        
        for i in range(array1_size):
-            s_ptr[0] = get_pscore( int(a1_ptr[0]), a2_ptr[0] )
+            #s_ptr[0] = get_pscore( int(a1_ptr[0]), a2_ptr[0] )
+            s_ptr[0] = get_pscore(( int(a1_ptr[0]), a2_ptr[0] ))
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
@@ -1247,7 +1270,8 @@ cdef class CallerFromAlignments:
        s_ptr = <float32_t *> s.data

        for i in range(array1.shape[0]):
-            s_ptr[0] = self.pqtable[ get_pscore( int(a1_ptr[0]), a2_ptr[0] ) ]
+            #s_ptr[0] = self.pqtable[ get_pscore( int(a1_ptr[0]), a2_ptr[0] ) ]
+            s_ptr[0] = self.pqtable[ get_pscore(( int(a1_ptr[0]), a2_ptr[0] )) ]
            s_ptr += 1
            a1_ptr += 1
            a2_ptr += 1
@@ -1402,29 +1426,23 @@ cdef class CallerFromAlignments:
            ctrl_array_ptr += 1

            if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits
-                tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t )
-                #t_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t ) )
-                fprintf( ft, tmp_bytes )
+                fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
                pre_v_t = v_t
                pre_p_t = p

            if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits
-                tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c )
-                fprintf( fc, tmp_bytes )
-                #c_write_func( "%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c ) )
+                fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
                pre_v_c = v_c
                pre_p_c = p

        p = pos_array_ptr[ 0 ]
        # last one
-        tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_t, p, pre_v_t )
-        fprintf( ft, tmp_bytes )
-        tmp_bytes = b"%s\t%d\t%d\t%.5f\n" % ( chrom, pre_p_c, p, pre_v_c )
-        fprintf( fc, tmp_bytes )
+        fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )        
+        fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )        

        return True

-    cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length=200, int lvl1_max_gap=50, int lvl2_max_gap=400, bool auto_cutoff = False):
+    cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int min_length=200, int lvl1_max_gap=50, int lvl2_max_gap=400, bool cutoff_analysis = False):
        """This function try to find enriched regions within which,
        scores are continuously higher than a given cutoff for level
        1, and link them using the gap above level 2 cutoff with a
@@ -1456,8 +1474,8 @@ cdef class CallerFromAlignments:
        # prepare p-q table
        if not self.pqtable:
            logging.info("#3 Pre-compute pvalue-qvalue table...")
-            if auto_cutoff:
-                logging.info("#3 Cutoff for broad region will be automatically decided!")
+            if cutoff_analysis:
+                logging.info("#3 Cutoff value vs broad region calls will be analyzed!")
                self.__pre_computes( max_gap = lvl2_max_gap, min_length = min_length )
            else:
                self.__cal_pvalue_qvalue_table()
@@ -1527,7 +1545,7 @@ cdef class CallerFromAlignments:

        return broadpeaks

-    cdef __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
+    cdef void  __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
                                                         int min_length, int lvl1_max_gap, int lvl2_max_gap, bool save_bedGraph):
        """ Call peaks for a chromosome.


--- a/MACS2/IO/FixWidthTrack.c
+++ b/MACS2/IO/FixWidthTrack.c
--- a/MACS2/IO/FixWidthTrack.pyx
+++ b/MACS2/IO/FixWidthTrack.pyx
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2019-10-30 16:32:50 taoliu>
+# Time-stamp: <2019-11-04 11:47:48 taoliu>

 """Module for FWTrack classes.

@@ -96,7 +96,7 @@ cdef class FWTrack:
        self.length = 0
        self.__destroyed = False

-    cpdef destroy ( self ):
+    cpdef void destroy ( self ):
        """Destroy this object and release mem.
        """
        cdef:
@@ -120,10 +120,10 @@ cdef class FWTrack:
                self.__dup_locations[chromosome] = [None, None]
                self.__dup_locations.pop(chromosome)
        self.__destroyed = True
-        return True
+        return


-    cpdef add_loc ( self, bytes chromosome, int32_t fiveendpos, int strand ):
+    cpdef void add_loc ( self, bytes chromosome, int32_t fiveendpos, int strand ):
        """Add a location to the list according to the sequence name.
        
        chromosome -- mostly the chromosome name
@@ -139,24 +139,18 @@ cdef class FWTrack:
            self.__locations[chromosome][strand][0] = fiveendpos
            self.__pointer[chromosome][strand] = 1
        else:
-            try:
-                i = self.__pointer[chromosome][strand]
-                if i % self.buffer_size == 0:
-                    self.__expand__ ( self.__locations[chromosome][strand] )
-                self.__locations[chromosome][strand][i]= fiveendpos
-                self.__pointer[chromosome][strand] += 1
-            except:
-                print ("i:",i)
-                print ("self.buffer_size:", self.buffer_size)
-                print ("strand:", strand)
-                print ("loc len:", len( self.__locations[chromosome][strand] ))
-                raise Exception("!!")
-
-    cdef np.ndarray[np.int32_t, ndim=1] __expand__ ( self, np.ndarray[np.int32_t, ndim=1] arr ):
+            i = self.__pointer[chromosome][strand]
+            if i % self.buffer_size == 0:
+                self.__expand__ ( self.__locations[chromosome][strand] )
+            self.__locations[chromosome][strand][i]= fiveendpos
+            self.__pointer[chromosome][strand] += 1
+        return
+
+    cdef void __expand__ ( self, np.ndarray[np.int32_t, ndim=1] arr ):
        arr.resize( arr.shape[0] + self.buffer_size, refcheck = False )
-        return arr
+        return

-    cpdef finalize ( self ):
+    cpdef void finalize ( self ):
        """ Resize np arrays for 5' positions and sort them in place

        Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
@@ -233,7 +227,7 @@ cdef class FWTrack:
    #     """
    #     return self.total*self.fw

-    cpdef sort ( self ):
+    cpdef void sort ( self ):
        """Naive sorting for locations.
        
        """
@@ -249,9 +243,10 @@ cdef class FWTrack:
            self.__locations[c][1].sort()

        self.__sorted = True
+        return

    @cython.boundscheck(False)
-    cpdef separate_dups( self, maxint = 1 ):
+    cpdef void separate_dups( self, int maxnum = 1 ):
        """Separate the duplicated reads into a different track
        stored at self.dup
        """
@@ -298,7 +293,7 @@ cdef class FWTrack:
                    else:
                        current_loc = p
                        n = 1
-                    if n > maxint:
+                    if n > maxnum:
                        dup_plus [ i_dup ] = p
                        i_dup += 1
                    else:
@@ -344,7 +339,7 @@ cdef class FWTrack:
                    else:
                        current_loc = p
                        n = 1
-                    if n > maxint:
+                    if n > maxnum:
                        dup_minus [ i_dup ] = p
                        i_dup += 1
                    else:
@@ -439,7 +434,7 @@ cdef class FWTrack:
        return 0

    @cython.boundscheck(False) # do not check that np indices are valid
-    cpdef unsigned long filter_dup ( self, int32_t maxnum = -1):
+    cpdef unsigned long filter_dup ( self, int maxnum = -1):
        """Filter the duplicated reads.

        Run it right after you add all data into this object.
@@ -486,16 +481,12 @@ cdef class FWTrack:
                    p = plus[ i_old ]
                    if p == current_loc:
                        n += 1
-                        if n <= maxnum:
-                            new_plus[ i_new ] = p
-                            i_new += 1
-                        else:
-                            logging.debug("Duplicate reads found at %s:%d at + strand" % (k,p) )
                    else:
                        current_loc = p
-                        new_plus[ i_new ] = p
-                        i_new += 1                        
                        n = 1
+                    if n <= maxnum:
+                        new_plus[ i_new ] = p
+                        i_new += 1
                new_plus.resize( i_new, refcheck=False )
                self.total +=  i_new
                self.__pointer[k][0] = i_new
@@ -525,16 +516,12 @@ cdef class FWTrack:
                    p = minus[ i_old ]
                    if p == current_loc:
                        n += 1
-                        if n <= maxnum:
-                            new_minus[ i_new ] = p
-                            i_new += 1
-                        else:
-                            logging.debug("Duplicate reads found at %s:%d at + strand" % (k,p) )
                    else:
                        current_loc = p
-                        new_minus[ i_new ] = p
-                        i_new += 1                        
                        n = 1
+                    if n <= maxnum:
+                        new_minus[ i_new ] = p
+                        i_new += 1
                new_minus.resize( i_new, refcheck=False )
                self.total +=  i_new
                self.__pointer[k][1] = i_new                
@@ -553,7 +540,7 @@ cdef class FWTrack:
        self.length = self.fw * self.total
        return self.total

-    cpdef sample_percent (self, float percent, int seed = -1 ):
+    cpdef void sample_percent (self, float percent, int seed = -1 ):
        """Sample the tags for a given percentage.

        Warning: the current object is changed!
@@ -592,7 +579,7 @@ cdef class FWTrack:
        self.length = self.fw * self.total
        return

-    cpdef sample_num (self, uint64_t samplesize, int seed = -1):
+    cpdef void sample_num (self, uint64_t samplesize, int seed = -1):
        """Sample the tags for a given percentage.

        Warning: the current object is changed!
@@ -604,7 +591,7 @@ cdef class FWTrack:
        self.sample_percent ( percent, seed )
        return

-    cpdef print_to_bed (self, fhd=None):
+    cpdef void print_to_bed (self, fhd=None):
        """Output FWTrack to BED format files. If fhd is given,
        write to a file, otherwise, output to standard output.
        
@@ -675,7 +662,7 @@ cdef class FWTrack:
        rt_minus = np.array(temp)
        return (rt_plus, rt_minus)

-    cpdef compute_region_tags_from_peaks ( self, peaks, func, int window_size = 100, float cutoff = 5 ):
+    cpdef list compute_region_tags_from_peaks ( self, peaks, func, int window_size = 100, float cutoff = 5 ):
        """Extract tags in peak, then apply func on extracted tags.
        
        peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
@@ -759,7 +746,7 @@ cdef class FWTrack:
                
        return retval

-    cpdef refine_peak_from_tags_distribution ( self, peaks, int window_size = 100, float cutoff = 5 ):
+    cpdef object refine_peak_from_tags_distribution ( self, peaks, int window_size = 100, float cutoff = 5 ):
        """Extract tags in peak, then apply func on extracted tags.
        
        peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
@@ -781,6 +768,7 @@ cdef class FWTrack:
            set pchrnames, chrnames
            list temp, retval, cpeaks
            np.ndarray[np.int32_t, ndim=1] adjusted_summits, passflags
+            object ret_peaks

        pchrnames = peaks.get_chr_names()
        retval = []
@@ -864,7 +852,7 @@ cdef class FWTrack:
                # end of a loop
        return ret_peaks

-    cpdef pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0, bint directional = True, int end_shift = 0 ):
+    cpdef list pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0, bint directional = True, int end_shift = 0 ):
        """pileup a certain chromosome, return [p,v] (end position and value) list.
        
        ds             : tag will be extended to this value to 3' direction,
@@ -932,7 +920,7 @@ cdef inline int32_t left_forward ( data, int pos, int window_size ):
 cdef inline int32_t right_forward ( data, int pos, int window_size ):
    return data.get(pos + window_size, 0) - data.get(pos, 0)

-cdef wtd_find_summit(chrom, np.ndarray[np.int32_t, ndim=1] plus, np.ndarray[np.int32_t, ndim=1] minus, int32_t search_start, int32_t search_end, int32_t window_size, float cutoff):
+cdef tuple wtd_find_summit(chrom, np.ndarray[np.int32_t, ndim=1] plus, np.ndarray[np.int32_t, ndim=1] minus, int32_t search_start, int32_t search_end, int32_t window_size, float cutoff):
    """internal function to be called by refine_peak_from_tags_distribution()

    """

--- a/MACS2/IO/PairedEndTrack.c
+++ b/MACS2/IO/PairedEndTrack.c
--- a/MACS2/IO/PairedEndTrack.pyx
+++ b/MACS2/IO/PairedEndTrack.pyx
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2019-10-30 17:49:19 taoliu>
+# Time-stamp: <2019-11-04 13:23:29 taoliu>

 """Module for filter duplicate tags from paired-end data

@@ -26,7 +26,10 @@ from MACS2.Pileup import quick_pileup, max_over_two_pv_array, se_all_in_one_pile

 cdef INT_MAX = <int>((<unsigned int>-1)>>1)

-
+cdef packed struct peLoc:
+    np.int32_t l
+    np.int32_t r
+  
 # Let numpy enforce PE-ness using ndarray, gives bonus speedup when sorting
 # PE data doesn't have strandedness
 cdef class PETrackI:
@@ -72,14 +75,14 @@ cdef class PETrackI:
        self.length = 0
        self.average_template_length = 0.0
        
-    cpdef add_loc ( self, bytes chromosome, int start, int end):
+    cpdef void add_loc ( self, bytes chromosome, int start, int end):
        """Add a location to the list according to the sequence name.
        
        chromosome -- mostly the chromosome name
        fiveendpos -- 5' end pos, left for plus strand, right for neg strand
        """
        cdef:
-            long i
+            int i

        if chromosome not in self.__locations:
            self.__locations[chromosome] = np.zeros(shape=self.buffer_size, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
@@ -90,10 +93,11 @@ cdef class PETrackI:
            if i % self.buffer_size == 0:
                self.__expand__ ( self.__locations[chromosome] )
            self.__locations[chromosome][ i ] = ( start, end )
-            self.__pointer[chromosome] += 1
+            self.__pointer[chromosome] = i + 1
        self.length += end - start
+        return

-    cpdef destroy ( self ):
+    cpdef void destroy ( self ):
        """Destroy this object and release mem.
        """
        cdef:
@@ -114,9 +118,9 @@ cdef class PETrackI:
                self.__dup_locations.pop(chromosome)
        self.__destroyed = True

-        return True
+        return

-    cpdef __expand__ ( self, np.ndarray arr ):
+    cpdef void __expand__ ( self, np.ndarray arr ):
        arr.resize((arr.shape[0] + self.buffer_size), refcheck = False )
        return

@@ -151,8 +155,7 @@ cdef class PETrackI:
            self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()])
        return self.rlengths
    
-
-    def finalize ( self ):
+    cpdef void finalize ( self ):
        """ Resize np arrays for 5' positions and sort them in place

        Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!        
@@ -177,7 +180,7 @@ cdef class PETrackI:
        self.average_template_length = float( self.length ) / self.total
        return

-    def get_locations_by_chr ( self, bytes chromosome ):
+    cpdef get_locations_by_chr ( self, bytes chromosome ):
        """Return a tuple of two lists of locations for certain chromosome.

        """
@@ -203,7 +206,7 @@ cdef class PETrackI:
    #         l += (v[:,1] - v[:,0]).sum() 
    #     return l

-    cpdef sort ( self ):
+    cpdef void sort ( self ):
        """Naive sorting for locations.
        
        """
@@ -219,6 +222,7 @@ cdef class PETrackI:
            self.__locations[c].sort( order=['l', 'r'] ) # sort by the leftmost location
            #print "before", self.__locations[c][0:100]
        self.__sorted = True
+        return

 #    def centered_fake_fragments(track, int d):
 #        """Return a copy of the PETrackI object so that its locations are all
@@ -270,7 +274,7 @@ cdef class PETrackI:
        return pmf

    @cython.boundscheck(False) # do not check that np indices are valid
-    def separate_dups ( self , int maxint = 1 ):
+    cpdef void separate_dups ( self , int maxnum = 1 ):
        """Filter the duplicated reads.
    
        Run it right after you add all data into this object.
@@ -285,6 +289,8 @@ cdef class PETrackI:
            set chrnames
            bytes k

+        if maxnum < 0: return # condition to return if not filtering
+
        chrnames = self.get_chr_names()
         
        if not self.__sorted: self.sort()
@@ -296,86 +302,82 @@ cdef class PETrackI:
        self.average_template_length = 0.0

        for k in chrnames: # for each chromosome
-#            dups.__locations[k] = self.__locations[k].copy()
-            i_new = 0
-            i_dup = 0
            locs = self.__locations[k]
            size = locs.shape[0]
-            if size <= 1:
-                new_locs = locs
-            else:
-                new_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
-                dup_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
-                n = 1
-            
-                current_loc_start = locs[0][0] # same as locs[0]['l']
-                current_loc_end = locs[0][1]# same as locs[0]['r']
-                new_locs[i_new][0] = current_loc_start
-                new_locs[i_new][1] = current_loc_end
-                i_new += 1
-                self.length += current_loc_end - current_loc_start
-                for i_old in range(1, size):
-                    loc_start = locs[i_old][0]
-                    loc_end = locs[i_old][1]
-                    all_same = ((loc_start == current_loc_start) and
-                                (loc_end == current_loc_end)) 
-                    if all_same:
-                        n += 1
-                    else:
-                        current_loc_start = loc_start
-                        current_loc_end = loc_end
-                        n = 1
-                    if n > maxint:
-                        dup_locs[i_dup][0] = loc_start
-                        dup_locs[i_dup][1] = loc_end
-                        i_dup += 1
-                    else:
-                        new_locs[i_new][0] = loc_start
-                        new_locs[i_new][1] = loc_end
-                        self.length += loc_end - loc_start                        
-                        i_new += 1
-                new_locs.resize( i_new , refcheck = False)
-                dup_locs.resize( i_dup , refcheck = False)
-                self.total += i_new
-                self.dup_total += i_dup
-                self.__pointer[k] = i_new
-                self.__dup_pointer[k] = i_dup
-                # unnecessary
-#                new_size = new_locs.shape[0]
-#                dup_size = dup_locs.shape[0]
-#                self.__pointer[k] = new_size
-#                dups.__pointer[k] = dup_size
-           # free memory?
+            if size == 1:
+                # do nothing and continue
+                self.total +=  size
+                self.__pointer[k] = size
+                self.length += locs[0][1] - locs[0][0]
+                self.__dup_locations[k] = []
+                continue
+            # save duplicate raeds to dup_locations[k] and update locations[k]
+            new_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
+            dup_locs = np.zeros(self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
+            current_loc_start = locs[0][0] # same as locs[0]['l']
+            current_loc_end = locs[0][1]# same as locs[0]['r']
+            new_locs[0][0] = current_loc_start
+            new_locs[0][1] = current_loc_end
+            self.length += current_loc_end - current_loc_start
+            i_new = 1           # index of new_locs
+            i_dup = 0           # index of dup_locs
+            n = 1
+            for i_old in range(1, size):
+                loc_start = locs[i_old][0]
+                loc_end = locs[i_old][1]
+                if (loc_start == current_loc_start) and (loc_end == current_loc_end) :
+                    # both ends are the same
+                    n += 1
+                else:
+                    # not the same, update current_loc and reset n
+                    current_loc_start = loc_start
+                    current_loc_end = loc_end
+                    n = 1
+                if n > maxnum:
+                    # more than maxnum duplicates, put the duplicates in dup_locs
+                    dup_locs[i_dup][0] = loc_start
+                    dup_locs[i_dup][1] = loc_end
+                    i_dup += 1
+                else:
+                    # otherwise, put the fragment in new_locs
+                    new_locs[i_new][0] = loc_start
+                    new_locs[i_new][1] = loc_end
+                    self.length += loc_end - loc_start                            
+                    i_new += 1                        
+            new_locs.resize( i_new , refcheck = False)
+            dup_locs.resize( i_dup , refcheck = False)
+            self.total += i_new
+            self.dup_total += i_dup
+            self.__pointer[k] = i_new
+            self.__dup_pointer[k] = i_dup
+            # free memory?
            # I know I should shrink it to 0 size directly,
            # however, on Mac OSX, it seems directly assigning 0
            # doesn't do a thing.
            locs.resize( self.buffer_size, refcheck=False )
            locs.resize( 0, refcheck=False )
            # hope there would be no mem leak...
-    
            self.__locations[k] = new_locs
-            if size > 1:
-                self.__dup_locations[k] = dup_locs
-        self.average_template_length = float( self.length ) / self.total
+            self.__dup_locations[k] = dup_locs
+        self.average_template_length = self.length / self.total
        return
    
    @cython.boundscheck(False) # do not check that np indices are valid
-    cpdef unsigned long filter_dup ( self, int maxnum=-1):
+    cpdef void filter_dup ( self, int maxnum=-1):
        """Filter the duplicated reads.
    
        Run it right after you add all data into this object.
        """
        cdef:
            int i_chrom, n, start, end
-#            np.ndarray[np.int32_t, ndim=1] loc #= np.zeros([1,2], np.int32)
-#            np.ndarray[np.int32_t, ndim=1] current_loc #= np.zeros([1,2], np.int32)
            int loc_start, loc_end, current_loc_start, current_loc_end
            unsigned long i_old, i_new, size, new_size
            bytes k
            np.ndarray locs, new_locs
            set chrnames
+            #peLoc current_range, this_range
                
-        if maxnum < 0: return self.total # condition to return if not filtering
+        if maxnum < 0: return # condition to return if not filtering
        
        if not self.__sorted: self.sort()
        
@@ -386,58 +388,62 @@ cdef class PETrackI:
        chrnames = self.get_chr_names()
        
        for k in chrnames: # for each chromosome
-            i_new = 0
            locs = self.__locations[k]
-            size = locs.shape[0]
-            if size <= 1:
-                new_locs = locs
-            else:
-                new_locs = np.zeros( self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
-                n = 1                # the number of tags in the current location
-            
-                current_loc_start = locs[0][0]
-                current_loc_end = locs[0][1]
-                new_locs[i_new][0] = current_loc_start
-                new_locs[i_new][1] = current_loc_end
-                i_new += 1
-                self.length += current_loc_end - current_loc_start
-                for i_old in range(1, size):
-                    loc_start = locs[i_old][0]
-                    loc_end = locs[i_old][1]
-                    all_same = ((loc_start == current_loc_start) and
-                                (loc_end == current_loc_end))
-                    if all_same:
-                        n += 1
-                        if n <= maxnum:
-                            new_locs[i_new][0] = loc_start
-                            new_locs[i_new][1] = loc_end
-                            self.length += loc_end - loc_start                            
-                            i_new += 1
-                    else:
-                        current_loc_start = loc_start
-                        current_loc_end = loc_end
-                        new_locs[i_new][0] = loc_start
-                        new_locs[i_new][1] = loc_end
-                        self.length += loc_end - loc_start                        
-                        i_new += 1
-                        n = 1
-                new_locs.resize( i_new, refcheck = False )
-                new_size = new_locs.shape[0]
-                self.__pointer[k] = new_size
-                self.total += new_size
-           # free memory?
+            size = locs.shape[0]            
+            if size == 1:
+                # do nothing and continue
+                self.total += size
+                self.__pointer[k] = size
+                self.length += locs[0][1] - locs[0][0]
+                continue
+            # discard duplicate reads and make a new __locations[k]
+            new_locs = np.zeros( self.__pointer[k] + 1, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
+            # get the first loc
+            #current_loc_start = locs[0][0]
+            #current_loc_end = locs[0][1]
+            ( current_loc_start, current_loc_end ) = locs[0]
+            #new_locs[0][0] = current_loc_start
+            #new_locs[0][1] = current_loc_end
+            new_locs[0] = ( current_loc_start, current_loc_end )
+            self.length += current_loc_end - current_loc_start
+            i_new = 1           # index of new_locs
+            n = 1               # the number of tags in the current genomic location
+            for i_old in range(1, size):
+                #loc_start = locs[i_old][0]
+                #loc_end = locs[i_old][1]
+                ( loc_start, loc_end ) = locs[i_old]
+                if loc_start == current_loc_start and loc_end == current_loc_end:
+                    # both ends are the same, add 1 to duplicate number n
+                    n += 1
+                else:
+                    # not the same, update currnet_loc, reset n
+                    current_loc_start = loc_start
+                    current_loc_end = loc_end
+                    n = 1
+                if n <= maxnum:
+                    # smaller than maxnum, then add to new_locs,
+                    # otherwise, discard
+                    #new_locs[i_new][0] = loc_start
+                    #new_locs[i_new][1] = loc_end
+                    new_locs[i_new] = (loc_start, loc_end)
+                    self.length += loc_end - loc_start
+                    i_new += 1
+            new_locs.resize( i_new, refcheck = False )
+            new_size = new_locs.shape[0]
+            self.__pointer[k] = new_size
+            self.total += new_size
+            # free memory?
            # I know I should shrink it to 0 size directly,
            # however, on Mac OSX, it seems directly assigning 0
            # doesn't do a thing.
            locs.resize( self.buffer_size, refcheck=False )
            locs.resize( 0, refcheck=False )
            # hope there would be no mem leak...
-    
            self.__locations[k] = new_locs
-        self.average_template_length = float( self.length ) / self.total
-        return self.total
+        self.average_template_length = self.length / self.total
+        return

-    def sample_percent (self, float percent, int seed = -1):
+    cpdef void sample_percent (self, float percent, int seed = -1):
        """Sample the tags for a given percentage.

        Warning: the current object is changed!
@@ -470,7 +476,7 @@ cdef class PETrackI:
        self.average_template_length = float( self.length )/ self.total
        return

-    def sample_num (self, uint64_t samplesize, int seed = -1):
+    cpdef void sample_num (self, uint64_t samplesize, int seed = -1):
        """Sample the tags for a given percentage.

        Warning: the current object is changed!
@@ -481,7 +487,7 @@ cdef class PETrackI:
        self.sample_percent ( percent, seed )
        return

-    def print_to_bed (self, fhd=None):
+    cpdef void print_to_bed (self, fhd=None):
        """Output to BEDPE format files. If fhd is given, write to a
        file, otherwise, output to standard output.
        
@@ -510,7 +516,7 @@ cdef class PETrackI:

        return
    
-    cpdef pileup_a_chromosome ( self, bytes chrom, list scale_factor_s, float baseline_value = 0.0 ):
+    cpdef list pileup_a_chromosome ( self, bytes chrom, list scale_factor_s, float baseline_value = 0.0 ):
        """pileup a certain chromosome, return [p,v] (end position and value) list.
        
        scale_factor_s  : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
@@ -537,7 +543,7 @@ cdef class PETrackI:

        return prev_pileup

-    cpdef pileup_a_chromosome_c ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0 ):
+    cpdef list pileup_a_chromosome_c ( self, bytes chrom, list ds, list scale_factor_s, float baseline_value = 0.0 ):
        """pileup a certain chromosome, return [p,v] (end position and value) list.

        This function is for control track. Basically, here is a

--- a/MACS2/IO/Parser.c
+++ b/MACS2/IO/Parser.c
--- a/MACS2/IO/Parser.pyx
+++ b/MACS2/IO/Parser.pyx
--- a/MACS2/IO/ScoreTrack.c
+++ b/MACS2/IO/ScoreTrack.c
--- a/MACS2/IO/ScoreTrack.pyx
+++ b/MACS2/IO/ScoreTrack.pyx
@@ -744,15 +744,6 @@ cdef class scoreTrackII:
        pqtable = self.make_pq_table()
        
        # convert p to q
-
-        # convert pvalue2qvalue to a simple dict based on khash
-        # khash has big advantage while checking keys for millions of times.
-        #s_p2q = Float64HashTable()
-        #for k in pqtable.keys():
-        #    s_p2q.set_item(k,pqtable[k])
-
-        #g = s_p2q.get_item
-        
        for chrom in self.data.keys():
            v = self.data[chrom][3]
            l = self.datalength[chrom]

--- a/MACS2/PeakDetect.c
+++ b/MACS2/PeakDetect.c
--- a/MACS2/PeakDetect.pyx
+++ b/MACS2/PeakDetect.pyx
@@ -251,14 +251,14 @@ class PeakDetect:
                                                    min_length=self.minlen,
                                                    lvl1_max_gap=self.maxgap,
                                                    lvl2_max_gap=self.maxgap*4,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
            else:
                self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
                peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
                                                    min_length=self.minlen,
                                                    max_gap=self.maxgap,
                                                    call_summits=call_summits,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
        elif self.log_qvalue != None:
            if self.opt.broad:
                self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
@@ -268,13 +268,13 @@ class PeakDetect:
                                                    min_length=self.minlen,
                                                    lvl1_max_gap=self.maxgap,
                                                    lvl2_max_gap=self.maxgap*4,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
            else:
                peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
                                                    min_length=self.minlen,
                                                    max_gap=self.maxgap,
                                                    call_summits=call_summits,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
        scorecalculator.destroy()
        return peaks

@@ -362,14 +362,14 @@ class PeakDetect:
                                                    min_length=self.minlen,
                                                    lvl1_max_gap=self.maxgap,
                                                    lvl2_max_gap=self.maxgap*4,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
            else:
                self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
                peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
                                                    min_length=self.minlen,
                                                    max_gap=self.maxgap,
                                                    call_summits=call_summits,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
        elif self.log_qvalue != None:
            if self.opt.broad:
                self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
@@ -379,13 +379,13 @@ class PeakDetect:
                                                    min_length=self.minlen,
                                                    lvl1_max_gap=self.maxgap,
                                                    lvl2_max_gap=self.maxgap*4,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
            else:
                peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
                                                    min_length=self.minlen,
                                                    max_gap=self.maxgap,
                                                    call_summits=call_summits,
-                                                    auto_cutoff=self.opt.cutoff_analysis )
+                                                    cutoff_analysis=self.opt.cutoff_analysis )
        scorecalculator.destroy()
        return peaks


--- a/MACS2/Pileup.c
+++ b/MACS2/Pileup.c
--- a/MACS2/Pileup.pyx
+++ b/MACS2/Pileup.pyx
@@ -39,9 +39,10 @@ from time import time as ttime
 # functions
 # ------------------------------------

-cdef inline int int_max(int a, int b): return a if a >= b else b
-cdef inline long long_max(long a, long b): return a if a >= b else b
-cdef inline float float_max(float a, float b): return a if a >= b else b
+# use python3 max function instead
+#cdef inline int int_max(int a, int b): return a if a > b else b
+#cdef inline long long_max(long a, long b): return a if a > b else b
+#cdef inline float float_max(float a, float b): return a if a > b else b

 cdef void clean_up_ndarray ( np.ndarray x ):
    # clean numpy ndarray in two steps
@@ -122,6 +123,7 @@ cpdef pileup_and_write( trackI,


 # function to pileup BAMPE/BEDPE stored in PETrackI object and write to a BEDGraph file
+# this function uses c function
 cpdef pileup_and_write_pe( petrackI,
                           bytes output_filename,
                           float scale_factor = 1,
@@ -572,7 +574,7 @@ cdef int * fix_coordinates_2 ( int * poss, int l_of_poss, int rlength) nogil:
    
    return poss

-# general pileup function
+# general pileup function implemented in cython
 cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarray[np.int32_t, ndim=1] minus_tags, long five_shift, long three_shift, int rlength, float scale_factor, float baseline_value ):
    """Return pileup given 5' end of fragment at plus or minus strand
    separately, and given shift at both direction to recover a
@@ -647,7 +649,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
    if pre_p != 0:
        # the first chunk of 0
        ret_p_ptr[0] = pre_p
-        ret_v_ptr[0] = float_max(0,baseline_value) 
+        ret_v_ptr[0] = max(0,baseline_value) 
        ret_p_ptr += 1
        ret_v_ptr += 1
        I += 1
@@ -662,7 +664,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
            p = start_poss_ptr[0]
            if p != pre_p:
                ret_p_ptr[0] = p
-                ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
+                ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
                ret_p_ptr += 1
                ret_v_ptr += 1
                I += 1
@@ -674,7 +676,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
            p = end_poss_ptr[0]
            if p != pre_p:
                ret_p_ptr[0] = p
-                ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
+                ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
                ret_p_ptr += 1
                ret_v_ptr += 1
                I += 1
@@ -694,7 +696,7 @@ cpdef se_all_in_one_pileup ( np.ndarray[np.int32_t, ndim=1] plus_tags, np.ndarra
            p = end_poss_ptr[0]
            if p != pre_p:
                ret_p_ptr[0] = p
-                ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
+                ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
                ret_p_ptr += 1
                ret_v_ptr += 1
                I += 1
@@ -720,6 +722,7 @@ cdef int compare(const void * a, const void * b) nogil:
    if a - b > 0: return 1
    return 0

+# quick pileup implemented in cython
 cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.int32_t, ndim=1] end_poss, float scale_factor, float baseline_value ):
    """Return pileup given plus strand and minus strand positions of fragments.
    
@@ -775,7 +778,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
    if pre_p != 0:
        # the first chunk of 0
        ret_p_ptr[0] = pre_p
-        ret_v_ptr[0] = float_max(0,baseline_value) 
+        ret_v_ptr[0] = max(0,baseline_value) 
        ret_p_ptr += 1
        ret_v_ptr += 1
        I += 1
@@ -787,7 +790,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
            p = start_poss_ptr[0]
            if p != pre_p:
                ret_p_ptr[0] = p
-                ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
+                ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
                ret_p_ptr += 1
                ret_v_ptr += 1
                I += 1
@@ -801,7 +804,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
            p = end_poss_ptr[0]
            if p != pre_p:
                ret_p_ptr[0] = p
-                ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
+                ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
                ret_p_ptr += 1
                ret_v_ptr += 1
                I += 1
@@ -822,7 +825,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in
            #for p in minus_tags[i_e:]:
            if p != pre_p:
                ret_p_ptr[0] = p
-                ret_v_ptr[0] = float_max(pileup * scale_factor, baseline_value)
+                ret_v_ptr[0] = max(pileup * scale_factor, baseline_value)
                ret_p_ptr += 1
                ret_v_ptr += 1
                I += 1
@@ -835,8 +838,7 @@ cpdef quick_pileup ( np.ndarray[np.int32_t, ndim=1] start_poss, np.ndarray[np.in

    return tmp

-# general function to calculate maximum between two arrays.
-
+# general function to calculate maximum between two arrays in cython.
 cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
    """Merge two position-value arrays. For intersection regions, take
    the maximum value within region.
@@ -884,7 +886,7 @@ cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
        if a1_pos_ptr[0] < a2_pos_ptr[0]:
            # clip a region from pre_p to p1, then set pre_p as p1.
            ret_pos_ptr[0] = a1_pos_ptr[0]
-            ret_v_ptr[0] =  float_max( a1_v_ptr[0], a2_v_ptr[0] )
+            ret_v_ptr[0] =  max( a1_v_ptr[0], a2_v_ptr[0] )
            ret_pos_ptr += 1
            ret_v_ptr += 1
            I += 1
@@ -896,7 +898,7 @@ cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
        elif a1_pos_ptr[0] > a2_pos_ptr[0]:
            # clip a region from pre_p to p2, then set pre_p as p2.
            ret_pos_ptr[0] = a2_pos_ptr[0]
-            ret_v_ptr[0] =  float_max( a1_v_ptr[0], a2_v_ptr[0] )
+            ret_v_ptr[0] =  max( a1_v_ptr[0], a2_v_ptr[0] )
            ret_pos_ptr += 1
            ret_v_ptr += 1
            I += 1
@@ -908,7 +910,7 @@ cpdef list max_over_two_pv_array ( list tmparray1, list tmparray2 ):
        else:
            # from pre_p to p1 or p2, then set pre_p as p1 or p2.
            ret_pos_ptr[0] = a1_pos_ptr[0]
-            ret_v_ptr[0] =  float_max( a1_v_ptr[0], a2_v_ptr[0] )
+            ret_v_ptr[0] =  max( a1_v_ptr[0], a2_v_ptr[0] )
            ret_pos_ptr += 1
            ret_v_ptr += 1
            I += 1

--- a/MACS2/Statistics.c
+++ b/MACS2/Statistics.c