Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · dbc03cfe · dbc03cfe
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Change log
 All notable changes to this project will be documented in this file.

+## Version 3.0.1 2019-08-01
+#### TRANSIT:
+ - Add check for python3 (TRANSIT 3+ requires python3.6+)
+ - Minor fixes in GI and Pathway Enrichment
+
 ## Version 3.0.0 2019-07-18
 #### TRANSIT:
 - TRANSIT now supports python 3. (To use with python 2, use releases < 3.0.0)

--- a/Dockerfile
+++ b/Dockerfile
-From r-base:3.4.1
-RUN apt-get update -y && apt-get install -y -f python2 python-dev python-pip
+From r-base:3.6.1
+RUN apt-get update -y && apt-get install -y -f python3 python-dev python3-pip
 ADD src/ /src
 ADD tests/ /tests
-RUN pip install pytest 'numpy~=1.15' 'scipy~=1.2' 'matplotlib~=2.2' 'pillow~=5.0' 'statsmodels~=0.9' 'rpy2<2.9.0'
+RUN pip3 install pytest 'numpy~=1.16' 'scipy~=1.2' 'matplotlib~=3.0' 'pillow~=6.0' 'statsmodels~=0.9' 'rpy2'
 RUN R -e "install.packages('MASS')"
 RUN R -e "install.packages('pscl')"


--- a/README.md
+++ b/README.md
 # TRANSIT

 [![Version](https://img.shields.io/github/tag/mad-lab/transit.svg)](https://github.com/mad-lab/transit)   [![Build Status](https://travis-ci.org/mad-lab/transit.svg?branch=master)](https://travis-ci.org/mad-lab/transit)   [![Documentation Status](https://readthedocs.org/projects/transit/badge/?version=latest)](http://transit.readthedocs.io/en/latest/?badge=latest)   [![Downloads](https://pepy.tech/badge/tnseq-transit)](https://pepy.tech/project/tnseq-transit)
+
 =======

+**NOTE: TRANSIT v3.0+ now requires python3.6+. If you want to use TRANSIT with python2, use version < 3.0.**
+
 Welcome! This is the distribution for the TRANSIT and TPP tools developed by the [Ioerger Lab](http://orca2.tamu.edu/tom/iLab.html) at Texas A&M University.

 TRANSIT is a tool for processing and statistical analysis of Tn-Seq data.
@@ -22,7 +25,7 @@ TRANSIT offers a variety of features including:

 -   Ability to analyze datasets from libraries constructed using  **himar1 or tn5 transposons**.

-   **TrackView** to help visualize read-counts accross the genome.
+-   **TrackView** to help visualize read-counts across the genome.

 -   Can **export datasets** into a variety of formats, including **IGV**.


--- a/debian/changelog
+++ b/debian/changelog
-tnseq-transit (3.0.0-1) UNRELEASED; urgency=medium
+tnseq-transit (3.0.1-1) UNRELEASED; urgency=medium

+  [ Andreas Tille ]
  * New upstream version
  * New version supports Python3
    Closes: #931711
@@ -9,6 +10,9 @@ tnseq-transit (3.0.0-1) UNRELEASED; urgency=medium
  TODO: https://pypi.org/project/PyPubSub/
       -> see new queue (#939493)

+  [ Michael R. Crusoe ]
+  * removed now-undeeded 2to3.patch
+
 -- Andreas Tille <tille@debian.org>  Thu, 25 Jul 2019 21:35:41 +0200

 tnseq-transit (2.5.2-1) unstable; urgency=medium

--- a/debian/patches/2to3.patch
+++ b/debian/patches/2to3.patch
-Author: Andreas Tille <tille@debian.org>
-Last-Update: Thu, 25 Jul 2019 21:35:41 +0200
-Description: Use 2to3 to finalise Python2 -> Python3 migration
-
--- a/src/pytransit/analysis/multitnseq.py
-+++ b/src/pytransit/analysis/multitnseq.py
-@@ -9,7 +9,7 @@ try:
- except Exception as e:
-     hasR = False
- 
-import base
-+from . import base
- import numpy
- import scipy
- import pdfkit
-@@ -18,7 +18,7 @@ import statsmodels.stats.multicomp
- import pytransit.transit_tools as transit_tools
- import pytransit.tnseq_tools as tnseq_tools
- import pytransit.norm_tools as norm_tools
-from multitnseq_helpers import def_r_samples_corrplot, def_r_clustering, def_r_make_heatmap, def_r_conditions_corrplot
-+from .multitnseq_helpers import def_r_samples_corrplot, def_r_clustering, def_r_make_heatmap, def_r_conditions_corrplot
- 
- ###### GUI ELEMENTS ######
- short_name = "MultiTnSeq"
-@@ -60,7 +60,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
-         (args, kwargs) = transit_tools.cleanargs(rawargs)
- 
-         if (kwargs.get('-help', False) or kwargs.get('h', False)):
-            print(MultiTnSeqMethod.usage_string())
-+            print((MultiTnSeqMethod.usage_string()))
-             sys.exit(0)
- 
-         fna = args[0]
-@@ -130,7 +130,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
-           if Samples.get(row,"Condition") == cond:
-             fname = Samples.get(row,"Filename")
-             if fname not in filenamesInCombWig:
-              print "error: filename '%s' listed in samples metadata not found in combined wig file" % fname; sys.exit(0)
-+              print("error: filename '%s' listed in samples metadata not found in combined wig file" % fname); sys.exit(0)
-             SampleIndexes.append(row)
-             wigindexes.append(filenamesInCombWig.index(fname))
- 
-@@ -149,7 +149,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
-             fname = Samples.get(row,"Filename")
-             cond2datasets[cond].append(filenamesInData.index(fname))
-         if len(cond2datasets[cond]) == 0:
-          print "error: no samples found in metadata for condition %s" % cond; sys.exit(0)
-+          print("error: no samples found in metadata for condition %s" % cond); sys.exit(0)
- 
-       (normed, factors) = norm_tools.normalize_data(data, method='TTR')
-       Nds,Nsites = normed.shape # transposed: rows are datasets, cols are TA sites
-@@ -196,15 +196,15 @@ class MultiTnSeqMethod(base.AnalysisMeth
- 
- 
-       if False: # print means for each gene for each dataset
-        print '\t'.join(filenamesInData)
-        print '\t'.join([Samples.get(x,"Id") for x in SampleIndexes])
-+        print('\t'.join(filenamesInData))
-+        print('\t'.join([Samples.get(x,"Id") for x in SampleIndexes]))
-         for (start,end,Rv,gene,strand) in genes:
-           if len(TAsites[Rv])>0: # skip genes with no TA sites
-             means = []
-             for j in range(Nds):
-               obs = normed[j,TAsites[Rv]]
-               means.append(numpy.mean(obs))
-          print '\t'.join([Rv,gene,str(len(TAsites[Rv]))]+["%0.1f" % x for x in means])
-+          print('\t'.join([Rv,gene,str(len(TAsites[Rv]))]+["%0.1f" % x for x in means]))
-         #sys.exit(0)
- 
- 
-@@ -344,14 +344,14 @@ class MultiTnSeqMethod(base.AnalysisMeth
-             batches[batch].append(Means[Rv][r])
-             vals.append(Means[Rv][r]) ###
-           for b in "CC1 CC2 CC3 KO".split(): vals.append(numpy.mean(batches[b]))
-          print '\t'.join([Rv,gene]+["%0.1f" % x for x in vals]) ###
-+          print('\t'.join([Rv,gene]+["%0.1f" % x for x in vals])) ###
-       ###sys.exit(0)
- 
- 
- 
-       # remove batch effects (subtract batch means from mean gene counts; adjust each to global mean)
-       if self.debatch and "Batch" in Conditions.headers:
-        print "<BR>correcting means for batch effects..."
-+        print("<BR>correcting means for batch effects...")
-         for (start,end,Rv,gene,strand) in genes:
-          if Rv in Means:
-           batches = {}
-@@ -372,11 +372,11 @@ class MultiTnSeqMethod(base.AnalysisMeth
- 
-       if False: # print gene means for each condition (with batch corrections)
-         vals = ['ORF','gene']+Conditions.getcol("Condition")
-        print '\t'.join(vals)
-+        print('\t'.join(vals))
-         for (start,end,Rv,gene,strand) in genes:
-            if Rv not in Means: continue
-            vals = [Means[Rv][r] for r in range(Conditions.nrows)]
-           print '\t'.join([Rv,gene]+["%0.1f" % x for x in vals])
-+           print('\t'.join([Rv,gene]+["%0.1f" % x for x in vals]))
-         sys.exit(0)
- 
-       ################################
-@@ -398,11 +398,11 @@ class MultiTnSeqMethod(base.AnalysisMeth
-         lfcs[Rv] = lfcvec
- 
-       if self.stdnorm:
-        print "<BR>applying standard normalization to LFCs"
-+        print("<BR>applying standard normalization to LFCs")
-         for i,cond in enumerate(Conditions.keys):
-          col = [row[i] for row in lfcs.values()]
-+          col = [row[i] for row in list(lfcs.values())]
-           m,s = numpy.mean(col),numpy.std(col) # consider using quantiles(col,[0,0.25,0.5,0.75,1.0])
-          for Rv in lfcs.keys():
-+          for Rv in list(lfcs.keys()):
-             lfcs[Rv][i] = (lfcs[Rv][i]-m)/s
- 
-       file = open(self.makefname("temp_LFCs.txt", self.rundir),"w")
-@@ -415,7 +415,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
-         file.write('\t'.join([str(x) for x in vals])+EOL)
-         cnt += 1
-       file.close()
-      if cnt==0: print "error: no significantly varying genes found by ANOVA"; return
-+      if cnt==0: print("error: no significantly varying genes found by ANOVA"); return
- 
-       r_conditions_corrplot = def_r_conditions_corrplot()
-       r_make_heatmap = def_r_make_heatmap()
--- a/debian/patches/series
+++ b/debian/patches/series
 skip_test_requiring_non_existing_input_data.patch
-2to3.patch
--- a/setup.py
+++ b/setup.py
@@ -101,6 +101,7 @@ setup(

    description='TRANSIT is a tool for the analysis of Tn-Seq data. It provides an easy to use graphical interface and access to three different analysis methods that allow the user to determine essentiality in a single condition as well as between conditions.',
    long_description=long_description,
+    long_description_content_type='text/markdown',

    # The project's main homepage.
    url='https://github.com/mad-lab/transit',

--- a/src/pytpp/__main__.py
+++ b/src/pytpp/__main__.py
@@ -39,6 +39,10 @@ def run_main():
    main(*args, **kwargs)

 def main(*args, **kwargs):
+    # Check python version
+    if (sys.version_info[0] < 3):
+        print("TRANSIT v3.0+ requires python3.6+. To use with python2, please install TRANSIT version < 3.0")
+        sys.exit(0)
    vars = Globals()
    # Check for arguements
    if not args and not kwargs and hasWx:        

--- a/src/pytpp/tpp_tools.py
+++ b/src/pytpp/tpp_tools.py
@@ -1442,6 +1442,7 @@ def show_help():
  print('    -maxreads <INT>')
  print('    -mismatches <INT>  # when searching for constant regions in reads 1 and 2; default is 1')
  print('    -flags "<STRING>"  # args to pass to BWA')
+  print('    -bwa-alg [aln|mem]  # Default: mem. Algorithm to use for mapping reads with bwa' )
  print('    -primer-start-window INT,INT # position in read to search for start of primer; default is [0,20]')
  print('    -window-size INT   # automatic method to set window')
  print('    -barseq_catalog_in|-barseq_catalog_out <file>')

--- a/src/pytransit/__init__.py
+++ b/src/pytransit/__init__.py
@@ -2,6 +2,6 @@
 __all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]


-__version__ = "v3.0.0"
+__version__ = "v3.0.1"
 prefix = "[TRANSIT]"

--- a/src/pytransit/__main__.py
+++ b/src/pytransit/__main__.py
@@ -43,6 +43,10 @@ def run_main():
    main(*args, **kwargs)

 def main(*args, **kwargs):
+    # Check python version
+    if (sys.version_info[0] < 3):
+        print("TRANSIT v3.0+ requires python3.6+. To use with python2, please install TRANSIT version < 3.0")
+        sys.exit(0)
    #If no arguments, show GUI:
    DEBUG = "--debug" in sys.argv
    if DEBUG:

--- a/src/pytransit/analysis/gi.py
+++ b/src/pytransit/analysis/gi.py
@@ -40,10 +40,10 @@ long_name = "Genetic Interactions"
 short_desc = "Genetic interactions analysis for change in enrichment"
 long_desc = """Method for determining genetic interactions based on changes in enrichment (i.e. delta log fold-change in mean read counts).

-NOTE: This method requires 4 groups of datasets. Use the main interface to add datasets for the two strain backgrounds under the first condition. A window will allow you to add the datasets under the second condition.
+NOTE: This method requires 4 groups of datasets. Use the main interface to add datasets for the two strain backgrounds under the first condition. After pressing "Run GI", a subsequent window will allow you to add the datasets under the second condition.
 """

-transposons = []
+transposons = ["himar1"]
 columns = ["Orf","Name","Number of TA Sites","Mean count (Strain A Condition 1)","Mean count (Strain A Condition 2)","Mean count (Strain B Condition 1)","Mean count (Strain B Condition 2)", "Mean logFC (Strain A)", "Mean logFC (Strain B)", "Mean delta logFC","Lower Bound delta logFC","Upper Bound delta logFC", "Prob. of delta-logFC being within ROPE", "Adjusted Probability (BFDR)", "Is HDI outside ROPE?", "Type of Interaction"]



--- a/src/pytransit/analysis/multitnseq.py
+++ b/src/pytransit/analysis/multitnseq.py
--- a/src/pytransit/analysis/multitnseq_helpers.py
+++ b/src/pytransit/analysis/multitnseq_helpers.py
-from rpy2.robjects import r, globalenv
-
-def def_r_clustering():
-    r('''
-    r_clustering = function(data_filename, K, D) {
-        # input: temp_LFCs.txt
-        data = read.table(data_filename,head=T,sep='\t')
-        lfcs = data[,3:length(colnames(data))]
-        labels = paste(data$Rv,'/',data$Gene,sep="")
-        rownames(lfcs) = labels
-
-        R = length(rownames(lfcs)) # genes
-        C = length(colnames(lfcs)) # conditions
-
-        library(corrplot)
-
-        fname = "lfcs_boxplot.png"
-        png(fname,width=300+15*C,height=500)
-        boxplot(lfcs,las=2,ylab="log2 fold change of genes (relative to the median)")
-        dev.off()
-
-        km = kmeans(lfcs,K)
-        clusters = km$cluster
-
-        write.table(data.frame(lfcs,clust=clusters),"temp_clust.txt", sep='\t', quote=F)
-
-        pca = prcomp(t(lfcs))
-
-        library(MASS)
-        mat = as.matrix(lfcs)
-        ldapca = lda(clusters~.,lfcs)
-        X = mat %*% ldapca$scaling[,1]
-        Y = mat %*% ldapca$scaling[,2]
-        fname = "pca_genes.png"
-        png(fname,width=1000,height=1000)
-        plot(X,Y,pch=20)
-        text(X,Y,label=labels,adj=c(0.5,1.5),col=clusters)
-        dev.off()
-
-        actpca = prcomp(lfcs,center=TRUE,scale=TRUE)
-        fname = "pca_conditions.png"
-        png(fname,width=500,height=500)
-        plot(actpca$rotation[,1:2],pch=20)#,xlim=c(-1,1),ylim=c(-1,1))
-        text(actpca$rotation[,1:2],label=colnames(lfcs),adj=c(0.5,1.5),cex.lab=1.5)
-        dev.off()
-
-        print(length(lfcs[,1]))
-        hc = hclust(dist(lfcs),method="ward.D2")
-        print("****hclust_genes****")
-        print(hc)
-        fname = "hclust_genes.png"
-        png(fname,width=300+15*R,height=600)
-        plot(hc)
-        K2 = max(2,min(K,max(hc$height)))
-        rect.hclust(hc,k=K2,border='red')
-        dev.off()
-
-        hc = hclust(dist(t(lfcs)),method="ward.D2")
-        print("****hclust_conditions****")
-        print(hc)
-        fname = "hclust_conditions.png"
-        png(fname,width=300+15*C,height=600)
-        # plot(hc)
-        # D2 = max(2,min(D,C-1))
-        # rect.hclust(hc,k=D2,border='red')
-        # dev.off()
-
-        fname = "cluster_opt.png"
-        png(fname)
-        library(factoextra)
-        fviz_nbclust(lfcs, kmeans, method = "wss",k.max=30) # or silhouette or gap_stat
-        #fviz_nbclust ( lfcs, kmeans, method = "wss",k.max=30)+geom_hline ( yintercept=52 )+geom_vline(xintercept=10)
-        dev.off()
-
-        ####################################################
-        # required factoextra and corrplot libraries
-
-        var = get_pca_var(actpca) 
-        fname = "condition_PCs.png"
-        png(fname,width=1000,height=1000)
-        corrplot(var$cor,main="Principle Components",mar=c(1,1,1,1))
-        dev.off()
-
-        S = diag(actpca$sdev,D,D)
-        rawLoadings = actpca$rotation[,1:D] %*% S
-        vmax = varimax(rawLoadings)
-        rotatedLoadings = vmax$loadings
-        rotatedScores = scale(actpca$x[,1:D]) %*% vmax$rotmat
-
-        # https://stats.stackexchange.com/questions/59213/how-to-compute-varimax-rotated-principal-components-in-r
-        # scores = U.S = X.V = actpca$x = scale(lfcs) %*% actpca$rotation
-        combined_transform = (actpca$rotation[,1:D] %*% S) %*% vmax$rotmat
-        # vmax$loadings is same as combined_transform, but with abs(vals)<0.1 blanked out
-        write.table(round(combined_transform,6),"varimax_loadings.txt",sep='\t',quote=F)
-
-        fname = "varimax.png"
-        png(fname,width=800,height=800)
-        corrplot(rotatedLoadings,main="Varimax loadings",mar=c(1,1,1,1)) 
-        dev.off()
-
-        scores = rotatedScores
-        colnames(scores) = paste("score",seq(1:D),sep="")
-        squares = scores*scores
-        ss = apply(squares,1,sum)
-        cos2 = squares/ss
-        best = as.matrix(apply(cos2,1,max))
-        assoc = cos2*sign(scores)
-        colnames(assoc) = paste("assoc",seq(1:D),sep="")
-
-        # # generate null distribution for cos2 to closest axis
-        # 
-        # library(MASS)
-        # SAMPLES = 10000
-        # sample = mvrnorm(n=SAMPLES,mu=rep(0,D),Sigma=diag(D))
-        # # no need to apply Varimax rotation
-        # squares = sample*sample
-        # ss = apply(squares,1,sum)
-        # Xcos2 = squares/ss
-        # Xbest = as.matrix(apply(Xcos2,1,max))
-        # #distn = ecdf(best)
-        # pvals = apply(best,1,function (x) { length(Xbest[Xbest>=x]) })
-        # pvals = pvals/SAMPLES
-        # padj = p.adjust(pvals,method="BH")
-         
-        #res = data.frame(data,scores,assoc,best,pvals,padj)
-        res = data.frame(data,scores,assoc)
-        write.table(format(res,digits=3,scientific=F), "temp_scores.txt",sep='\t',quote=F,row.names=F)
-
-    }
-    ''')
-    return globalenv['r_clustering']
-
-def def_r_samples_corrplot():
-    r('''
-    r_samples_corrplot = function(data_filename) {
-        data = read.table(data_filename, sep='\t',head=T)
-        vals = as.matrix(data[,4:length(colnames(data))])
-        N = length(colnames(vals))
-
-        library(corrplot)
-
-        fname = "samples_corrplot.png"
-        png(fname,width=300+20*N,height=300+20*N)
-        #corrplot(cor(vals)) # among TAsites (not good)
-
-        TAsites = aggregate ( coord~ORF,data=data,length)
-        colnames(TAsites) = c("ORF","sites")
-        temp = aggregate(vals,by=list(data$ORF),FUN=mean)
-        corrplot(cor(temp[TAsites$sites>=10,2:length(colnames(temp))]))
-
-        dev.off()
-    }''')
-
-    return globalenv['r_samples_corrplot']
-
-def def_r_conditions_corrplot():
-    r('''
-        r_conditions_corrplot = function(data_filename) {
-            data = read.table(data_filename, sep='\t', head=T)
-            vals = as.matrix(data[,3:length(colnames(data))])
-            N = length(colnames(vals))
-
-            library(corrplot)
-
-            png("conditions_corrplot.png", width=20*N+300, height=20*N+300)
-            corrplot(cor(vals))
-            dev.off()
-        }
-    ''')
-
-    return globalenv['r_conditions_corrplot']
-
-
-def def_r_make_heatmap():
-    r('''
-        r_make_heatmap = function(data_filename) {
-            data = read.table(data_filename, head=T, sep='\t')
-            lfcs = as.matrix(data[,3:length(colnames(data))])
-            labels = paste(data$Rv,'/',data$Gene,sep="")
-            rownames(lfcs) = labels
-
-            library(corrplot)
-            library(RColorBrewer)
-            library(gplots)
-
-            # Red for down regulated, green for upregulated. 
-            redgreen <- function(n) { c( hsv(h=0/6, v=seq(1,0,length=n/2) ), hsv(h=2/6, v=seq(0,1,length=n/2) ), ) }
-            colors <- colorRampPalette(c("red", "white", "green"))(n = 1000)
-
-            C = length(colnames(lfcs))
-            R = length(rownames(lfcs))
-            W = 300+C*30
-            H = 300+R*15
-
-            fname = "heatmap.png"
-            png(fname,width=W,height=H)
-            #par(oma=c(10,1,1,10)) # b,l,t,r; units="lines of space"
-            heatmap.2(lfcs,col=colors,margin=c(12,12),lwid=c(1,7),lhei=c(1,7),trace="none",cexCol=1.4,cexRow=1.4) # make sure white=0
-            dev.off()
-        }
-    ''')
-    return globalenv['r_make_heatmap']
-
--- a/src/pytransit/analysis/pathway_enrichment.py
+++ b/src/pytransit/analysis/pathway_enrichment.py
@@ -177,13 +177,13 @@ class GSEAMethod(base.SingleConditionMethod):
      n = len(dict_n[key])
      I = set(dict_k) & set(dict_n[key])
      k = len(I)
-      result[key] = {"parameters":str(k)+"\t"+str(n),"p-value": hypergeom.sf(k,M,n,N), "Intersection":I}
+      result[key] = {"parameters":str(n)+"\t"+str(k),"p-value": hypergeom.sf(k,M,n,N), "Intersection":I}
    self.padjustForHypergeom(result)
    return result


  def padjustForHypergeom(self,result):
-    keys = result.keys()
+    keys = list(result.keys())
    pvalues = [result[k]["p-value"] for k in keys]
    b,adj=multitest.fdrcorrection(pvalues, alpha=0.05, method='indep')
    for i in range(len(keys)):
@@ -357,7 +357,7 @@ class GSEAMethod(base.SingleConditionMethod):
    self.output.close()  

  def padjustTest(self,test):
-    keys = test.keys()
+    keys = list(test.keys())
    pvals = [test[k][2] for k in keys]
    b,adj=multitest.fdrcorrection(pvals, alpha=0.05, method='indep')
    for i in range(len(keys)):
@@ -453,7 +453,7 @@ class GSEAMethod(base.SingleConditionMethod):
      self.output.write("#%s\n" % "\t".join(columns))
      self.saveExit(gseaVal,PathDict,rank,ORFNameDict,DESCR)
    elif self.M =="HYPE":
-      columns=["cat id descr","Total genes","Total in intersection","pval","padj","genes in intersection"]
+      columns=["cat id", "descr","Total genes","Total in intersection","pval","padj","genes in intersection"]
      self.output.write("#%s\n" % "\t".join(columns))
      self.saveHyperGeometricTest(results,ORFNameDict,DESCR)
    elif self.M =="GSEA-Z":