Skip to content
Commits on Source (4)
# Change log
All notable changes to this project will be documented in this file.
## Version 3.0.1 2019-08-01
#### TRANSIT:
- Add check for python3 (TRANSIT 3+ requires python3.6+)
- Minor fixes in GI and Pathway Enrichment
## Version 3.0.0 2019-07-18
#### TRANSIT:
- TRANSIT now supports python 3. (To use with python 2, use releases < 3.0.0)
......
From r-base:3.4.1
RUN apt-get update -y && apt-get install -y -f python2 python-dev python-pip
From r-base:3.6.1
RUN apt-get update -y && apt-get install -y -f python3 python-dev python3-pip
ADD src/ /src
ADD tests/ /tests
RUN pip install pytest 'numpy~=1.15' 'scipy~=1.2' 'matplotlib~=2.2' 'pillow~=5.0' 'statsmodels~=0.9' 'rpy2<2.9.0'
RUN pip3 install pytest 'numpy~=1.16' 'scipy~=1.2' 'matplotlib~=3.0' 'pillow~=6.0' 'statsmodels~=0.9' 'rpy2'
RUN R -e "install.packages('MASS')"
RUN R -e "install.packages('pscl')"
......
# TRANSIT
[![Version](https://img.shields.io/github/tag/mad-lab/transit.svg)](https://github.com/mad-lab/transit) [![Build Status](https://travis-ci.org/mad-lab/transit.svg?branch=master)](https://travis-ci.org/mad-lab/transit) [![Documentation Status](https://readthedocs.org/projects/transit/badge/?version=latest)](http://transit.readthedocs.io/en/latest/?badge=latest) [![Downloads](https://pepy.tech/badge/tnseq-transit)](https://pepy.tech/project/tnseq-transit)
=======
**NOTE: TRANSIT v3.0+ now requires python3.6+. If you want to use TRANSIT with python2, use version < 3.0.**
Welcome! This is the distribution for the TRANSIT and TPP tools developed by the [Ioerger Lab](http://orca2.tamu.edu/tom/iLab.html) at Texas A&M University.
TRANSIT is a tool for processing and statistical analysis of Tn-Seq data.
......@@ -22,7 +25,7 @@ TRANSIT offers a variety of features including:
- Ability to analyze datasets from libraries constructed using **himar1 or tn5 transposons**.
- **TrackView** to help visualize read-counts accross the genome.
- **TrackView** to help visualize read-counts across the genome.
- Can **export datasets** into a variety of formats, including **IGV**.
......
tnseq-transit (3.0.0-1) UNRELEASED; urgency=medium
tnseq-transit (3.0.1-1) UNRELEASED; urgency=medium
[ Andreas Tille ]
* New upstream version
* New version supports Python3
Closes: #931711
......@@ -9,6 +10,9 @@ tnseq-transit (3.0.0-1) UNRELEASED; urgency=medium
TODO: https://pypi.org/project/PyPubSub/
-> see new queue (#939493)
[ Michael R. Crusoe ]
* removed now-undeeded 2to3.patch
-- Andreas Tille <tille@debian.org> Thu, 25 Jul 2019 21:35:41 +0200
tnseq-transit (2.5.2-1) unstable; urgency=medium
......
Author: Andreas Tille <tille@debian.org>
Last-Update: Thu, 25 Jul 2019 21:35:41 +0200
Description: Use 2to3 to finalise Python2 -> Python3 migration
--- a/src/pytransit/analysis/multitnseq.py
+++ b/src/pytransit/analysis/multitnseq.py
@@ -9,7 +9,7 @@ try:
except Exception as e:
hasR = False
-import base
+from . import base
import numpy
import scipy
import pdfkit
@@ -18,7 +18,7 @@ import statsmodels.stats.multicomp
import pytransit.transit_tools as transit_tools
import pytransit.tnseq_tools as tnseq_tools
import pytransit.norm_tools as norm_tools
-from multitnseq_helpers import def_r_samples_corrplot, def_r_clustering, def_r_make_heatmap, def_r_conditions_corrplot
+from .multitnseq_helpers import def_r_samples_corrplot, def_r_clustering, def_r_make_heatmap, def_r_conditions_corrplot
###### GUI ELEMENTS ######
short_name = "MultiTnSeq"
@@ -60,7 +60,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
(args, kwargs) = transit_tools.cleanargs(rawargs)
if (kwargs.get('-help', False) or kwargs.get('h', False)):
- print(MultiTnSeqMethod.usage_string())
+ print((MultiTnSeqMethod.usage_string()))
sys.exit(0)
fna = args[0]
@@ -130,7 +130,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
if Samples.get(row,"Condition") == cond:
fname = Samples.get(row,"Filename")
if fname not in filenamesInCombWig:
- print "error: filename '%s' listed in samples metadata not found in combined wig file" % fname; sys.exit(0)
+ print("error: filename '%s' listed in samples metadata not found in combined wig file" % fname); sys.exit(0)
SampleIndexes.append(row)
wigindexes.append(filenamesInCombWig.index(fname))
@@ -149,7 +149,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
fname = Samples.get(row,"Filename")
cond2datasets[cond].append(filenamesInData.index(fname))
if len(cond2datasets[cond]) == 0:
- print "error: no samples found in metadata for condition %s" % cond; sys.exit(0)
+ print("error: no samples found in metadata for condition %s" % cond); sys.exit(0)
(normed, factors) = norm_tools.normalize_data(data, method='TTR')
Nds,Nsites = normed.shape # transposed: rows are datasets, cols are TA sites
@@ -196,15 +196,15 @@ class MultiTnSeqMethod(base.AnalysisMeth
if False: # print means for each gene for each dataset
- print '\t'.join(filenamesInData)
- print '\t'.join([Samples.get(x,"Id") for x in SampleIndexes])
+ print('\t'.join(filenamesInData))
+ print('\t'.join([Samples.get(x,"Id") for x in SampleIndexes]))
for (start,end,Rv,gene,strand) in genes:
if len(TAsites[Rv])>0: # skip genes with no TA sites
means = []
for j in range(Nds):
obs = normed[j,TAsites[Rv]]
means.append(numpy.mean(obs))
- print '\t'.join([Rv,gene,str(len(TAsites[Rv]))]+["%0.1f" % x for x in means])
+ print('\t'.join([Rv,gene,str(len(TAsites[Rv]))]+["%0.1f" % x for x in means]))
#sys.exit(0)
@@ -344,14 +344,14 @@ class MultiTnSeqMethod(base.AnalysisMeth
batches[batch].append(Means[Rv][r])
vals.append(Means[Rv][r]) ###
for b in "CC1 CC2 CC3 KO".split(): vals.append(numpy.mean(batches[b]))
- print '\t'.join([Rv,gene]+["%0.1f" % x for x in vals]) ###
+ print('\t'.join([Rv,gene]+["%0.1f" % x for x in vals])) ###
###sys.exit(0)
# remove batch effects (subtract batch means from mean gene counts; adjust each to global mean)
if self.debatch and "Batch" in Conditions.headers:
- print "<BR>correcting means for batch effects..."
+ print("<BR>correcting means for batch effects...")
for (start,end,Rv,gene,strand) in genes:
if Rv in Means:
batches = {}
@@ -372,11 +372,11 @@ class MultiTnSeqMethod(base.AnalysisMeth
if False: # print gene means for each condition (with batch corrections)
vals = ['ORF','gene']+Conditions.getcol("Condition")
- print '\t'.join(vals)
+ print('\t'.join(vals))
for (start,end,Rv,gene,strand) in genes:
if Rv not in Means: continue
vals = [Means[Rv][r] for r in range(Conditions.nrows)]
- print '\t'.join([Rv,gene]+["%0.1f" % x for x in vals])
+ print('\t'.join([Rv,gene]+["%0.1f" % x for x in vals]))
sys.exit(0)
################################
@@ -398,11 +398,11 @@ class MultiTnSeqMethod(base.AnalysisMeth
lfcs[Rv] = lfcvec
if self.stdnorm:
- print "<BR>applying standard normalization to LFCs"
+ print("<BR>applying standard normalization to LFCs")
for i,cond in enumerate(Conditions.keys):
- col = [row[i] for row in lfcs.values()]
+ col = [row[i] for row in list(lfcs.values())]
m,s = numpy.mean(col),numpy.std(col) # consider using quantiles(col,[0,0.25,0.5,0.75,1.0])
- for Rv in lfcs.keys():
+ for Rv in list(lfcs.keys()):
lfcs[Rv][i] = (lfcs[Rv][i]-m)/s
file = open(self.makefname("temp_LFCs.txt", self.rundir),"w")
@@ -415,7 +415,7 @@ class MultiTnSeqMethod(base.AnalysisMeth
file.write('\t'.join([str(x) for x in vals])+EOL)
cnt += 1
file.close()
- if cnt==0: print "error: no significantly varying genes found by ANOVA"; return
+ if cnt==0: print("error: no significantly varying genes found by ANOVA"); return
r_conditions_corrplot = def_r_conditions_corrplot()
r_make_heatmap = def_r_make_heatmap()
skip_test_requiring_non_existing_input_data.patch
2to3.patch
......@@ -101,6 +101,7 @@ setup(
description='TRANSIT is a tool for the analysis of Tn-Seq data. It provides an easy to use graphical interface and access to three different analysis methods that allow the user to determine essentiality in a single condition as well as between conditions.',
long_description=long_description,
long_description_content_type='text/markdown',
# The project's main homepage.
url='https://github.com/mad-lab/transit',
......
......@@ -39,6 +39,10 @@ def run_main():
main(*args, **kwargs)
def main(*args, **kwargs):
# Check python version
if (sys.version_info[0] < 3):
print("TRANSIT v3.0+ requires python3.6+. To use with python2, please install TRANSIT version < 3.0")
sys.exit(0)
vars = Globals()
# Check for arguements
if not args and not kwargs and hasWx:
......
......@@ -1442,6 +1442,7 @@ def show_help():
print(' -maxreads <INT>')
print(' -mismatches <INT> # when searching for constant regions in reads 1 and 2; default is 1')
print(' -flags "<STRING>" # args to pass to BWA')
print(' -bwa-alg [aln|mem] # Default: mem. Algorithm to use for mapping reads with bwa' )
print(' -primer-start-window INT,INT # position in read to search for start of primer; default is [0,20]')
print(' -window-size INT # automatic method to set window')
print(' -barseq_catalog_in|-barseq_catalog_out <file>')
......
......@@ -2,6 +2,6 @@
__all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
__version__ = "v3.0.0"
__version__ = "v3.0.1"
prefix = "[TRANSIT]"
......@@ -43,6 +43,10 @@ def run_main():
main(*args, **kwargs)
def main(*args, **kwargs):
# Check python version
if (sys.version_info[0] < 3):
print("TRANSIT v3.0+ requires python3.6+. To use with python2, please install TRANSIT version < 3.0")
sys.exit(0)
#If no arguments, show GUI:
DEBUG = "--debug" in sys.argv
if DEBUG:
......
......@@ -40,10 +40,10 @@ long_name = "Genetic Interactions"
short_desc = "Genetic interactions analysis for change in enrichment"
long_desc = """Method for determining genetic interactions based on changes in enrichment (i.e. delta log fold-change in mean read counts).
NOTE: This method requires 4 groups of datasets. Use the main interface to add datasets for the two strain backgrounds under the first condition. A window will allow you to add the datasets under the second condition.
NOTE: This method requires 4 groups of datasets. Use the main interface to add datasets for the two strain backgrounds under the first condition. After pressing "Run GI", a subsequent window will allow you to add the datasets under the second condition.
"""
transposons = []
transposons = ["himar1"]
columns = ["Orf","Name","Number of TA Sites","Mean count (Strain A Condition 1)","Mean count (Strain A Condition 2)","Mean count (Strain B Condition 1)","Mean count (Strain B Condition 2)", "Mean logFC (Strain A)", "Mean logFC (Strain B)", "Mean delta logFC","Lower Bound delta logFC","Upper Bound delta logFC", "Prob. of delta-logFC being within ROPE", "Adjusted Probability (BFDR)", "Is HDI outside ROPE?", "Type of Interaction"]
......
This diff is collapsed.
from rpy2.robjects import r, globalenv
def def_r_clustering():
r('''
r_clustering = function(data_filename, K, D) {
# input: temp_LFCs.txt
data = read.table(data_filename,head=T,sep='\t')
lfcs = data[,3:length(colnames(data))]
labels = paste(data$Rv,'/',data$Gene,sep="")
rownames(lfcs) = labels
R = length(rownames(lfcs)) # genes
C = length(colnames(lfcs)) # conditions
library(corrplot)
fname = "lfcs_boxplot.png"
png(fname,width=300+15*C,height=500)
boxplot(lfcs,las=2,ylab="log2 fold change of genes (relative to the median)")
dev.off()
km = kmeans(lfcs,K)
clusters = km$cluster
write.table(data.frame(lfcs,clust=clusters),"temp_clust.txt", sep='\t', quote=F)
pca = prcomp(t(lfcs))
library(MASS)
mat = as.matrix(lfcs)
ldapca = lda(clusters~.,lfcs)
X = mat %*% ldapca$scaling[,1]
Y = mat %*% ldapca$scaling[,2]
fname = "pca_genes.png"
png(fname,width=1000,height=1000)
plot(X,Y,pch=20)
text(X,Y,label=labels,adj=c(0.5,1.5),col=clusters)
dev.off()
actpca = prcomp(lfcs,center=TRUE,scale=TRUE)
fname = "pca_conditions.png"
png(fname,width=500,height=500)
plot(actpca$rotation[,1:2],pch=20)#,xlim=c(-1,1),ylim=c(-1,1))
text(actpca$rotation[,1:2],label=colnames(lfcs),adj=c(0.5,1.5),cex.lab=1.5)
dev.off()
print(length(lfcs[,1]))
hc = hclust(dist(lfcs),method="ward.D2")
print("****hclust_genes****")
print(hc)
fname = "hclust_genes.png"
png(fname,width=300+15*R,height=600)
plot(hc)
K2 = max(2,min(K,max(hc$height)))
rect.hclust(hc,k=K2,border='red')
dev.off()
hc = hclust(dist(t(lfcs)),method="ward.D2")
print("****hclust_conditions****")
print(hc)
fname = "hclust_conditions.png"
png(fname,width=300+15*C,height=600)
# plot(hc)
# D2 = max(2,min(D,C-1))
# rect.hclust(hc,k=D2,border='red')
# dev.off()
fname = "cluster_opt.png"
png(fname)
library(factoextra)
fviz_nbclust(lfcs, kmeans, method = "wss",k.max=30) # or silhouette or gap_stat
#fviz_nbclust ( lfcs, kmeans, method = "wss",k.max=30)+geom_hline ( yintercept=52 )+geom_vline(xintercept=10)
dev.off()
####################################################
# required factoextra and corrplot libraries
var = get_pca_var(actpca)
fname = "condition_PCs.png"
png(fname,width=1000,height=1000)
corrplot(var$cor,main="Principle Components",mar=c(1,1,1,1))
dev.off()
S = diag(actpca$sdev,D,D)
rawLoadings = actpca$rotation[,1:D] %*% S
vmax = varimax(rawLoadings)
rotatedLoadings = vmax$loadings
rotatedScores = scale(actpca$x[,1:D]) %*% vmax$rotmat
# https://stats.stackexchange.com/questions/59213/how-to-compute-varimax-rotated-principal-components-in-r
# scores = U.S = X.V = actpca$x = scale(lfcs) %*% actpca$rotation
combined_transform = (actpca$rotation[,1:D] %*% S) %*% vmax$rotmat
# vmax$loadings is same as combined_transform, but with abs(vals)<0.1 blanked out
write.table(round(combined_transform,6),"varimax_loadings.txt",sep='\t',quote=F)
fname = "varimax.png"
png(fname,width=800,height=800)
corrplot(rotatedLoadings,main="Varimax loadings",mar=c(1,1,1,1))
dev.off()
scores = rotatedScores
colnames(scores) = paste("score",seq(1:D),sep="")
squares = scores*scores
ss = apply(squares,1,sum)
cos2 = squares/ss
best = as.matrix(apply(cos2,1,max))
assoc = cos2*sign(scores)
colnames(assoc) = paste("assoc",seq(1:D),sep="")
# # generate null distribution for cos2 to closest axis
#
# library(MASS)
# SAMPLES = 10000
# sample = mvrnorm(n=SAMPLES,mu=rep(0,D),Sigma=diag(D))
# # no need to apply Varimax rotation
# squares = sample*sample
# ss = apply(squares,1,sum)
# Xcos2 = squares/ss
# Xbest = as.matrix(apply(Xcos2,1,max))
# #distn = ecdf(best)
# pvals = apply(best,1,function (x) { length(Xbest[Xbest>=x]) })
# pvals = pvals/SAMPLES
# padj = p.adjust(pvals,method="BH")
#res = data.frame(data,scores,assoc,best,pvals,padj)
res = data.frame(data,scores,assoc)
write.table(format(res,digits=3,scientific=F), "temp_scores.txt",sep='\t',quote=F,row.names=F)
}
''')
return globalenv['r_clustering']
def def_r_samples_corrplot():
r('''
r_samples_corrplot = function(data_filename) {
data = read.table(data_filename, sep='\t',head=T)
vals = as.matrix(data[,4:length(colnames(data))])
N = length(colnames(vals))
library(corrplot)
fname = "samples_corrplot.png"
png(fname,width=300+20*N,height=300+20*N)
#corrplot(cor(vals)) # among TAsites (not good)
TAsites = aggregate ( coord~ORF,data=data,length)
colnames(TAsites) = c("ORF","sites")
temp = aggregate(vals,by=list(data$ORF),FUN=mean)
corrplot(cor(temp[TAsites$sites>=10,2:length(colnames(temp))]))
dev.off()
}''')
return globalenv['r_samples_corrplot']
def def_r_conditions_corrplot():
r('''
r_conditions_corrplot = function(data_filename) {
data = read.table(data_filename, sep='\t', head=T)
vals = as.matrix(data[,3:length(colnames(data))])
N = length(colnames(vals))
library(corrplot)
png("conditions_corrplot.png", width=20*N+300, height=20*N+300)
corrplot(cor(vals))
dev.off()
}
''')
return globalenv['r_conditions_corrplot']
def def_r_make_heatmap():
r('''
r_make_heatmap = function(data_filename) {
data = read.table(data_filename, head=T, sep='\t')
lfcs = as.matrix(data[,3:length(colnames(data))])
labels = paste(data$Rv,'/',data$Gene,sep="")
rownames(lfcs) = labels
library(corrplot)
library(RColorBrewer)
library(gplots)
# Red for down regulated, green for upregulated.
redgreen <- function(n) { c( hsv(h=0/6, v=seq(1,0,length=n/2) ), hsv(h=2/6, v=seq(0,1,length=n/2) ), ) }
colors <- colorRampPalette(c("red", "white", "green"))(n = 1000)
C = length(colnames(lfcs))
R = length(rownames(lfcs))
W = 300+C*30
H = 300+R*15
fname = "heatmap.png"
png(fname,width=W,height=H)
#par(oma=c(10,1,1,10)) # b,l,t,r; units="lines of space"
heatmap.2(lfcs,col=colors,margin=c(12,12),lwid=c(1,7),lhei=c(1,7),trace="none",cexCol=1.4,cexRow=1.4) # make sure white=0
dev.off()
}
''')
return globalenv['r_make_heatmap']
......@@ -177,13 +177,13 @@ class GSEAMethod(base.SingleConditionMethod):
n = len(dict_n[key])
I = set(dict_k) & set(dict_n[key])
k = len(I)
result[key] = {"parameters":str(k)+"\t"+str(n),"p-value": hypergeom.sf(k,M,n,N), "Intersection":I}
result[key] = {"parameters":str(n)+"\t"+str(k),"p-value": hypergeom.sf(k,M,n,N), "Intersection":I}
self.padjustForHypergeom(result)
return result
def padjustForHypergeom(self,result):
keys = result.keys()
keys = list(result.keys())
pvalues = [result[k]["p-value"] for k in keys]
b,adj=multitest.fdrcorrection(pvalues, alpha=0.05, method='indep')
for i in range(len(keys)):
......@@ -357,7 +357,7 @@ class GSEAMethod(base.SingleConditionMethod):
self.output.close()
def padjustTest(self,test):
keys = test.keys()
keys = list(test.keys())
pvals = [test[k][2] for k in keys]
b,adj=multitest.fdrcorrection(pvals, alpha=0.05, method='indep')
for i in range(len(keys)):
......@@ -453,7 +453,7 @@ class GSEAMethod(base.SingleConditionMethod):
self.output.write("#%s\n" % "\t".join(columns))
self.saveExit(gseaVal,PathDict,rank,ORFNameDict,DESCR)
elif self.M =="HYPE":
columns=["cat id descr","Total genes","Total in intersection","pval","padj","genes in intersection"]
columns=["cat id", "descr","Total genes","Total in intersection","pval","padj","genes in intersection"]
self.output.write("#%s\n" % "\t".join(columns))
self.saveHyperGeometricTest(results,ORFNameDict,DESCR)
elif self.M =="GSEA-Z":
......