Skip to content
Commits on Source (6)
......@@ -2,7 +2,7 @@
INSTALLDIR=/usr/local/bin
CPP = g++
CPPFLAGS += -Wall -O3 -Wno-unused-result
CPPFLAGS += -Wall -O2 -std=gnu++11 -Wno-unused-result
all: proteinortho5_clustering proteinortho5_tree
......
proteinortho (5.16+dfsg-1) unstable; urgency=medium
* New upstream version
* Standards-Version: 4.1.3
* debhelper 11
* d/rules: do not parse d/changelog
-- Andreas Tille <tille@debian.org> Wed, 21 Feb 2018 08:35:47 +0100
proteinortho (5.15+dfsg-1) unstable; urgency=medium
* New upstream version
......
......@@ -3,9 +3,9 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 9),
Build-Depends: debhelper (>= 11~),
ncbi-blast+
Standards-Version: 3.9.8
Standards-Version: 4.1.3
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/proteinortho.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/proteinortho.git
Homepage: https://www.bioinf.uni-leipzig.de/Software/proteinortho/
......
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: Proteinortho
Upstream-Contact: Marcus Lechner <lechner@staff.uni-marburg.de>
Source: https://www.bioinf.uni-leipzig.de/Software/proteinortho/
......
......@@ -2,7 +2,7 @@
# DH_VERBOSE := 1
DEBPKGNAME := $(shell dpkg-parsechangelog | awk '/^Source:/ {print $$2}')
include /usr/share/dpkg/default.mk
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
......@@ -10,4 +10,4 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all
dh $@
override_dh_auto_install:
dh_auto_install --buildsystem=makefile -- install INSTALLDIR=$(CURDIR)/debian/$(DEBPKGNAME)/usr/lib/proteinortho
dh_auto_install --buildsystem=makefile -- install INSTALLDIR=$(CURDIR)/debian/$(DEB_SOURCE)/usr/lib/proteinortho
......@@ -4,7 +4,7 @@
</head>
<body>
<h1>Proteinortho Manual / PoFF Manual</h1>
<small>This manual corresponds to version 5.15</small>
<small>This manual corresponds to version 5.16</small>
<h2>Introduction</h2>
Proteinortho is a tool to detect orthologous genes within different species.
For doing so, it compares similarities of given gene sequences and clusters them to find significant groups.
......@@ -154,17 +154,27 @@ They are:
<h2>Hints</h2>
<ul>
<li>Using <i>.faa</i> to indicate that your file contains amino acids and <i>.fna</i> to show it contains nucleotides makes life much easier.
<li>Each sequence ID in your FASTA files must be unique. Consider renaming the otherwise.
<li>Sequence IDs must be unique within a single FASTA file. Consider renaming otherwise. <br>Note: Till version 5.15 sequences IDs had to be unique among the whole dataset. Proteinortho now keeps track of name and species to avoid the necessissity of renaming.
<li>You need write permissions in the directory of your FASTA files as Proteinortho will create blast databases. If this is not the case, consider using symbolic links to the FASTA files.
<li>The directory <i>tools</i> cotains useful tools, e.g. <code>grab_proteins.pl</code> which fetches protein sequences of orthologous groups from Proteinortho output table
<li>The directory <i>tools</i> contains useful tools, e.g. <code>grab_proteins.pl</code> which fetches protein sequences of orthologous groups from Proteinortho output table
</ul>
<h2>Usage</h2>
<code>proteinortho5.pl [OPTIONS] FASTA1 FASTA2 [FASTAn...]</code> <p>
<table border="1" cellspacing="0" cellpadding="2">
<table border="0" cellspacing="2" cellpadding="2">
<tr><td><b>Option</b></td><td><b>Default value</b></td><td><b>Description</b></td></tr>
<tr><td colspan=3><hr><b>[General options]</b><hr></td></tr>
<tr><td>-project=</td><td>myproject</td><td> prefix for all result file names</td></tr>
<tr><td>-cpus=</td><td>auto</td><td>use the given number of threads</td></tr>
<tr><td>-verbose</td><td> </td><td>give a lot of information about the current progress</td></tr>
<tr><td>-keep</td><td>-</td><td>store temporary blast results for reuse (advisable for larger jobs)</td></tr>
<tr><td>-temp=</td><td>working directory</td><td> path for temporary files</td></tr>
<tr><td>-force</td><td> </td><td>force recalculation of blast results in any case</td></tr>
<tr><td>-clean</td><td> </td><td>remove all unnecessary files after processing</td></tr>
<tr><td colspan=3><hr><b>[Search options]</b><hr></td></tr>
<tr><td>-e=</td><td>1e-05</td><td>E-value for blast</td></tr>
<tr><td>-p=</td><td>blastp+</td><td>blast program to use:
<ul>
......@@ -179,29 +189,34 @@ in case you only have access to blast legacy:
<li>blastp &#8594; sequences are given as amino acids
<li>tblastx &#8594; sequences are given as nucleotides and will be interpreted as (translated) amino acids in all three reading frames
</ul>
</td></tr>
<tr><td>-project=</td><td>myproject</td><td> prefix for all result file names</td></tr>
<tr><td>-temp=</td><td>working directory</td><td> path for temporary files</td></tr>
<tr><td>-synteny</td><td>-</td><td>activate PoFF extension to separate similar sequences using synteny (requires a GFF file for each FASTA file)</td></tr>
<tr><td>-nograph</td><td>-</td><td>do not generate .graph files with pairwise orthology data<br>saves some time</td></tr>
<tr><td>-keep</td><td>-</td><td>store temporary blast results for reuse</td></tr>
<tr><td>-clean</td><td>-</td><td>remove all unnecessary files after processing</td></tr>
<tr><td>-desc</td><td>-</td><td>write gene description file (XXX.descriptions); works only with NCBI-formated FASTA entries currently</td></tr>
<tr><td>-force</td><td>-</td><td>force recalculation of blast results in any case</td></tr>
<tr><td>-cpus=</td><td>auto</td><td>use the given number of threads</td></tr>
<tr><td>-selfblast</td><td>-</td><td>apply selfblast to directly paralogs; normally these are inferred indirectly from orthology data to other species (experimental!)</td></tr>
<tr><td>-singles</td><td>-</td><td>Also report genes without orthologs in table output</td></tr>
<tr><td>-selfblast</td><td> </td><td>apply selfblast to directly paralogs; normally these are inferred indirectly from orthology data to other species (experimental!)</td></tr>
<tr><td>-sim=</td><td>0.95</td><td>min. similarity for additional hits</td></tr>
<tr><td>-identity=</td><td>25</td><td>min. percent identity of best blast alignments</td></tr>
<tr><td>-cov=</td><td>50</td><td>min. coverage of best blast alignments in percent</td></tr>
<tr><td>-subpara=</td><td> </td><td>additional parameters for blast; set these in quotes (e.g. -subpara='-seg no') <br>This parameter was named -blastParameters in earlier versions</td></tr>
<tr><td colspan=3><hr><b>[Synteny options]</b><hr></td></tr>
<tr><td>-synteny</td><td>-</td><td>activate PoFF extension to separate similar sequences using synteny (requires a GFF file for each FASTA file)</td></tr>
<tr><td>-dups=</td><td>0</td><td> applied in combination with -synteny; number of reiterations for adjacencies heuristic to determine duplicated regions; <br>
if set to a higher number, co-orthologs will tend to get clustered together rather than getting separated
</td></tr>
<tr><td>-cs=</td><td>3</td><td> applied in combination with -synteny; size of a maximum common substring (MCS) for adjacency matches; <br> the longer this value becomes the longer syntenic regions need to be in order to be detected </td></tr>
<tr><td>alpha=</td><td>0.5</td><td>weight of adjacencies vs. sequence similarity</td></tr>
<tr><td colspan=3><hr><b>[Clustering options]</b><hr></td></tr>
<tr><td>-singles</td><td> </td><td>also report genes without orthologs in table output</td></tr>
<tr><td>-purity=</td><td>1</td><td>avoid spurious graph assignments [range: 0.01-1, default 0.75]</td></tr>
<tr><td>-conn=</td><td>0.1</td><td>min. algebraic connectivity of orthologous groups during clustering</td></tr>
<tr><td>-sim=</td><td>0.95</td><td>min. similarity for additional hits</td></tr>
<tr><td>-blastpath=</td><td>-</td><td>path to your local blast installation (if not in present in default paths)</td></tr>
<tr><td>-blastParameters=</td><td>-</td><td>additional parameters for blast; set these in quotes (e.g. -blastParameters='-seg no')</td></tr>
<tr><td>-verbose</td><td>-</td><td>give a lot of information about the current progress</td></tr>
<tr><td>-debug</td><td>-</td><td>gives detailed information for bug tracking</td></tr>
<tr><td>-nograph</td><td>-</td><td>do not generate .graph files with pairwise orthology data<br>saves some time</td></tr>
<tr><td colspan=3><hr><b>[Misc options]</b><hr></td></tr>
<tr><td>-desc</td><td> </td><td>write gene description file (XXX.descriptions); works only with NCBI-formated FASTA entries currently</td></tr>
<tr><td>-blastpath=</td><td> </td><td>path to your local blast installation (if not in present in default paths)</td></tr>
<tr><td>-debug</td><td> </td><td>gives detailed information for bug tracking</td></tr>
<tr><td colspan=3><hr><b>[Large compute jobs]</b><hr></td></tr>
<tr><td colspan=3><b>Parameters needed to distribute the runs over several machines</b></td></tr>
<tr><td>-step=</td><td>0</td><td>
perform only specific steps of the analysis
......@@ -211,31 +226,32 @@ perform only specific steps of the analysis
<li>3 &#8594; perform clustering
<li>0 &#8594; perform all steps
</ul></td></tr>
<tr><td>startat=</td><td>0</td><td>file number to start with</td></tr>
<tr><td>stopat=</td><td>-1</td><td>file number to end with (very last one = -1)</td></tr>
<tr><td colspan=3><b>Parameters applied in combination with -synteny<b></td></tr>
<tr><td>-dups=</td><td>0</td><td> applied in combination with -synteny; number of reiterations for adjacencies heuristic to determine duplicated regions; <br>
if set to a higher number, co-orthologs will tend to get clustered together rather than getting separated
</td></tr>
<tr><td>-cs=</td><td>3</td><td> applied in combination with -synteny; size of a maximum common substring (MCS) for adjacency matches; <br> the longer this value becomes the longer syntenic regions need to be in order to be detected </td></tr>
<tr><td>alpha=</td><td>0.5</td><td>weight of adjacencies vs. sequence similarity</td></tr>
<tr><td>jobs=N/M</td><td> </td><td>distribute blast step into M subsets and run job number N out of M in this very process, only works in combination with -step=2</td></tr>
</table>
<h3>Using several machines</h3>
If you want to involve multiple machines or separate a Proteinortho run into smaller chunks, use the <i>-steps=</i> option.
If you want to involve multiple machines or separate a Proteinortho run into smaller chunks, use the <i>-jobs=M/N</i> option.
First, run <code>proteinortho5.pl -steps=1 ...</code> to generate the indices.
Then you can run <code>proteinortho5.pl -steps=2 -startat=X -stopat=Y ...</code> to run small chunks separately.
Instead of X and Y numbers must be set representing the file number to be calculated. X and Y can be equal.
E.g. you have four fasta files, the you can to run <br>
Then you can run <code>proteinortho5.pl -steps=2 -jobs=M/N ...</code> to run small chunks separately.
Instead of M and N numbers must be set representing the number of jobs you want to divide the run into (M) and the job division to be performed by the process.
E.g. to divide a Proteinortho run into 4 jobs to run on several machines, use <br>
<code>
proteinortho5.pl -steps=1 ... <br>
</code>
on a single PC, then <br>
<code>
proteinortho5.pl -steps=2 -startat=0 -stopat=0 ... <br>
proteinortho5.pl -steps=2 -startat=1 -stopat=1 ... <br>
proteinortho5.pl -steps=2 -startat=2 -stopat=2 ... <br>
proteinortho5.pl -steps=2 -startat=3 -stopat=3 ... <br>
proteinortho5.pl -steps=2 -jobs=1/4 ... <br>
proteinortho5.pl -steps=2 -jobs=2/4 ... <br>
proteinortho5.pl -steps=2 -jobs=3/4 ... <br>
proteinortho5.pl -steps=2 -jobs=4/4 ... <br>
</code>
seperateltly on different machines (within the same shared working directory).
After all step 2 runs are done, <code>proteinortho5.pl -steps=3 ...</code> can be executed to perform the clustering and merge all calculations.
separately on different machines (can be run in parallel or iteratively within the same shared working directory).
After all step 2 runs are done, run
<br><code>proteinortho5.pl -steps=3 ...</code>
<br> to perform the clustering and merge all calculations on a single PC.
</body>
</html>
This diff is collapsed.
......@@ -11,13 +11,19 @@ if (!defined($ARGV[1])) {
# Proteinortho Output
print STDERR "Cleaning edge list...\n";
# Reading removal list
my %map;
open(IN,"<$ARGV[0]") || die("Could not open file $ARGV[0]: $!");
while (<IN>) {
chomp;
my ($a, $b) = sort split(' ',$_,2);
unless ($b) {die("Line does not match filter list pattern\n");}
my ($ta, $sa, $tb, $sb) = split("\t",$_,4); # name, species, name, species
unless ($sb) {die("Line does not match filter list pattern\n");}
$ta .= " ".$sa;
$tb .= " ".$sb;
my ($a, $b) = sort($ta,$tb); # bidirectional edges store once is fine
$map{$a.' '.$b} = 1;
}
close(IN);
......@@ -25,20 +31,48 @@ close(IN);
shift(@ARGV);
my $rm = 0;
my $all = 0;
# For all blastgraphs
my $filea = "";
my $fileb = "";
foreach my $file (@ARGV) {
open(IN,"<$file") || die("Could not open file $file: $!");
while (<IN>) {
if ($_ =~ /^#/) {print $_; next;}
if ($_ =~ /^#/) {
print $_;
chomp;
$_ =~ s/^#\s*//;
my ($a,$b,undef) = split("\t",$_,3);
unless ($b) {die("Line does not match Proteinortho graph format\n");}
if ($a eq "a" || $a eq "file_a") {next;}
unless (defined($b)) {die("Line does not match Proteinortho graph format. Filename missing.\n");}
# Keep track of files/species
$filea = $a;
$fileb = $b;
}
else {
unless ($_ =~ /\t/) {next;}
my ($ta,$tb,undef) = split("\t",$_,3);
unless ($tb) {die("Line does not match Proteinortho graph format\n");}
$all++;
$ta .= " ".$filea;
$tb .= " ".$fileb;
my ($a, $b) = sort($ta,$tb); # bidirectional edges store once is fine
($a,$b) = sort($a, $b);
if (exists($map{$a.' '.$b})) {
$rm++;
next;
}
if (exists($map{$a.' '.$b})) {$rm++; next;}
# if ($a eq $b) {$all--; next;} # Selfhits are ignored
print $_;
}
}
close(IN);
}
print STDERR "Removed $rm / $all edges\n";
print STDERR "Done.\n";
## TODO: Adjust to remove selfblast hits and watch
......@@ -3,7 +3,7 @@
* Reads edge list and splits connected components
* according to algebraic connectivity threshold
*
* Last updated: 2016/04/22
* Last updated: 2017/03/20
* Author: Marcus Lechner
*/
......@@ -40,6 +40,8 @@ vector<unsigned int> get_deg_one (vector<unsigned int>& );
// Parameters
bool param_verbose = false;
double param_con_threshold = 0.1;
unsigned int debug_level = 0;
double param_sep_purity = 0.75;
string param_rmgraph = "remove.graph";
unsigned int min_iter = 100; // min number of alg con iterations
unsigned int max_iter = 10000; // max number of alg con iterations
......@@ -98,6 +100,14 @@ int main(int argc, char *argv[]) {
paras++;
param_con_threshold = string2double(string(argv[paras]));
}
else if (parameter == "-purity") {
paras++;
param_sep_purity = string2double(string(argv[paras]));
}
else if (parameter == "-debug") {
paras++;
debug_level = int(string2double(string(argv[paras])));
}
else if (parameter == "-rmgraph") {
paras++;
param_rmgraph = string(argv[paras]);
......@@ -109,9 +119,14 @@ int main(int argc, char *argv[]) {
}
}
if (debug_level > 0) cerr << "[DEBUG] Debug level " << debug_level << endl;
// Parse files
for (vector<string>::iterator it=files.begin() ; it != files.end(); it++) parse_file(*it);
for (vector<string>::iterator it=files.begin() ; it != files.end(); it++) {
if (debug_level > 0) cerr << "[DEBUG] Parsing file " << *it << endl;
parse_file(*it);
if (debug_level > 0) cerr << "[DEBUG] I know " << species_counter << " species with " << protein_counter << " proteins and " << edges << " edges in sum" << endl;
};
// Free memory
files.clear();
......@@ -122,6 +137,7 @@ int main(int argc, char *argv[]) {
if (param_verbose) cerr << species_counter << " species" << endl << protein_counter << " paired proteins" << endl << edges << " bidirectional edges" << endl;
// Prepare sort of output
if (debug_level > 0) cerr << "[DEBUG] Sorting known species" << endl;
sort_species();
// Write output header
......@@ -131,6 +147,7 @@ int main(int argc, char *argv[]) {
graph_clean.open(param_rmgraph.c_str());
// Clustering
if (debug_level > 0) cerr << "[DEBUG] Clustering" << endl;
partition_graph();
graph_clean.close();
}
......@@ -283,14 +300,19 @@ void partition_graph() {
// Do not report singles
if (current_group.size() < 2) {
if (debug_level > 1) cerr << "[DEBUG] Single gene skipped" << endl;
protein_id++;
continue;
}
// Connectivity analysis
if (debug_level > 0) cerr << "[DEBUG] Calculating connectivity of a group. " << current_group.size() << " proteins (ID: " << protein_id << ")" << endl;
if (current_group.size() > 16000) cerr << "[WARN] Current connected component conatins " << current_group.size() << " proteins. That might be way to much for me. Try raising the e-value!" << endl;
double connectivity = getConnectivity(current_group);
if (debug_level > 0) cerr << "[DEBUG] Connectivity was " << connectivity << endl;
if (connectivity < param_con_threshold && current_group.size() > 3) {
if (debug_level > 0) cerr << "[DEBUG] Reiterating" << endl;
// Split groups is done in getConnectivity function
// Reset flags and repeat without incrementing protein counter
for (unsigned int i = 0; i < current_group.size(); i++) done[current_group[i]] = false;
......@@ -432,7 +454,8 @@ void removeExternalEdges(map<unsigned int,bool>& a) {
for (vector<unsigned int>::iterator ita = graph[protein].edges.begin(); ita != graph[protein].edges.end(); ita++) {
// If it is not present the own group, set flag
if (a.find(*ita) == a.end()) {
graph_clean << graph[protein].full_name << "\t" << graph[*ita ].full_name << endl; // Improved graph cleaning
// 5.16 store name AND species in removal list
graph_clean << graph[protein].full_name << "\t" << species[graph[protein].species_id] << "\t" << graph[*ita ].full_name << "\t" << species[graph[*ita ].species_id] << endl; // Improved graph cleaning
swap = true;
}
// Otherwise, add it to the new edge list
......@@ -497,11 +520,15 @@ void splitGroups(vector<double>& y, vector<unsigned int>& nodes){
// return;
// }
// Store data about two groups
map<unsigned int,bool> groupA, groupB;
// Store data about two groups (Zero cannot be assigned with certainty)
map<unsigned int,bool> groupA, groupB, groupZero;
for (unsigned int i = 0; i < y.size(); i++) {
// cerr << i << " = " << y[i] << endl;
if (y[i] < 0) {groupA[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#95cde5}" << endl; }
unsigned int style = 1;
if (y[i] < 0) style = 2;
if (abs(y[i]) < param_sep_purity) style = 0;
if (debug_level > 2) cerr << "[DEBUG L3] " << i << " = " << y[i] << " -> " << style << endl;
if (abs(y[i]) < param_sep_purity) {groupZero[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#95cde5}" << endl; }
else if (y[i] < 0) {groupA[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#95cde5}" << endl; }
else {groupB[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#b01700}" << endl; }
}
......@@ -512,22 +539,18 @@ void splitGroups(vector<double>& y, vector<unsigned int>& nodes){
throw "Failed to partition subgraph! This might lead to an infinit loop. Please submit the .blastgraph file to lechner@staff.uni-marburg.de to help fixing this issue.";
}
removeExternalEdges(groupZero);
removeExternalEdges(groupA);
removeExternalEdges(groupB);
}
double getConnectivity(vector<unsigned int>& nodes) {
// Special cases hotfix
if (nodes.size() == 2) {return 1;}
if (nodes.size() == 3) {
vector<unsigned int> min = get_deg_one(nodes);
// fully connected
if (min.size() == 0) {return 1;}
// not
else {return 1/3;}
// Special case quicky
if (nodes.size() == 2) {
if (debug_level > 1) cerr << "[DEBUG L2] Shortcut for two-piece component -> conn = 1" << endl;
return 1;
}
// Hotfix end
// quicky end
// Get max degree of nodes
unsigned int max_degree = max_deg(nodes);
......@@ -546,7 +569,9 @@ double getConnectivity(vector<unsigned int>& nodes) {
// Repeat until difference < epsilon
unsigned int iter = 0; // catch huge clustering issues by keeping track here
while(iter++ < max_iter) {
if (debug_level > 2 && iter%100 == 0) cerr << "[DEBUG L2] Step " << iter << " / " << max_iter << endl;
last_length = current_length;
// Get a new x
x = get_new_x(norm, nodes, mapping);
......@@ -557,9 +582,10 @@ double getConnectivity(vector<unsigned int>& nodes) {
// Get lenght (lambda) & normalize vector
norm = nomalize(x_hat, &current_length);
/// cerr << "I " << iter << ": " << abs(current_length-last_length) << endl;
if (abs(current_length-last_length) < 0.00001 && iter >= min_iter) break; // min 100 iterations (prevent convergence by chance), converge to 1e-6
if (abs(current_length-last_length) < 0.0001 && iter >= min_iter) break; // min 100 iterations (prevent convergence by chance), converge to 1e-6
}
/// cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl;
if (debug_level > 0) cerr << "[DEBUG] " << iter << " / " << max_iter << " iterations required (error is " << abs(current_length-last_length) << ")" << endl;
double connectivity = (-current_length+2*max_degree)/(nodes.size());
......@@ -622,13 +648,13 @@ void parse_file(string file) {
if (file_a == "file_a" && file_b == "file_b") continue; // Init Header
// Map species a
// Map species a if not know so far
if (species2id.find(file_a) == species2id.end()) {
species.push_back(file_a);
// cerr << "Species " << species_counter << ": " << file_a << endl;
species2id[file_a] = species_counter++;
}
// Map species b
// Map species b if not know so far
if (species2id.find(file_b) == species2id.end()) {
// cerr << "Species " << species_counter << ": " << file_b << endl;
species.push_back(file_b);
......@@ -644,17 +670,27 @@ void parse_file(string file) {
// cerr << protein_counter << ": " << fields[0] << " <-> " << fields[1] << endl;
// 5.16 deal with duplicated IDs by adding file ID to protein ID
string ida = fields[0];
string idb = fields[1];
fields[0] += " "; fields[0] += file_a_id;
fields[1] += " "; fields[1] += file_b_id;
// 5.16 do not point to yourself
if (!fields[0].compare(fields[1])) {continue;}
// A new protein
if (protein2id.find(fields[0]) == protein2id.end()) {
protein a;
a.full_name = fields[0];
a.full_name = ida;
a.species_id = file_a_id;
protein2id[fields[0]] = protein_counter++;
graph.push_back(a);
}
if (protein2id.find(fields[1]) == protein2id.end()) {
protein b;
b.full_name = fields[1]; b.species_id = file_b_id;
b.full_name = idb;
b.species_id = file_b_id;
protein2id[fields[1]] = protein_counter++;
graph.push_back(b);
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -477,14 +477,6 @@ GRNCKAADQPGILTHNKMCQCGDFGLPHEKYGFAHLSRSSFGVSCTACSNPTIVSALHRA
ESIHRSGAWLNSQDSTETGTNLFGPTSKESRESICGNQFSVTATIGQLGYVTDVLAHGMV
LAFCDFAKTASAGPPKEKVNCDDGHGGLMLEANVFSEESDVRKHLGLSEKGYHCVLTSKT
LVKFLIKNQTFHCGAEGCVPTGSGAFGD
>L_7
KEDAIRWPQLGGVSLYGEDANMELGADGVPSTSALGMPWPVLFNANLSGKQCAQCRIFIV
CHLTQPHFVGAMQVGMSGDDELKDQVKLNGGACKDEFRGNRTLMAMYPFARVLFATPSTV
SFDKFILKEGFGFLGRCAAKVAATAPLNGVTTACPVQVVSKCCNKGKKELEPLFTAREHA
ADGCGYASAFTVALEKDSYHCDYYSDHAQYASKSYAKSSRALASYFLIQFISCTKGLCSE
SHECVKNEFLVKIWPGSKMGASIPTNYVMLSDPAYPYECDDHQNNHCSGEKMPKLQNHPG
YSAFTQRQKSFTKHLTPAKGERKFDMNGVHLDIGKVTPTTSDEYVEALSLVHPTALNPAF
GMKAIYYLKRAYKKGLFGLDVHVSKNNTEASKKDYHVVT
>L_8
CRLPGGVNERAFGHVNDKDLCMLPSPVFCTQKGKEHPPNKEYQMAGSKPPTWPAAQVVDC
RHELRTYTGQLPMRSATDLGPNVKSIYTVSERGFKVGQTNHAVCFEAGNVEDKKWKMCHG
......
#!/usr/bin/perl
use warnings;
use strict;
my %genehash;
if (defined($ARGV[0])) {
print STDERR "Usage: remove_graph_duplicates.pl <GRAPH\n\nChecks a given graph input for duplicates and removes them (species only)\nReads from STDIN\n";
exit;
}
my $a = "unk";
my $b = "unk";
while (<STDIN>) {
my @row = split(/\s+/);
if ($_ =~ /^#/) {
unless ($row[1]) {die();}
unless ($row[1] eq "file_a" || $row[1] eq "a") {
$a = $row[1];
$b = $row[2];
}
}
else {
unless ($row[1]) {next;}
# print "$a: $row[0]\n$b: $row[1]\n";
if (defined($genehash{$row[0]})) {
if ($genehash{$row[0]} ne $a) {
print "$row[0] was used with $genehash{$row[0]} and $a\n";
}
}
else {
$genehash{$row[0]} = $a;
}
if (defined($genehash{$row[1]})) {
if ($genehash{$row[1]} ne $b) {
print "$row[1] was used with $genehash{$row[1]} and $b\n";
}
}
else {
$genehash{$row[1]} = $b;
}
}
}
#!/usr/bin/perl
use warnings;
use strict;
unless (defined($ARGV[0])) {
print STDERR "Usage: extract_from_graph.pl PROTEINORTHO_TABLE <GRAPH\n\nExtracts an orthology group from a given graph (STDIN)\n";
exit;
}
my %use_gene;
my @species;
open(PO,"<$ARGV[0]") || die($!);
while (<PO>) {
chomp;
my @list = split(/\t+/);
if ($list[0] eq "# Species") {
@species = @list;
shift @species;
shift @species;
shift @species;
}
else {
shift @list;
shift @list;
shift @list;
for (my $i = 0; $i < scalar(@list); $i++) {
foreach my $gene (split(",",$list[$i])) {
my $id = $gene.' '.$species[$i];
$use_gene{$id}++;
}
}
}
}
close(PO);
print STDERR scalar(keys %use_gene);
print STDERR " genes found\n";
my $file_a;
my $file_b;
while (<STDIN>) {
if ($_ =~ /^#/) {
print $_;
my @row = split(/\s+/);
unless ($row[1]) {die();}
unless ($row[1] eq "file_a" || $row[1] eq "a") {
$file_a = $row[1];
$file_b = $row[2];
}
}
else {
my @row = split(/\s+/);
unless ($row[1]) {next;}
my $ida = $row[0].' '.$file_a;
my $idb = $row[1].' '.$file_b;
if ($use_gene{$ida} || $use_gene{$idb}) {
print $_;
}
}
}
#!/usr/bin/perl
use warnings;
use strict;
print "graph G {\n";
while (<STDIN>) {
chomp;
if ($_ =~ /#/) {next;}
my @x = split(/\s/);
if ($x[1]) {
print " $x[0] -- $x[1];\n"
}
}
print " }";
#!/usr/bin/perl
use warnings;
use strict;
my %filehash;
my $flag;
if (defined($ARGV[0])) {
print STDERR "Usage: remove_graph_duplicates.pl <GRAPH\n\nChecks a given graph input for duplicates and removes them (species only)\nReads from STDIN\n";
exit;
}
while (<STDIN>) {
if ($_ =~ /^#/) {
$flag = 1;
my @row = split(/\s+/);
unless ($row[1]) {die();}
unless ($row[1] eq "file_a" || $row[1] eq "a") {
my $a = $row[1];
my $b = $row[2];
my $id = join(",",sort($a,$b));
if (++$filehash{$id} > 1) {
$flag = 0;
print STDERR "Found duplicate: $a vs $b\n";
}
}
}
if ($flag) {
print $_;
}
}