Skip to content
Commits on Source (2)
#!/bin/bash
# Implementation of Lima-Mendez et al. (DOI: 10.1093/molbev/msn023)
# method of clustering of bacteriophages based on gene shared content.
#
# Used a FASTA sequence from NCBI and Skunavirus inclusive
#
# Requirements:
# * Prodigal - ORFs caller (DOI: 10.1186/1471-2105-11-119)
# * Diamond- - homology search engine (DOI: https://doi.org/10.1038/nmeth.3176)
# * MCL - clustering algorithm (DOI: https://doi.org/10.1137/040608635)
# Modified implementation of the above paper in the public domain (scientific)
# Original file by kseniaarkhipova
threads=$(nproc)
prodigal='prodigal'
diamond='/usr/lib/debian-med/bin/diamond'
LM_scripts='./'
mcl=''
fasta=${1}
#Predicting ORFs, diamond all-vs-all
${prodigal} -i ${fasta} -a predicted_prot.faa -o orfs.gff -p meta -q -f gff
"$diamond" makedb --in predicted_prot.faa -d proteins_database.dmnd
"$diamond" blastp --db proteins_database.dmnd -q predicted_prot.faa -o diamond_result.txt --quiet
#Filtering of an alignment result, proteins with hits with bitscore > 50 considered as related
awk -F'\t' -v OFS='\t' '($1 != $2) && $12 > 50 {print $1, $2, $11}' diamond_result.txt > protein_graphs.txt
#Generation of protein families
"$mcl"mcxload -abc protein_graphs.txt --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o protein_graphs.mci -write-tab protein_graphs.tab
"$mcl"mcl protein_graphs.mci -I 2
"$mcl"mcxdump -icl out.protein_graphs.mci.I20 -tabr protein_graphs.tab -o clusters.protein_graphs.mci.I20
#Pairwise comparison of contigs and assessment of shared gene content
"$LM_scripts"hypergeom.py clusters.protein_graphs.mci.I20
#Building clusters of genomes
"$mcl"mcxload -abc genome_graphs.txt --stream-mirror -o genome_graphs.mci -write-tab genome_graphs.tab
"$mcl"mcl genome_graphs.mci -I 1.2
"$mcl"mcl genome_graphs.mci -I 2
"$mcl"mcl genome_graphs.mci -I 4
"$mcl"mcl genome_graphs.mci -I 6
"$mcl"mcxdump -icl out.genome_graphs.mci.I12 -tabr genome_graphs.tab -o clusters.genome.mci.I12
"$mcl"mcxdump -icl out.genome_graphs.mci.I20 -tabr genome_graphs.tab -o clusters.genome.mci.I20
"$mcl"mcxdump -icl out.genome_graphs.mci.I40 -tabr genome_graphs.tab -o clusters.genome.mci.I40
"$mcl"mcxdump -icl out.genome_graphs.mci.I60 -tabr genome_graphs.tab -o clusters.genome.mci.I60
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python2
# Pairwise comparison of contigs and assessment of shared gene content
from __future__ import division
import scipy.stats as stats
import re
import math
import sys
clusters = sys.argv[1]
families = {}
names2 = []
with open(clusters) as file2:
for line in file2:
dat = line.strip().split('\t')
for i in dat:
contig = re.search('(\S+)_\d+', i).group(1)
families[contig] = set()
names2.append(contig)
names = list(set(names2))
with open(clusters) as file1:
count = 1
for line in file1:
dat = line.strip().split('\t')
for i in dat:
k = re.search('(\S+)_\d+', i).group(1)
if k in families:
families[k].add('family_' + str(count))
count += 1
total = []
file_out = open('genome_graphs.txt', 'w')
for i in range(len(names)):
for j in range(len(names)):
if names[i] != names[j]:
common = len(families[names[i]].intersection(families[names[j]]))
c = max(len(families[names[i]]), len(families[names[j]]))
b = min(len(families[names[i]]), len(families[names[j]]))
p = stats.hypergeom.sf(common - 1, count - 1, c, b)
if p != 0:
link_strength = - math.log10(p * (len(names) * (len(names) - 1) / 2))
if link_strength > 0 and names[i] != names[j]:
l = [names[i], names[j], str(link_strength)]
file_out.write('\t'.join(l) + '\n')
else:
l = [names[i], names[j], "100"]
file_out.write('\t'.join(l) + '\n')
file_out.close()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.