Commit b74ffb2c authored by Andreas Tille's avatar Andreas Tille

New upstream version 1.2.0

parent 98ebe318
#!/usr/bin/perl -w
##
# Author: Ben Langmead
# Date: 2/14/2010
#
# Routines for getting and expanding jars from
#
package AWS;
use strict;
use warnings;
our $accessKey = "";
our $secretKey = "";
##
# If either $accessKey or $secretKey are not already set, look some
# more places for them.
#
sub ensureKeys($$$) {
my ($hadoop, $hadoop_arg, $env) = @_;
my $hadoopHome = $env->{HADOOP_HOME};
if(!defined($hadoopHome)) {
$hadoop = $hadoop_arg if $hadoop_arg ne "";
if(-x $hadoop) {
$hadoopHome = `dirname $hadoop`;
chomp($hadoopHome);
$hadoopHome .= "/..";
}
}
if($accessKey eq "") {
if(defined($env->{AWS_ACCESS_KEY_ID})) {
$accessKey = $env->{AWS_ACCESS_KEY_ID};
} elsif(defined($hadoopHome)) {
$accessKey = `grep fs.s3n.awsAccessKeyId $hadoopHome/conf/*.xml | sed 's/.*<value>//' | sed 's/<\\/value>.*//'`;
$accessKey =~ s/\s.*$//; # In case we got multiple lines back
if($accessKey eq "") {
print STDERR "Couldn't get access key from $hadoopHome/conf/*.xml\n";
}
}
if($accessKey eq "") {
die "--accesskey was not specified, nor could the access ".
"key be retrived from an environment variable or from ".
"the \$HADOOP_HOME/conf directory\n";
}
}
if($secretKey eq "") {
if(defined($env->{AWS_SECRET_ACCESS_KEY})) {
$secretKey = $env->{AWS_SECRET_ACCESS_KEY};
} elsif(defined($hadoopHome)) {
$secretKey = `grep fs.s3n.awsSecretAccessKey $hadoopHome/conf/*.xml | sed 's/.*<value>//' | sed 's/<\\/value>.*//'`;
$secretKey =~ s/\s.*$//; # In case we got multiple lines back
if($secretKey eq "") {
print STDERR "Couldn't get secret key from $hadoopHome/conf/*.xml\n";
}
}
if($secretKey eq "") {
die "--secretkey was not specified, nor could the secret ".
"key be retrived from an environment variable or from ".
"the \$HADOOP_HOME/conf directory\n";
}
}
}
1;
This diff is collapsed.
#!/usr/bin/perl
##
# BinSort.pl
#
# A utility for binning and sorting input data in parallel. Input
# files
#
use strict;
use warnings;
use Getopt::Long;
use FindBin qw($Bin);
use lib $Bin;
use lib "$Bin/contrib";
use Cwd 'abs_path';
use ForkManager;
use IO::File;
use List::Util qw[min max];
my $input = "";
my $output = "";
my $intermediate = "";
my $prefix = "";
my $suffix = "";
my $delim = "\t";
my $sortSize = "";
my $cores = 0;
my $sortArgs = "";
my $verbose = 0;
my $force = 0;
my $keep = 0;
my $excludeUnmapped = 0;
my @bin = ();
my $binmapStr = "";
my %binmap = ();
GetOptions (
"input:s" => \$input,
"intermediate:s" => \$intermediate,
"output:s" => \$output,
"bin:s" => \@bin,
"sort:s" => \$sortArgs,
"delim:s" => \$delim,
"S:i" => \$sortSize,
"size:i" => \$sortSize,
"cores:i" => \$cores,
"bin-map:s" => \$binmapStr,
"binmap:s" => \$binmapStr,
"exclude-unmapped" => \$excludeUnmapped,
"prefix:s" => \$prefix,
"suffix:s" => \$suffix,
"keep-all" => \$keep,
"verbose" => \$verbose,
"force" => \$force) || die "Bad option\n";
if(scalar(@ARGV) > 0) {
$input .= "," if $input ne "";
$input .= join(",", @ARGV);
}
# By default, limit the total size of all sorts to 2GB
$delim = "\t" if $delim eq "";
print STDERR "# parallel binners/sorters: $cores\n";
print STDERR "Input: $input\n";
print STDERR "Output: $output\n";
print STDERR "Sort memory footprint (total): $sortSize\n";
print STDERR "Output prefix/suffix: $prefix/$suffix\n";
print STDERR "Delimiter (ascii): ".ord($delim)."\n";
print STDERR "Options: [ ";
print STDERR "-keep-all " if $keep;
print STDERR "-force " if $force;
print STDERR "]\n";
sub checkDir($) {
my $dir = shift;
if(-d $dir) {
die "Output directory $dir already exists" unless $force;
if($force) {
print STDERR "Removing directory $dir due to -force\n";
system("rm -rf $dir >/dev/null 2>/dev/null");
-d $dir && die "Could not remove directory $dir";
}
}
system("mkdir -p $dir >/dev/null 2>/dev/null");
-d $dir || die "Could not create new directory $dir";
}
checkDir("$output");
$intermediate = "$output.pre" if $intermediate eq "";
my $binsOut = "$intermediate/bins";
my $binsErr = "$intermediate/bins.err";
checkDir($binsOut);
checkDir($binsErr);
$output = abs_path($output);
##
# Make a string into an acceptible filename.
#
sub fsSanitize($) {
my $f = shift;
my $ret = "";
for(my $i = 0; $i < length($f); $i++) {
my $c = substr($f, $i, 1);
if($c =~ /[.,#A-Za-z01-9_-]/) {
$ret .= $c;
} else {
$ret .= "_";
}
}
return $ret;
}
if($binmapStr ne "") {
open (BINMAP, $binmapStr) || die "Could not open $binmapStr for reading\n";
print "Bin map = {\n" if $verbose;
while(<BINMAP>) {
chomp;
my @s = split /\t/;
scalar(@s) == 2 || die "Expected key-tab-value, got:\n$_\n";
my ($k, $v) = @s;
defined($binmap{$k}) && print "WARNING: Key $k is mapped more than once\n";
$binmap{$k} = fsSanitize($v);
print " $k => $binmap{$k}\n" if $verbose;
}
print "}\n" if $verbose;
close(BINMAP);
}
print "Starting fork manager\n" if $verbose;
my $pm = new Parallel::ForkManager($cores);
# All bins must be >= 1
for my $b (@bin) { $b > 0 || die "A -bin was $b, but must be > 0\n"; }
# Setup a callback for when a child finishes up so we can
# get its exit code
my $childFailed = 0;
my $childFailedPid = 0;
$pm->run_on_finish(
sub {
my ($pid, $exit_code, $ident) = @_;
if($exit_code != 0) {
$childFailed = $exit_code;
$childFailedPid = $pid;
}
}
);
# First, determine the number of input files
my $ninputs = 0;
for my $inp (split(/,/, $input)) {
$inp = abs_path($inp);
-d $inp || -f $inp || die "No such input file or directory as \"$inp\"\n";
my @fs = ();
if(-d $inp) { @fs = <$inp/*>; }
else { push @fs, $inp; }
$ninputs += scalar(@fs);
}
print STDERR "Found $ninputs input files\n";
# For each input dir
my %filesDone = ();
my %bases = ();
print STDERR "--- Bin ---\n";
my $fi = 0;
for my $inp (split(/,/, $input)) {
$inp = abs_path($inp) if $inp ne "-";
-d $inp || -f $inp || $inp eq "-" || die "No such input file or directory as \"$inp\"\n";
my @fs = ();
if(-d $inp) { @fs = <$inp/*>; }
else { push @fs, $inp; }
scalar(@fs) > 0 || die "No input files in directory \"$inp\"\n";
# For each input file (in current dir)
for my $f (@fs) {
my $base = `basename $f`;
chomp($base);
defined($bases{$base}) && die "Attempted to process file $base more than once\n";
$bases{$base} = 1; # parent keeps track of all the basenames
$fi++;
if($childFailed) {
print STDERR "Aborting master loop because child failed\n";
last;
}
$pm->start and next; # fork off a mapper for this input file
print STDERR "Pid $$ processing input $f [$fi of $ninputs]...\n";
if($f =~ /\.gz$/) {
open INP, "gzip -dc $f |" || die "Could not open pipe 'gzip -dc $f |'";
} elsif($f =~ /\.bz2$/) {
open INP, "bzip2 -dc $f |" || die "Could not open pipe 'bzip2 -dc $f |'";
} else {
open INP, "$f" || die "Could not open $f for reading\n";
}
my $lastBin = undef;
my $lastBinval = undef;
my %outfhs = ();
while(<INP>) {
chomp;
my @s = split /$delim/;
my $binkey = "";
# For each binning dimension
for my $b (@bin) {
$b <= scalar(@s) || die "Bad bin index $b; line only had ".scalar(@s)." tokens:\n$_\n";
$binkey .= $s[$b-1];
}
if(defined($lastBin) && $binkey eq $lastBin) {
# Fast, common case; do what we did last time
defined($lastBinval) || die;
print {$outfhs{$lastBinval}} "$_\n";
} else {
# Use -binmap to map the bin key. If no mapping exists,
# keep the same key (but sanitized).
unless(defined($binmap{$binkey})) {
next if $excludeUnmapped;
# Make a mapping to a sanitized version of binkey
$binmap{$binkey} = fsSanitize($binkey);
}
my $binval = $binmap{$binkey};
unless(defined($outfhs{$binval})) {
system("mkdir -p $binsOut/$base");
my $ofn = "$binsOut/$base/$binval";
print STDERR "Opened filehandle $ofn" if $verbose;
print STDERR "; ".scalar(keys %outfhs)." open in PID $$\n" if $verbose;
$outfhs{$binval} = new IO::File($ofn, "w");
$outfhs{$binval} || die "Could not open $ofn for writing\n";
}
print {$outfhs{$binval}} "$_\n";
$lastBin = $binkey;
$lastBinval = $binval;
}
}
# Close output handles
for my $bin (keys %outfhs) { $outfhs{$bin}->close() };
# Close input handle
close(INP);
$? == 0 || die "Bad exitlevel from input slurp: $?\n";
$pm->finish; # end of fork
}
}
print STDERR "Aborted master loop because child failed\n" if $childFailed;
$pm->wait_all_children;
if($childFailed) {
die "Aborting because child with PID $childFailedPid exited abnormally\nSee previous output\n";
} else {
print STDERR "All children succeeded\n";
}
# Now collect a list of all the binvals. We couldn't have (easily)
# collected them in the previous loop because the binvals were known
# only to the child processes and not to the parent. But we can
# reconstitute them based on the file names.
my %binvals = ();
for my $base (keys %bases) {
for my $f (<$binsOut/$base/*>) {
my $b = `basename $f`;
chomp($b);
$binvals{$b} = 1;
}
}
#
$sortSize = int((3 * 1024 * 1024)/min($cores, scalar(keys %binvals)));
my $bi = 0;
my $sortCmd = "sort -S $sortSize $sortArgs";
print STDERR "--- Sort ---\n";
print STDERR "Sort command: $sortCmd\n";
for my $binval (sort keys %binvals) {
$bi++;
if($childFailed) {
print STDERR "Aborting master loop because child failed\n";
last;
}
$pm->start and next; # fork off a mapper for this input file
print STDERR "Pid $$ processing bin $binval [$bi of ".scalar(keys %binvals)."]...\n";
my $inps = "";
for my $base (keys %bases) {
if(-f "$binsOut/$base/$binval") {
$inps .= "$binsOut/$base/$binval ";
}
}
my $ret = system("$sortCmd $inps >$output/$prefix$binval$suffix 2>$binsErr/$binval");
if($ret == 0 && !$keep) {
# Delete all the files that were inputs to the sort
system("rm -f $inps");
}
exit $ret;
}
$pm->wait_all_children;
if($childFailed) {
die "Aborting because child with PID $childFailedPid exited abnormally\nSee previous output\n";
} else {
print STDERR "All children succeeded\n";
}
print STDERR "DONE\n";
# No errors
unless($keep) {
print STDERR "Removing $intermediate (to keep, specify -keep-all)\n";
system("rm -rf $intermediate");
}
#!/usr/bin/perl -w
##
# CBFinish.pl
#
# Authors: Ben Langmead & Michael C. Schatz
# Date: October 20, 2009
#
# Put a proper chromosome name back onto all Crossbow records.
#
# Author: Ben Langmead
# Date: February 11, 2010
#
use strict;
use warnings;
use 5.004;
use Getopt::Long;
use IO::File;
use Carp;
use FindBin qw($Bin);
use lib $Bin;
use Counters;
use Get;
use Util;
use Tools;
use AWS;
use File::Path qw(mkpath);
{
# Force stderr to flush immediately
my $ofh = select STDERR;
$| = 1;
select $ofh;
}
sub run($) {
my $cmd = shift;
print STDERR "Postprocess.pl: Running \"$cmd\"\n";
return system($cmd);
}
# We want to manipulate counters before opening stdin, but Hadoop seems
# to freak out when counter updates come before the first <STDIN>. So
# instead, we append counter updates to this list.
my @counterUpdates = ();
sub counter($) {
my $c = shift;
defined($c) || croak("Undefined counter update");
print STDERR "reporter:counter:$c\n";
}
sub flushCounters() {
for my $c (@counterUpdates) { counter($c); }
@counterUpdates = ();
}
push @counterUpdates, "Postprocess,Invoked,1";
my $cmap_file = "";
my $cmap_jar = "";
my $dest_dir = "";
my $output = "";
my $cntfn = "";
sub dieusage {
my $msg = shift;
my $exitlevel = shift;
$exitlevel = $exitlevel || 1;
print STDERR "$msg\n";
exit $exitlevel;
}
sub msg($) {
my $m = shift;
defined($m) || croak("Undefined message");
$m =~ s/[\r\n]*$//;
print STDERR "CBFinish.pl: $m\n";
}
Tools::initTools();
my %env = %ENV;
GetOptions (
"output:s" => \$output,
"s3cmd:s" => \$Tools::s3cmd_arg,
"s3cfg:s" => \$Tools::s3cfg,
"jar:s" => \$Tools::jar_arg,
"accessid:s" => \$AWS::accessKey,
"secretid:s" => \$AWS::secretKey,
"hadoop:s" => \$Tools::hadoop_arg,
"wget:s" => \$Tools::wget_arg,
"cmap:s" => \$cmap_file,
"cmapjar:s" => \$cmap_jar,
"destdir:s" => \$dest_dir,
"counters:s" => \$cntfn) || dieusage("Bad option", 1);
Tools::purgeEnv();
$dest_dir = "." if $dest_dir eq "";
msg("s3cmd: found: $Tools::s3cmd, given: $Tools::s3cmd_arg");
msg("jar: found: $Tools::jar, given: $Tools::jar_arg");
msg("hadoop: found: $Tools::hadoop, given: $Tools::hadoop_arg");
msg("wget: found: $Tools::wget, given: $Tools::wget_arg");
msg("s3cfg: $Tools::s3cfg");
msg("cmap_file: $cmap_file");
msg("cmap_jar: $cmap_jar");
msg("local destination dir: $dest_dir");
msg("Output dir: $output");
msg("ls -al");
msg(`ls -al`);
if($cmap_jar ne "") {
mkpath($dest_dir);
(-d $dest_dir) || die "-destdir $dest_dir does not exist or isn't a directory, and could not be created\n";
}
if($cmap_file ne "" && ! -f $cmap_file) {
die "-cmap file $cmap_file doesn't exist or isn't readable\n";
}
sub pushResult($) {
my $fn = shift;
msg("Pushing $fn");
$output .= "/" unless $output =~ /\/$/;
if($output =~ /^s3/i) {
Get::do_s3_put($fn, $output, \@counterUpdates, \%env);
} elsif($output =~ /^hdfs/i) {
my $ret = Get::do_hdfs_put($fn, $output, \@counterUpdates);
if($ret != 0) {
msg("Fatal error: could not put result file $fn into HDFS directory $output");
exit 1;
}
} else {
mkpath($output);
(-d $output) || die "Could not create output directory: $output\n";
run("cp $fn $output") == 0 || die;
}
}
my %cmap = ();
sub loadCmap($) {
my $f = shift;
if($f ne "" && -e $f) {
open CMAP, "$f";
while(<CMAP>) {
chomp;
my @s = split;
next if $s[0] eq "" || $#s < 1;
$cmap{$s[1]} = $s[0];
push @counterUpdates, "Postprocess,Chromosome map entries loaded,1";
}
close(CMAP);
}
}
if($cmap_jar ne "") {
msg("Ensuring cmap jar is installed");
Get::ensureFetched($cmap_jar, $dest_dir, \@counterUpdates, undef, undef, \%env);
push @counterUpdates, "Postprocess,Calls to ensureJar,1";
$cmap_file = "$dest_dir/cmap.txt";
msg("Examining extracted files");
msg("find $dest_dir");
print STDERR `find $dest_dir`;
unless(-f $cmap_file) {
die "Extracting jar didn't create \"$dest_dir/cmap.txt\" file.\n";
}
}
loadCmap($cmap_file) if $cmap_file ne "";
my %outfhs = ();
my %recs = ();
my $lines = 0;
while(<STDIN>) {
next if /^\s*FAKE\s*$/;
next if /^\s*$/;
$lines++;
flushCounters() if scalar(@counterUpdates) > 0;
next unless $_ ne "";
my @ss = split(/\t/);
my $chr = $ss[0];
$chr = $cmap{$chr} if defined($cmap{$chr});
unless(defined($outfhs{$chr})) {
counter("Postprocess,Chromosomes observed,1");
$outfhs{$chr} = new IO::File(".tmp.CBFinish.pl.$$.$chr", "w");
}
$ss[0] = $chr;
$ss[1] = int($ss[1]); # remove leading 0s
print {$outfhs{$chr}} join("\t", @ss);
$recs{$chr}++;
}
msg("Read $lines lines of output");
for my $chr (keys %outfhs) {
counter("Postprocess,SNPs for chromosome $chr,$recs{$chr}");
$outfhs{$chr}->close();
my $fn = ".tmp.CBFinish.pl.$$.$chr";
run("gzip -c < $fn > $chr.gz") == 0 || die "Couldn't gzip $fn\n";
$fn = "$chr.gz";
pushResult($fn);
counter("Postprocess,Chromosome files pushed,1");
};
counter("Postprocess,0-SNP invocations,1") if $lines == 0;
flushCounters() if scalar(@counterUpdates) > 0;
#!/usr/bin/perl
##
# MapWrap.pl
#
# Simple wrapper that mimics some of Hadoop's behavior during the
# Map step of a MapReduce computation.
#
use strict;
use warnings;
use Getopt::Long;
use FindBin qw($Bin);
use lib $Bin;
use lib "$Bin/contrib";
use Cwd 'abs_path';
use Wrap;
use File::Path qw(mkpath);
use POSIX qw/strftime/;
my $input = "";
my $output = "";
my $intermediate = "";
my $force = 0;
my $verbose = 0;
my $VERSION = `cat $Bin/VERSION`; $VERSION =~ s/\s//g;
my $support = qq!
When requesting support, please include the full output printed here.
If a child process was the cause of the error, the output should
include the relevant error message from the child's error log. You may
be asked to provide additional files as well.
!;
##
# Printer that prints to STDERR and, optionally, to a file for messages.
#
my $msgfn = "";
my $msgfh = undef;
sub msg($) {
my $msg = shift;
$msg =~ s/[\r\n]*$//;
print STDERR "$msg\n";
print {$msgfh} "$msg\n" if defined($msgfh);
}
##
# Printer that prints to STDERR and, optionally, to a file for counters.
#
my ($cntfn, $cntdir) = ("", "");
my $cntfh = undef;
sub cnt($) {
my $msg = shift;
$msg =~ s/[\r\n]*$//;
print STDERR "$msg\n";
print {$cntfh} "$msg\n" if defined($cntfh);
}
##
# Print an error message, a support message, then die with given
# exitlevel.
#
sub mydie($$) {
my ($msg, $lev) = @_;
msg("Fatal error $VERSION:D$lev: $msg");
msg($support);
exit $lev;
}
GetOptions (
"messages:s" => \$msgfn,
"counters:s" => \$cntdir,
"intermediate:s" => \$intermediate,
"input:s" => \$input,
"output:s" => \$output,
"force" => \$force) || die "Bad option\n";
if($msgfn ne "") {
open($msgfh, ">>$msgfn") || mydie("Could not open message-out file $msgfn for writing", 15);
}
$input ne "" || mydie("Must specify input directory with --input", 10);
$intermediate ne "" || mydie("Must specify intermediate directory with --intermediate", 10);
$output ne "" || mydie("Must specify output directory with --output", 10);
$cntdir ne "" || mydie("Must specify counters directory with --counters", 10);
msg("=== Directory checker ===");
msg("Time: ".strftime('%H:%M:%S %d-%b-%Y', localtime));
msg("Input: $input");
msg("Output: $output");
msg("Intermediate: $intermediate");
msg("Counters: $cntdir");
msg("Options: [ ".($force ? "--force " : "")."]");
sub checkDir {
my ($dir, $forceoverride) = @_;
if(-d $dir) {
mydie("Output directory $dir already exists", 20) unless $force;
if($force && !$forceoverride) {
msg("Removing directory $dir due to --force");
system("rm -rf $dir >/dev/null 2>/dev/null");
-d $dir && mydie("Could not remove directory $dir", 30);
}
}
mkpath($dir);
(-d $dir) || mydie("Could not create new directory $dir", 40);
}
checkDir($output);
checkDir($intermediate);
if(defined($cntdir) && $cntdir ne "") {
checkDir($cntdir);
}
close($msgfh) if $msgfn ne "";
This diff is collapsed.
#!/usr/bin/perl -w
##
# Counters.pl
#
# Authors: Ben Langmead
# Date: February 14, 2010
#
# Get all the counters and put them in the output directory.
#
use strict;
use warnings;
use Getopt::Long;
use POSIX qw/strftime/;
use FindBin qw($Bin);
use lib $Bin;
use Get;