vcf2dag.cpp 4.97 KB
Newer Older
1 2
#include "Variant.h"
#include "BedReader.h"
3
#include "IntervalTree.h"
4
#include <getopt.h>
5
#include "Fasta.h"
6 7 8 9 10
#include <algorithm>
#include <list>
#include <set>

using namespace std;
11
using namespace vcflib;
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168


void printSummary(char** argv) {
    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
         << endl
         << "options:" << endl 
         << "    -r, --reference FILE         FASTA reference file." << endl
         << endl
         << "Modify the VCF file so that homozygous regions are included as REF/. calls." << endl
         << "For each ref and alt allele, assign an index.  These steps are sufficient to" << endl
         << "enable use of the VCF as a DAG (specifically a partially-ordered graph)." << endl;
    exit(0);
}

int main(int argc, char** argv) {

    string vcfFileName;
    string fastaFileName;

    bool adjustVcf = false;

    if (argc == 1)
        printSummary(argv);

    int c;
    while (true) {
        static struct option long_options[] =
            {
                /* These options set a flag. */
                //{"verbose", no_argument,       &verbose_flag, 1},
                {"help", no_argument, 0, 'h'},
                {"reference", required_argument, 0, 'r'},
                {0, 0, 0, 0}
            };
        /* getopt_long stores the option index here. */
        int option_index = 0;

        c = getopt_long (argc, argv, "hr:",
                         long_options, &option_index);

        if (c == -1)
            break;

        switch (c) {

	    case 'r':
            fastaFileName = string(optarg);
            break;

        case 'h':
            printSummary(argv);
            break;

        case '?':
            printSummary(argv);
            exit(1);
            break;

        default:
            abort ();
        }
    }

    VariantCallFile variantFile;
    string inputFilename;
    if (optind == argc - 1) {
        inputFilename = argv[optind];
        variantFile.open(inputFilename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        cerr << "could not open VCF file" << endl;
        exit(1);
    }

    FastaReference reference;
    if (fastaFileName.empty()) {
        cerr << "a reference is required" << endl;
        exit(1);
    } else {
        reference.open(fastaFileName);
    }
    
    string idname = "id";
    long int uid = 0;

    variantFile.addHeaderLine("##INFO=<ID="+idname+".alt,Number=A,Type=Integer,Description=\"Unique numerical identifier of alt allele.\">");
    variantFile.addHeaderLine("##INFO=<ID="+idname+".ref,Number=1,Type=Integer,Description=\"Unique numerical identifier of ref allele.\">");
    cout << variantFile.header << endl;

    long int last_end = 1;
    string sequenceName;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {

        if (sequenceName.empty()) {
            sequenceName = var.sequenceName;
        } else if (sequenceName != var.sequenceName) {
            // emit last record from previous chrom
            // these should be refactored.....
            Variant refvar(variantFile);
            if (var.position - last_end > 0) {
                refvar.ref = reference.getSubSequence(sequenceName, last_end - 1, var.position - last_end);
                refvar.quality = 0;
                refvar.position = last_end;
                refvar.sequenceName = sequenceName;
                refvar.info[idname+".ref"].push_back(convert(uid++));
                cout << refvar << endl;
            }
            last_end = 1;
            sequenceName = var.sequenceName;
        }

        // generate the last reference record if we have sequence between variants
        if (var.position - last_end > 0) {
            Variant refvar(variantFile);
            refvar.quality = 0;
            refvar.position = last_end;
            refvar.sequenceName = sequenceName;
            refvar.ref = reference.getSubSequence(sequenceName, last_end - 1, var.position - last_end);
            refvar.info[idname+".ref"].push_back(convert(uid++));
            cout << refvar << endl;
        }

        // now manipulate this record
        vector<string>& refidx = var.info[idname+".ref"];
        refidx.clear(); refidx.push_back(convert(uid++));

        vector<string>& idxs = var.info[idname+".alt"];
        idxs.clear();
        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
            idxs.push_back(convert(uid++));
        }
        cout << var << endl;

        last_end = var.position + var.ref.size();

    }

    if (reference.sequenceLength(sequenceName) - last_end > 0) {
        Variant refvar(variantFile);
        refvar.quality = 0;
        refvar.position = last_end;
        refvar.sequenceName = sequenceName;
        refvar.ref = reference.getSubSequence(sequenceName, last_end,
                                              reference.sequenceLength(sequenceName) - last_end);
        refvar.info[idname+".ref"].push_back(convert(uid++));
        cout << refvar << endl;
    }

    return 0;

}