Skip to content

Commit

Permalink
Merge pull request #36 from ComparativeGenomicsToolkit/chop
Browse files Browse the repository at this point in the history
Add dupe remover and name munger
  • Loading branch information
glennhickey authored Feb 9, 2021
2 parents 5db9207 + 07f2965 commit 068fea8
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 15 deletions.
17 changes: 14 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
rootPath = ./
include ./include.mk

all : hal2vg clip-vg
all : hal2vg clip-vg halRemoveDupes

# Note: hdf5 from apt doesn't seem to work for static builds. It should be installed
# from source and configured with "--enable-static --disable-shared", then have its
Expand All @@ -23,12 +23,17 @@ ifeq ($(shell ldd clip-vg | grep "not a dynamic" | wc -l), $(shell ls clip-vg |
else
$(error ldd found dnymaic linked dependency in clip-vg)
endif
ifeq ($(shell ldd halRemoveDupes | grep "not a dynamic" | wc -l), $(shell ls halRemoveDupes | wc -l))
$(info ldd verified that halRemoveDupes static)
else
$(error ldd found dnymaic linked dependency in halRemoveDupes)
endif

cleanFast :
rm -f hal2vg hal2vg.o clip-vg clip-vg.o
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o

clean :
rm -f hal2vg hal2vg.o clip-vg clip-vg.o
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o
cd deps/sonLib && make clean
cd deps/pinchesAndCacti && make clean
cd deps/hal && make clean
Expand Down Expand Up @@ -58,6 +63,12 @@ clip-vg.o : clip-vg.cpp ${basicLibsDependencies}
clip-vg : clip-vg.o ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -fopenmp -pthread clip-vg.o ${basicLibs} -o clip-vg

halRemoveDupes.o : halRemoveDupes.cpp ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -I . halRemoveDupes.cpp -c

halRemoveDupes : halRemoveDupes.o ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -fopenmp -pthread halRemoveDupes.o ${basicLibs} -o halRemoveDupes

test :
make
cd tests && prove -v t
3 changes: 2 additions & 1 deletion build-tools/makeBinRelease
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ git fetch --tags origin
REL_TAG=$(getLatestReleaseTag)
git checkout "${REL_TAG}"
git submodule update --init --recursive
cp ~/dev/CMakeLists.txt ./deps/libbdsg-easy/deps/libhandlegraph/CMakeLists.txt

if [ $(man gcc | grep nehalem | wc -l) -ge 1 ]
then
Expand All @@ -36,5 +37,5 @@ else
make check-static
fi

cp hal2vg clip-vg ${buildDir}/
cp hal2vg clip-vg halRemoveDupes ${buildDir}/

100 changes: 89 additions & 11 deletions clip-vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ void help(char** argv) {
<< "Chop out path intervals from a vg graph" << endl
<< endl
<< "options: " << endl
<< " -b, --bed FILE Intervals to clip in BED format" << endl
<< " -m, --min-length N Only clip paths of length < N" << endl
<< " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl
<< " -p, --progress Print progress" << endl
<< " -b, --bed FILE Intervals to clip in BED format" << endl
<< " -m, --min-length N Only clip paths of length < N" << endl
<< " -f, --force-clip Don't abort with error if clipped node overlapped by multiple paths" << endl
<< " -r, --name-replace S1>S2 Replace (first occurrence of) S1 with S2 in all path names" << endl
<< " -p, --progress Print progress" << endl
<< endl;
}

Expand All @@ -44,17 +45,21 @@ static void chop_path_intervals(MutablePathMutableHandleGraph* graph,
static unordered_set<handle_t> chop_path(MutablePathMutableHandleGraph* graph,
path_handle_t path_handle,
const vector<pair<int64_t, int64_t>>& intervals);
static void replace_path_name_substrings(MutablePathMutableHandleGraph* graph, const vector<string>& to_replace,
bool progress);
// Create a subpath name (todo: make same function in vg consistent (it only includes start))
static inline string make_subpath_name(const string& path_name, size_t offset, size_t length) {
return path_name + "[" + std::to_string(offset) + "-" + std::to_string(offset + length) + "]";
}


int main(int argc, char** argv) {

string bed_path;
int64_t min_length = 0;
bool force_clip = false;
bool progress = false;
vector<string> replace_list;
int c;
optind = 1;
while (true) {
Expand All @@ -64,13 +69,14 @@ int main(int argc, char** argv) {
{"bed", required_argument, 0, 'b'},
{"min-length", required_argument, 0, 'm'},
{"force-clip", no_argument, 0, 'f'},
{"name-replace", required_argument, 0, 'r'},
{"progress", no_argument, 0, 'p'},
{0, 0, 0, 0}
};

int option_index = 0;

c = getopt_long (argc, argv, "hpb:m:f",
c = getopt_long (argc, argv, "hpb:m:fr:",
long_options, &option_index);

// Detect the end of the options.
Expand All @@ -88,6 +94,9 @@ int main(int argc, char** argv) {
case 'f':
force_clip = true;
break;
case 'r':
replace_list.push_back(optarg);
break;
case 'p':
progress = true;
break;
Expand Down Expand Up @@ -120,8 +129,13 @@ int main(int argc, char** argv) {
return 1;
}

if (bed_path.empty() == (min_length == 0)) {
cerr << "[clip-vg] error: Exactly one of either -b or -m must be specified to select input" << endl;
if (!bed_path.empty() && min_length != 0) {
cerr << "[clip-vg] error: -b and -m must cannot be used together" << endl;
return 1;
}

if (bed_path.empty() && min_length == 0 && replace_list.empty()) {
cerr << "[clip-vg] error: at east one of -b, -m or -r must be specified" << endl;
return 1;
}

Expand Down Expand Up @@ -167,8 +181,14 @@ int main(int argc, char** argv) {
}
cerr << "[clip-vg]: Loaded " << num_intervals << " BED intervals over " << bed_intervals.size() << " sequences" << endl;
}

chop_path_intervals(graph.get(), bed_intervals, force_clip, progress);

if (!bed_intervals.empty()) {
chop_path_intervals(graph.get(), bed_intervals, force_clip, progress);
}

if (!replace_list.empty()) {
replace_path_name_substrings(graph.get(), replace_list, progress);
}

dynamic_cast<SerializableHandleGraph*>(graph.get())->serialize(cout);

Expand Down Expand Up @@ -334,8 +354,11 @@ void chop_path_intervals(MutablePathMutableHandleGraph* graph,
if (progress) {
cerr << "[clip-vg]: Clipped "
<< chopped_bases << " bases from "
<< chopped_nodes << " nodes in "
<< chopped_paths << " paths" << endl;
<< chopped_nodes << " nodes";
if (!force_clip) {
cerr << " in " << chopped_paths << " paths";
}
cerr << endl;
}
}

Expand Down Expand Up @@ -470,3 +493,58 @@ unordered_set<handle_t> chop_path(MutablePathMutableHandleGraph* graph,

return chopped_handles;
}

void replace_path_name_substrings(MutablePathMutableHandleGraph* graph, const vector<string>& to_replace,
bool progress) {
// parse the strings
vector<pair<string, string>> replace;
for (const string& repstring : to_replace) {
size_t sep = repstring.find('>');
if (sep == string::npos || sep == 0 || sep == repstring.length() - 1) {
cerr << "[clip-vg]: Unable to find separator '>' in " << repstring << ". Replacement must be"
<< " specified with \"s1>s2\"" << endl;
exit(1);
}
replace.push_back(make_pair(repstring.substr(0, sep), repstring.substr(sep + 1)));
if (replace.back().first == replace.back().second) {
replace.pop_back();
}
}

size_t replacement_count = 0;
size_t path_count = 0;
// take care to not modify path handles while iterating path handles, just in case
vector<string> path_names;
graph->for_each_path_handle([&](path_handle_t path_handle) {
path_names.push_back(graph->get_path_name(path_handle));
});
for (string& path_name : path_names) {
path_handle_t path_handle = graph->get_path_handle(path_name);
bool changed = false;
for (auto& rep : replace) {
size_t p = path_name.find(rep.first);
if (p != string::npos) {
path_name.replace(p, rep.first.length(), rep.second);
++replacement_count;
changed = true;
}
}
if (changed) {
++path_count;
if (graph->has_path(path_name)) {
cerr << "[clip-vg] error: cannot change name of path from " << graph->get_path_name(path_handle) << " to "
<< path_name << " because the latter already exists in the graph" << endl;
exit(1);
}
path_handle_t new_path_handle = graph->create_path_handle(path_name, graph->get_is_circular(path_handle));
graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
graph->append_step(new_path_handle, graph->get_handle_of_step(step_handle));
});
graph->destroy_path(path_handle);
}
}

if (progress) {
cerr << "[clip-vg]: Replaced " << replacement_count << " substrings in " << path_count << " path names" << endl;
}
}
87 changes: 87 additions & 0 deletions halRemoveDupes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Copyright (C) 2016 by Glenn Hickey ([email protected])
*
* Released under the MIT license, see LICENSE.txt
*/

// This file was created by merging hal2sg.cpp and sg2vg.cpp with
// a small amount of glue for the interface.

//#define debug

#include <cstdlib>
#include <iostream>
#include <cassert>
#include <fstream>
#include <deque>
#include <unordered_map>

#include "hal.h"

using namespace std;
using namespace hal;

static void initParser(CLParser* optionsParser) {
optionsParser->addArgument("halFile", "input hal file");
optionsParser->addArgument("genome", "remove all paralogy edges from this genome");
optionsParser->setDescription("Remove paralogy edges from given genome (in place)");
}

int main(int argc, char** argv) {
CLParser optionsParser(WRITE_ACCESS);
initParser(&optionsParser);
string halPath;
string genomeName;
try {
optionsParser.parseOptions(argc, argv);
halPath = optionsParser.getArgument<string>("halFile");
genomeName = optionsParser.getArgument<string>("genome");
}
catch(exception& e) {
cerr << e.what() << endl;
optionsParser.printUsage(cerr);
exit(1);
}
try {
AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, READ_ACCESS | WRITE_ACCESS));
if (alignment->getNumGenomes() == 0) {
throw hal_exception("input hal alignmenet is empty");
}

Genome* genome = alignment->openGenome(genomeName);
if (genome == NULL) {
throw hal_exception("Genome " + genomeName + " not found in alignment");
}

if (genomeName == alignment->getRootName()) {
throw hal_exception("Cannot run on root");
}

TopSegmentIteratorPtr topIt = genome->getTopSegmentIterator();

size_t total_length = 0;
size_t total_edges = 0;
for (; not topIt->atEnd(); topIt->toRight()) {
TopSegment* topSeg = topIt->tseg();
if (topSeg->hasNextParalogy()) {
topSeg->setNextParalogyIndex(NULL_INDEX);
total_length += topSeg->getLength();
++total_edges;
}
}

if (total_length > 0) {
cerr << "[halRemoveDupes]: " << total_edges << " paralogy edges removed from " << genomeName
<< " with total length " << total_length << endl;
} else {
cerr << "[halRemoveDupes] : No paralogy edges found in " << genomeName << endl;
}
}

catch(exception& e) {
cerr << e.what() << endl;
exit(1);
}

return 0;
}

0 comments on commit 068fea8

Please sign in to comment.