Skip to content

Commit

Permalink
Merge pull request #3696 from vgteam/restrict-dictionary-paths
Browse files Browse the repository at this point in the history
Restrict sequence dictionary paths
  • Loading branch information
adamnovak authored Jul 8, 2022
2 parents 2a029a2 + 0518360 commit d03d0f1
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 104 deletions.
13 changes: 10 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,18 @@ ifneq ($(shell uname -s),Darwin)
LIB_DEPS += $(LIB_DIR)/libelf.a
endif

# Control variable for allocator
# On the command line, you can `make jemalloc=off` if you definitely don't want jemalloc.
jemalloc = on
ifeq ($(shell uname -s),Darwin)
jemalloc = off
endif

# Only depend on these files for the final linking stage.
# These libraries provide no headers to affect the vg build.
LINK_DEPS =

ifneq ($(shell uname -s),Darwin)
ifeq ($(jemalloc),on)
# Use jemalloc at link time
LINK_DEPS += $(LIB_DIR)/libjemalloc.a
# We have to use it statically or we can't get at its secret symbols.
Expand Down Expand Up @@ -390,11 +397,11 @@ $(LIB_DIR)/libvg.a: $(LIBVG_DEPS)
# For a normal dynamic build we remove the static build marker
$(BIN_DIR)/$(EXE): $(LIB_DIR)/libvg.a $(EXE_DEPS)
-rm -f $(LIB_DIR)/vg_is_static
. ./source_me.sh && $(CXX) $(LDFLAGS) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) -lvg $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS)
. ./source_me.sh && $(CXX) $(LDFLAGS) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.a $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS)
# We keep a file that we touch on the last static build.
# If the vg linkables are newer than the last static build, we do a build
$(LIB_DIR)/vg_is_static: $(INC_DIR)/vg_environment_version.hpp $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(DEPS) $(LINK_DEPS)
$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) -lvg $(STATIC_FLAGS) $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS)
$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.a $(STATIC_FLAGS) $(LD_LIB_DIR_FLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS)
-touch $(LIB_DIR)/vg_is_static

# We don't want to always rebuild the static vg if no files have changed.
Expand Down
274 changes: 216 additions & 58 deletions src/hts_alignment_emitter.cpp

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions src/hts_alignment_emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,14 @@ unique_ptr<AlignmentEmitter> get_alignment_emitter(const string& filename, const
* get_alignment_emitter_with_surjection(), by parsing a file. The file may be
* an HTSlib-style "sequence dictionary" (consisting of SAM @SQ header lines),
* or a plain list of sequence names (which do not start with "@SQ"). If the
* file is not openable or contains no entries, reports an error and quits. If
* the filename is itself an empty string, all non-alt-allele paths from the
* file is not openable or contains no entries, reports an error and quits.
*
* If path_names has entries, they are treated as path names that supplement
* those in the file, if any.
*
* If the filename is itself an empty string, and no path names are passed,
* then all reference-sense paths from the graph will be collected in arbitrary
* order. If there are none, all non-alt-allele generic sense paths from the
* graph will be collected in arbitrary order.
*
* TODO: Be able to generate the autosomes human-sort, X, Y, MT order typical
Expand All @@ -79,7 +85,7 @@ unique_ptr<AlignmentEmitter> get_alignment_emitter(const string& filename, const
* This information needs to come from the user in order to be correct, but
* if it's not specified, it'll be guessed from the graph
*/
vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const string& filename, const PathPositionHandleGraph& graph);
vector<tuple<path_handle_t, size_t, size_t>> get_sequence_dictionary(const string& filename, const vector<string>& path_names, const PathPositionHandleGraph& graph);

/**
* Given a list of path handles and size info (from get_sequence_dictionary), return two things:
Expand Down
2 changes: 1 addition & 1 deletion src/subcommand/giraffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,7 @@ int main_giraffe(int argc, char** argv) {
if (hts_output) {
// For htslib we need a non-empty list of paths.
assert(path_position_graph != nullptr);
paths = get_sequence_dictionary(ref_paths_name, *path_position_graph);
paths = get_sequence_dictionary(ref_paths_name, {}, *path_position_graph);
}

// Set up output to an emitter that will handle serialization and surjection.
Expand Down
2 changes: 1 addition & 1 deletion src/subcommand/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ int main_map(int argc, char** argv) {
// Look up all the paths we might need to surject to.
vector<tuple<path_handle_t, size_t, size_t>> paths;
if (hts_output) {
paths = get_sequence_dictionary(ref_paths_name, *xgidx);
paths = get_sequence_dictionary(ref_paths_name, {}, *xgidx);
}

// Set up output to an emitter that will handle serialization and surjection
Expand Down
2 changes: 1 addition & 1 deletion src/subcommand/mpmap_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1858,7 +1858,7 @@ int main_mpmap(int argc, char** argv) {
}

// Load all the paths in the right order
vector<tuple<path_handle_t, size_t, size_t>> paths = get_sequence_dictionary(ref_paths_name, *path_position_handle_graph);
vector<tuple<path_handle_t, size_t, size_t>> paths = get_sequence_dictionary(ref_paths_name, {}, *path_position_handle_graph);
// Make them into a set for directing surjection.
for (const auto& path_info : paths) {
surjection_paths.insert(get<0>(path_info));
Expand Down
54 changes: 18 additions & 36 deletions src/subcommand/surject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,8 @@ void help_surject(char** argv) {
<< "options:" << endl
<< " -x, --xg-name FILE use this graph or xg index (required)" << endl
<< " -t, --threads N number of threads to use" << endl
<< " -p, --into-path NAME surject into this path (many allowed, default: all in xg)" << endl
<< " -F, --into-paths FILE surject into nonoverlapping path names listed in FILE (one per line)" << endl
<< " --ref-paths FILE ordered list of paths in the graph, one per line or HTSlib .dict, for HTSLib @SQ headers" << endl
<< " -p, --into-path NAME surject into this path or its subpaths (many allowed, default: reference, then non-alt generic)" << endl
<< " -F, --into-paths FILE surject into path names listed in HTSlib sequence dictionary or path list FILE" << endl
<< " -l, --subpath-local let the multipath mapping surjection produce local (rather than global) alignments" << endl
<< " -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads" << endl
<< " -G, --gaf-input input file is GAF instead of GAM" << endl
Expand All @@ -61,12 +60,9 @@ int main_surject(int argc, char** argv) {
return 1;
}

#define OPT_REF_PATHS 1001

string xg_name;
set<string> path_names;
vector<string> path_names;
string path_file;
string ref_paths_name;
string output_format = "GAM";
string input_format = "GAM";
bool spliced = false;
Expand All @@ -90,7 +86,7 @@ int main_surject(int argc, char** argv) {
{"threads", required_argument, 0, 't'},
{"into-path", required_argument, 0, 'p'},
{"into-paths", required_argument, 0, 'F'},
{"ref-paths", required_argument, 0, OPT_REF_PATHS},
{"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths
{"subpath-local", required_argument, 0, 'l'},
{"interleaved", no_argument, 0, 'i'},
{"gaf-input", no_argument, 0, 'G'},
Expand Down Expand Up @@ -124,17 +120,13 @@ int main_surject(int argc, char** argv) {
break;

case 'p':
path_names.insert(optarg);
path_names.push_back(optarg);
break;

case 'F':
path_file = optarg;
break;

case OPT_REF_PATHS:
ref_paths_name = optarg;
break;

case 'l':
subpath_global = false;
break;
Expand Down Expand Up @@ -218,17 +210,7 @@ int main_surject(int argc, char** argv) {
};

string file_name = get_input_file_name(optind, argc, argv);

if (!path_file.empty()){
// open the file
ifstream in(path_file);
string line;
while (std::getline(in,line)) {
// Load up all the paths to surject into from the file
path_names.insert(line);
}
}


PathPositionHandleGraph* xgidx = nullptr;
unique_ptr<PathHandleGraph> path_handle_graph;
bdsg::PathPositionOverlayHelper overlay_helper;
Expand All @@ -240,17 +222,20 @@ int main_surject(int argc, char** argv) {
cerr << "error[vg surject] XG index (-x) is required for surjection" << endl;
exit(1);
}

// Get the paths to surject into and their length information, either from
// the given file, or from the provided list, or from sniffing the graph.
vector<tuple<path_handle_t, size_t, size_t>> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *xgidx);
// Clear out path_names so we don't accidentally use it
path_names.clear();

// add the target paths. if there are no path names, add them all. otherwise, take into account possible subpath names.
// Convert to a set for membership testing
unordered_set<path_handle_t> paths;
xgidx->for_each_path_handle([&](path_handle_t path_handle) {
if (path_names.empty() || path_names.count(Paths::get_base_name(xgidx->get_path_name(path_handle)))) {
paths.insert(path_handle);
}
});
// don't use this anymore: it's no longer updated to include all paths when none given.
path_names.clear();

paths.reserve(sequence_dictionary.size());
for (auto& entry : sequence_dictionary) {
paths.insert(get<0>(entry));
}

// Make a single thread-safe Surjector.
Surjector surjector(xgidx);
surjector.adjust_alignments_for_base_quality = qual_adj;
Expand All @@ -264,9 +249,6 @@ int main_surject(int argc, char** argv) {
surjector.min_splice_length = numeric_limits<int64_t>::max();
}

// Get the paths to use in the HTSLib header sequence dictionary
vector<tuple<path_handle_t, size_t, size_t>> sequence_dictionary = get_sequence_dictionary(ref_paths_name, *xgidx);

// Count our threads
int thread_count = vg::get_thread_count();

Expand Down
12 changes: 11 additions & 1 deletion test/t/15_vg_surject.t
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap
PATH=../bin:$PATH # for vg


plan tests 39
plan tests 40

vg construct -r small/x.fa >j.vg
vg index -x j.xg j.vg
Expand Down Expand Up @@ -36,6 +36,16 @@ is $(vg surject -p x -x x.xg -t 1 -s j.gam | grep -v "@" | cut -f3 | grep x | wc

is $(vg surject -x x.xg -t 1 -s j.gam | grep -v "@" | cut -f3 | grep x | wc -l) \
100 "vg surject doesn't need to be told which path to use"

vg paths -X -x x.vg | vg view -aj - | jq '.name = "sample#0#x#0"' | vg view -JGa - > paths.gam
vg paths -X -x x.vg | vg view -aj - | jq '.name = "ref#0#x[55]"' | vg view -JGa - >> paths.gam
vg augment x.vg -i paths.gam > x.aug.vg
vg index -x x.aug.xg x.aug.vg

is $(vg surject -x x.aug.xg -t 1 -s j.gam | grep -v "@" | cut -f3 | grep "ref#0#x" | wc -l) \
100 "vg surject picks a reference-sense path if it is present"

rm x.aug.vg x.aug.xg paths.gam

is $(vg surject -p x -x x.xg -t 1 x.gam | vg view -a - | wc -l) \
100 "vg surject works for every read simulated from a dense graph"
Expand Down

2 comments on commit d03d0f1

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 12778 seconds

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.42.0. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 9655 seconds

Please sign in to comment.