From 8f2d41573f5d2c32f972ea633c5251c164fe1323 Mon Sep 17 00:00:00 2001 From: ivartb Date: Mon, 14 Aug 2023 14:29:15 +0300 Subject: [PATCH] Add `comp2graph` tool for graph components output in GFA format --- src/algo/Comp2Graph.java | 153 +++++++++++++++++++++++++++++++ src/algo/SingleNode.java | 21 +++++ src/io/GFAWriter.java | 86 +++++++++++++++++ src/tools/ComponentsToGraph.java | 142 ++++++++++++++++++++++++++++ 4 files changed, 402 insertions(+) create mode 100644 src/algo/Comp2Graph.java create mode 100644 src/algo/SingleNode.java create mode 100644 src/io/GFAWriter.java create mode 100644 src/tools/ComponentsToGraph.java diff --git a/src/algo/Comp2Graph.java b/src/algo/Comp2Graph.java new file mode 100644 index 0000000..d5f1491 --- /dev/null +++ b/src/algo/Comp2Graph.java @@ -0,0 +1,153 @@ +package algo; + +import io.GFAWriter; +import org.apache.log4j.Logger; +import ru.ifmo.genetics.dna.DnaTools; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.utils.KmerUtils; +import structures.ConnectedComponent; + +import java.io.ByteArrayOutputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static io.GFAWriter.normalizeDna; + +public class Comp2Graph implements Runnable { + private final int k; + private final int compID; + private final Logger logger; + + private final ByteArrayOutputStream outputStream; + + private final ConnectedComponent comp; + private final BigLong2ShortHashMap all_kmers; + private final BigLong2ShortHashMap comp_kmers; + private final Map subgraph; + + private int size; + private SingleNode[] nodes; + + public Comp2Graph(int k, int icomp, ConnectedComponent comp, BigLong2ShortHashMap all_kmers, ByteArrayOutputStream outputStream, Logger logger) { + this.k = k; + this.compID = icomp; + this.logger = logger; + + this.comp = comp; + this.all_kmers = all_kmers; + this.outputStream = outputStream; + this.comp_kmers = new BigLong2ShortHashMap(8, 12, true); + this.subgraph = new HashMap<>(); + } + + + @Override + public void run() { + buildComponent(); + initStructures(); + mergePaths(); + createGFA(); + } + + private void buildComponent() { + logger.debug("Building graph component..."); + if (all_kmers != null) { + for (long kmer: comp.kmers) { + comp_kmers.put(kmer, all_kmers.getWithZero(kmer)); + } + } else { + for (long kmer: comp.kmers) { + comp_kmers.put(kmer, (short)1); + } + } + + for (long kmer: comp.kmers) { + subgraph.put(normalizeDna(KmerUtils.kmer2String(kmer, k)), comp_kmers.getWithZero(kmer)); + } + } + + private void initStructures() { + logger.debug("Initializing structures for graph traversal..."); + Map> nodeByKmer = new HashMap<>(); + size = subgraph.size() * 2; + nodes = new SingleNode[size]; + { + int id = 0; + for (Map.Entry stringShortEntry : subgraph.entrySet()) { + String seq = stringShortEntry.getKey(); + String rc = DnaTools.reverseComplement(seq); + nodes[id] = new SingleNode(seq, id); + nodes[id + 1] = new SingleNode(rc, id); + nodes[id].rc = nodes[id + 1]; + nodes[id + 1].rc = nodes[id]; + id += 2; + } + } + for (int i = 0; i < size; i++) { + String key = nodes[i].sequence.substring(0, k - 1); + if (!nodeByKmer.containsKey(key)) { + nodeByKmer.put(key, new ArrayList<>()); + } + nodeByKmer.get(key).add(nodes[i]); + } + for (int i = 0; i < size; i++) { + String lastK = nodes[i].sequence.substring(1); + if (nodeByKmer.containsKey(lastK)) { + nodes[i].rc.neighbors.addAll(nodeByKmer.get(lastK)); + } + } + } + + private void mergePaths() { + logger.debug("Merging graph paths..."); + while (true) { + boolean acted = false; + for (int i = 0; i < size; i++) { + if (!nodes[i].deleted && nodes[i].neighbors.size() == 1) { + SingleNode other = nodes[i].neighbors.get(0); + if (other.neighbors.size() != 1) { + continue; + } + mergeNodes(nodes[i], other); + acted = true; + } + } + if (!acted) { + break; + } + } + } + + private void mergeNodes(SingleNode firstPlus, SingleNode secondMinus) { + // first k-1 symbols of firstPlus coincide with complement of first k-1 symbols of secondMinus + SingleNode firstMinus = firstPlus.rc, secondPlus = secondMinus.rc; + String newSeq = mergeLabels(secondPlus.sequence, firstPlus.sequence); + String newSeqRC = mergeLabels(firstMinus.sequence, secondMinus.sequence); + + secondPlus.sequence = newSeq; + firstMinus.sequence = newSeqRC; + secondPlus.rc = firstMinus; + firstMinus.rc = secondPlus; + + firstPlus.deleted = secondMinus.deleted = true; + } + + private void checkLabels(String a, String b) { + if (!a.substring(a.length() - (k - 1)).equals(b.substring(0, k - 1))) { + throw new AssertionError("Labels should be merged, but can not: " + a + " and " + b); + } + } + + private String mergeLabels(String a, String b) { + checkLabels(a, b); + return a + b.substring(k - 1); + } + + private void createGFA() { + logger.debug("Preparing GFA output..."); + GFAWriter writer = new GFAWriter(k, compID, outputStream, nodes, subgraph); + writer.generateOutput(); + } +} diff --git a/src/algo/SingleNode.java b/src/algo/SingleNode.java new file mode 100644 index 0000000..7f1a5db --- /dev/null +++ b/src/algo/SingleNode.java @@ -0,0 +1,21 @@ +package algo; + +import java.util.ArrayList; +import java.util.List; + +public class SingleNode { + public String sequence; + public int id; + public boolean deleted; + public SingleNode rc; + public List neighbors; + + public SingleNode(String sequence, int id) { + this.sequence = sequence; + this.id = id; + + this.deleted = false; + this.neighbors = new ArrayList<>(); + } + +} diff --git a/src/io/GFAWriter.java b/src/io/GFAWriter.java new file mode 100644 index 0000000..5ea0b2c --- /dev/null +++ b/src/io/GFAWriter.java @@ -0,0 +1,86 @@ +package io; + +import algo.SingleNode; + + +import java.io.ByteArrayOutputStream; +import java.io.PrintWriter; +import java.util.Map; + +import static ru.ifmo.genetics.dna.DnaTools.reverseComplement; + +public class GFAWriter { + private final int k; + private final int compID; + private final ByteArrayOutputStream outputStream; + private final SingleNode[] nodes; + private final Map subgraph; + private final int size; + + + public GFAWriter(int k, int compID, ByteArrayOutputStream outputStream, SingleNode[] nodes, Map subgraph) { + this.k = k; + this.compID = compID; + this.outputStream = outputStream; + this.nodes = nodes; + this.subgraph = subgraph; + this.size = nodes.length; + } + + public void generateOutput() { + PrintWriter out = new PrintWriter(outputStream, false); + for (int i = 0; i < size; i++) { + if (!nodes[i].deleted && nodes[i].sequence.compareTo(nodes[i].rc.sequence) <= 0) { + printLabel(out, nodes[i]); + } + } + for (SingleNode i : nodes) { + if (!i.deleted) { + for (SingleNode j : i.neighbors) { + if (!j.deleted) { + printEdge(out, i, j); + } + } + } + } + out.flush(); + out.close(); + } + + private void printEdge(PrintWriter out, SingleNode first, SingleNode second) { + StringBuilder edge = new StringBuilder("L\t"); + edge.append(getNodeId(first)).append('\t') + .append((first.sequence.compareTo(first.rc.sequence) >= 0 ? "+" : "-")).append('\t') + .append(getNodeId(second)).append('\t') + .append((second.sequence.compareTo(second.rc.sequence) <= 0 ? "+" : "-")).append('\t') + .append(k - 1).append("M"); + out.println(edge); + } + + private String getNodeId(SingleNode node) { + return (Math.min(node.rc.id, node.id) + 1) + "_i" + compID; + } + + private void printLabel(PrintWriter out, SingleNode node) { + out.print("S\t" + getNodeId(node) + "\t" + node.sequence); + long coverage = 0; + for (int i = 0; i + k <= node.sequence.length(); i++) { + String kmer = node.sequence.substring(i, i + k); + coverage += subgraph.get(normalizeDna(kmer)); + } + coverage += subgraph.get(normalizeDna(node.sequence.substring(node.sequence.length()-k)))*(k-1); + //coverage += coverage / (node.sequence.length() - k + 1) * (k-1); + + out.println("\tLN:i:" + (node.sequence.length()) + "\tKC:i:" + coverage); + } + + public static String normalizeDna(String s) { + String rc = reverseComplement(s); + if (s.compareTo(rc) < 0) { + return s; + } else { + return rc; + } + } +} + diff --git a/src/tools/ComponentsToGraph.java b/src/tools/ComponentsToGraph.java new file mode 100644 index 0000000..a8285c9 --- /dev/null +++ b/src/tools/ComponentsToGraph.java @@ -0,0 +1,142 @@ +package tools; + +import algo.Comp2Graph; +import io.IOUtils; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.structures.map.MutableLongShortEntry; +import ru.ifmo.genetics.utils.Misc; +import ru.ifmo.genetics.utils.tool.ExecutionFailedException; +import ru.ifmo.genetics.utils.tool.Parameter; +import ru.ifmo.genetics.utils.tool.Tool; +import ru.ifmo.genetics.utils.tool.inputParameterBuilder.BoolParameterBuilder; +import ru.ifmo.genetics.utils.tool.inputParameterBuilder.FileMVParameterBuilder; +import ru.ifmo.genetics.utils.tool.inputParameterBuilder.FileParameterBuilder; +import ru.ifmo.genetics.utils.tool.inputParameterBuilder.IntParameterBuilder; +import structures.ConnectedComponent; + + +import java.io.*; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +public class ComponentsToGraph extends Tool { + public static final String NAME = "comp2graph"; + public static final String DESCRIPTION = "Transforms components in binary format to de Bruijn graph in GFA format"; + + + // input parameters + public final Parameter k = addParameter(new IntParameterBuilder("k") + .mandatory() + .withShortOpt("k") + .withDescription("k-mer size") + .create()); + + public final Parameter componentsFile = addParameter(new FileParameterBuilder("components-file") + .mandatory() + .withShortOpt("cf") + .withDescription("binary components file") + .create()); + + public final Parameter kmersFiles = addParameter(new FileMVParameterBuilder("k-mers") + .optional() + .withShortOpt("i") + .withDescription("list of input files with k-mers in binary format for graph coverage") + .create()); + + + public final Parameter coverageMode = addParameter(new BoolParameterBuilder("coverage") + .optional() + .withShortOpt("cov") + .withDefaultValue(false) + .withDescription("if SET count graph coverage as total k-mer occurrences in samples, otherwise as number of samples with k-mer (works only with `-i` option)") + .create()); + + + public final Parameter graphFile = addParameter(new FileParameterBuilder("graph-file") + .withDescription("file to write found components to") + .withDefaultValue(workDir.append("components-graph.gfa")) + .create()); + + + @Override + protected void cleanImpl() { + + } + + @Override + protected void runImpl() throws ExecutionFailedException { + if (k.get() <= 0) { + error("The size of k-mer must be at least 1."); + System.exit(1); + } + if (k.get() > 31) { + error("The size of k-mer must be no more than 31."); + System.exit(1); + } + + List components = ConnectedComponent.loadComponents(componentsFile.get()); + info(components.size() + " components loaded from " + componentsFile.get()); + + ExecutorService execService = Executors.newFixedThreadPool(availableProcessors.get()); + + BigLong2ShortHashMap hm = null; + if (kmersFiles.get() != null) { + hm = IOUtils.loadKmers(kmersFiles.get(), 0, availableProcessors.get(), logger); + debug("Memory used = " + Misc.usedMemoryAsString()); + if (!coverageMode.get()) { + hm.resetValues(); + for (File file : kmersFiles.get()) { + BigLong2ShortHashMap filt_hm = IOUtils.loadKmers(new File[]{file}, 0, availableProcessors.get(), logger); + debug("Memory used = " + Misc.usedMemoryAsString()); + Iterator it = filt_hm.entryIterator(); + while (it.hasNext()) { + MutableLongShortEntry entry = it.next(); + long key = entry.getKey(); + hm.put(key, (short) (hm.getWithZero(key) + 1)); + } + } + } + } + debug("Memory used = " + Misc.usedMemoryAsString()); + + + + ByteArrayOutputStream[] compFiles = new ByteArrayOutputStream[components.size()]; + for (int icomp=0; icomp < components.size(); icomp++) { + ConnectedComponent cmp = components.get(icomp); + compFiles[icomp] = new ByteArrayOutputStream(); + execService.execute(new Comp2Graph(k.get(), icomp, cmp, hm, compFiles[icomp], logger)); + } + + execService.shutdown(); + try { + execService.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + throw new ExecutionFailedException("Error while transforming components to graph image: " + e); + } + + try(OutputStream outputStream = new FileOutputStream(graphFile.get())) { + for (ByteArrayOutputStream out: compFiles) { + out.writeTo(outputStream); + } + } catch (IOException e) { + throw new ExecutionFailedException("Error while writing GFA image to file: " + e); + } + + info("Graph components saved to GFA format!"); + } + + + + public static void main(String[] args) { + new ComponentsToGraph().mainImpl(args); + } + + public ComponentsToGraph() { + super(NAME, DESCRIPTION); + } +} +