Skip to content

Commit

Permalink
Add comp2graph tool for graph components output in GFA format
Browse files Browse the repository at this point in the history
  • Loading branch information
ivartb committed Aug 14, 2023
1 parent 77c6794 commit 8f2d415
Show file tree
Hide file tree
Showing 4 changed files with 402 additions and 0 deletions.
153 changes: 153 additions & 0 deletions src/algo/Comp2Graph.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
package algo;

import io.GFAWriter;
import org.apache.log4j.Logger;
import ru.ifmo.genetics.dna.DnaTools;
import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap;
import ru.ifmo.genetics.utils.KmerUtils;
import structures.ConnectedComponent;

import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static io.GFAWriter.normalizeDna;

public class Comp2Graph implements Runnable {
private final int k;
private final int compID;
private final Logger logger;

private final ByteArrayOutputStream outputStream;

private final ConnectedComponent comp;
private final BigLong2ShortHashMap all_kmers;
private final BigLong2ShortHashMap comp_kmers;
private final Map<String, Short> subgraph;

private int size;
private SingleNode[] nodes;

public Comp2Graph(int k, int icomp, ConnectedComponent comp, BigLong2ShortHashMap all_kmers, ByteArrayOutputStream outputStream, Logger logger) {
this.k = k;
this.compID = icomp;
this.logger = logger;

this.comp = comp;
this.all_kmers = all_kmers;
this.outputStream = outputStream;
this.comp_kmers = new BigLong2ShortHashMap(8, 12, true);
this.subgraph = new HashMap<>();
}


@Override
public void run() {
buildComponent();
initStructures();
mergePaths();
createGFA();
}

private void buildComponent() {
logger.debug("Building graph component...");
if (all_kmers != null) {
for (long kmer: comp.kmers) {
comp_kmers.put(kmer, all_kmers.getWithZero(kmer));
}
} else {
for (long kmer: comp.kmers) {
comp_kmers.put(kmer, (short)1);
}
}

for (long kmer: comp.kmers) {
subgraph.put(normalizeDna(KmerUtils.kmer2String(kmer, k)), comp_kmers.getWithZero(kmer));
}
}

private void initStructures() {
logger.debug("Initializing structures for graph traversal...");
Map<String, List<SingleNode>> nodeByKmer = new HashMap<>();
size = subgraph.size() * 2;
nodes = new SingleNode[size];
{
int id = 0;
for (Map.Entry<String, Short> stringShortEntry : subgraph.entrySet()) {
String seq = stringShortEntry.getKey();
String rc = DnaTools.reverseComplement(seq);
nodes[id] = new SingleNode(seq, id);
nodes[id + 1] = new SingleNode(rc, id);
nodes[id].rc = nodes[id + 1];
nodes[id + 1].rc = nodes[id];
id += 2;
}
}
for (int i = 0; i < size; i++) {
String key = nodes[i].sequence.substring(0, k - 1);
if (!nodeByKmer.containsKey(key)) {
nodeByKmer.put(key, new ArrayList<>());
}
nodeByKmer.get(key).add(nodes[i]);
}
for (int i = 0; i < size; i++) {
String lastK = nodes[i].sequence.substring(1);
if (nodeByKmer.containsKey(lastK)) {
nodes[i].rc.neighbors.addAll(nodeByKmer.get(lastK));
}
}
}

private void mergePaths() {
logger.debug("Merging graph paths...");
while (true) {
boolean acted = false;
for (int i = 0; i < size; i++) {
if (!nodes[i].deleted && nodes[i].neighbors.size() == 1) {
SingleNode other = nodes[i].neighbors.get(0);
if (other.neighbors.size() != 1) {
continue;
}
mergeNodes(nodes[i], other);
acted = true;
}
}
if (!acted) {
break;
}
}
}

private void mergeNodes(SingleNode firstPlus, SingleNode secondMinus) {
// first k-1 symbols of firstPlus coincide with complement of first k-1 symbols of secondMinus
SingleNode firstMinus = firstPlus.rc, secondPlus = secondMinus.rc;
String newSeq = mergeLabels(secondPlus.sequence, firstPlus.sequence);
String newSeqRC = mergeLabels(firstMinus.sequence, secondMinus.sequence);

secondPlus.sequence = newSeq;
firstMinus.sequence = newSeqRC;
secondPlus.rc = firstMinus;
firstMinus.rc = secondPlus;

firstPlus.deleted = secondMinus.deleted = true;
}

private void checkLabels(String a, String b) {
if (!a.substring(a.length() - (k - 1)).equals(b.substring(0, k - 1))) {
throw new AssertionError("Labels should be merged, but can not: " + a + " and " + b);
}
}

private String mergeLabels(String a, String b) {
checkLabels(a, b);
return a + b.substring(k - 1);
}

private void createGFA() {
logger.debug("Preparing GFA output...");
GFAWriter writer = new GFAWriter(k, compID, outputStream, nodes, subgraph);
writer.generateOutput();
}
}
21 changes: 21 additions & 0 deletions src/algo/SingleNode.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package algo;

import java.util.ArrayList;
import java.util.List;

public class SingleNode {
public String sequence;
public int id;
public boolean deleted;
public SingleNode rc;
public List<SingleNode> neighbors;

public SingleNode(String sequence, int id) {
this.sequence = sequence;
this.id = id;

this.deleted = false;
this.neighbors = new ArrayList<>();
}

}
86 changes: 86 additions & 0 deletions src/io/GFAWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package io;

import algo.SingleNode;


import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.util.Map;

import static ru.ifmo.genetics.dna.DnaTools.reverseComplement;

public class GFAWriter {
private final int k;
private final int compID;
private final ByteArrayOutputStream outputStream;
private final SingleNode[] nodes;
private final Map<String, Short> subgraph;
private final int size;


public GFAWriter(int k, int compID, ByteArrayOutputStream outputStream, SingleNode[] nodes, Map<String, Short> subgraph) {
this.k = k;
this.compID = compID;
this.outputStream = outputStream;
this.nodes = nodes;
this.subgraph = subgraph;
this.size = nodes.length;
}

public void generateOutput() {
PrintWriter out = new PrintWriter(outputStream, false);
for (int i = 0; i < size; i++) {
if (!nodes[i].deleted && nodes[i].sequence.compareTo(nodes[i].rc.sequence) <= 0) {
printLabel(out, nodes[i]);
}
}
for (SingleNode i : nodes) {
if (!i.deleted) {
for (SingleNode j : i.neighbors) {
if (!j.deleted) {
printEdge(out, i, j);
}
}
}
}
out.flush();
out.close();
}

private void printEdge(PrintWriter out, SingleNode first, SingleNode second) {
StringBuilder edge = new StringBuilder("L\t");
edge.append(getNodeId(first)).append('\t')
.append((first.sequence.compareTo(first.rc.sequence) >= 0 ? "+" : "-")).append('\t')
.append(getNodeId(second)).append('\t')
.append((second.sequence.compareTo(second.rc.sequence) <= 0 ? "+" : "-")).append('\t')
.append(k - 1).append("M");
out.println(edge);
}

private String getNodeId(SingleNode node) {
return (Math.min(node.rc.id, node.id) + 1) + "_i" + compID;
}

private void printLabel(PrintWriter out, SingleNode node) {
out.print("S\t" + getNodeId(node) + "\t" + node.sequence);
long coverage = 0;
for (int i = 0; i + k <= node.sequence.length(); i++) {
String kmer = node.sequence.substring(i, i + k);
coverage += subgraph.get(normalizeDna(kmer));
}
coverage += subgraph.get(normalizeDna(node.sequence.substring(node.sequence.length()-k)))*(k-1);
//coverage += coverage / (node.sequence.length() - k + 1) * (k-1);

out.println("\tLN:i:" + (node.sequence.length()) + "\tKC:i:" + coverage);
}

public static String normalizeDna(String s) {
String rc = reverseComplement(s);
if (s.compareTo(rc) < 0) {
return s;
} else {
return rc;
}
}
}

Loading

0 comments on commit 8f2d415

Please sign in to comment.