diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..d84e315 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,773 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import argparse\n", + "from Bio import SeqIO" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def read_sequences(directory):\n", + " \"\"\"\n", + " Reads in the sequences from the fastq files\n", + " :param directory:\n", + " :return:\n", + " \"\"\"\n", + " files = os.listdir(directory)\n", + " sequences = []\n", + "\n", + " for file in files:\n", + " sequence = \"\"\n", + " filename = os.path.join(directory, file)\n", + " for record in SeqIO.parse(filename, \"fastq\"):\n", + " sequence = sequence + ''.join(list(record.seq))\n", + " sequences.append(sequence)\n", + " return sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def split_into_kmers(sequence, k, threshold):\n", + " kmers = {}\n", + "\n", + " for i in range(0, len(sequence), k):\n", + " kmer = sequence[i: i + k + 1]\n", + " if len(kmer) > k:\n", + " if kmer in kmers.keys():\n", + " kmers[kmer] = kmers[kmer] + 1\n", + " else:\n", + " kmers[kmer] = 1\n", + "\n", + " for key, count in kmers:\n", + " if count <= threshold:\n", + " kmers.pop(key)\n", + "\n", + " return kmers" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_de_bruijin_graph(kmers):\n", + " # Set of all nodes in the DB Graph\n", + " nodes = set()\n", + " # Set of nodes having in-degrees\n", + " not_starts = set()\n", + " # List of all directed edges in the graph\n", + " edges = []\n", + "\n", + " for kmer in kmers:\n", + " # From k-mers, get k-1mers\n", + " k1mer1 = kmer[:-1]\n", + " k1mer2 = kmer[1:]\n", + " # Add k-1 mers to the nodes list\n", + " nodes.add(k1mer1)\n", + " nodes.add(k1mer2)\n", + "\n", + " # Add an edge between the two k-1mers\n", + " edges.append((k1mer1, k1mer2))\n", + "\n", + " # Add destination node to the set of nodes having in degrees\n", + " not_starts.add(k1mer2)\n", + "\n", + " start_nodes = list(nodes - not_starts)\n", + "\n", + " # Return the nodes, edges and the starting nodes in the graph.\n", + " return nodes, edges, start_nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_de_bruijin_graph_alt(kmers):\n", + " \n", + " # to keep track of degree of nodes\n", + " node_degree = {}\n", + " \n", + " # Set of all nodes in the DB Graph\n", + " nodes = set()\n", + " \n", + " # Set of nodes having in-degrees\n", + " not_starts = set()\n", + " \n", + " # List of all directed edges in the graph\n", + " edges = []\n", + "\n", + " for kmer in kmers:\n", + " \n", + " # From k-mers, get k-1mers\n", + " prefix = kmer[:-1]\n", + " suffix = kmer[1:]\n", + " \n", + " # Add k-1 mers to the nodes list\n", + " nodes.add(prefix)\n", + " nodes.add(suffix)\n", + "\n", + " # Add an edge between the two k-1mers\n", + " edges.append((prefix, suffix))\n", + " \n", + " # fetching and updating degrees\n", + " p_count = node_degree.setdefault(prefix, 0)\n", + " node_degree[prefix] = p_count+1\n", + " s_count = node_degree.setdefault(suffix, 0)\n", + " node_degree[suffix] = s_count-1\n", + " \n", + " degree_node = {v:k for k, v in node_degree.items() if v != 0}\n", + " \n", + " start = None\n", + " \n", + " if len(degree_node) > 0:\n", + " start_stop = sorted(degree_node, reverse=True)\n", + " start = degree_node[start_stop[0]]\n", + " \n", + " # Return the nodes, edges and the starting nodes in the graph.\n", + " return nodes, edges, start" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def make_node_edge_map(edges):\n", + "\n", + " # Make a map of starting nodes to the adjacency list of that node\n", + " node_edge_map = {}\n", + "\n", + " # Go through all edges\n", + " for e in edges:\n", + " n = e[0]\n", + " # If start node exists, add destination node to adjacency list\n", + " if n in node_edge_map:\n", + " node_edge_map[n].append(e[1])\n", + " # Add start node to map and initialize the adjacency list with the destination node\n", + " else:\n", + " node_edge_map[n] = [e[1]]\n", + " return node_edge_map" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def traverse_graph(graph, start):\n", + " # if there is no explicit start node\n", + " if len(start) == 0:\n", + " # pick any node as the start\n", + " start = list(graph.keys())[0]\n", + " else:\n", + " start = start[0]\n", + " \n", + " # maintain a stack to store the nodes to visit\n", + " path = [start]\n", + " \n", + " # accumulates the eulerian path\n", + " eulerian_path = []\n", + " \n", + " # while stack is non-empty\n", + " while path:\n", + " # pick up the topmost node in the stack\n", + " curr_node = str(path[-1])\n", + " \n", + " # if the current node is a key and has entries in its ajacency list\n", + " if curr_node in graph and graph[curr_node]:\n", + " \n", + " # get the adjacency list\n", + " adj_nodes = graph[curr_node]\n", + " next_node = None\n", + " \n", + " # if only one entry, proceed to that node\n", + " if len(adj_nodes) == 1:\n", + " next_node = adj_nodes.pop()\n", + " \n", + " # if not, proceed to next node that would allow us to traverse\n", + " # the rest of the graph\n", + " else:\n", + " # iterate through all adj_nodes\n", + " for node in adj_nodes:\n", + " # if the node leads to other nodes, set that as next\n", + " if node in graph.keys():\n", + " next_node = node\n", + " break\n", + " # clear the edge from the current node\n", + " graph[curr_node].remove(next_node)\n", + " # update the path we want to explore\n", + " path.append(next_node)\n", + " else:\n", + " # if we can go any further, append the node to the path\n", + " eulerian_path.append(path.pop())\n", + " \n", + " # reverse the accumulator as the path was populated in reverse\n", + " eulerian_path.reverse()\n", + " \n", + " return eulerian_path" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "def traverse_graph_alt(graph, start):\n", + " \n", + " # to collect all eulerian paths/cycles in a graph\n", + " all_trails = list()\n", + " \n", + " # the eularian tour of the entire graph\n", + " tour = []\n", + " tour.append(start)\n", + " \n", + " skip_trail = True\n", + " \n", + " while (True):\n", + " # start an eulerian trail\n", + " trail = []\n", + " \n", + " curr_node = start\n", + " \n", + " # traverse a trail until we can't go further\n", + " while (True):\n", + " \n", + " # terminate if can't go further\n", + " if curr_node not in graph:\n", + " break\n", + " \n", + " # pick a next node\n", + " next_node = graph[curr_node].pop()\n", + " \n", + " # if the adjacency list becomes emtpy for the current node, delete\n", + " if len(graph[curr_node]) == 0:\n", + " del graph[curr_node]\n", + " \n", + " # append next node to trail\n", + " trail.append(next_node)\n", + " \n", + " # if we circle back to start we have covered the trail\n", + " if next_node == start:\n", + " break;\n", + " \n", + " # if not move on to next node\n", + " curr_node = next_node\n", + " \n", + " # we skip adding the first trail as it would reflect in the tour\n", + " if skip_trail == False:\n", + " # after finishing a trail, add it to all tours\n", + " all_trails.append(list(trail))\n", + " \n", + " skip_trail = False\n", + " \n", + " # where to append the trail in the tour\n", + " append_at = tour.index(start)\n", + " \n", + " # introducing the trail inbetween the tour\n", + " tour = tour[:append_at+1] + trail + tour[append_at+1:]\n", + " \n", + " # done if no more nodes to explore\n", + " if len(graph) == 0:\n", + " break\n", + " \n", + " new_start_possible = False\n", + " \n", + " for node in tour:\n", + " if node in graph:\n", + " start = node\n", + " new_start_possible = True\n", + " break\n", + " \n", + " if not new_start_possible:\n", + " print(\"error, tour exploration terminated with remaining graph:\")\n", + " print(graph)\n", + " break\n", + " \n", + " return tour, all_trails" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "def eulerian_trail(m,v):\n", + " nemap = m\n", + " result_trail = []\n", + " start = v\n", + " result_trail.append(start)\n", + " while(True):\n", + " trail = []\n", + " previous = start\n", + " while(True):\n", + " \n", + " if(previous not in nemap):\n", + " break\n", + " next = nemap[previous].pop()\n", + " if(len(nemap[previous]) == 0):\n", + " nemap.pop(previous,None)\n", + " trail.append(next)\n", + " if(next == start):\n", + " break;\n", + " previous = next\n", + " # completed one trail\n", + " print(trail)\n", + " index = result_trail.index(start)\n", + " result_trail = result_trail[0:index+1] + trail + result_trail[index+1:len(result_trail)]\n", + " # choose new start\n", + " if(len(nemap)==0):\n", + " break\n", + " found_new_start = False\n", + " for n in result_trail:\n", + " if n in nemap:\n", + " start = n\n", + " found_new_start = True\n", + " break # from for loop\n", + " if not found_new_start:\n", + " print(\"error\")\n", + " print(\"result_trail\",result_trail)\n", + " print(nemap)\n", + " break\n", + " return result_trail" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "def align_to_reference(assembled_seq, args):\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " # add the command line arguments\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--k\", default=30, help=\"k-mer size\")\n", + " parser.add_argument(\"--threshold\", default=2, help=\"min count of k-mers to consider\")\n", + " parser.add_argument(\"--input\", default=\"sars-cov-2-trimmed/SE-SW-4-15\", help=\"input directory\")\n", + " parser.add_argument(\"--reference\", default=None, help=\"reference genome\")\n", + "\n", + " args = parser.parse_args()\n", + "\n", + " # each file is a seperate sequence\n", + " sequences = read_sequences(args.input)\n", + "\n", + " for sequence in sequences:\n", + " kmers = split_into_kmers(sequence, args.k, args.threshold)\n", + " k1mers = split_into_kmers(sequence, args.k - 1, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "k = 30\n", + "threshold = 2\n", + "input_dir = \"./sars-cov-2-trimmed/SE-SW-4-15\"" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[WinError 3] The system cannot find the path specified: './sars-cov-2-trimmed/SE-SW-4-15'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0msequences\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mread_sequences\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput_dir\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m\u001b[0m in \u001b[0;36mread_sequences\u001b[1;34m(directory)\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;32mreturn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \"\"\"\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mfiles\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdirectory\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[0msequences\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [WinError 3] The system cannot find the path specified: './sars-cov-2-trimmed/SE-SW-4-15'" + ] + } + ], + "source": [ + "sequences = read_sequences(input_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], + "source": [ + "kmers = [\"ATG\", \"GGT\", \"GTG\", \"TAT\", \"TGC\", \"TGG\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "kmers = [\"AAA\", \"AAB\", \"ABB\", \"BBB\", \"BBA\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [], + "source": [ + "kmers = [\"ATG\", \"TGG\", \"GGC\", \"GCG\", \"CGT\", \"GCA\", \"GTG\", \"CAA\", \"AAT\", \"TGC\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "kmers = [\"ATG\", \"TGC\", \"GCT\", \"CTA\", \"TAG\", \"AGC\", \"GCA\", \"CAC\", \"ACA\", \"CAT\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "kmers = ['ATCG', 'TCGT', 'CGTT', 'GTTG', 'TTGC', 'TGCG', 'GCGC', 'CGCG', 'GCGA', 'CGAC', 'GACC', 'ACCG']" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "nodes, edges, start = generate_de_bruijin_graph_alt(kmers)" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "if start == None:\n", + " nodes = list(nodes)\n", + " start = nodes[0]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ATC'" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "start" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('ATC', 'TCG'),\n", + " ('TCG', 'CGT'),\n", + " ('CGT', 'GTT'),\n", + " ('GTT', 'TTG'),\n", + " ('TTG', 'TGC'),\n", + " ('TGC', 'GCG'),\n", + " ('GCG', 'CGC'),\n", + " ('CGC', 'GCG'),\n", + " ('GCG', 'CGA'),\n", + " ('CGA', 'GAC'),\n", + " ('GAC', 'ACC'),\n", + " ('ACC', 'CCG')]" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edges" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ACC',\n", + " 'ATC',\n", + " 'CCG',\n", + " 'CGA',\n", + " 'CGC',\n", + " 'CGT',\n", + " 'GAC',\n", + " 'GCG',\n", + " 'GTT',\n", + " 'TCG',\n", + " 'TGC',\n", + " 'TTG'}" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ATC'" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "start" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "graph = make_node_edge_map(edges)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ATC': ['TCG'],\n", + " 'TCG': ['CGT'],\n", + " 'CGT': ['GTT'],\n", + " 'GTT': ['TTG'],\n", + " 'TTG': ['TGC'],\n", + " 'TGC': ['GCG'],\n", + " 'GCG': ['CGC', 'CGA'],\n", + " 'CGC': ['GCG'],\n", + " 'CGA': ['GAC'],\n", + " 'GAC': ['ACC'],\n", + " 'ACC': ['CCG']}" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "path = traverse_graph(graph, start)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ATC',\n", + " 'TCG',\n", + " 'CGT',\n", + " 'GTT',\n", + " 'TTG',\n", + " 'TGC',\n", + " 'GCG',\n", + " 'CGC',\n", + " 'GCG',\n", + " 'CGA',\n", + " 'GAC',\n", + " 'ACC',\n", + " 'CCG']" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['CGC', 'GCG']]" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'eulerian_trail' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0meulerian_trail\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgraph\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'eulerian_trail' is not defined" + ] + } + ], + "source": [ + "path = eulerian_trail(graph, start)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "dict = {'Name': 'Zara', 'Age': 7}" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "n = dict.setdefault('Name', 'Tara')" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Zara\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/assembler.py b/assembler.py index ce7319b..10bd1f5 100644 --- a/assembler.py +++ b/assembler.py @@ -66,6 +66,51 @@ def generate_de_bruijin_graph(kmers): # Return the nodes, edges and the starting nodes in the graph. return nodes, edges, start_nodes +def generate_de_bruijin_graph_alt(kmers): + + # to keep track of degree of nodes + node_degree = {} + + # Set of all nodes in the DB Graph + nodes = set() + + # Set of nodes having in-degrees + not_starts = set() + + # List of all directed edges in the graph + edges = [] + + for kmer in kmers: + + # From k-mers, get k-1mers + prefix = kmer[:-1] + suffix = kmer[1:] + + # Add k-1 mers to the nodes list + nodes.add(prefix) + nodes.add(suffix) + + # Add an edge between the two k-1mers + edges.append((prefix, suffix)) + + # fetching and updating degrees + p_count = node_degree.setdefault(prefix, 0) + node_degree[prefix] = p_count+1 + s_count = node_degree.setdefault(suffix, 0) + node_degree[suffix] = s_count-1 + + # reverse mapping to figure out start and stop nodes + degree_node = {v:k for k, v in node_degree.items() if v != 0} + + start = None + + # reverse mapping would be empty if there is not eulerian path, but only cycles + if len(degree_node) > 0: + start_stop = sorted(degree_node, reverse=True) + start = degree_node[start_stop[0]] + + # Return the nodes, edges and the starting nodes in the graph. + return nodes, edges, start def make_node_edge_map(edges): @@ -77,16 +122,138 @@ def make_node_edge_map(edges): n = e[0] # If start node exists, add destination node to adjacency list if n in node_edge_map: - node_edge_map[n].append(e[1]) + node_edge_map[n].add(e[1]) # Add start node to map and initialize the adjacency list with the destination node else: - node_edge_map[n] = [e[1]] + adjacency_set = set() # using a set to support faster udpates + adjacency_set.add(e[1]) + node_edge_map[n] = adjacency_set + return node_edge_map +def traverse_graph(graph, start): + # if there is no explicit start node + if len(start) == 0: + # pick any node as the start + start = list(graph.keys())[0] + else: + start = start[0] -def traverse_graph(graph): - return [] + # maintain a stack to store the nodes to visit + path = [start] + + # accumulates the eulerian path + eulerian_path = [] + + # while stack is non-empty + while path: + # pick up the topmost node in the stack + curr_node = str(path[-1]) + + # if the current node is a key and has entries in its ajacency list + if curr_node in graph and graph[curr_node]: + + # get the adjacency list + adj_nodes = graph[curr_node] + next_node = None + + # if only one entry, proceed to that node + if len(adj_nodes) == 1: + next_node = adj_nodes.pop() + + # if not, proceed to next node that would allow us to traverse + # the rest of the graph + else: + # iterate through all adj_nodes + for node in adj_nodes: + # if the node leads to other nodes, set that as next + if node in graph.keys(): + next_node = node + break + # clear the edge from the current node + graph[curr_node].remove(next_node) + # update the path we want to explore + path.append(next_node) + else: + # if we can go any further, append the node to the path + eulerian_path.append(path.pop()) + # reverse the accumulator as the path was populated in reverse + eulerian_path.reverse() + + return eulerian_path +def traverse_graph_alt(graph, start): + + # to collect all eulerian paths/cycles in a graph + all_trails = list() + + # the eularian tour of the entire graph + tour = [] + tour.append(start) + + skip_trail = True + + while (True): + # start an eulerian trail + trail = [] + + curr_node = start + + # traverse a trail until we can't go further + while (True): + + # terminate if can't go further + if curr_node not in graph: + break + + # pick a next node + next_node = graph[curr_node].pop() + + # if the adjacency list becomes emtpy for the current node, delete + if len(graph[curr_node]) == 0: + del graph[curr_node] + + # append next node to trail + trail.append(next_node) + + # if we circle back to start we have covered the trail + if next_node == start: + break; + + # if not move on to next node + curr_node = next_node + + # we skip adding the first trail as it would reflect in the tour + if skip_trail == False: + # after finishing a trail, add it to all tours + all_trails.append(list(trail)) + + skip_trail = False + + # where to append the trail in the tour + append_at = tour.index(start) + + # introducing the trail inbetween the tour + tour = tour[:append_at+1] + trail + tour[append_at+1:] + + # done if no more nodes to explore + if len(graph) == 0: + break + + new_start_possible = False + + for node in tour: + if node in graph: + start = node + new_start_possible = True + break + + if not new_start_possible: + print("error, tour exploration terminated with remaining graph:") + print(graph) + break + + return tour, all_trails def align_to_reference(assembled_seq, args): return [] @@ -108,6 +275,14 @@ def align_to_reference(assembled_seq, args): for sequence in sequences: kmers = split_into_kmers(sequence, args.k, args.threshold) k1mers = split_into_kmers(sequence, args.k - 1, 0) + nodes, edges, start = generate_de_bruijin_graph_alt(kmers) + # start would be None if there is not eulerian path, but only cycles + if start == None: + nodes = list(nodes) + start = nodes[0] + graph = make_node_edge_map(edges) + tour, all_trails = traverse_graph(graph, start) + # graph = generate_de_bruijin_graph(kmers, k1mers, args) # save the graph for plotting # assembled_seq = traverse_graph(graph) # mismatches = align_to_reference(assembled_seq, args) # save the mismatches for plotting