From 96bcd2500d465ec815d30f6edd4c426ac10c533a Mon Sep 17 00:00:00 2001 From: Steven Weaver Date: Tue, 5 Mar 2019 17:21:35 -0500 Subject: [PATCH] adding cycle report option --- hivclustering/mtnetwork.py | 94 +++++++++++++++++------------------ hivclustering/networkbuild.py | 42 ++++++++++------ 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/hivclustering/mtnetwork.py b/hivclustering/mtnetwork.py index 3ec94a6..7325dd0 100644 --- a/hivclustering/mtnetwork.py +++ b/hivclustering/mtnetwork.py @@ -56,8 +56,8 @@ def parseHeader(str): try: patient_description['date'] = time.strptime(groups[1], pattern) except ValueError: - continue - + continue + except: print("Warning: could not parse the following ID as the reg.exp. header: %s" % str, file = sys.stderr) patient_description['id'] = str @@ -207,7 +207,7 @@ def __init__(self, patient1, patient2, date1, date2, visible, attribute=None, se self.p1 = patient2 self.date2 = date1 self.date1 = date2 - if sequence_ids is not None: + if sequence_ids is not None: self.sequences = list(self.sequences) self.sequences.reverse() self.sequences = tuple (self.sequences) @@ -830,7 +830,7 @@ def create_a_pref_attachment_network(self, network_size=100, start_with=1, rando #print(max(dates_by_chain), file = sys.stderr) return simulation_start - + def construct_cluster_representation(self, root_node, already_simulated, the_cluster): if root_node in self.adjacency_list: @@ -1912,7 +1912,7 @@ def find_all_cycles_old (self, edge_set, cycle_length, maximum_number=2**18, ign will store cycles of length `cycle_length`, as lists of nodes, with the convention that the cycle 'starts' with the node that has the lowest sort order (lexicographically) ''' - + def handle_a_cycle (node_list): node_list.rotate(-node_list.index (min (node_list, key = lambda x : x[0].id))) # also orient the cycles so x1->x2->x3->...-x_N has x_2 <= x_N @@ -1920,19 +1920,19 @@ def handle_a_cycle (node_list): if node_list[1][0].id > node_list[-1][0].id: node_list.rotate (-1) node_list.reverse() - + #extract sequences that correspond to individual nodes in the cycles - + sequences = [] sequence_set = set () - - - # iterate over the list of sequence IDs and check that the sequences used to make + + + # iterate over the list of sequence IDs and check that the sequences used to make # the same sequences were used to make each of the links, i.e. that # if there is an N1--N2--N3 chain, and N2 has two sequences associated with it, say N2-1 and N2-2 # then it is not the case that N1 is linked to N2 via N2-1 and N3 is linked to N2 via N2-2 - - for n, s in node_list: + + for n, s in node_list: #my_seq = [seq for seq in s.sequences if seq in self.sequence_ids[n.id]] my_seq = None sequence_set.add (s.sequences) @@ -1941,32 +1941,32 @@ def handle_a_cycle (node_list): if nd == n: my_seq = s.sequences[i] break - + if not my_seq: return None sequences.append (my_seq) if len (sequence_set) != len (node_list): return None - + cycles.add (tuple(sequences)) - - + + #print ([n[1].sequences for n in node_list], file = sys.stderr) #cycles.add (tuple(sorted (node_list, key = lambda x : x.id))) - + if (cycle_length >= 4): node_and_edge_am = {} self.compute_adjacency(both=True, edge_set=edge_set, storage=node_and_edge_am) - - #perform depth wise-traversal + + #perform depth wise-traversal visited = set () - + def DFS (current_node, current_edge, current_node_chain): visited.add (current_node) current_node_chain.append ([current_node, current_edge]) - - + + for neighbor, edge in node_and_edge_am[current_node]: if neighbor not in visited: current_node_chain[len(current_node_chain)-1][1] = edge @@ -1975,24 +1975,24 @@ def DFS (current_node, current_edge, current_node_chain): if len (current_node_chain) == cycle_length and neighbor == current_node_chain[0][0]: current_node_chain [-1][1] = edge handle_a_cycle (current_node_chain) - + current_node_chain.pop() - + for n in node_and_edge_am: - if n not in visited: + if n not in visited: chain = collections.deque () DFS (n, None, chain) - - + + else: raise UserWarning('Will only count cycles of length 4 or greater') - + print ("\n", cycles, file = sys.stderr) return cycles def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_set = None, do_quads = False): # if do_quads is set, then look for simple cycles of length 4, otherwise look for triangles - + cycles = set() #sequences_involved_in_links = set () #sequence_pairs = set () @@ -2002,7 +2002,7 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se #print ("Locating triangles in the network", file = sys.stderr) - + # create a list of the form # node[id] = list of # [node_id] = edge_id @@ -2030,17 +2030,17 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se nodes = [node, node2, node3, node4] if len (set (nodes)) < 4: continue - + quad = collections.deque (nodes) quad.rotate(-quad.index (min (quad, key = lambda x : x.id))) if quad[1].id > quad[-1].id: quad.rotate (-1) quad.reverse() - + sequences = [] sequence_set = set () quad = tuple (quad) - + if quad not in cycle_nodes: for n, quad_edge in enumerate ([adjacency_map[quad[0]][quad[1]], adjacency_map[quad[1]][quad[2]], adjacency_map[quad[2]][quad[3]], adjacency_map[quad[3]][quad[0]]]): my_seq = None @@ -2050,22 +2050,22 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se if nd == quad[n]: my_seq = quad_edge.sequences[i] break - + if my_seq: - sequences.append (my_seq) + sequences.append (my_seq) if len (sequence_set) == 4 and len (sequences) == 4: sequence_set = tuple (sequences) if ignore_this_set and sequence_set in ignore_this_set: continue - + cycle_nodes.add(quad) cycles.add(sequence_set) for s in sequence_set: if s not in count_by_sequence: count_by_sequence[s] = 1 else: - count_by_sequence[s] += 1 - + count_by_sequence[s] += 1 + ''' sequence_set = set() for quad_edge [adjacency_map[triad[0]][triad[1]], adjacency_map[triad[0]][triad[2]], adjacency_map[triad[1]][triad[2]]]: @@ -2084,9 +2084,9 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se if s not in count_by_sequence: count_by_sequence[s] = 1 else: - count_by_sequence[s] += 1 - ''' - + count_by_sequence[s] += 1 + ''' + else: if node in adjacency_map[node3]: triad = sorted([node, node2, node3]) @@ -2118,7 +2118,7 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se #print (sequence_set) #triangle_nodes_all.add(triad) - + if len(cycle_nodes) >= maximum_number: raise UserWarning( '\nToo many cycles to attempt full filtering; stopped at %d' % maximum_number) @@ -2131,7 +2131,7 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se if do_quads: sorted_result = sorted([(t[0], t[1], t[2], t[3], sum([count_by_sequence[k] for k in t])) for t in cycles], key=lambda x: (x[4], x[0], x[1], x[2], x[3]), reverse=True) - + else: sorted_result = sorted([(t[0], t[1], t[2], sum([count_by_sequence[k] for k in t])) for t in cycles], key=lambda x: (x[3], x[0], x[1], x[2]), reverse=True) @@ -2202,7 +2202,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance if len(cycles) == 0: return None - + evaluator = partial(_test_edge_support, sequence_records=sequence_records, hy_instance=hy_instance, p_value_cutoff=p_value_cutoff, test_quads = test_quads) #processed_objects = evaluator (cycles) @@ -2224,7 +2224,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance upper_bound = 4 if test_quads else 3 processed_objects = sorted([k for p in processed_objects for k in p], key=lambda x: x[0][upper_bound]) - + edges_removed = set() must_keep = set() @@ -2242,7 +2242,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance unsupported_edges = set() removed_edges = set() bridges = set() - + edge_enumerator = ((0, 1), (1, 2), (2, 3), (3, 0)) if test_quads else ((0, 1), (0, 2), (1, 2)) for t, p_values in processed_objects: @@ -2278,7 +2278,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance this_edge.edge_reject_p = max(p_values[pair_index], this_edge.edge_reject_p) else: this_edge.edge_reject_p = max(p_values[2 - pair_index], this_edge.edge_reject_p) - + if this_edge.edge_reject_p > p_value_cutoff: if this_edge not in unsupported_edges: #print (this_edge, this_edge.edge_reject_p, seq_id) diff --git a/hivclustering/networkbuild.py b/hivclustering/networkbuild.py index 84a3e2e..39975f7 100755 --- a/hivclustering/networkbuild.py +++ b/hivclustering/networkbuild.py @@ -137,7 +137,7 @@ def describe_network(network, json_output=False, keep_singletons=False): print(reasons, file=sys.stderr) if not settings().skip_degrees: - print("Fitting the degree distribution to various densities", file=sys.stderr) + print("Fitting the degree distribution to various densities", file=sys.stderr) distro_fit = network.fit_degree_distribution() ci = distro_fit['rho_ci'][distro_fit['Best']] rho = distro_fit['rho'][distro_fit['Best']] @@ -159,10 +159,10 @@ def describe_network(network, json_output=False, keep_singletons=False): print("Best distribution is '%s'" % (distro_fit['Best']), file=sys.stderr) else: distro_fit = {'degrees' : network.get_degree_distribution()} - + if json_output: return_json['Degrees'] = distro_fit['degrees'] - + # find diffs in directed edges '''for anEdge in network.edges: @@ -333,6 +333,7 @@ def build_a_network(extra_arguments = None): arguments.add_argument('-n', '--edge-filtering', dest='edge_filtering', choices=['remove', 'report'], help='Compute edge support and mark edges for removal using sequence-based triangle tests (requires the -s argument) and either only report them or remove the edges before doing other analyses ', required=False) arguments.add_argument('-y', '--centralities', help='Output a CSV file with node centralities') arguments.add_argument('-l', '--edge-filter-cycles', dest = 'filter_cycles', help='Filter edges that are in cycles other than triangles', action='store_true') + arguments.add_argument('--cycle-report-file', dest = 'cycle_report_filename', help='Prints cycle report to specified file', default = sys.stdout, type = argparse.FileType('w'), required=False) arguments.add_argument('-g', '--triangles', help='Maximum number of triangles to consider in each filtering pass', type = int, default = 2**15) arguments.add_argument('-C', '--contaminants', help='Screen for contaminants by marking or removing sequences that cluster with any of the contaminant IDs (-F option) [default is not to screen]', choices=['report', 'remove']) arguments.add_argument('-F', '--contaminant-file', dest='contaminant_file',help='IDs of contaminant sequences', type=str) @@ -442,7 +443,11 @@ def build_a_network(extra_arguments = None): if len([k for k in [run_settings.edge_filtering, run_settings.sequences] if k is None]) == 1: raise ValueError('Two arguments (-n and -s) are needed for edge filtering options') - network = transmission_network(multiple_edges=run_settings.multiple_edges) + if not run_settings.filter_cycles and run_settings.cycle_report_filename: + raise ValueError('-l option is necessary to report cycles') + + + network = transmission_network(multiple_edges=run_settings.multiple_edges) network.read_from_csv_file(run_settings.input, formatter, run_settings.threshold, 'BULK') uds_settings = None @@ -515,7 +520,7 @@ def build_a_network(extra_arguments = None): # network.apply_id_filter(list=run_settings.filter, do_clear=False) network.compute_clusters () # this allocates nodes to clusters - + def generate_edges_by_cluster () : edges_by_clusters = {} current_edge_set = network.reduce_edge_set() @@ -528,9 +533,9 @@ def generate_edges_by_cluster () : edges_by_clusters = [set(v) for c,v in edges_by_clusters.items() if len (v) >= 3] edges_by_clusters.sort (key = lambda x : len (x)) # smallest first return edges_by_clusters - + edges_by_clusters = generate_edges_by_cluster() - + # load sequence data @@ -550,14 +555,14 @@ def generate_edges_by_cluster () : for seq_id in all_referenced_sequences: referenced_sequence_data[seq_id] = hy_instance.getvar(seq_id, hy.HyphyInterface.STRING) - + if run_settings.extract is not None: sequence_set = set () for anEdge in edges_by_clusters[run_settings.extract]: sequence_set.update (anEdge.sequences) for s in sequence_set: print (">%s\n%s\n" % (s, referenced_sequence_data[s]), file = run_settings.output) - + # partition edges into clusters @@ -591,13 +596,20 @@ def handle_a_cluster (edge_set, cluster_count, total_count): maximum_number += run_settings.triangles edge_set.difference_update (set ([edge for edge in edge_set if not edge.has_support()])) - + if run_settings.filter_cycles: maximum_number = run_settings.triangles supported_quads = set () - + for filtering_pass in range (8): - edge_stats = network.test_edge_support(referenced_sequence_data, *network.find_all_simple_cycles(edge_set, maximum_number = maximum_number, ignore_this_set = supported_quads, do_quads = True), supported_cycles = supported_quads, test_quads = True) + simple_cycles = network.find_all_simple_cycles(edge_set, maximum_number = maximum_number, ignore_this_set = supported_quads, do_quads = True) + + # If reporting cycle option set, pickle output to file + if run_settings.cycle_report_filename: + print(json.dumps({"cycles" : simple_cycles[0]}), file=run_settings.cycle_report_filename) + + edge_stats = network.test_edge_support(referenced_sequence_data, *simple_cycles, supported_cycles = supported_quads, test_quads = True) + if not edge_stats: break else: @@ -641,13 +653,13 @@ def handle_a_cluster (edge_set, cluster_count, total_count): total_removed += handle_a_cluster (current_edge_set, cluster_count, len (edges_by_clusters)) print ("\nEdge filtering identified %d edges for removal" % total_removed, file = sys.stderr) - + network.set_edge_visibility(edge_visibility) # restore edge visibility if run_settings.edge_filtering == 'remove': print("Edge filtering removed %d edges" % network.conditional_prune_edges(), file=sys.stderr) - - + + return network