Skip to content

Commit

Permalink
adding cycle report option
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenweaver committed Mar 5, 2019
1 parent 6d07fa1 commit 96bcd25
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 62 deletions.
94 changes: 47 additions & 47 deletions hivclustering/mtnetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def parseHeader(str):
try:
patient_description['date'] = time.strptime(groups[1], pattern)
except ValueError:
continue
continue

except:
print("Warning: could not parse the following ID as the reg.exp. header: %s" % str, file = sys.stderr)
patient_description['id'] = str
Expand Down Expand Up @@ -207,7 +207,7 @@ def __init__(self, patient1, patient2, date1, date2, visible, attribute=None, se
self.p1 = patient2
self.date2 = date1
self.date1 = date2
if sequence_ids is not None:
if sequence_ids is not None:
self.sequences = list(self.sequences)
self.sequences.reverse()
self.sequences = tuple (self.sequences)
Expand Down Expand Up @@ -830,7 +830,7 @@ def create_a_pref_attachment_network(self, network_size=100, start_with=1, rando

#print(max(dates_by_chain), file = sys.stderr)
return simulation_start


def construct_cluster_representation(self, root_node, already_simulated, the_cluster):
if root_node in self.adjacency_list:
Expand Down Expand Up @@ -1912,27 +1912,27 @@ def find_all_cycles_old (self, edge_set, cycle_length, maximum_number=2**18, ign
will store cycles of length `cycle_length`, as lists of nodes, with the convention that
the cycle 'starts' with the node that has the lowest sort order (lexicographically)
'''

def handle_a_cycle (node_list):
node_list.rotate(-node_list.index (min (node_list, key = lambda x : x[0].id)))
# also orient the cycles so x1->x2->x3->...-x_N has x_2 <= x_N
# i.e. 1-2-3-4 is chosen instead of 1-4-3-2
if node_list[1][0].id > node_list[-1][0].id:
node_list.rotate (-1)
node_list.reverse()

#extract sequences that correspond to individual nodes in the cycles

sequences = []
sequence_set = set ()
# iterate over the list of sequence IDs and check that the sequences used to make


# iterate over the list of sequence IDs and check that the sequences used to make
# the same sequences were used to make each of the links, i.e. that
# if there is an N1--N2--N3 chain, and N2 has two sequences associated with it, say N2-1 and N2-2
# then it is not the case that N1 is linked to N2 via N2-1 and N3 is linked to N2 via N2-2
for n, s in node_list:

for n, s in node_list:
#my_seq = [seq for seq in s.sequences if seq in self.sequence_ids[n.id]]
my_seq = None
sequence_set.add (s.sequences)
Expand All @@ -1941,32 +1941,32 @@ def handle_a_cycle (node_list):
if nd == n:
my_seq = s.sequences[i]
break

if not my_seq:
return None
sequences.append (my_seq)

if len (sequence_set) != len (node_list):
return None

cycles.add (tuple(sequences))


#print ([n[1].sequences for n in node_list], file = sys.stderr)
#cycles.add (tuple(sorted (node_list, key = lambda x : x.id)))

if (cycle_length >= 4):
node_and_edge_am = {}
self.compute_adjacency(both=True, edge_set=edge_set, storage=node_and_edge_am)
#perform depth wise-traversal

#perform depth wise-traversal
visited = set ()

def DFS (current_node, current_edge, current_node_chain):
visited.add (current_node)
current_node_chain.append ([current_node, current_edge])


for neighbor, edge in node_and_edge_am[current_node]:
if neighbor not in visited:
current_node_chain[len(current_node_chain)-1][1] = edge
Expand All @@ -1975,24 +1975,24 @@ def DFS (current_node, current_edge, current_node_chain):
if len (current_node_chain) == cycle_length and neighbor == current_node_chain[0][0]:
current_node_chain [-1][1] = edge
handle_a_cycle (current_node_chain)

current_node_chain.pop()

for n in node_and_edge_am:
if n not in visited:
if n not in visited:
chain = collections.deque ()
DFS (n, None, chain)


else:
raise UserWarning('Will only count cycles of length 4 or greater')

print ("\n", cycles, file = sys.stderr)
return cycles

def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_set = None, do_quads = False):
# if do_quads is set, then look for simple cycles of length 4, otherwise look for triangles

cycles = set()
#sequences_involved_in_links = set ()
#sequence_pairs = set ()
Expand All @@ -2002,7 +2002,7 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se

#print ("Locating triangles in the network", file = sys.stderr)


# create a list of the form
# node[id] = list of
# [node_id] = edge_id
Expand Down Expand Up @@ -2030,17 +2030,17 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se
nodes = [node, node2, node3, node4]
if len (set (nodes)) < 4:
continue

quad = collections.deque (nodes)
quad.rotate(-quad.index (min (quad, key = lambda x : x.id)))
if quad[1].id > quad[-1].id:
quad.rotate (-1)
quad.reverse()

sequences = []
sequence_set = set ()
quad = tuple (quad)

if quad not in cycle_nodes:
for n, quad_edge in enumerate ([adjacency_map[quad[0]][quad[1]], adjacency_map[quad[1]][quad[2]], adjacency_map[quad[2]][quad[3]], adjacency_map[quad[3]][quad[0]]]):
my_seq = None
Expand All @@ -2050,22 +2050,22 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se
if nd == quad[n]:
my_seq = quad_edge.sequences[i]
break

if my_seq:
sequences.append (my_seq)
sequences.append (my_seq)
if len (sequence_set) == 4 and len (sequences) == 4:
sequence_set = tuple (sequences)
if ignore_this_set and sequence_set in ignore_this_set:
continue

cycle_nodes.add(quad)
cycles.add(sequence_set)
for s in sequence_set:
if s not in count_by_sequence:
count_by_sequence[s] = 1
else:
count_by_sequence[s] += 1
count_by_sequence[s] += 1

'''
sequence_set = set()
for quad_edge [adjacency_map[triad[0]][triad[1]], adjacency_map[triad[0]][triad[2]], adjacency_map[triad[1]][triad[2]]]:
Expand All @@ -2084,9 +2084,9 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se
if s not in count_by_sequence:
count_by_sequence[s] = 1
else:
count_by_sequence[s] += 1
'''
count_by_sequence[s] += 1
'''

else:
if node in adjacency_map[node3]:
triad = sorted([node, node2, node3])
Expand Down Expand Up @@ -2118,7 +2118,7 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se
#print (sequence_set)

#triangle_nodes_all.add(triad)

if len(cycle_nodes) >= maximum_number:
raise UserWarning(
'\nToo many cycles to attempt full filtering; stopped at %d' % maximum_number)
Expand All @@ -2131,7 +2131,7 @@ def find_all_simple_cycles (self, edge_set, maximum_number=2**18, ignore_this_se
if do_quads:
sorted_result = sorted([(t[0], t[1], t[2], t[3], sum([count_by_sequence[k] for k in t]))
for t in cycles], key=lambda x: (x[4], x[0], x[1], x[2], x[3]), reverse=True)

else:
sorted_result = sorted([(t[0], t[1], t[2], sum([count_by_sequence[k] for k in t]))
for t in cycles], key=lambda x: (x[3], x[0], x[1], x[2]), reverse=True)
Expand Down Expand Up @@ -2202,7 +2202,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance

if len(cycles) == 0:
return None

evaluator = partial(_test_edge_support, sequence_records=sequence_records,
hy_instance=hy_instance, p_value_cutoff=p_value_cutoff, test_quads = test_quads)
#processed_objects = evaluator (cycles)
Expand All @@ -2224,7 +2224,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance

upper_bound = 4 if test_quads else 3
processed_objects = sorted([k for p in processed_objects for k in p], key=lambda x: x[0][upper_bound])

edges_removed = set()
must_keep = set()

Expand All @@ -2242,7 +2242,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance
unsupported_edges = set()
removed_edges = set()
bridges = set()

edge_enumerator = ((0, 1), (1, 2), (2, 3), (3, 0)) if test_quads else ((0, 1), (0, 2), (1, 2))

for t, p_values in processed_objects:
Expand Down Expand Up @@ -2278,7 +2278,7 @@ def test_edge_support(self, sequence_records, cycles, adjacency_set, hy_instance
this_edge.edge_reject_p = max(p_values[pair_index], this_edge.edge_reject_p)
else:
this_edge.edge_reject_p = max(p_values[2 - pair_index], this_edge.edge_reject_p)

if this_edge.edge_reject_p > p_value_cutoff:
if this_edge not in unsupported_edges:
#print (this_edge, this_edge.edge_reject_p, seq_id)
Expand Down
42 changes: 27 additions & 15 deletions hivclustering/networkbuild.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def describe_network(network, json_output=False, keep_singletons=False):
print(reasons, file=sys.stderr)

if not settings().skip_degrees:
print("Fitting the degree distribution to various densities", file=sys.stderr)
print("Fitting the degree distribution to various densities", file=sys.stderr)
distro_fit = network.fit_degree_distribution()
ci = distro_fit['rho_ci'][distro_fit['Best']]
rho = distro_fit['rho'][distro_fit['Best']]
Expand All @@ -159,10 +159,10 @@ def describe_network(network, json_output=False, keep_singletons=False):
print("Best distribution is '%s'" % (distro_fit['Best']), file=sys.stderr)
else:
distro_fit = {'degrees' : network.get_degree_distribution()}

if json_output:
return_json['Degrees'] = distro_fit['degrees']


# find diffs in directed edges
'''for anEdge in network.edges:
Expand Down Expand Up @@ -333,6 +333,7 @@ def build_a_network(extra_arguments = None):
arguments.add_argument('-n', '--edge-filtering', dest='edge_filtering', choices=['remove', 'report'], help='Compute edge support and mark edges for removal using sequence-based triangle tests (requires the -s argument) and either only report them or remove the edges before doing other analyses ', required=False)
arguments.add_argument('-y', '--centralities', help='Output a CSV file with node centralities')
arguments.add_argument('-l', '--edge-filter-cycles', dest = 'filter_cycles', help='Filter edges that are in cycles other than triangles', action='store_true')
arguments.add_argument('--cycle-report-file', dest = 'cycle_report_filename', help='Prints cycle report to specified file', default = sys.stdout, type = argparse.FileType('w'), required=False)
arguments.add_argument('-g', '--triangles', help='Maximum number of triangles to consider in each filtering pass', type = int, default = 2**15)
arguments.add_argument('-C', '--contaminants', help='Screen for contaminants by marking or removing sequences that cluster with any of the contaminant IDs (-F option) [default is not to screen]', choices=['report', 'remove'])
arguments.add_argument('-F', '--contaminant-file', dest='contaminant_file',help='IDs of contaminant sequences', type=str)
Expand Down Expand Up @@ -442,7 +443,11 @@ def build_a_network(extra_arguments = None):
if len([k for k in [run_settings.edge_filtering, run_settings.sequences] if k is None]) == 1:
raise ValueError('Two arguments (-n and -s) are needed for edge filtering options')

network = transmission_network(multiple_edges=run_settings.multiple_edges)
if not run_settings.filter_cycles and run_settings.cycle_report_filename:
raise ValueError('-l option is necessary to report cycles')


network = transmission_network(multiple_edges=run_settings.multiple_edges)
network.read_from_csv_file(run_settings.input, formatter, run_settings.threshold, 'BULK')

uds_settings = None
Expand Down Expand Up @@ -515,7 +520,7 @@ def build_a_network(extra_arguments = None):
# network.apply_id_filter(list=run_settings.filter, do_clear=False)

network.compute_clusters () # this allocates nodes to clusters

def generate_edges_by_cluster () :
edges_by_clusters = {}
current_edge_set = network.reduce_edge_set()
Expand All @@ -528,9 +533,9 @@ def generate_edges_by_cluster () :
edges_by_clusters = [set(v) for c,v in edges_by_clusters.items() if len (v) >= 3]
edges_by_clusters.sort (key = lambda x : len (x)) # smallest first
return edges_by_clusters

edges_by_clusters = generate_edges_by_cluster()


# load sequence data

Expand All @@ -550,14 +555,14 @@ def generate_edges_by_cluster () :

for seq_id in all_referenced_sequences:
referenced_sequence_data[seq_id] = hy_instance.getvar(seq_id, hy.HyphyInterface.STRING)

if run_settings.extract is not None:
sequence_set = set ()
for anEdge in edges_by_clusters[run_settings.extract]:
sequence_set.update (anEdge.sequences)
for s in sequence_set:
print (">%s\n%s\n" % (s, referenced_sequence_data[s]), file = run_settings.output)



# partition edges into clusters
Expand Down Expand Up @@ -591,13 +596,20 @@ def handle_a_cluster (edge_set, cluster_count, total_count):

maximum_number += run_settings.triangles
edge_set.difference_update (set ([edge for edge in edge_set if not edge.has_support()]))

if run_settings.filter_cycles:
maximum_number = run_settings.triangles
supported_quads = set ()

for filtering_pass in range (8):
edge_stats = network.test_edge_support(referenced_sequence_data, *network.find_all_simple_cycles(edge_set, maximum_number = maximum_number, ignore_this_set = supported_quads, do_quads = True), supported_cycles = supported_quads, test_quads = True)
simple_cycles = network.find_all_simple_cycles(edge_set, maximum_number = maximum_number, ignore_this_set = supported_quads, do_quads = True)

# If reporting cycle option set, pickle output to file
if run_settings.cycle_report_filename:
print(json.dumps({"cycles" : simple_cycles[0]}), file=run_settings.cycle_report_filename)

edge_stats = network.test_edge_support(referenced_sequence_data, *simple_cycles, supported_cycles = supported_quads, test_quads = True)

if not edge_stats:
break
else:
Expand Down Expand Up @@ -641,13 +653,13 @@ def handle_a_cluster (edge_set, cluster_count, total_count):
total_removed += handle_a_cluster (current_edge_set, cluster_count, len (edges_by_clusters))

print ("\nEdge filtering identified %d edges for removal" % total_removed, file = sys.stderr)

network.set_edge_visibility(edge_visibility) # restore edge visibility

if run_settings.edge_filtering == 'remove':
print("Edge filtering removed %d edges" % network.conditional_prune_edges(), file=sys.stderr)


return network


0 comments on commit 96bcd25

Please sign in to comment.