From 788338a1e534f0982594add5997d54328cb23b6f Mon Sep 17 00:00:00 2001 From: Sergei Pond Date: Mon, 22 Jan 2018 13:43:39 -0500 Subject: [PATCH 1/2] Can now supply multiple CSV and sequence files as input, making including database comparisons easier --- hivclustering/data/HBL/TriangleSupport.bf | 73 +++++++++++++---------- hivclustering/mtnetwork.py | 67 ++++++++++++--------- hivclustering/networkbuild.py | 49 ++++++++------- 3 files changed, 108 insertions(+), 81 deletions(-) diff --git a/hivclustering/data/HBL/TriangleSupport.bf b/hivclustering/data/HBL/TriangleSupport.bf index 3570137..3646e77 100644 --- a/hivclustering/data/HBL/TriangleSupport.bf +++ b/hivclustering/data/HBL/TriangleSupport.bf @@ -4,58 +4,58 @@ first_call = 1; function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) { // seq_i is the list of sequence indices within the datafilter - - if (first_call) { + + if (first_call) { global R1 = 2; global R2 = 2; qTN93 = {{*,t,R1*t,t} {t,*,t,R2*t} {R1*t,t,*,t} {t,R2*t,t,*}}; - + Model TN93 = (qTN93, efv, 1); Tree T3 = (1,2,3); - first_call = 0; + first_call = 0; } else { R1 = 2; R2 = 2; } - + filterString = Join (",", {{seq1,seq2,seq3}}); DataSetFilter D3 = CreateFilter (^filter, 1,, filterString); - + USE_LAST_RESULTS = 0; OPTIMIZATION_METHOD = 4; - + ClearConstraints (T3); LikelihoodFunction L3 = (D3,T3); Optimize (full, L3); - - + + // stash all MLEs - + GetString (_lfInfo,L3,-1); MLE_stash = {}; - _paramList = _lfInfo["Global Independent"]; + _paramList = _lfInfo["Global Independent"]; for (k = 0; k < Columns (_paramList); k+=1) { id = _paramList[k]; MLE_stash [id] = Eval (id); } - _paramList = _lfInfo["Local Independent"]; + _paramList = _lfInfo["Local Independent"]; for (k = 0; k < Columns (_paramList); k+=1) { id = _paramList[k]; MLE_stash [id] = Eval (id); - } - + } + USE_LAST_RESULTS = 1; OPTIMIZATION_METHOD = 0; - + p_values = {3,1}; bl = BranchLength (T3, -1); - + for (b = 0; b < 3; b+=1) { - // restore MLEs + // restore MLEs if (bl[b] > 0) { for (k = 0; k < Abs (MLE_stash); k+=1) { id = MLE_stash["INDEXORDER"][k]; @@ -69,9 +69,9 @@ function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) { p_values [b] = 0.5; } } - + return p_values; - + } @@ -79,7 +79,16 @@ function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) { map = {triangle_count, 3}; //fprintf (stdout, "Loading...\n"); -DataSet ds = ReadDataFile (_py_sequence_file); + +file_count = Rows(_py_sequence_file) * Columns (_py_sequence_file); + +DataSet ds = ReadDataFile (_py_sequence_file[0]); + +for (k = 1; k < file_count; k += 1) { + DataSet read_more = ReadDataFile (_py_sequence_file[k]); + DataSet ds = Combine (ds, read_more); +} + DataSetFilter filteredData = CreateFilter (ds,1); nameToIndex = {}; @@ -102,26 +111,26 @@ all_p_values = {triangle_count, 3}; for (_t = 0; _t < triangle_count; _t += 1) { _toffset = _t * 3; - + //fprintf (stdout, _t, "\n"); - + _sidx1 = nameToIndex[_py_triangle_sequences[_toffset]] - 1; assert (_sidx1 >= 0, "Failed to map " + _py_triangle_sequences[_toffset]); _sidx2 = nameToIndex[_py_triangle_sequences[_toffset + 1]] - 1; assert (_sidx2 >= 0, "Failed to map " + _py_triangle_sequences[_toffset + 1]); _sidx3 = nameToIndex[_py_triangle_sequences[_toffset + 2]] - 1; assert (_sidx3 >= 0, "Failed to map " + _py_triangle_sequences[_toffset + 2]); - - - lpv = _testNetworkTriangle ("filteredData", globalFreqs, + + + lpv = _testNetworkTriangle ("filteredData", globalFreqs, _sidx1, _sidx2, _sidx3); - - for (z = 0; z < 3; z+=1) { - all_p_values [_t][z] = lpv[z]; - } - + + for (z = 0; z < 3; z+=1) { + all_p_values [_t][z] = lpv[z]; + } + } @@ -131,8 +140,8 @@ function _THyPhyAskFor(key) if (key_n >= 0 && key_n < triangle_count) { return all_p_values[key_n][-1]; } - - + + return "_THyPhy_NOT_HANDLED_"; } diff --git a/hivclustering/mtnetwork.py b/hivclustering/mtnetwork.py index c6fe70f..e2536f7 100644 --- a/hivclustering/mtnetwork.py +++ b/hivclustering/mtnetwork.py @@ -84,6 +84,11 @@ def parsePlain(str): return patient_description, None +def _ensure_list (arg): + if type (arg) is list: + return arg + return [arg] + def tm_to_datetime(tm_object): if tm_object is None: return None @@ -512,9 +517,9 @@ def get_baseline_date(self, complete=False): if complete: return min(self.dates) - - #print (self.dates) - + + #print (self.dates) + return min([k.tm_year for k in self.dates if k is not None]) def get_latest_date(self, complete=False): @@ -586,25 +591,31 @@ def __init__(self, multiple_edges=False): self.sequence_ids = {} # this will store unique sequence ids keyed by edge information (pid and date) def read_from_csv_file(self, file_name, formatter=None, distance_cut=None, default_attribute=None, bootstrap_mode=False): + + file_names = _ensure_list(file_name) + if formatter is None: - formatter = parseAEH - edgeReader = csv.reader(file_name) - header = next(edgeReader) - edgeAnnotations = {} - if len(header) < 3: - raise IOError('transmission_network.read_from_csv_file() : Expected a .csv file with at least 3 columns as input') + formatter = [parseAEH for f in file_names] + edgeAnnotations = {} handled_ids = set() - for line in edgeReader: - distance = float(line[2]) - if distance_cut is not None and distance > distance_cut: - self.ensure_node_is_added(line[0], formatter, default_attribute, bootstrap_mode, handled_ids) - self.ensure_node_is_added(line[1], formatter, default_attribute, bootstrap_mode, handled_ids) - continue - edge = self.add_an_edge(line[0], line[1], distance, formatter, default_attribute, bootstrap_mode) - if edge is not None and len(line) > 3: - edgeAnnotations[edge] = line[2:] + for index, file_object in enumerate (file_names): + edgeReader = csv.reader(file_object) + header = next(edgeReader) + if len(header) < 3: + raise IOError('transmission_network.read_from_csv_file() : Expected a .csv file with at least 3 columns as input (file %s)' % file_object.name) + + + for line in edgeReader: + distance = float(line[2]) + if distance_cut is not None and distance > distance_cut: + self.ensure_node_is_added(line[0], formatter[index], default_attribute, bootstrap_mode, handled_ids) + self.ensure_node_is_added(line[1], formatter[index], default_attribute, bootstrap_mode, handled_ids) + continue + edge = self.add_an_edge(line[0], line[1], distance, formatter[index], default_attribute, bootstrap_mode) + if edge is not None and len(line) > 3: + edgeAnnotations[edge] = line[2:] return edgeAnnotations @@ -928,8 +939,8 @@ def add_edi_json(self, edi): node.add_vl(vl_record[1], vl_record[0]) else: node.add_named_attribute (k, v) - - + + def clustering_coefficients(self, node_list=None): clustering_coefficiencts = {} @@ -1080,9 +1091,9 @@ def add_an_edge(self, id1, id2, distance, header_parser=None, edge_attribute=Non return new_edge return None - + def sequence_set_for_edge_filtering (self): - ''' return the set of all sequences necessary to perform edge filtering + ''' return the set of all sequences necessary to perform edge filtering ''' sequence_set = set () for an_edge in self.edge_iterator (): @@ -1376,9 +1387,9 @@ def get_edge_node_count(self, attributes_to_check=None): edge_set.add((edge.p1, edge.p2)) - return {'edges': len(edge_set), 'nodes': len(vis_nodes), 'total_edges': edge_count, - 'multiple_dates': [[k[0], k[1].days] for k in multiple_samples], - 'total_sequences': len(vis_nodes) + sum([k[0] for k in multiple_samples]) - len(multiple_samples), + return {'edges': len(edge_set), 'nodes': len(vis_nodes), 'total_edges': edge_count, + 'multiple_dates': [[k[0], k[1].days] for k in multiple_samples], + 'total_sequences': len(vis_nodes) + sum([k[0] for k in multiple_samples]) - len(multiple_samples), 'stages': nodes_by_stage, 'edge-stages': edges_by_stage} def clear_adjacency(self, clear_filter=True): @@ -1706,7 +1717,7 @@ def generate_dot(self, file, year_vis=None, reduce_edges=True, attribute_color=N nodes_drawn = set () directed = {'undirected': 0, 'directed': 0} - + for edge in self.edge_iterator() if reduce_edges == False else self.reduce_edge_set(): if edge.visible: distance = self.distances[edge] @@ -1728,7 +1739,7 @@ def generate_dot(self, file, year_vis=None, reduce_edges=True, attribute_color=N if edge.check_date(year_vis) == False: file.write('%s [style="invis" arrowhead = "%s"];\n' % (edge_attr[0], edge_attr[1])) continue - + if attribute_color is not None: color = attribute_color (edge) if color is not None: @@ -2227,7 +2238,7 @@ def get_degree_distribution(self, **kwargs): connect_me = False - + if outdegree: connect_me = dir is not None and dir == node if indegree: diff --git a/hivclustering/networkbuild.py b/hivclustering/networkbuild.py index fef6dbf..61534a8 100755 --- a/hivclustering/networkbuild.py +++ b/hivclustering/networkbuild.py @@ -291,10 +291,11 @@ def get_sequence_ids(fn): #------------------------------------------------------------------------------- def get_fasta_ids(fn): - fh = open(fn) - for line in fh: - if line[0] == '>': - yield line[1:].strip() + for f in fn: + fh = open(f) + for line in fh: + if line[0] == '>': + yield line[1:].strip() @@ -304,22 +305,22 @@ def build_a_network(): random.seed() arguments = argparse.ArgumentParser(description='Read filenames.') - arguments.add_argument('-i', '--input', help='Input CSV file with inferred genetic links (or stdin if omitted). Must be a CSV file with three columns: ID1,ID2,distance.') + arguments.add_argument('-i', '--input', help='Input CSV file with inferred genetic links (or stdin if omitted). Can be specified multiple times for multiple input files (e.g. to include a reference database). Must be a CSV file with three columns: ID1,ID2,distance.', action = 'append') arguments.add_argument('-u', '--uds', help='Input CSV file with UDS data. Must be a CSV file with three columns: ID1,ID2,distance.') arguments.add_argument('-d', '--dot', help='Output DOT file for GraphViz (or stdout if omitted)') arguments.add_argument('-c', '--cluster', help='Output a CSV file with cluster assignments for each sequence') arguments.add_argument('-t', '--threshold', help='Only count edges where the distance is less than this threshold') arguments.add_argument('-e', '--edi', help='A .json file with clinical information') arguments.add_argument('-z', '--old_edi', help='A .csv file with legacy EDI dates') - arguments.add_argument('-f', '--format', help='Sequence ID format. One of AEH (ID | sample_date | otherfiels default), LANL (e.g. B_HXB2_K03455_1983 : subtype_country_id_year -- could have more fields), regexp (match a regular expression, use the first group as the ID), or plain (treat as sequence ID only, no meta)') + arguments.add_argument('-f', '--format', help='Sequence ID format. One of AEH (ID | sample_date | otherfiels default), LANL (e.g. B_HXB2_K03455_1983 : subtype_country_id_year -- could have more fields), regexp (match a regular expression, use the first group as the ID), or plain (treat as sequence ID only, no meta); one per input argument if specified', action = 'append') arguments.add_argument('-x', '--exclude', help='Exclude any sequence which belongs to a cluster containing a "reference" strain, defined by the year of isolation. The value of this argument is an integer year (e.g. 1984) so that any sequence isolated in or before that year (e.g. <=1983) is considered to be a lab strain. This option makes sense for LANL or AEH data.') arguments.add_argument('-r', '--resistance',help='Load a JSON file with resistance annotation by sequence', type=argparse.FileType('r')) - arguments.add_argument('-p', '--parser', help='The reg.exp pattern to split up sequence ids; only used if format is regexp', required=False, type=str) + arguments.add_argument('-p', '--parser', help='The reg.exp pattern to split up sequence ids; only used if format is regexp (specify N times for N input files, even if empty)', required=False, type=str, action = 'append') arguments.add_argument('-a', '--attributes',help='Load a CSV file with optional node attributes', type=argparse.FileType('r')) arguments.add_argument('-j', '--json', help='Output the network report as a JSON object',required=False, action='store_true', default=False) arguments.add_argument('-o', '--singletons', help='Include singletons in JSON output',required=False, action='store_true', default=False) arguments.add_argument('-k', '--filter', help='Only return clusters with ids listed by a newline separated supplied file. ', required=False) - arguments.add_argument('-s', '--sequences', help='Provide the MSA with sequences which were used to make the distance file. ', required=False) + arguments.add_argument('-s', '--sequences', help='Provide the MSA with sequences which were used to make the distance file. Can be specified multiple times to include mutliple MSA files', required=False, action = 'append') arguments.add_argument('-n', '--edge-filtering', dest='edge_filtering', choices=['remove', 'report'], help='Compute edge support and mark edges for removal using sequence-based triangle tests (requires the -s argument) and either only report them or remove the edges before doing other analyses ', required=False) arguments.add_argument('-y', '--centralities', help='Output a CSV file with node centralities') arguments.add_argument('-g', '--triangles', help='Maximum number of triangles to consider in each filtering pass', type = int, default = 2**16) @@ -332,10 +333,10 @@ def build_a_network(): run_settings = arguments.parse_args() if run_settings.input == None: - run_settings.input = sys.stdin + run_settings.input = [sys.stdin] else: try: - run_settings.input = open(run_settings.input, 'r') + run_settings.input = [open(file, 'r') for file in run_settings.input] except IOError: print("Failed to open '%s' for reading" % (run_settings.input), file=sys.stderr) raise @@ -381,17 +382,23 @@ def build_a_network(): print("Failed to open '%s' for writing" % (run_settings.cluster), file=sys.stderr) raise - formatter = parseAEH + formatter = [] if run_settings.format is not None: - formats = {"AEH": parseAEH, "LANL": parseLANL, "plain": parsePlain, "regexp": parseRegExp( - None if run_settings.parser is None else re.compile(run_settings.parser))} - try: - formatter = formats[run_settings.format] - except KeyError: - print("%s is not a valid setting for 'format' (must be in %s)" % - (run_settings.format, str(list(formats.keys()))), file=sys.stderr) - raise + for index, format_k in enumerate (run_settings.format): + formats = {"AEH": parseAEH, "LANL": parseLANL, "plain": parsePlain, "regexp": parseRegExp( + None if run_settings.parser is None or len(run_settings.parser) < index else re.compile(run_settings.parser[index]))} + try: + formatter.append (formats[format_k]) + except KeyError: + print("%s is not a valid setting for 'format' (must be in %s)" % + (run_settings.format, str(list(formats.keys()))), file=sys.stderr) + raise + + if len (run_settings.format) != len (run_settings.input): + raise Exception ("Must specify as many formatters as there are input files when at least one formatter is specified explicitly") + else: + formatter = [parseAEH for k in run_settings.input] if run_settings.exclude is not None: try: @@ -479,8 +486,8 @@ def build_a_network(): maximum_number = run_settings.triangles for filtering_pass in range (64): - edge_stats = network.test_edge_support(os.path.abspath( - run_settings.sequences), *network.find_all_triangles(current_edge_set, maximum_number = maximum_number)) + edge_stats = network.test_edge_support([os.path.abspath( + s) for s in run_settings.sequences], *network.find_all_triangles(current_edge_set, maximum_number = maximum_number)) if not edge_stats or edge_stats['removed edges'] == 0: break else: From 1a58a7f90087aa0ede1002a66f9ca028b34ff180 Mon Sep 17 00:00:00 2001 From: Steven Weaver Date: Wed, 24 Jan 2018 12:46:38 -0500 Subject: [PATCH 2/2] bumping version to 1.3.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e229278..6c90fc4 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ sys.path.insert(0, join(split(abspath(__file__))[0], 'lib')) setup(name='hivclustering', - version="1.2.8", + version="1.3.0", description='HIV molecular clustering tools', author='Sergei Kosakovsky Pond', author_email='spond@ucsd.edu',