Skip to content

Commit

Permalink
Merge pull request #14 from veg/develop
Browse files Browse the repository at this point in the history
Can now supply multiple CSV and sequence files as input, making inclu…
  • Loading branch information
stevenweaver authored Jan 24, 2018
2 parents a65e416 + 1a58a7f commit a13e781
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 82 deletions.
73 changes: 41 additions & 32 deletions hivclustering/data/HBL/TriangleSupport.bf
Original file line number Diff line number Diff line change
Expand Up @@ -4,58 +4,58 @@ first_call = 1;

function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) {
// seq_i is the list of sequence indices within the datafilter
if (first_call) {

if (first_call) {
global R1 = 2;
global R2 = 2;
qTN93 = {{*,t,R1*t,t}
{t,*,t,R2*t}
{R1*t,t,*,t}
{t,R2*t,t,*}};

Model TN93 = (qTN93, efv, 1);
Tree T3 = (1,2,3);
first_call = 0;
first_call = 0;
} else {
R1 = 2;
R2 = 2;
}

filterString = Join (",", {{seq1,seq2,seq3}});
DataSetFilter D3 = CreateFilter (^filter, 1,, filterString);

USE_LAST_RESULTS = 0;
OPTIMIZATION_METHOD = 4;

ClearConstraints (T3);
LikelihoodFunction L3 = (D3,T3);
Optimize (full, L3);


// stash all MLEs

GetString (_lfInfo,L3,-1);
MLE_stash = {};

_paramList = _lfInfo["Global Independent"];
_paramList = _lfInfo["Global Independent"];
for (k = 0; k < Columns (_paramList); k+=1) {
id = _paramList[k];
MLE_stash [id] = Eval (id);
}
_paramList = _lfInfo["Local Independent"];
_paramList = _lfInfo["Local Independent"];
for (k = 0; k < Columns (_paramList); k+=1) {
id = _paramList[k];
MLE_stash [id] = Eval (id);
}
}

USE_LAST_RESULTS = 1;
OPTIMIZATION_METHOD = 0;

p_values = {3,1};
bl = BranchLength (T3, -1);

for (b = 0; b < 3; b+=1) {
// restore MLEs
// restore MLEs
if (bl[b] > 0) {
for (k = 0; k < Abs (MLE_stash); k+=1) {
id = MLE_stash["INDEXORDER"][k];
Expand All @@ -69,17 +69,26 @@ function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) {
p_values [b] = 0.5;
}
}

return p_values;

}



map = {triangle_count, 3};

//fprintf (stdout, "Loading...\n");
DataSet ds = ReadDataFile (_py_sequence_file);

file_count = Rows(_py_sequence_file) * Columns (_py_sequence_file);

DataSet ds = ReadDataFile (_py_sequence_file[0]);

for (k = 1; k < file_count; k += 1) {
DataSet read_more = ReadDataFile (_py_sequence_file[k]);
DataSet ds = Combine (ds, read_more);
}

DataSetFilter filteredData = CreateFilter (ds,1);

nameToIndex = {};
Expand All @@ -102,26 +111,26 @@ all_p_values = {triangle_count, 3};

for (_t = 0; _t < triangle_count; _t += 1) {
_toffset = _t * 3;

//fprintf (stdout, _t, "\n");

_sidx1 = nameToIndex[_py_triangle_sequences[_toffset]] - 1;
assert (_sidx1 >= 0, "Failed to map " + _py_triangle_sequences[_toffset]);
_sidx2 = nameToIndex[_py_triangle_sequences[_toffset + 1]] - 1;
assert (_sidx2 >= 0, "Failed to map " + _py_triangle_sequences[_toffset + 1]);
_sidx3 = nameToIndex[_py_triangle_sequences[_toffset + 2]] - 1;
assert (_sidx3 >= 0, "Failed to map " + _py_triangle_sequences[_toffset + 2]);
lpv = _testNetworkTriangle ("filteredData", globalFreqs,


lpv = _testNetworkTriangle ("filteredData", globalFreqs,
_sidx1,
_sidx2,
_sidx3);
for (z = 0; z < 3; z+=1) {
all_p_values [_t][z] = lpv[z];
}

for (z = 0; z < 3; z+=1) {
all_p_values [_t][z] = lpv[z];
}

}


Expand All @@ -131,8 +140,8 @@ function _THyPhyAskFor(key)
if (key_n >= 0 && key_n < triangle_count) {
return all_p_values[key_n][-1];
}





return "_THyPhy_NOT_HANDLED_";
}
67 changes: 39 additions & 28 deletions hivclustering/mtnetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def parsePlain(str):
return patient_description, None


def _ensure_list (arg):
if type (arg) is list:
return arg
return [arg]

def tm_to_datetime(tm_object):
if tm_object is None:
return None
Expand Down Expand Up @@ -512,9 +517,9 @@ def get_baseline_date(self, complete=False):

if complete:
return min(self.dates)
#print (self.dates)

#print (self.dates)

return min([k.tm_year for k in self.dates if k is not None])

def get_latest_date(self, complete=False):
Expand Down Expand Up @@ -586,25 +591,31 @@ def __init__(self, multiple_edges=False):
self.sequence_ids = {} # this will store unique sequence ids keyed by edge information (pid and date)

def read_from_csv_file(self, file_name, formatter=None, distance_cut=None, default_attribute=None, bootstrap_mode=False):

file_names = _ensure_list(file_name)

if formatter is None:
formatter = parseAEH
edgeReader = csv.reader(file_name)
header = next(edgeReader)
edgeAnnotations = {}
if len(header) < 3:
raise IOError('transmission_network.read_from_csv_file() : Expected a .csv file with at least 3 columns as input')
formatter = [parseAEH for f in file_names]

edgeAnnotations = {}
handled_ids = set()

for line in edgeReader:
distance = float(line[2])
if distance_cut is not None and distance > distance_cut:
self.ensure_node_is_added(line[0], formatter, default_attribute, bootstrap_mode, handled_ids)
self.ensure_node_is_added(line[1], formatter, default_attribute, bootstrap_mode, handled_ids)
continue
edge = self.add_an_edge(line[0], line[1], distance, formatter, default_attribute, bootstrap_mode)
if edge is not None and len(line) > 3:
edgeAnnotations[edge] = line[2:]
for index, file_object in enumerate (file_names):
edgeReader = csv.reader(file_object)
header = next(edgeReader)
if len(header) < 3:
raise IOError('transmission_network.read_from_csv_file() : Expected a .csv file with at least 3 columns as input (file %s)' % file_object.name)


for line in edgeReader:
distance = float(line[2])
if distance_cut is not None and distance > distance_cut:
self.ensure_node_is_added(line[0], formatter[index], default_attribute, bootstrap_mode, handled_ids)
self.ensure_node_is_added(line[1], formatter[index], default_attribute, bootstrap_mode, handled_ids)
continue
edge = self.add_an_edge(line[0], line[1], distance, formatter[index], default_attribute, bootstrap_mode)
if edge is not None and len(line) > 3:
edgeAnnotations[edge] = line[2:]

return edgeAnnotations

Expand Down Expand Up @@ -928,8 +939,8 @@ def add_edi_json(self, edi):
node.add_vl(vl_record[1], vl_record[0])
else:
node.add_named_attribute (k, v)



def clustering_coefficients(self, node_list=None):
clustering_coefficiencts = {}
Expand Down Expand Up @@ -1080,9 +1091,9 @@ def add_an_edge(self, id1, id2, distance, header_parser=None, edge_attribute=Non
return new_edge

return None

def sequence_set_for_edge_filtering (self):
''' return the set of all sequences necessary to perform edge filtering
''' return the set of all sequences necessary to perform edge filtering
'''
sequence_set = set ()
for an_edge in self.edge_iterator ():
Expand Down Expand Up @@ -1376,9 +1387,9 @@ def get_edge_node_count(self, attributes_to_check=None):

edge_set.add((edge.p1, edge.p2))

return {'edges': len(edge_set), 'nodes': len(vis_nodes), 'total_edges': edge_count,
'multiple_dates': [[k[0], k[1].days] for k in multiple_samples],
'total_sequences': len(vis_nodes) + sum([k[0] for k in multiple_samples]) - len(multiple_samples),
return {'edges': len(edge_set), 'nodes': len(vis_nodes), 'total_edges': edge_count,
'multiple_dates': [[k[0], k[1].days] for k in multiple_samples],
'total_sequences': len(vis_nodes) + sum([k[0] for k in multiple_samples]) - len(multiple_samples),
'stages': nodes_by_stage, 'edge-stages': edges_by_stage}

def clear_adjacency(self, clear_filter=True):
Expand Down Expand Up @@ -1706,7 +1717,7 @@ def generate_dot(self, file, year_vis=None, reduce_edges=True, attribute_color=N
nodes_drawn = set ()

directed = {'undirected': 0, 'directed': 0}

for edge in self.edge_iterator() if reduce_edges == False else self.reduce_edge_set():
if edge.visible:
distance = self.distances[edge]
Expand All @@ -1728,7 +1739,7 @@ def generate_dot(self, file, year_vis=None, reduce_edges=True, attribute_color=N
if edge.check_date(year_vis) == False:
file.write('%s [style="invis" arrowhead = "%s"];\n' % (edge_attr[0], edge_attr[1]))
continue

if attribute_color is not None:
color = attribute_color (edge)
if color is not None:
Expand Down Expand Up @@ -2227,7 +2238,7 @@ def get_degree_distribution(self, **kwargs):


connect_me = False

if outdegree:
connect_me = dir is not None and dir == node
if indegree:
Expand Down
49 changes: 28 additions & 21 deletions hivclustering/networkbuild.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,11 @@ def get_sequence_ids(fn):
#-------------------------------------------------------------------------------

def get_fasta_ids(fn):
fh = open(fn)
for line in fh:
if line[0] == '>':
yield line[1:].strip()
for f in fn:
fh = open(f)
for line in fh:
if line[0] == '>':
yield line[1:].strip()



Expand All @@ -304,22 +305,22 @@ def build_a_network():
random.seed()
arguments = argparse.ArgumentParser(description='Read filenames.')

arguments.add_argument('-i', '--input', help='Input CSV file with inferred genetic links (or stdin if omitted). Must be a CSV file with three columns: ID1,ID2,distance.')
arguments.add_argument('-i', '--input', help='Input CSV file with inferred genetic links (or stdin if omitted). Can be specified multiple times for multiple input files (e.g. to include a reference database). Must be a CSV file with three columns: ID1,ID2,distance.', action = 'append')
arguments.add_argument('-u', '--uds', help='Input CSV file with UDS data. Must be a CSV file with three columns: ID1,ID2,distance.')
arguments.add_argument('-d', '--dot', help='Output DOT file for GraphViz (or stdout if omitted)')
arguments.add_argument('-c', '--cluster', help='Output a CSV file with cluster assignments for each sequence')
arguments.add_argument('-t', '--threshold', help='Only count edges where the distance is less than this threshold')
arguments.add_argument('-e', '--edi', help='A .json file with clinical information')
arguments.add_argument('-z', '--old_edi', help='A .csv file with legacy EDI dates')
arguments.add_argument('-f', '--format', help='Sequence ID format. One of AEH (ID | sample_date | otherfiels default), LANL (e.g. B_HXB2_K03455_1983 : subtype_country_id_year -- could have more fields), regexp (match a regular expression, use the first group as the ID), or plain (treat as sequence ID only, no meta)')
arguments.add_argument('-f', '--format', help='Sequence ID format. One of AEH (ID | sample_date | otherfiels default), LANL (e.g. B_HXB2_K03455_1983 : subtype_country_id_year -- could have more fields), regexp (match a regular expression, use the first group as the ID), or plain (treat as sequence ID only, no meta); one per input argument if specified', action = 'append')
arguments.add_argument('-x', '--exclude', help='Exclude any sequence which belongs to a cluster containing a "reference" strain, defined by the year of isolation. The value of this argument is an integer year (e.g. 1984) so that any sequence isolated in or before that year (e.g. <=1983) is considered to be a lab strain. This option makes sense for LANL or AEH data.')
arguments.add_argument('-r', '--resistance',help='Load a JSON file with resistance annotation by sequence', type=argparse.FileType('r'))
arguments.add_argument('-p', '--parser', help='The reg.exp pattern to split up sequence ids; only used if format is regexp', required=False, type=str)
arguments.add_argument('-p', '--parser', help='The reg.exp pattern to split up sequence ids; only used if format is regexp (specify N times for N input files, even if empty)', required=False, type=str, action = 'append')
arguments.add_argument('-a', '--attributes',help='Load a CSV file with optional node attributes', type=argparse.FileType('r'))
arguments.add_argument('-j', '--json', help='Output the network report as a JSON object',required=False, action='store_true', default=False)
arguments.add_argument('-o', '--singletons', help='Include singletons in JSON output',required=False, action='store_true', default=False)
arguments.add_argument('-k', '--filter', help='Only return clusters with ids listed by a newline separated supplied file. ', required=False)
arguments.add_argument('-s', '--sequences', help='Provide the MSA with sequences which were used to make the distance file. ', required=False)
arguments.add_argument('-s', '--sequences', help='Provide the MSA with sequences which were used to make the distance file. Can be specified multiple times to include mutliple MSA files', required=False, action = 'append')
arguments.add_argument('-n', '--edge-filtering', dest='edge_filtering', choices=['remove', 'report'], help='Compute edge support and mark edges for removal using sequence-based triangle tests (requires the -s argument) and either only report them or remove the edges before doing other analyses ', required=False)
arguments.add_argument('-y', '--centralities', help='Output a CSV file with node centralities')
arguments.add_argument('-g', '--triangles', help='Maximum number of triangles to consider in each filtering pass', type = int, default = 2**16)
Expand All @@ -332,10 +333,10 @@ def build_a_network():
run_settings = arguments.parse_args()

if run_settings.input == None:
run_settings.input = sys.stdin
run_settings.input = [sys.stdin]
else:
try:
run_settings.input = open(run_settings.input, 'r')
run_settings.input = [open(file, 'r') for file in run_settings.input]
except IOError:
print("Failed to open '%s' for reading" % (run_settings.input), file=sys.stderr)
raise
Expand Down Expand Up @@ -381,17 +382,23 @@ def build_a_network():
print("Failed to open '%s' for writing" % (run_settings.cluster), file=sys.stderr)
raise

formatter = parseAEH
formatter = []

if run_settings.format is not None:
formats = {"AEH": parseAEH, "LANL": parseLANL, "plain": parsePlain, "regexp": parseRegExp(
None if run_settings.parser is None else re.compile(run_settings.parser))}
try:
formatter = formats[run_settings.format]
except KeyError:
print("%s is not a valid setting for 'format' (must be in %s)" %
(run_settings.format, str(list(formats.keys()))), file=sys.stderr)
raise
for index, format_k in enumerate (run_settings.format):
formats = {"AEH": parseAEH, "LANL": parseLANL, "plain": parsePlain, "regexp": parseRegExp(
None if run_settings.parser is None or len(run_settings.parser) < index else re.compile(run_settings.parser[index]))}
try:
formatter.append (formats[format_k])
except KeyError:
print("%s is not a valid setting for 'format' (must be in %s)" %
(run_settings.format, str(list(formats.keys()))), file=sys.stderr)
raise

if len (run_settings.format) != len (run_settings.input):
raise Exception ("Must specify as many formatters as there are input files when at least one formatter is specified explicitly")
else:
formatter = [parseAEH for k in run_settings.input]

if run_settings.exclude is not None:
try:
Expand Down Expand Up @@ -479,8 +486,8 @@ def build_a_network():
maximum_number = run_settings.triangles

for filtering_pass in range (64):
edge_stats = network.test_edge_support(os.path.abspath(
run_settings.sequences), *network.find_all_triangles(current_edge_set, maximum_number = maximum_number))
edge_stats = network.test_edge_support([os.path.abspath(
s) for s in run_settings.sequences], *network.find_all_triangles(current_edge_set, maximum_number = maximum_number))
if not edge_stats or edge_stats['removed edges'] == 0:
break
else:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
sys.path.insert(0, join(split(abspath(__file__))[0], 'lib'))

setup(name='hivclustering',
version="1.2.8",
version="1.3.0",
description='HIV molecular clustering tools',
author='Sergei Kosakovsky Pond',
author_email='[email protected]',
Expand Down

0 comments on commit a13e781

Please sign in to comment.