Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Can now supply multiple CSV and sequence files as input, making inclu… #14

Merged
merged 2 commits into from
Jan 24, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 41 additions & 32 deletions hivclustering/data/HBL/TriangleSupport.bf
Original file line number Diff line number Diff line change
Expand Up @@ -4,58 +4,58 @@ first_call = 1;

function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) {
// seq_i is the list of sequence indices within the datafilter
if (first_call) {

if (first_call) {
global R1 = 2;
global R2 = 2;
qTN93 = {{*,t,R1*t,t}
{t,*,t,R2*t}
{R1*t,t,*,t}
{t,R2*t,t,*}};

Model TN93 = (qTN93, efv, 1);
Tree T3 = (1,2,3);
first_call = 0;
first_call = 0;
} else {
R1 = 2;
R2 = 2;
}

filterString = Join (",", {{seq1,seq2,seq3}});
DataSetFilter D3 = CreateFilter (^filter, 1,, filterString);

USE_LAST_RESULTS = 0;
OPTIMIZATION_METHOD = 4;

ClearConstraints (T3);
LikelihoodFunction L3 = (D3,T3);
Optimize (full, L3);


// stash all MLEs

GetString (_lfInfo,L3,-1);
MLE_stash = {};

_paramList = _lfInfo["Global Independent"];
_paramList = _lfInfo["Global Independent"];
for (k = 0; k < Columns (_paramList); k+=1) {
id = _paramList[k];
MLE_stash [id] = Eval (id);
}
_paramList = _lfInfo["Local Independent"];
_paramList = _lfInfo["Local Independent"];
for (k = 0; k < Columns (_paramList); k+=1) {
id = _paramList[k];
MLE_stash [id] = Eval (id);
}
}

USE_LAST_RESULTS = 1;
OPTIMIZATION_METHOD = 0;

p_values = {3,1};
bl = BranchLength (T3, -1);

for (b = 0; b < 3; b+=1) {
// restore MLEs
// restore MLEs
if (bl[b] > 0) {
for (k = 0; k < Abs (MLE_stash); k+=1) {
id = MLE_stash["INDEXORDER"][k];
Expand All @@ -69,17 +69,26 @@ function _testNetworkTriangle (filter, efv, seq1, seq2, seq3) {
p_values [b] = 0.5;
}
}

return p_values;

}



map = {triangle_count, 3};

//fprintf (stdout, "Loading...\n");
DataSet ds = ReadDataFile (_py_sequence_file);

file_count = Rows(_py_sequence_file) * Columns (_py_sequence_file);

DataSet ds = ReadDataFile (_py_sequence_file[0]);

for (k = 1; k < file_count; k += 1) {
DataSet read_more = ReadDataFile (_py_sequence_file[k]);
DataSet ds = Combine (ds, read_more);
}

DataSetFilter filteredData = CreateFilter (ds,1);

nameToIndex = {};
Expand All @@ -102,26 +111,26 @@ all_p_values = {triangle_count, 3};

for (_t = 0; _t < triangle_count; _t += 1) {
_toffset = _t * 3;

//fprintf (stdout, _t, "\n");

_sidx1 = nameToIndex[_py_triangle_sequences[_toffset]] - 1;
assert (_sidx1 >= 0, "Failed to map " + _py_triangle_sequences[_toffset]);
_sidx2 = nameToIndex[_py_triangle_sequences[_toffset + 1]] - 1;
assert (_sidx2 >= 0, "Failed to map " + _py_triangle_sequences[_toffset + 1]);
_sidx3 = nameToIndex[_py_triangle_sequences[_toffset + 2]] - 1;
assert (_sidx3 >= 0, "Failed to map " + _py_triangle_sequences[_toffset + 2]);
lpv = _testNetworkTriangle ("filteredData", globalFreqs,


lpv = _testNetworkTriangle ("filteredData", globalFreqs,
_sidx1,
_sidx2,
_sidx3);
for (z = 0; z < 3; z+=1) {
all_p_values [_t][z] = lpv[z];
}

for (z = 0; z < 3; z+=1) {
all_p_values [_t][z] = lpv[z];
}

}


Expand All @@ -131,8 +140,8 @@ function _THyPhyAskFor(key)
if (key_n >= 0 && key_n < triangle_count) {
return all_p_values[key_n][-1];
}





return "_THyPhy_NOT_HANDLED_";
}
67 changes: 39 additions & 28 deletions hivclustering/mtnetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def parsePlain(str):
return patient_description, None


def _ensure_list (arg):
if type (arg) is list:
return arg
return [arg]

def tm_to_datetime(tm_object):
if tm_object is None:
return None
Expand Down Expand Up @@ -512,9 +517,9 @@ def get_baseline_date(self, complete=False):

if complete:
return min(self.dates)
#print (self.dates)

#print (self.dates)

return min([k.tm_year for k in self.dates if k is not None])

def get_latest_date(self, complete=False):
Expand Down Expand Up @@ -586,25 +591,31 @@ def __init__(self, multiple_edges=False):
self.sequence_ids = {} # this will store unique sequence ids keyed by edge information (pid and date)

def read_from_csv_file(self, file_name, formatter=None, distance_cut=None, default_attribute=None, bootstrap_mode=False):

file_names = _ensure_list(file_name)

if formatter is None:
formatter = parseAEH
edgeReader = csv.reader(file_name)
header = next(edgeReader)
edgeAnnotations = {}
if len(header) < 3:
raise IOError('transmission_network.read_from_csv_file() : Expected a .csv file with at least 3 columns as input')
formatter = [parseAEH for f in file_names]

edgeAnnotations = {}
handled_ids = set()

for line in edgeReader:
distance = float(line[2])
if distance_cut is not None and distance > distance_cut:
self.ensure_node_is_added(line[0], formatter, default_attribute, bootstrap_mode, handled_ids)
self.ensure_node_is_added(line[1], formatter, default_attribute, bootstrap_mode, handled_ids)
continue
edge = self.add_an_edge(line[0], line[1], distance, formatter, default_attribute, bootstrap_mode)
if edge is not None and len(line) > 3:
edgeAnnotations[edge] = line[2:]
for index, file_object in enumerate (file_names):
edgeReader = csv.reader(file_object)
header = next(edgeReader)
if len(header) < 3:
raise IOError('transmission_network.read_from_csv_file() : Expected a .csv file with at least 3 columns as input (file %s)' % file_object.name)


for line in edgeReader:
distance = float(line[2])
if distance_cut is not None and distance > distance_cut:
self.ensure_node_is_added(line[0], formatter[index], default_attribute, bootstrap_mode, handled_ids)
self.ensure_node_is_added(line[1], formatter[index], default_attribute, bootstrap_mode, handled_ids)
continue
edge = self.add_an_edge(line[0], line[1], distance, formatter[index], default_attribute, bootstrap_mode)
if edge is not None and len(line) > 3:
edgeAnnotations[edge] = line[2:]

return edgeAnnotations

Expand Down Expand Up @@ -928,8 +939,8 @@ def add_edi_json(self, edi):
node.add_vl(vl_record[1], vl_record[0])
else:
node.add_named_attribute (k, v)



def clustering_coefficients(self, node_list=None):
clustering_coefficiencts = {}
Expand Down Expand Up @@ -1080,9 +1091,9 @@ def add_an_edge(self, id1, id2, distance, header_parser=None, edge_attribute=Non
return new_edge

return None

def sequence_set_for_edge_filtering (self):
''' return the set of all sequences necessary to perform edge filtering
''' return the set of all sequences necessary to perform edge filtering
'''
sequence_set = set ()
for an_edge in self.edge_iterator ():
Expand Down Expand Up @@ -1376,9 +1387,9 @@ def get_edge_node_count(self, attributes_to_check=None):

edge_set.add((edge.p1, edge.p2))

return {'edges': len(edge_set), 'nodes': len(vis_nodes), 'total_edges': edge_count,
'multiple_dates': [[k[0], k[1].days] for k in multiple_samples],
'total_sequences': len(vis_nodes) + sum([k[0] for k in multiple_samples]) - len(multiple_samples),
return {'edges': len(edge_set), 'nodes': len(vis_nodes), 'total_edges': edge_count,
'multiple_dates': [[k[0], k[1].days] for k in multiple_samples],
'total_sequences': len(vis_nodes) + sum([k[0] for k in multiple_samples]) - len(multiple_samples),
'stages': nodes_by_stage, 'edge-stages': edges_by_stage}

def clear_adjacency(self, clear_filter=True):
Expand Down Expand Up @@ -1706,7 +1717,7 @@ def generate_dot(self, file, year_vis=None, reduce_edges=True, attribute_color=N
nodes_drawn = set ()

directed = {'undirected': 0, 'directed': 0}

for edge in self.edge_iterator() if reduce_edges == False else self.reduce_edge_set():
if edge.visible:
distance = self.distances[edge]
Expand All @@ -1728,7 +1739,7 @@ def generate_dot(self, file, year_vis=None, reduce_edges=True, attribute_color=N
if edge.check_date(year_vis) == False:
file.write('%s [style="invis" arrowhead = "%s"];\n' % (edge_attr[0], edge_attr[1]))
continue

if attribute_color is not None:
color = attribute_color (edge)
if color is not None:
Expand Down Expand Up @@ -2227,7 +2238,7 @@ def get_degree_distribution(self, **kwargs):


connect_me = False

if outdegree:
connect_me = dir is not None and dir == node
if indegree:
Expand Down
49 changes: 28 additions & 21 deletions hivclustering/networkbuild.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,11 @@ def get_sequence_ids(fn):
#-------------------------------------------------------------------------------

def get_fasta_ids(fn):
fh = open(fn)
for line in fh:
if line[0] == '>':
yield line[1:].strip()
for f in fn:
fh = open(f)
for line in fh:
if line[0] == '>':
yield line[1:].strip()



Expand All @@ -304,22 +305,22 @@ def build_a_network():
random.seed()
arguments = argparse.ArgumentParser(description='Read filenames.')

arguments.add_argument('-i', '--input', help='Input CSV file with inferred genetic links (or stdin if omitted). Must be a CSV file with three columns: ID1,ID2,distance.')
arguments.add_argument('-i', '--input', help='Input CSV file with inferred genetic links (or stdin if omitted). Can be specified multiple times for multiple input files (e.g. to include a reference database). Must be a CSV file with three columns: ID1,ID2,distance.', action = 'append')
arguments.add_argument('-u', '--uds', help='Input CSV file with UDS data. Must be a CSV file with three columns: ID1,ID2,distance.')
arguments.add_argument('-d', '--dot', help='Output DOT file for GraphViz (or stdout if omitted)')
arguments.add_argument('-c', '--cluster', help='Output a CSV file with cluster assignments for each sequence')
arguments.add_argument('-t', '--threshold', help='Only count edges where the distance is less than this threshold')
arguments.add_argument('-e', '--edi', help='A .json file with clinical information')
arguments.add_argument('-z', '--old_edi', help='A .csv file with legacy EDI dates')
arguments.add_argument('-f', '--format', help='Sequence ID format. One of AEH (ID | sample_date | otherfiels default), LANL (e.g. B_HXB2_K03455_1983 : subtype_country_id_year -- could have more fields), regexp (match a regular expression, use the first group as the ID), or plain (treat as sequence ID only, no meta)')
arguments.add_argument('-f', '--format', help='Sequence ID format. One of AEH (ID | sample_date | otherfiels default), LANL (e.g. B_HXB2_K03455_1983 : subtype_country_id_year -- could have more fields), regexp (match a regular expression, use the first group as the ID), or plain (treat as sequence ID only, no meta); one per input argument if specified', action = 'append')
arguments.add_argument('-x', '--exclude', help='Exclude any sequence which belongs to a cluster containing a "reference" strain, defined by the year of isolation. The value of this argument is an integer year (e.g. 1984) so that any sequence isolated in or before that year (e.g. <=1983) is considered to be a lab strain. This option makes sense for LANL or AEH data.')
arguments.add_argument('-r', '--resistance',help='Load a JSON file with resistance annotation by sequence', type=argparse.FileType('r'))
arguments.add_argument('-p', '--parser', help='The reg.exp pattern to split up sequence ids; only used if format is regexp', required=False, type=str)
arguments.add_argument('-p', '--parser', help='The reg.exp pattern to split up sequence ids; only used if format is regexp (specify N times for N input files, even if empty)', required=False, type=str, action = 'append')
arguments.add_argument('-a', '--attributes',help='Load a CSV file with optional node attributes', type=argparse.FileType('r'))
arguments.add_argument('-j', '--json', help='Output the network report as a JSON object',required=False, action='store_true', default=False)
arguments.add_argument('-o', '--singletons', help='Include singletons in JSON output',required=False, action='store_true', default=False)
arguments.add_argument('-k', '--filter', help='Only return clusters with ids listed by a newline separated supplied file. ', required=False)
arguments.add_argument('-s', '--sequences', help='Provide the MSA with sequences which were used to make the distance file. ', required=False)
arguments.add_argument('-s', '--sequences', help='Provide the MSA with sequences which were used to make the distance file. Can be specified multiple times to include mutliple MSA files', required=False, action = 'append')
arguments.add_argument('-n', '--edge-filtering', dest='edge_filtering', choices=['remove', 'report'], help='Compute edge support and mark edges for removal using sequence-based triangle tests (requires the -s argument) and either only report them or remove the edges before doing other analyses ', required=False)
arguments.add_argument('-y', '--centralities', help='Output a CSV file with node centralities')
arguments.add_argument('-g', '--triangles', help='Maximum number of triangles to consider in each filtering pass', type = int, default = 2**16)
Expand All @@ -332,10 +333,10 @@ def build_a_network():
run_settings = arguments.parse_args()

if run_settings.input == None:
run_settings.input = sys.stdin
run_settings.input = [sys.stdin]
else:
try:
run_settings.input = open(run_settings.input, 'r')
run_settings.input = [open(file, 'r') for file in run_settings.input]
except IOError:
print("Failed to open '%s' for reading" % (run_settings.input), file=sys.stderr)
raise
Expand Down Expand Up @@ -381,17 +382,23 @@ def build_a_network():
print("Failed to open '%s' for writing" % (run_settings.cluster), file=sys.stderr)
raise

formatter = parseAEH
formatter = []

if run_settings.format is not None:
formats = {"AEH": parseAEH, "LANL": parseLANL, "plain": parsePlain, "regexp": parseRegExp(
None if run_settings.parser is None else re.compile(run_settings.parser))}
try:
formatter = formats[run_settings.format]
except KeyError:
print("%s is not a valid setting for 'format' (must be in %s)" %
(run_settings.format, str(list(formats.keys()))), file=sys.stderr)
raise
for index, format_k in enumerate (run_settings.format):
formats = {"AEH": parseAEH, "LANL": parseLANL, "plain": parsePlain, "regexp": parseRegExp(
None if run_settings.parser is None or len(run_settings.parser) < index else re.compile(run_settings.parser[index]))}
try:
formatter.append (formats[format_k])
except KeyError:
print("%s is not a valid setting for 'format' (must be in %s)" %
(run_settings.format, str(list(formats.keys()))), file=sys.stderr)
raise

if len (run_settings.format) != len (run_settings.input):
raise Exception ("Must specify as many formatters as there are input files when at least one formatter is specified explicitly")
else:
formatter = [parseAEH for k in run_settings.input]

if run_settings.exclude is not None:
try:
Expand Down Expand Up @@ -479,8 +486,8 @@ def build_a_network():
maximum_number = run_settings.triangles

for filtering_pass in range (64):
edge_stats = network.test_edge_support(os.path.abspath(
run_settings.sequences), *network.find_all_triangles(current_edge_set, maximum_number = maximum_number))
edge_stats = network.test_edge_support([os.path.abspath(
s) for s in run_settings.sequences], *network.find_all_triangles(current_edge_set, maximum_number = maximum_number))
if not edge_stats or edge_stats['removed edges'] == 0:
break
else:
Expand Down