diff --git a/replace_rare_tree.py b/replace_rare_tree.py index d60b00e..444e3ff 100644 --- a/replace_rare_tree.py +++ b/replace_rare_tree.py @@ -1,7 +1,7 @@ """This script and its memeber fuunctions can be used to take a JSON encripted list of trees, separated by newlines, and a file of the unary rule counts (i.e. NOUN -> apple) and replace the terminals with (apple) fewer than 5 counts given -a specific tag (NOUN) with '_RARE_'""" +a specific tag (NOUN) with '_RARE_' """ import json from sys import argv @@ -21,13 +21,18 @@ def get_rares(countsname): if int(count) < 5: rares.add((word, tag)) return rares + +def get_json_trees_from_file(filename): + """Takes the name of a file with json format lexical trees, and laods them, + returning an array of the loaded trees""" + + tree_strings = open(filename).readlines() + return [json.loads(line) for line in tree_strings] + def recursive_rr(tree, rareset): """Takes a nested list in loaded JSON format and a set of rare (word, tag) pairs - and returns the """ #TODO fix this thingy. -#tree must be a json object -#TODO: Make flexible, make a decorator with two functions as arguments? one for terminals -#one for non-terminals + and returns the """ if len(tree) == 2: tag = tree[0] word = tree[1] @@ -38,14 +43,15 @@ def recursive_rr(tree, rareset): recursive_rr(tree[2], rareset) else: error("Not lexical tree") -#TODO: Apply to every item in a list more directly? Well that's what list comprehension does -# -def replace_all_trees(jtrees, rareset): #Apply to every tree in list, make more general. +def replace_all_trees(jtrees, rareset): + """Apply replace_rare to all of the trees in the list of trees""" for tree in jtrees: recursive_rr(tree, rareset) def write_trees(dest_name, json_trees): + """Takes a name for the destination function and a list of json encoded trees + and prints to file, separated by newline characters""" dest = open(dest_name, 'w') @@ -53,13 +59,9 @@ def write_trees(dest_name, json_trees): dest.write(json.dumps(tree)) dest.write('\n') -def get_json_trees_from_file(filename): - tree_strings = open(filename).readlines() - return [json.loads(line) for line in tree_strings] - -#TODO: Make ordering more logical - if __name__ == "__main__": + """To run in one go using command line arguments""" + script, counts_file, train_file, dest_file = argv rares = get_rares(counts_file)