diff --git a/metasyn/metaframe.py b/metasyn/metaframe.py index f8fe29a1..c46681bb 100644 --- a/metasyn/metaframe.py +++ b/metasyn/metaframe.py @@ -371,7 +371,18 @@ def save_toml(self, fp: Optional[Union[pathlib.Path, str]], doc = tomlkit.loads(tomlkit.dumps(self_dict)) doc["n_rows"].comment("Number of rows") - doc["n_columns"].comment("Number of columns") + doc["n_columns"].comment("""Number of columns + +# This is a metadata file with (limited) statistical information about each column separately in +# a dataset. No information about correlations or other relationships between columns is included. +# This file can be used to generate privacy-conscious synthetic data, which consequently has zero +# expected correlations and relationships between columns. +# For each column, the statistics can be either manually specified or estimated from real data. +# This information, including how the estimation was done, is shown in the metadata below. +# +# For more information, see https://github.com/sodascience/metasyn +""" + ) for i in range(self.n_columns): var = self.meta_vars[i] doc["vars"][i].comment(f"Metadata for column with name {var.name}") @@ -388,10 +399,11 @@ def save_toml(self, fp: Optional[Union[pathlib.Path, str]], and np.all(var.distribution.labels == var.distribution.default_distribution().labels) ) - print(var.name, multi_default) if "parameters" in var.creation_method: - parameter_comments.append(f"The above parameters for column '{var.name}' were " - "manually set by the user, no data was (directly) used.") + parameters = ", ".join(var.creation_method["parameters"]) + parameter_comments.append( + f"The parameters {parameters} for column '{var.name}' were " + "manually set by the user, no data was (directly) used.") elif (var.distribution.matches_name("multinoulli") and multi_default): parameter_comments.append("This mulinoulli distribution is the default one, no data" " was used.") @@ -401,8 +413,6 @@ def save_toml(self, fp: Optional[Union[pathlib.Path, str]], if (var.distribution.matches_name("multinoulli") and not multi_default): counts = (var.distribution.probs*(1-var.prop_missing)*self.n_rows).round() parameter_comments.append(f"Counts: {counts.astype(int)}\n") - # doc["vars"][i]["distribution"].add(tomlkit.comment( - # f"Counts: {counts.astype(int)}\n")) par_comment = "\n# ".join(parameter_comments) + "\n\n" doc["vars"][i]["distribution"]["parameters"].add(tomlkit.comment(par_comment)) if fp is None: diff --git a/metasyn/privacy.py b/metasyn/privacy.py index e1b1975f..52db2cb3 100644 --- a/metasyn/privacy.py +++ b/metasyn/privacy.py @@ -65,6 +65,7 @@ def comment(self, var): # noqa """ return "Above are the parameters for the column {var.name}" + class BasicPrivacy(BasePrivacy): """Class representing no privacy level.