diff --git a/docs/paper/build_command.txt b/docs/paper/build_command.txt index 098ee820..ebdc6d34 100644 --- a/docs/paper/build_command.txt +++ b/docs/paper/build_command.txt @@ -1,4 +1,8 @@ +# Word count: +wc -w docs/paper/paper.md + +# Building the paper: On windows: docker run --rm --volume %cd%/docs/paper:/data --env JOURNAL=joss openjournals/inara @@ -6,3 +10,4 @@ docker run --rm --volume %cd%/docs/paper:/data --env JOURNAL=joss openjournals/i on unix: docker run --rm --volume $PWD/docs/paper:/data --user $(id -u):$(id -g) --env JOURNAL=joss openjournals/inara + diff --git a/docs/paper/paper.bib b/docs/paper/paper.bib index 23b14cb2..7b49c275 100644 --- a/docs/paper/paper.bib +++ b/docs/paper/paper.bib @@ -1,7 +1,9 @@ @misc{bates2019ons, title={ONS methodology working paper series number 16—Synthetic data pilot}, author={Bates, AG and Spakulov{\'a}, I and Dove, I and Mealor, A}, - year={2019} + year={2019}, + url={https://www.ons.gov.uk/methodology/methodologicalpublications/generalmethodology/onsworkingpaperseries/onsmethodologyworkingpaperseriesnumber16syntheticdatapilot}, + urldate={2024-08-12} } @inproceedings{dwork2006differential, @@ -10,14 +12,16 @@ @inproceedings{dwork2006differential booktitle={International colloquium on automata, languages, and programming}, pages={1--12}, year={2006}, - organization={Springer} + organization={Springer}, + doi={10.1007/11787006_1} } -@book{dewolf2012statistical, +@book{hundepool2012statistical, title={Statistical disclosure control}, - author={de Wolf, Peter-Paul}, + author={Hundepool, Anco and Domingo-Ferrer, Josep and Franconi, Luisa and Giessing, Sarah and Nordholt, Eric Schulte and Spicer, Keith and De Wolf, Peter-Paul}, year={2012}, - publisher={Wiley \& Sons, Chichester} + publisher={Wiley \& Sons, Chichester}, + doi={10.1002/9781118348239} } @article{sweeney2002k, @@ -28,22 +32,17 @@ @article{sweeney2002k number={05}, pages={557--570}, year={2002}, - publisher={World Scientific} + publisher={World Scientific}, + doi={10.1142/S0218488502001648} } @misc{bond2015guidelines, - title={Guidelines for Output Checking. Eurostat}, + title={Guidelines for the checking of output based on microdata research}, + publisher={Eurostat}, author={Bond, S and Brandt, M and de Wolf, PP}, - year={2015} -} - -@article{dwork2010differential, - title={Differential privacy for statistics: What we know and what we want to learn}, - author={Dwork, Cynthia and Smith, Adam}, - journal={Journal of Privacy and Confidentiality}, - volume={1}, - number={2}, - year={2010} + year={2015}, + url={https://web.archive.org/web/20160408145718/http://dwbproject.org/export/sites/default/about/public_deliveraples/dwb_d11-8_synthetic-data_cta-ecta_output-checking-guidelines_final-reports.zip}, + urldate={2024-08-12} } @book{hastie2009elements, @@ -51,16 +50,8 @@ @book{hastie2009elements author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H and Friedman, Jerome H}, volume={2}, year={2009}, - publisher={Springer} -} - -@inproceedings{akaike1973information, - title={Information theory and an extension of the maximum likelihood principle}, - author={Akaike, H}, - booktitle={2nd International Symposium on Information Theory}, - pages={267--281}, - year={1973}, - organization={Akad{\'e}miai Kiad{\'o} Location Budapest, Hungary} + publisher={Springer}, + doi={10.1007/978-0-387-84858-7} } @article{neath2012bayesian, @@ -71,8 +62,10 @@ @article{neath2012bayesian number={2}, pages={199--203}, year={2012}, - publisher={Wiley Online Library} + publisher={Wiley Online Library}, + doi={10.1002/wics.199} } + @software{vink2024polars, author = {Ritchie Vink and Stijn de Gooijer and @@ -131,7 +124,8 @@ @article{nowok2016synthpop journal={Journal of statistical software}, volume={74}, pages={1--26}, - year={2016} + year={2016}, + doi={10.18637/jss.v074.i11} } @article{templ2017simulation, @@ -142,7 +136,8 @@ @article{templ2017simulation number={10}, pages={1--38}, year={2017}, - publisher={UCLA, Dept. of Statistics} + publisher={UCLA, Dept. of Statistics}, + doi={10.18637/jss.v079.i10} } @inproceedings{ping2017datasynthesizer, @@ -150,12 +145,14 @@ @inproceedings{ping2017datasynthesizer author={Ping, Haoyue and Stoyanovich, Julia and Howe, Bill}, booktitle={Proceedings of the 29th International Conference on Scientific and Statistical Database Management}, pages={1--5}, - year={2017} + year={2017}, + doi={10.1145/3085504.3091117} } @article{vankesteren2024democratize, title={To democratize research with sensitive data, we should make synthetic data more accessible}, author={{van Kesteren}, Erik-Jan}, journal={arXiv preprint arXiv:2404.17271}, - year={2024} + year={2024}, + doi={10.48550/arXiv.2404.17271} } \ No newline at end of file diff --git a/docs/paper/paper.md b/docs/paper/paper.md index b02fcff7..2cd0758e 100644 --- a/docs/paper/paper.md +++ b/docs/paper/paper.md @@ -29,38 +29,34 @@ bibliography: paper.bib --- # Summary -Synthetic data is a promising tool for improving the accessibility of datasets that are otherwise too sensitive to be shared publicly. To this end, we introduce `metasyn`, a Python package for generating synthetic data from tabular datasets. Unlike existing synthetic data generation software, `metasyn` is built on a simple generative model with a "naïve" marginal independence assumption --- an explicit choice that removes multivariate information from the synthetic data. It makes this trade-off in order to maintain transparency and auditability, to keep information leakage to a minimum, and even to enable privacy or disclosure risk guarantees through a plug-in system. While the analytical validity of the generated data is thus intentionally limited, its potential uses are broad, including exploratory analyses, code development and testing, and external communication and teaching [@vankesteren2024democratize]. `Metasyn` is flexible, scalable, and easily extended to meet diverse privacy needs. +Synthetic data is a promising tool for improving the accessibility of datasets which are too sensitive to be shared publicly. To this end, we introduce `metasyn`, a Python package for generating synthetic data from tabular datasets. Unlike existing synthetic data generation software, `metasyn` is built on a simple generative model that omits multivariate information. This choice enables transparency and auditability, keeps information leakage to a minimum, and enables privacy guarantees through a plug-in system. While the analytical validity of the generated data is thus intentionally limited, its potential uses are broad, including exploratory analyses, code development and testing, and external communication and teaching [@vankesteren2024democratize]. ![Logo of the `metasyn` project.](img/logo.svg) # Statement of need -`Metasyn` is a python package for generating synthetic data with a focus on privacy and disclosure control. It is aimed at owners of sensitive datasets such as public organisations, research groups, and individual researchers who want to improve the accessibility of their data for research and reproducibility by others. The goal of `metasyn` is to make it easy for data owners to share the structure and an approximation of the content of their data with others while keeping privacy concerns to a minimum. +`Metasyn` is aimed at owners of sensitive datasets such as public organisations, research groups, and individual researchers who want to improve the accessibility of their data for research and reproducibility by others. The goal of `metasyn` is to make it easy for data owners to share the structure and an approximation of the content of their data with others while keeping privacy concerns to a minimum. -With this goal in mind, `metasyn` distinguishes itself from existing software for generating synthetic data [e.g., @nowok2016synthpop; @templ2017simulation; @ping2017datasynthesizer] by strictly limiting the statistical information from the real data in the produced synthetic data. This choice enables the software to generate synthetic data with __privacy and disclosure guarantees__ through a plug-in system. Moreover, our system provides an __auditable and editable intermediate representation__ in the form of a human- and machine-readable `.json` metadata file from which new data can be synthesized. +With this goal in mind, `metasyn` distinguishes itself from existing software for generating synthetic data [e.g., @nowok2016synthpop; @templ2017simulation; @ping2017datasynthesizer] by strictly limiting the statistical information from the real data in the synthetic data. `Metasyn` explicitly avoids generating synthetic data with high analytical validity; instead, the synthetic data has realistic structure and plausible values, but multivariate relations are omitted ("augmented plausible synthetic data"; [@bates2019ons]). Moreover, our system provides an __auditable and editable intermediate representation__ in the form of a `.json` metadata file from which new data can be synthesized. -Through our focus on privacy and transparency, `metasyn` explicitly avoids generating synthetic data with high analytical validity. The data generated by our system is realistic in terms of data structure and plausible in terms of values for each variable --- the "augmented plausible" category of synthetic data [@bates2019ons] --- but multivariate relations or conditional patterns not learnt from the real data. This has implications for how this synthetic data can be used: not for statistical analysis and inference, but rather for initial exploration, analysis script development, and communication outside the data owner's institution. In the intended use case, an external researcher can make use of the synthetic data to assess the feasibility of their intended research before making the (often time-consuming) step of requesting access to the sensitive source data for the final analysis. - -As mentioned before, the privacy capacities of `metasyn` are extensible through a plug-in system, recognizing that different data owners have different needs and definitions of privacy. A data owner can define under which conditions they would accept open distribution of their synthetic data --- be it based on differential privacy [@dwork2006differential], statistical disclosure control [@dewolf2012statistical], k-anonymity [@sweeney2002k], or another specific definition of privacy. As part of the initial release of `metasyn`, we publish a plugin following the disclosure control guidelines from Eurostat [@bond2015guidelines]. +These choices enable the software to generate synthetic data with __privacy and disclosure guarantees__ through a plug-in system, recognizing that different data owners have different needs and definitions of privacy. A data owner can define under which conditions they would accept open distribution of their synthetic data --- be it based on differential privacy [@dwork2006differential], statistical disclosure control [@hundepool2012statistical], k-anonymity [@sweeney2002k], or another specific definition of privacy. As part of the initial release of `metasyn`, we publish a [plug-in](https://github.com/sodascience/metasyn-disclosure-control) following the disclosure control guidelines from Eurostat [@bond2015guidelines]. # Software features -At its core, `metasyn` is designed for three functions, which are briefly described in this section: +At its core, `metasyn` has three main functions: -1. __Estimation__: Automatically select univariate distributions and fit them to a properly formatted tabular dataset, optionally with additional privacy guarantees. +1. __Estimation__: Fit a generative model to a properly formatted tabular dataset, optionally with privacy guarantees. 2. __(De)serialization__: Create an intermediate representation of the fitted model for auditing, editing, and exporting. -3. __Generation__: Generate new synthetic datasets based on the fitted model or its serialized representation. +3. __Generation__: Synthesize new datasets based on a fitted model. ## Estimation -The generative model for multivariate datasets in `metasyn` makes the simplifying assumption of marginal independence: each column is considered separately, just as is done in e.g., naïve Bayes classifiers [@hastie2009elements]. Formally, this leads to the following generative model for the $K$-variate data $\mathbf{x}$: +The generative model in `metasyn` makes the assumption of marginal independence: each column is considered separately, similar to naïve Bayes classifiers [@hastie2009elements]. Some key advantages of this naïve approach are transparency and explainability, flexibility in handling mixed data types, and computational scalability to high-dimensional datasets. Formally, the generative model for $K$-variate data $\mathbf{x}$ is: \begin{equation} \label{eq:model} p(\mathbf{x}) = \prod_{k = 1}^K p(x_k) \end{equation} -There are many advantages to this naïve approach when compared to more advanced generative models: it is transparent and explainable, it is able to flexibly handle data of mixed types, and it is computationally scalable to high-dimensional datasets. As mentioned before, the tradeoff is the limited analytical validity when the independence assumption does not hold: in the synthetic data, the expected value of correlations, regression parameters, and other measures of association is 0. - -Model estimation starts with an appropriately pre-processed data frame. For `metasyn`, this means the data frame is tidy [@wickham2014tidy], each column has the correct data type, and missing data are represented by a missing value. Internally, our software uses the `polars` data frame library [@vink2024polars], as it is performant, has consistent data types, and natively supports missing data (i.e., `null` values). A simple example source table could look like this (note that categorical data has the appropriate `cat` data type, not `str`): +Model estimation starts with an appropriately pre-processed data frame, meaning it is tidy [@wickham2014tidy], each column has the correct data type, and missing data are represented by a missing value. Internally, our software uses the `polars` data frame library [@vink2024polars], as it is performant, has consistent data types, and natively supports missing data (i.e., `null` values). An example source table is printed below (NB: categorical data are appropriately encoded as `cat`, not `str`): ``` ┌─────┬────────┬─────┬────────┬──────────┐ @@ -76,54 +72,29 @@ Model estimation starts with an appropriately pre-processed data frame. For `met └─────┴────────┴─────┴────────┴──────────┘ ``` -For each data type supported by `metasyn`, there is a set of candidate distributions that can be fitted to that data type (see Table \autoref{tbl:dist}). To estimate the generative model of Equation \autoref{eq:model}, for each variable the software fits all compatible candidate distributions --- by default with maximum likelihood estimation --- and then selects the one with the lowest BIC [@neath2012bayesian]. For distributions where this is not possible, such as those for the string data type, a pseudo-BIC is created that trades off fit and complexity of the underlying models. +For each data type, a set of candidate distributions is fitted (see \autoref{tbl:dist}), and then `metasyn` selects the one with the lowest BIC [@neath2012bayesian]. For distributions where BIC computation is impossible (e.g., for the string data type) a pseudo-BIC is created that trades off fit and complexity of the underlying models. Table: \label{tbl:dist} Candidate distributions associated with data types in the core `metasyn` package. -| Variable type | Example | Candidate distributions | -| :------------ | :--------------------- | :----------------------------------------------------------------- | -| categorical | yes/no, country | Categorical (Multinoulli), Constant | -| continuous | 1.0, 2.1, ... | Uniform, Normal, LogNormal, TruncatedNormal, Exponential, Constant | -| discrete | 1, 2, ... | Poisson, Uniform, Normal, TruncatedNormal, Categorical, Constant | -| string | A108, C122, some words | Regex, Categorical, Faker, FreeText, Constant | -| date/time | 2021-01-13, 01:40:12 | Uniform, Constant | +| Data type | Candidate distributions | +| :---------- | :----------------------------------------------------------------- | +| Categorical | Categorical, Constant | +| Continuous | Uniform, Normal, LogNormal, TruncatedNormal, Exponential, Constant | +| Discrete | Poisson, Uniform, Normal, TruncatedNormal, Categorical, Constant | +| String | Regex, Categorical, Faker, FreeText, Constant | +| Date/time | Uniform, Constant | -From this table, the string distributions deserve special attention as they are not commonly encountered as probability distributions. Regex (regular expression) inference is performed on structured strings using the companion package [RegexModel](https://pypi.org/project/regexmodel/). It is able to automatically detect structure such as room numbers (A108, C122, B109), e-mail addresses, websites, and more, which it summarizes using a probabilistic variant of regular expressions. Another option, should Regex inference fail for lack of structure, is to detect the language (using [lingua](https://pypi.org/project/lingua-language-detector/)) and randomly pick words from that language. We call this approach FreeText. The final alternative is for the data owner to specify that a certain variable should be synthesized using the popular [Faker](https://pypi.org/project/Faker/) package, which can generate specific data types such as localized addresses. +From this table, the string distributions deserve special attention as they are not common probability distributions. The regex (regular expression) distribution uses the package [`regexmodel`](https://pypi.org/project/regexmodel/) to automatically detect structure such as room numbers (A108, C122, B109), e-mail addresses, or websites. The FreeText distribution detects the language (using [lingua](https://pypi.org/project/lingua-language-detector/)) and randomly picks words from that language. The [Faker](https://pypi.org/project/Faker/) distribution can generate specific data types such as localized names and addresses pre-specified by the user. Generative model estimation with `metasyn` can be performed as follows: ```python -from metasyn import MetaFrame, VarSpec - -# "ID" column is the primary key, -# thus should generate unique values. -# "B" column is not, despite unique -# values in the dataframe -specs = [ - VarSpec("ID", unique=True), - VarSpec("B", unique=False), -] - -# create metaframe -mf = MetaFrame.fit_dataframe(df, var_specs=specs) +from metasyn import MetaFrame +mf = MetaFrame.fit_dataframe(df) ``` ## Serialization and deserialization -After a fitted model object is created, `metasyn` allows it to be transparently stored in a human- and machine-readable `.json` file. This file can be considered as metadata: it contains dataset-level descriptive information as well as variable-level information. The metadata format has a specific structure, which we call the `generative metadata format`, or `gmf`. The header contains the following dataset-level information: - -```json -"n_rows": 5, -"n_columns": 5, -"provenance": { - "created by": { - "name": "metasyn", - "version": "1.0.1" - }, - "creation time": "2024-08-07T12:20:36.022017" -} -``` - -Then, for each variable the `gmf` file contains information the name, the data type, the proportion of missing values, and the distribution fitted on the data. For example, a table column containing different types of fruits could result in the following `.json`: +After fitting a model, `metasyn` can transparently store it in a human- and machine-readable `.json` metadata file. This file contains dataset-level descriptive information as well as the following variable-level information: ```json { @@ -146,39 +117,24 @@ Then, for each variable the `gmf` file contains information the name, the data t } ``` -There are several advantages to creating such a serialized representation. First, it can be audited: the data owner can see exactly what information from the real data is made public through exporting the synthetic data, namely, the parameters of the distribution. Second, the file can be edited. For example, if a data owner thinks some of the labels of the "fruit" column contain sensitive information, these can simply be pseudonymized in the metadata file. Third, after exporting this file, an unlimited number of synthetic records can be created without incurring additional privacy risks, because the original data is no longer part of the synthetization process. - - -Serialization and deserialization with `metasyn` can be performed as follows: +This `.json` can be manually audited, edited, and after exporting this file, an unlimited number of synthetic records can be created without incurring additional privacy risks. Serialization and deserialization with `metasyn` can be performed as follows: ```python -# write a fitted MetaFrame to json -mf.export("fruits_gmf.json") - -# then, audit and export json from secure environment - -# outside the secure environment, load json into MetaFrame -mf_out = MetaFrame.from_json("fruits_gmf.json") +mf.export("fruits.json") +mf_new = MetaFrame.from_json("fruits.json") ``` ## Data generation -After creating the fitted model object, either from the original data or by deserializing a model object from a `.json` file, new data can be generated by the object. For each variable in the model object, the software randomly samples from the fitted distribution to create a synthetic version of the data. Data generation (or synthetization) in `metasyn` can be performed as follows: +For each variable in a `MetaFrame` object, `metasyn` can randomly sample synthetic datapoints. Data generation (or synthetization) in `metasyn` can be performed as follows: ```python -from metasyn import MetaFrame - -# load json into a metadataset object -mf = MetaFrame.from_json("metasyn_example.json") - -# create a fake dataset -df_syn = mf.synthesize(10) +df_syn = mf.synthesize(3) ``` -This may result in the following `polars` data frame[^1]. Note that missing values in the `optional` column are appropriately reproduced as well, courtesy of the "prop_missing" entry in the metadata format. +This may result in the following data frame. Note that missing values in the `optional` column are appropriately reproduced as well. ``` -shape: (10, 5) ┌─────┬────────┬─────┬────────┬──────────┐ │ ID ┆ fruits ┆ B ┆ cars ┆ optional │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ @@ -186,39 +142,24 @@ shape: (10, 5) ╞═════╪════════╪═════╪════════╪══════════╡ │ 1 ┆ banana ┆ 4 ┆ beetle ┆ null │ │ 2 ┆ banana ┆ 3 ┆ audi ┆ null │ -│ 3 ┆ banana ┆ 1 ┆ beetle ┆ 223 │ -│ 4 ┆ banana ┆ 0 ┆ beetle ┆ 258 │ -│ … ┆ … ┆ … ┆ … ┆ … │ -│ 7 ┆ banana ┆ 3 ┆ beetle ┆ 298 │ -│ 8 ┆ banana ┆ 2 ┆ beetle ┆ 67 │ -│ 9 ┆ banana ┆ 4 ┆ beetle ┆ -30 │ -│ 10 ┆ banana ┆ 2 ┆ beetle ┆ 172 │ +│ 3 ┆ banana ┆ 2 ┆ beetle ┆ 172 │ └─────┴────────┴─────┴────────┴──────────┘ ``` -[^1]: This `polars` dataframe can be easily converted to a `pandas` dataframe using `df_syn.to_pandas()` - # Plug-ins and automatic privacy -In addition to the core features described above, the `metasyn` package allows for plug-ins: add-on packages that alter the behaviour of the parameter estimation. Through this system, privacy guarantees can be built into `metasyn` ([privacy plugin template](https://github.com/sodascience/metasyn-privacy-template)) and additional distributions can be supported ([distribution plugin template](https://github.com/sodascience/metasyn-distribution-template)). For example, a plugin package called [`metasyn-disclosure-control`](https://github.com/sodascience/metasyn-disclosure-control) implements the disclosure control output guidelines from Eurostat [@bond2015guidelines] by re-implementing the `fit()` method of the candidate distributions shown in Table \autoref{tbl:dist} to include a micro-aggregation step. In this way, information transfer from the sensitive real data to the synthetic public data can be further reduced. - -This plug-in system is user-friendly: the user only needs to `pip install` the package and then `metasyn` can automatically find it to make the methods accessible: +The `metasyn` package also allows for plug-ins: packages that alter the distribution fitting behaviour. Through this system, privacy guarantees can be built into `metasyn` ([privacy plug-in template](https://github.com/sodascience/metasyn-privacy-template)) and additional distributions can be supported ([distribution plug-in template](https://github.com/sodascience/metasyn-distribution-template)). The [`metasyn-disclosure-control`](https://github.com/sodascience/metasyn-disclosure-control) plug-in implements output guidelines from Eurostat [@bond2015guidelines] by including micro-aggregation. In this way, information transfer from the sensitive real data to the synthetic public data can be further limited. Disclosure control is performed as follows: ```python -from metasyn import MetaDataset +from metasyn import MetaFrame from metasyncontrib.disclosure import DisclosurePrivacy mf = MetaFrame.fit_dataframe(df, privacy=DisclosurePrivacy()) ``` -# Conclusion -Synthetic data is a valuable tool for communicating about sensitive datasets. In this work, we have presented the software `metasyn`, which allows data owners to generate a synthetic version of their sensitive tabular data with a focus on privacy and transparency. Unlike existing tools for generating synthetic data, we choose to aim for low analytic validity to enable strong privacy guarantees: the underlying model makes a simplifying independence assumption, resulting in few parameters and thus a very limited information transfer. This approach additionally allows for disclosure guarantees through a plug-in system. - -Further documentation and examples can be found on [metasyn.readthedocs.io](https://metasyn.readthedocs.io/). - # Acknowledgements This research was conducted in whole or in part using ODISSEI, the Open Data Infrastructure for Social Science and Economic Innovations (https://ror.org/03m8v6t10) -The `metasyn` project is supported by the FAIR Research IT Innovation Fund of Utrecht University (March 2023) +`metasyn` was supported by the Utrecht University FAIR Research IT Innovation Fund (March 2023) # References \ No newline at end of file diff --git a/docs/paper/paper.pdf b/docs/paper/paper.pdf index af7cf901..d5c6ea79 100644 Binary files a/docs/paper/paper.pdf and b/docs/paper/paper.pdf differ