diff --git a/.github/workflows/joss-paper-draft.yml b/.github/workflows/joss-paper-draft.yml new file mode 100644 index 00000000..d8713456 --- /dev/null +++ b/.github/workflows/joss-paper-draft.yml @@ -0,0 +1,28 @@ +name: Draft PDF +on: + push: + paths: + - docs/paper/** + - .github/workflows/joss-paper-draft.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: docs/paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: docs/paper/paper.pdf \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1606f864..b03d5aea 100644 --- a/.gitignore +++ b/.gitignore @@ -141,8 +141,13 @@ dmypy.json #DS_Store .DS_Store -.DS_Store docs/source/images/pipeline.afphoto~lock~ docs/source/images/.$ClassDiagram.drawio.bkp docs/source/images/.$distributions.drawio.bkp + +# JOSS paper artefacts +docs/paper/*.jats +docs/paper/media + +# Generated api docs stuff docs/source/api/generated diff --git a/docs/paper/build_command.txt b/docs/paper/build_command.txt new file mode 100644 index 00000000..098ee820 --- /dev/null +++ b/docs/paper/build_command.txt @@ -0,0 +1,8 @@ + +On windows: + +docker run --rm --volume %cd%/docs/paper:/data --env JOURNAL=joss openjournals/inara + +on unix: + +docker run --rm --volume $PWD/docs/paper:/data --user $(id -u):$(id -g) --env JOURNAL=joss openjournals/inara diff --git a/docs/paper/img/logo.svg b/docs/paper/img/logo.svg new file mode 100644 index 00000000..53babdff --- /dev/null +++ b/docs/paper/img/logo.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/docs/paper/paper.bib b/docs/paper/paper.bib new file mode 100644 index 00000000..23b14cb2 --- /dev/null +++ b/docs/paper/paper.bib @@ -0,0 +1,161 @@ +@misc{bates2019ons, + title={ONS methodology working paper series number 16—Synthetic data pilot}, + author={Bates, AG and Spakulov{\'a}, I and Dove, I and Mealor, A}, + year={2019} +} + +@inproceedings{dwork2006differential, + title={Differential privacy}, + author={Dwork, Cynthia}, + booktitle={International colloquium on automata, languages, and programming}, + pages={1--12}, + year={2006}, + organization={Springer} +} + +@book{dewolf2012statistical, + title={Statistical disclosure control}, + author={de Wolf, Peter-Paul}, + year={2012}, + publisher={Wiley \& Sons, Chichester} +} + +@article{sweeney2002k, + title={k-anonymity: A model for protecting privacy}, + author={Sweeney, Latanya}, + journal={International journal of uncertainty, fuzziness and knowledge-based systems}, + volume={10}, + number={05}, + pages={557--570}, + year={2002}, + publisher={World Scientific} +} + +@misc{bond2015guidelines, + title={Guidelines for Output Checking. Eurostat}, + author={Bond, S and Brandt, M and de Wolf, PP}, + year={2015} +} + +@article{dwork2010differential, + title={Differential privacy for statistics: What we know and what we want to learn}, + author={Dwork, Cynthia and Smith, Adam}, + journal={Journal of Privacy and Confidentiality}, + volume={1}, + number={2}, + year={2010} +} + +@book{hastie2009elements, + title={The elements of statistical learning: data mining, inference, and prediction}, + author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H and Friedman, Jerome H}, + volume={2}, + year={2009}, + publisher={Springer} +} + +@inproceedings{akaike1973information, + title={Information theory and an extension of the maximum likelihood principle}, + author={Akaike, H}, + booktitle={2nd International Symposium on Information Theory}, + pages={267--281}, + year={1973}, + organization={Akad{\'e}miai Kiad{\'o} Location Budapest, Hungary} +} + +@article{neath2012bayesian, + title={The Bayesian information criterion: background, derivation, and applications}, + author={Neath, Andrew A and Cavanaugh, Joseph E}, + journal={Wiley Interdisciplinary Reviews: Computational Statistics}, + volume={4}, + number={2}, + pages={199--203}, + year={2012}, + publisher={Wiley Online Library} +} +@software{vink2024polars, + author = {Ritchie Vink and + Stijn de Gooijer and + Alexander Beedie and + Marco Edward Gorelli and + Weijie Guo and + J van Zundert and + Orson Peters and + Gert Hulselmans and + nameexhaustion and + Cory Grinstead and + Marshall and + Gijs Burghoorn and + chielP and + Itamar Turner-Trauring and + Matteo Santamaria and + Daniël Heres and + Lawrence Mitchell and + Josh Magarick and + ibENPC and + Karl Genockey and + Moritz Wilksch and + Jorge Leitao and + Mick van Gelderen and + Petros Barbagiannis and + Oliver Borchert and + deanm0000 and + Jonas Haag and + Henry Harbeck and + Liam Brannigan}, + title = {pola-rs/polars: Python Polars}, + year = 2024, + publisher = {Zenodo}, + version = {py-1.4.1}, + doi = {10.5281/zenodo.7697217}, + url = {https://doi.org/10.5281/zenodo.7697217} +} + +@article{wickham2014tidy, + title={Tidy Data}, + volume={59}, + url={https://www.jstatsoft.org/index.php/jss/article/view/v059i10}, + doi={10.18637/jss.v059.i10}, + abstract={A huge amount of effort is spent cleaning data to get it ready for analysis, but there has been little research on how to make data cleaning as easy and effective as possible. This paper tackles a small, but important, component of data cleaning: data tidying. Tidy datasets are easy to manipulate, model and visualize, and have a specific structure: each variable is a column, each observation is a row, and each type of observational unit is a table. This framework makes it easy to tidy messy datasets because only a small set of tools are needed to deal with a wide range of un-tidy datasets. This structure also makes it easier to develop tidy tools for data analysis, tools that both input and output tidy datasets. The advantages of a consistent data structure and matching tools are demonstrated with a case study free from mundane data manipulation chores.}, + number={10}, + journal={Journal of Statistical Software}, + author={Wickham, Hadley}, + year={2014}, + pages={1–23} +} + +# alternative synthetic data packages +@article{nowok2016synthpop, + title={synthpop: Bespoke creation of synthetic data in R}, + author={Nowok, Beata and Raab, Gillian M and Dibben, Chris}, + journal={Journal of statistical software}, + volume={74}, + pages={1--26}, + year={2016} +} + +@article{templ2017simulation, + title={Simulation of synthetic complex data: The R package simPop}, + author={Templ, Matthias and Meindl, Bernhard and Kowarik, Alexander and Dupriez, Olivier}, + journal={Journal of Statistical Software}, + volume={79}, + number={10}, + pages={1--38}, + year={2017}, + publisher={UCLA, Dept. of Statistics} +} + +@inproceedings{ping2017datasynthesizer, + title={Datasynthesizer: Privacy-preserving synthetic datasets}, + author={Ping, Haoyue and Stoyanovich, Julia and Howe, Bill}, + booktitle={Proceedings of the 29th International Conference on Scientific and Statistical Database Management}, + pages={1--5}, + year={2017} +} + +@article{vankesteren2024democratize, + title={To democratize research with sensitive data, we should make synthetic data more accessible}, + author={{van Kesteren}, Erik-Jan}, + journal={arXiv preprint arXiv:2404.17271}, + year={2024} +} \ No newline at end of file diff --git a/docs/paper/paper.md b/docs/paper/paper.md new file mode 100644 index 00000000..b02fcff7 --- /dev/null +++ b/docs/paper/paper.md @@ -0,0 +1,224 @@ +--- +title: 'Metasyn: Transparent Generation of Synthetic Tabular Data with Privacy Guarantees' +tags: + - Python + - synthetic data + - privacy + - generative models + - data management +authors: + - name: Raoul Schram + orcid: 0000-0001-6616-230X + equal-contrib: true + affiliation: 1 + - name: Samuel Spithorst + orcid: 0009-0000-4140-0658 + affiliation: 1 + - name: Erik-Jan van Kesteren + orcid: 0000-0003-1548-1663 + corresponding: true + equal-contrib: true + affiliation: "1, 2" +affiliations: + - name: Utrecht University, The Netherlands + index: 1 + - name: "ODISSEI: Open Data Infrastructure for Social Science and Economic Innovations, The Netherlands" + index: 2 +date: 8 August 2024 +bibliography: paper.bib +--- + +# Summary +Synthetic data is a promising tool for improving the accessibility of datasets that are otherwise too sensitive to be shared publicly. To this end, we introduce `metasyn`, a Python package for generating synthetic data from tabular datasets. Unlike existing synthetic data generation software, `metasyn` is built on a simple generative model with a "naïve" marginal independence assumption --- an explicit choice that removes multivariate information from the synthetic data. It makes this trade-off in order to maintain transparency and auditability, to keep information leakage to a minimum, and even to enable privacy or disclosure risk guarantees through a plug-in system. While the analytical validity of the generated data is thus intentionally limited, its potential uses are broad, including exploratory analyses, code development and testing, and external communication and teaching [@vankesteren2024democratize]. `Metasyn` is flexible, scalable, and easily extended to meet diverse privacy needs. + +![Logo of the `metasyn` project.](img/logo.svg) + +# Statement of need + +`Metasyn` is a python package for generating synthetic data with a focus on privacy and disclosure control. It is aimed at owners of sensitive datasets such as public organisations, research groups, and individual researchers who want to improve the accessibility of their data for research and reproducibility by others. The goal of `metasyn` is to make it easy for data owners to share the structure and an approximation of the content of their data with others while keeping privacy concerns to a minimum. + +With this goal in mind, `metasyn` distinguishes itself from existing software for generating synthetic data [e.g., @nowok2016synthpop; @templ2017simulation; @ping2017datasynthesizer] by strictly limiting the statistical information from the real data in the produced synthetic data. This choice enables the software to generate synthetic data with __privacy and disclosure guarantees__ through a plug-in system. Moreover, our system provides an __auditable and editable intermediate representation__ in the form of a human- and machine-readable `.json` metadata file from which new data can be synthesized. + +Through our focus on privacy and transparency, `metasyn` explicitly avoids generating synthetic data with high analytical validity. The data generated by our system is realistic in terms of data structure and plausible in terms of values for each variable --- the "augmented plausible" category of synthetic data [@bates2019ons] --- but multivariate relations or conditional patterns not learnt from the real data. This has implications for how this synthetic data can be used: not for statistical analysis and inference, but rather for initial exploration, analysis script development, and communication outside the data owner's institution. In the intended use case, an external researcher can make use of the synthetic data to assess the feasibility of their intended research before making the (often time-consuming) step of requesting access to the sensitive source data for the final analysis. + +As mentioned before, the privacy capacities of `metasyn` are extensible through a plug-in system, recognizing that different data owners have different needs and definitions of privacy. A data owner can define under which conditions they would accept open distribution of their synthetic data --- be it based on differential privacy [@dwork2006differential], statistical disclosure control [@dewolf2012statistical], k-anonymity [@sweeney2002k], or another specific definition of privacy. As part of the initial release of `metasyn`, we publish a plugin following the disclosure control guidelines from Eurostat [@bond2015guidelines]. + +# Software features + +At its core, `metasyn` is designed for three functions, which are briefly described in this section: + +1. __Estimation__: Automatically select univariate distributions and fit them to a properly formatted tabular dataset, optionally with additional privacy guarantees. +2. __(De)serialization__: Create an intermediate representation of the fitted model for auditing, editing, and exporting. +3. __Generation__: Generate new synthetic datasets based on the fitted model or its serialized representation. + +## Estimation +The generative model for multivariate datasets in `metasyn` makes the simplifying assumption of marginal independence: each column is considered separately, just as is done in e.g., naïve Bayes classifiers [@hastie2009elements]. Formally, this leads to the following generative model for the $K$-variate data $\mathbf{x}$: + +\begin{equation} \label{eq:model} +p(\mathbf{x}) = \prod_{k = 1}^K p(x_k) +\end{equation} + +There are many advantages to this naïve approach when compared to more advanced generative models: it is transparent and explainable, it is able to flexibly handle data of mixed types, and it is computationally scalable to high-dimensional datasets. As mentioned before, the tradeoff is the limited analytical validity when the independence assumption does not hold: in the synthetic data, the expected value of correlations, regression parameters, and other measures of association is 0. + +Model estimation starts with an appropriately pre-processed data frame. For `metasyn`, this means the data frame is tidy [@wickham2014tidy], each column has the correct data type, and missing data are represented by a missing value. Internally, our software uses the `polars` data frame library [@vink2024polars], as it is performant, has consistent data types, and natively supports missing data (i.e., `null` values). A simple example source table could look like this (note that categorical data has the appropriate `cat` data type, not `str`): + +``` +┌─────┬────────┬─────┬────────┬──────────┐ +│ ID ┆ fruits ┆ B ┆ cars ┆ optional │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ cat ┆ i64 ┆ cat ┆ i64 │ +╞═════╪════════╪═════╪════════╪══════════╡ +│ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ +│ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ +│ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ +│ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ +│ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ +└─────┴────────┴─────┴────────┴──────────┘ +``` + +For each data type supported by `metasyn`, there is a set of candidate distributions that can be fitted to that data type (see Table \autoref{tbl:dist}). To estimate the generative model of Equation \autoref{eq:model}, for each variable the software fits all compatible candidate distributions --- by default with maximum likelihood estimation --- and then selects the one with the lowest BIC [@neath2012bayesian]. For distributions where this is not possible, such as those for the string data type, a pseudo-BIC is created that trades off fit and complexity of the underlying models. + +Table: \label{tbl:dist} Candidate distributions associated with data types in the core `metasyn` package. + +| Variable type | Example | Candidate distributions | +| :------------ | :--------------------- | :----------------------------------------------------------------- | +| categorical | yes/no, country | Categorical (Multinoulli), Constant | +| continuous | 1.0, 2.1, ... | Uniform, Normal, LogNormal, TruncatedNormal, Exponential, Constant | +| discrete | 1, 2, ... | Poisson, Uniform, Normal, TruncatedNormal, Categorical, Constant | +| string | A108, C122, some words | Regex, Categorical, Faker, FreeText, Constant | +| date/time | 2021-01-13, 01:40:12 | Uniform, Constant | + +From this table, the string distributions deserve special attention as they are not commonly encountered as probability distributions. Regex (regular expression) inference is performed on structured strings using the companion package [RegexModel](https://pypi.org/project/regexmodel/). It is able to automatically detect structure such as room numbers (A108, C122, B109), e-mail addresses, websites, and more, which it summarizes using a probabilistic variant of regular expressions. Another option, should Regex inference fail for lack of structure, is to detect the language (using [lingua](https://pypi.org/project/lingua-language-detector/)) and randomly pick words from that language. We call this approach FreeText. The final alternative is for the data owner to specify that a certain variable should be synthesized using the popular [Faker](https://pypi.org/project/Faker/) package, which can generate specific data types such as localized addresses. + +Generative model estimation with `metasyn` can be performed as follows: + +```python +from metasyn import MetaFrame, VarSpec + +# "ID" column is the primary key, +# thus should generate unique values. +# "B" column is not, despite unique +# values in the dataframe +specs = [ + VarSpec("ID", unique=True), + VarSpec("B", unique=False), +] + +# create metaframe +mf = MetaFrame.fit_dataframe(df, var_specs=specs) +``` + +## Serialization and deserialization +After a fitted model object is created, `metasyn` allows it to be transparently stored in a human- and machine-readable `.json` file. This file can be considered as metadata: it contains dataset-level descriptive information as well as variable-level information. The metadata format has a specific structure, which we call the `generative metadata format`, or `gmf`. The header contains the following dataset-level information: + +```json +"n_rows": 5, +"n_columns": 5, +"provenance": { + "created by": { + "name": "metasyn", + "version": "1.0.1" + }, + "creation time": "2024-08-07T12:20:36.022017" +} +``` + +Then, for each variable the `gmf` file contains information the name, the data type, the proportion of missing values, and the distribution fitted on the data. For example, a table column containing different types of fruits could result in the following `.json`: + +```json +{ + "name": "fruits", + "type": "categorical", + "dtype": "Categorical(ordering='physical')", + "prop_missing": 0.0, + "distribution": { + "implements": "core.multinoulli", + "version": "1.0", + "provenance": "builtin", + "class_name": "MultinoulliDistribution", + "unique": false, + "parameters": { + "labels": ["apple", "banana"], + "probs": [0.4, 0.6] + } + }, + "creation_method": { "created_by": "metasyn" } +} +``` + +There are several advantages to creating such a serialized representation. First, it can be audited: the data owner can see exactly what information from the real data is made public through exporting the synthetic data, namely, the parameters of the distribution. Second, the file can be edited. For example, if a data owner thinks some of the labels of the "fruit" column contain sensitive information, these can simply be pseudonymized in the metadata file. Third, after exporting this file, an unlimited number of synthetic records can be created without incurring additional privacy risks, because the original data is no longer part of the synthetization process. + + +Serialization and deserialization with `metasyn` can be performed as follows: + +```python +# write a fitted MetaFrame to json +mf.export("fruits_gmf.json") + +# then, audit and export json from secure environment + +# outside the secure environment, load json into MetaFrame +mf_out = MetaFrame.from_json("fruits_gmf.json") +``` + +## Data generation + +After creating the fitted model object, either from the original data or by deserializing a model object from a `.json` file, new data can be generated by the object. For each variable in the model object, the software randomly samples from the fitted distribution to create a synthetic version of the data. Data generation (or synthetization) in `metasyn` can be performed as follows: + +```python +from metasyn import MetaFrame + +# load json into a metadataset object +mf = MetaFrame.from_json("metasyn_example.json") + +# create a fake dataset +df_syn = mf.synthesize(10) +``` + +This may result in the following `polars` data frame[^1]. Note that missing values in the `optional` column are appropriately reproduced as well, courtesy of the "prop_missing" entry in the metadata format. + +``` +shape: (10, 5) +┌─────┬────────┬─────┬────────┬──────────┐ +│ ID ┆ fruits ┆ B ┆ cars ┆ optional │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ cat ┆ i64 ┆ cat ┆ i64 │ +╞═════╪════════╪═════╪════════╪══════════╡ +│ 1 ┆ banana ┆ 4 ┆ beetle ┆ null │ +│ 2 ┆ banana ┆ 3 ┆ audi ┆ null │ +│ 3 ┆ banana ┆ 1 ┆ beetle ┆ 223 │ +│ 4 ┆ banana ┆ 0 ┆ beetle ┆ 258 │ +│ … ┆ … ┆ … ┆ … ┆ … │ +│ 7 ┆ banana ┆ 3 ┆ beetle ┆ 298 │ +│ 8 ┆ banana ┆ 2 ┆ beetle ┆ 67 │ +│ 9 ┆ banana ┆ 4 ┆ beetle ┆ -30 │ +│ 10 ┆ banana ┆ 2 ┆ beetle ┆ 172 │ +└─────┴────────┴─────┴────────┴──────────┘ +``` + +[^1]: This `polars` dataframe can be easily converted to a `pandas` dataframe using `df_syn.to_pandas()` + +# Plug-ins and automatic privacy +In addition to the core features described above, the `metasyn` package allows for plug-ins: add-on packages that alter the behaviour of the parameter estimation. Through this system, privacy guarantees can be built into `metasyn` ([privacy plugin template](https://github.com/sodascience/metasyn-privacy-template)) and additional distributions can be supported ([distribution plugin template](https://github.com/sodascience/metasyn-distribution-template)). For example, a plugin package called [`metasyn-disclosure-control`](https://github.com/sodascience/metasyn-disclosure-control) implements the disclosure control output guidelines from Eurostat [@bond2015guidelines] by re-implementing the `fit()` method of the candidate distributions shown in Table \autoref{tbl:dist} to include a micro-aggregation step. In this way, information transfer from the sensitive real data to the synthetic public data can be further reduced. + +This plug-in system is user-friendly: the user only needs to `pip install` the package and then `metasyn` can automatically find it to make the methods accessible: + +```python +from metasyn import MetaDataset +from metasyncontrib.disclosure import DisclosurePrivacy + +mf = MetaFrame.fit_dataframe(df, privacy=DisclosurePrivacy()) +``` + +# Conclusion +Synthetic data is a valuable tool for communicating about sensitive datasets. In this work, we have presented the software `metasyn`, which allows data owners to generate a synthetic version of their sensitive tabular data with a focus on privacy and transparency. Unlike existing tools for generating synthetic data, we choose to aim for low analytic validity to enable strong privacy guarantees: the underlying model makes a simplifying independence assumption, resulting in few parameters and thus a very limited information transfer. This approach additionally allows for disclosure guarantees through a plug-in system. + +Further documentation and examples can be found on [metasyn.readthedocs.io](https://metasyn.readthedocs.io/). + +# Acknowledgements + +This research was conducted in whole or in part using ODISSEI, the Open Data Infrastructure for Social Science and Economic Innovations (https://ror.org/03m8v6t10) + +The `metasyn` project is supported by the FAIR Research IT Innovation Fund of Utrecht University (March 2023) + +# References \ No newline at end of file diff --git a/docs/paper/paper.pdf b/docs/paper/paper.pdf new file mode 100644 index 00000000..af7cf901 Binary files /dev/null and b/docs/paper/paper.pdf differ