diff --git a/joss.04707/10.21105.joss.04707.crossref.xml b/joss.04707/10.21105.joss.04707.crossref.xml
new file mode 100644
index 0000000000..0185e98cd1
--- /dev/null
+++ b/joss.04707/10.21105.joss.04707.crossref.xml
@@ -0,0 +1,320 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<doi_batch xmlns="http://www.crossref.org/schema/5.3.1"
+           xmlns:ai="http://www.crossref.org/AccessIndicators.xsd"
+           xmlns:rel="http://www.crossref.org/relations.xsd"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           version="5.3.1"
+           xsi:schemaLocation="http://www.crossref.org/schema/5.3.1 http://www.crossref.org/schemas/crossref5.3.1.xsd">
+  <head>
+    <doi_batch_id>20221213T203705-b9e5e2b1dfe502ad5ac93ecef9b780f6da9c0bbd</doi_batch_id>
+    <timestamp>20221213203705</timestamp>
+    <depositor>
+      <depositor_name>JOSS Admin</depositor_name>
+      <email_address>admin@theoj.org</email_address>
+    </depositor>
+    <registrant>The Open Journal</registrant>
+  </head>
+  <body>
+    <journal>
+      <journal_metadata>
+        <full_title>Journal of Open Source Software</full_title>
+        <abbrev_title>JOSS</abbrev_title>
+        <issn media_type="electronic">2475-9066</issn>
+        <doi_data>
+          <doi>10.21105/joss</doi>
+          <resource>https://joss.theoj.org/</resource>
+        </doi_data>
+      </journal_metadata>
+      <journal_issue>
+        <publication_date media_type="online">
+          <month>12</month>
+          <year>2022</year>
+        </publication_date>
+        <journal_volume>
+          <volume>7</volume>
+        </journal_volume>
+        <issue>80</issue>
+      </journal_issue>
+      <journal_article publication_type="full_text">
+        <titles>
+          <title>dtrackr: An R package for tracking the provenance of
+data</title>
+        </titles>
+        <contributors>
+          <person_name sequence="first" contributor_role="author">
+            <given_name>Robert</given_name>
+            <surname>Challen</surname>
+            <ORCID>https://orcid.org/0000-0002-5504-7768</ORCID>
+          </person_name>
+        </contributors>
+        <publication_date>
+          <month>12</month>
+          <day>13</day>
+          <year>2022</year>
+        </publication_date>
+        <pages>
+          <first_page>4707</first_page>
+        </pages>
+        <publisher_item>
+          <identifier id_type="doi">10.21105/joss.04707</identifier>
+        </publisher_item>
+        <ai:program name="AccessIndicators">
+          <ai:license_ref applies_to="vor">http://creativecommons.org/licenses/by/4.0/</ai:license_ref>
+          <ai:license_ref applies_to="am">http://creativecommons.org/licenses/by/4.0/</ai:license_ref>
+          <ai:license_ref applies_to="tdm">http://creativecommons.org/licenses/by/4.0/</ai:license_ref>
+        </ai:program>
+        <rel:program>
+          <rel:related_item>
+            <rel:description>Software archive</rel:description>
+            <rel:inter_work_relation relationship-type="references" identifier-type="doi">10.5281/zenodo.7433514</rel:inter_work_relation>
+          </rel:related_item>
+          <rel:related_item>
+            <rel:description>GitHub review issue</rel:description>
+            <rel:inter_work_relation relationship-type="hasReview" identifier-type="uri">https://github.com/openjournals/joss-reviews/issues/4707</rel:inter_work_relation>
+          </rel:related_item>
+        </rel:program>
+        <doi_data>
+          <doi>10.21105/joss.04707</doi>
+          <resource>https://joss.theoj.org/papers/10.21105/joss.04707</resource>
+          <collection property="text-mining">
+            <item>
+              <resource mime_type="application/pdf">https://joss.theoj.org/papers/10.21105/joss.04707.pdf</resource>
+            </item>
+          </collection>
+        </doi_data>
+        <citation_list>
+          <citation key="schulzCONSORT2010Statement2010">
+            <article_title>CONSORT 2010 Statement: Updated guidelines
+for reporting parallel group randomised trials</article_title>
+            <author>Schulz</author>
+            <journal_title>BMJ</journal_title>
+            <volume>340</volume>
+            <doi>10.1136/bmj.c332</doi>
+            <issn>0959-8138</issn>
+            <cYear>2010</cYear>
+            <unstructured_citation>Schulz, K. F., Altman, D. G., &amp;
+Moher, D. (2010). CONSORT 2010 Statement: Updated guidelines for
+reporting parallel group randomised trials. BMJ, 340, c332.
+https://doi.org/10.1136/bmj.c332</unstructured_citation>
+          </citation>
+          <citation key="vonelmStrengtheningReportingObservational2008">
+            <article_title>The Strengthening the Reporting of
+Observational Studies in Epidemiology (STROBE) statement: Guidelines for
+reporting observational studies</article_title>
+            <author>Elm</author>
+            <journal_title>Journal of Clinical
+Epidemiology</journal_title>
+            <issue>4</issue>
+            <volume>61</volume>
+            <cYear>2008</cYear>
+            <unstructured_citation>von Elm, E., Altman, D. G., Egger,
+M., Pocock, S. J., Gøtzsche, P. C., Vandenbroucke, J. P., &amp; STROBE
+Initiative. (2008). The Strengthening the Reporting of Observational
+Studies in Epidemiology (STROBE) statement: Guidelines for reporting
+observational studies. Journal of Clinical Epidemiology, 61(4),
+344–349.</unstructured_citation>
+          </citation>
+          <citation key="collinsTransparentReportingMultivariable2015">
+            <article_title>Transparent reporting of a multivariable
+prediction model for individual prognosis or diagnosis (TRIPOD): The
+TRIPOD Statement</article_title>
+            <author>Collins</author>
+            <journal_title>BMC Medicine</journal_title>
+            <issue>1</issue>
+            <volume>13</volume>
+            <doi>10.1186/s12916-014-0241-z</doi>
+            <issn>1741-7015</issn>
+            <cYear>2015</cYear>
+            <unstructured_citation>Collins, G. S., Reitsma, J. B.,
+Altman, D. G., &amp; Moons, K. G. (2015). Transparent reporting of a
+multivariable prediction model for individual prognosis or diagnosis
+(TRIPOD): The TRIPOD Statement. BMC Medicine, 13(1), 1.
+https://doi.org/10.1186/s12916-014-0241-z</unstructured_citation>
+          </citation>
+          <citation key="wickhamWelcomeTidyverse2019a">
+            <article_title>Welcome to the Tidyverse</article_title>
+            <author>Wickham</author>
+            <journal_title>Journal of Open Source
+Software</journal_title>
+            <issue>43</issue>
+            <volume>4</volume>
+            <doi>10.21105/joss.01686</doi>
+            <issn>2475-9066</issn>
+            <cYear>2019</cYear>
+            <unstructured_citation>Wickham, H., Averick, M., Bryan, J.,
+Chang, W., McGowan, L. D., François, R., Grolemund, G., Hayes, A.,
+Henry, L., Hester, J., Kuhn, M., Pedersen, T. L., Miller, E., Bache, S.
+M., Müller, K., Ooms, J., Robinson, D., Seidel, D. P., Spinu, V., …
+Yutani, H. (2019). Welcome to the Tidyverse. Journal of Open Source
+Software, 4(43), 1686.
+https://doi.org/10.21105/joss.01686</unstructured_citation>
+          </citation>
+          <citation key="challenRiskMortalityPatients2021">
+            <article_title>Risk of mortality in patients infected with
+SARS-CoV-2 variant of concern 202012/1: Matched cohort
+study</article_title>
+            <author>Challen</author>
+            <journal_title>BMJ</journal_title>
+            <volume>372</volume>
+            <doi>10.1136/bmj.n579</doi>
+            <issn>1756-1833</issn>
+            <cYear>2021</cYear>
+            <unstructured_citation>Challen, R., Brooks-Pollock, E.,
+Read, J. M., Dyson, L., Tsaneva-Atanasova, K., &amp; Danon, L. (2021).
+Risk of mortality in patients infected with SARS-CoV-2 variant of
+concern 202012/1: Matched cohort study. BMJ, 372, n579.
+https://doi.org/10.1136/bmj.n579</unstructured_citation>
+          </citation>
+          <citation key="hyamsIncidenceCommunityAcquired2022">
+            <article_title>Incidence of Community Acquired Lower
+Respiratory Tract Disease in Bristol, UK During the COVID-19
+Pandemic</article_title>
+            <author>Hyams</author>
+            <doi>10.2139/ssrn.4087373</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Hyams, C., Challen, R., Begier, E.,
+Southern, J., King, J., Morley, A., Szasz-Benczur, Z., Garcia Gonzalez,
+M., Kinney, J., Campling, J., Gray, S., Oliver, J., Hubler, R., Valluri,
+S. R., Vyse, A., McLaughlin, J. M., Ellsbury, G., Maskell, N., Gessner,
+B., … Finn, A. (2022). Incidence of Community Acquired Lower Respiratory
+Tract Disease in Bristol, UK During the COVID-19 Pandemic [SSRN
+Scholarly Paper].
+https://doi.org/10.2139/ssrn.4087373</unstructured_citation>
+          </citation>
+          <citation key="gansnerOpenGraphVisualization2000">
+            <article_title>An open graph visualization system and its
+applications to software engineering</article_title>
+            <author>Gansner</author>
+            <journal_title>Software - Practice and
+Experience</journal_title>
+            <issue>11</issue>
+            <volume>30</volume>
+            <doi>10.1002/1097-024X(200009)30:11&lt;1203::AID-SPE338&gt;3.0.CO;2-N</doi>
+            <cYear>2000</cYear>
+            <unstructured_citation>Gansner, E. R., &amp; North, S. C.
+(2000). An open graph visualization system and its applications to
+software engineering. Software - Practice and Experience, 30(11),
+1203–1233.
+https://doi.org/10.1002/1097-024X(200009)30:11&lt;1203::AID-SPE338&gt;3.0.CO;2-N</unstructured_citation>
+          </citation>
+          <citation key="alterCapturingDataProvenance2021">
+            <article_title>Capturing Data Provenance from Statistical
+Software</article_title>
+            <author>Alter</author>
+            <journal_title>International Journal of Digital
+Curation</journal_title>
+            <issue>1, 1</issue>
+            <volume>16</volume>
+            <doi>10.2218/ijdc.v16i1.763</doi>
+            <issn>1746-8256</issn>
+            <cYear>2021</cYear>
+            <unstructured_citation>Alter, G. C., Gager, J., Heus, P.,
+Hunter, C., Ionescu, S., Iverson, J., Jagadish, H. V., Lyle, J.,
+Mueller, A., Nordgaard, S., Risnes, O., Smith, D., &amp; Song, J.
+(2021). Capturing Data Provenance from Statistical Software.
+International Journal of Digital Curation, 16(1, 1), 14–14.
+https://doi.org/10.2218/ijdc.v16i1.763</unstructured_citation>
+          </citation>
+          <citation key="DoltGitData2022">
+            <volume_title>Dolt is Git for Data!</volume_title>
+            <cYear>2022</cYear>
+            <unstructured_citation>Dolt is Git for Data! (2022).
+[Computer software]. DoltHub. https://github.com/dolthub/dolt (Original
+work published 2019)</unstructured_citation>
+          </citation>
+          <citation key="hyamsSeverityOmicron5292022">
+            <article_title>Severity of Omicron (B.1.1.529) and Delta
+(B.1.1.617.2) SARS-CoV-2 infection among hospitalised adults: A
+prospective cohort study</article_title>
+            <author>Hyams</author>
+            <doi>10.1101/2022.06.29.22277044</doi>
+            <cYear>2022</cYear>
+            <unstructured_citation>Hyams, C., Challen, R., Marlow, R.,
+Nguyen, J., Begier, E., Southern, J., King, J., Morley, A., Kinney, J.,
+Clout, M., Oliver, J., Ellsbury, G., Maskell, N., Jodar, L., Gessner,
+B., McLaughlin, J., Danon, L., Finn, A., &amp; Group, T. A. C. R.
+(2022). Severity of Omicron (B.1.1.529) and Delta (B.1.1.617.2)
+SARS-CoV-2 infection among hospitalised adults: A prospective cohort
+study (p. 2022.06.29.22277044). medRxiv.
+https://doi.org/10.1101/2022.06.29.22277044</unstructured_citation>
+          </citation>
+          <citation key="landauTargetsPackageDynamic2021">
+            <article_title>The targets R package: A dynamic Make-like
+function-oriented pipeline toolkit for reproducibility and
+high-performance computing</article_title>
+            <author>Landau</author>
+            <journal_title>Journal of Open Source
+Software</journal_title>
+            <issue>57</issue>
+            <volume>6</volume>
+            <doi>10.21105/joss.02959</doi>
+            <issn>2475-9066</issn>
+            <cYear>2021</cYear>
+            <unstructured_citation>Landau, W. M. (2021). The targets R
+package: A dynamic Make-like function-oriented pipeline toolkit for
+reproducibility and high-performance computing. Journal of Open Source
+Software, 6(57), 2959.
+https://doi.org/10.21105/joss.02959</unstructured_citation>
+          </citation>
+          <citation key="pimentelSurveyCollectingManaging2019">
+            <article_title>A Survey on Collecting, Managing, and
+Analyzing Provenance from Scripts</article_title>
+            <author>Pimentel</author>
+            <journal_title>ACM Computing Surveys</journal_title>
+            <issue>3</issue>
+            <volume>52</volume>
+            <doi>10.1145/3311955</doi>
+            <issn>0360-0300</issn>
+            <cYear>2019</cYear>
+            <unstructured_citation>Pimentel, J. F., Freire, J., Murta,
+L., &amp; Braganholo, V. (2019). A Survey on Collecting, Managing, and
+Analyzing Provenance from Scripts. ACM Computing Surveys, 52(3),
+47:1–47:38. https://doi.org/10.1145/3311955</unstructured_citation>
+          </citation>
+          <citation key="rossDoltrClientDolt2022">
+            <volume_title>Doltr: A client for the dolt
+database</volume_title>
+            <author>Ross</author>
+            <cYear>2022</cYear>
+            <unstructured_citation>Ross, N. (2022). Doltr: A client for
+the dolt database [Manual].</unstructured_citation>
+          </citation>
+          <citation key="survival-package">
+            <volume_title>A package for survival analysis in
+r</volume_title>
+            <author>Therneau</author>
+            <cYear>2022</cYear>
+            <unstructured_citation>Therneau, T. M. (2022). A package for
+survival analysis in r.
+https://CRAN.R-project.org/package=survival</unstructured_citation>
+          </citation>
+          <citation key="survival-book">
+            <volume_title>Modeling survival data: Extending the Cox
+model</volume_title>
+            <author>Terry M. Therneau</author>
+            <isbn>0-387-98784-3</isbn>
+            <cYear>2000</cYear>
+            <unstructured_citation>Terry M. Therneau, &amp; Patricia M.
+Grambsch. (2000). Modeling survival data: Extending the Cox model.
+Springer. ISBN: 0-387-98784-3</unstructured_citation>
+          </citation>
+          <citation key="lerner_using_2018">
+            <article_title>Using Introspection to Collect Provenance in
+R</article_title>
+            <author>Lerner</author>
+            <journal_title>Informatics</journal_title>
+            <issue>1</issue>
+            <volume>5</volume>
+            <doi>10.3390/informatics5010012</doi>
+            <issn>2227-9709</issn>
+            <cYear>2018</cYear>
+            <unstructured_citation>Lerner, B., Boose, E., &amp; Perez,
+L. (2018). Using Introspection to Collect Provenance in R. Informatics,
+5(1), 12.
+https://doi.org/10.3390/informatics5010012</unstructured_citation>
+          </citation>
+        </citation_list>
+      </journal_article>
+    </journal>
+  </body>
+</doi_batch>
diff --git a/joss.04707/10.21105.joss.04707.jats b/joss.04707/10.21105.joss.04707.jats
new file mode 100644
index 0000000000..a1876f1a6c
--- /dev/null
+++ b/joss.04707/10.21105.joss.04707.jats
@@ -0,0 +1,559 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN"
+                  "JATS-publishing1.dtd">
+<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="1.2" article-type="other">
+<front>
+<journal-meta>
+<journal-id></journal-id>
+<journal-title-group>
+<journal-title>Journal of Open Source Software</journal-title>
+<abbrev-journal-title>JOSS</abbrev-journal-title>
+</journal-title-group>
+<issn publication-format="electronic">2475-9066</issn>
+<publisher>
+<publisher-name>Open Journals</publisher-name>
+</publisher>
+</journal-meta>
+<article-meta>
+<article-id pub-id-type="publisher-id">4707</article-id>
+<article-id pub-id-type="doi">10.21105/joss.04707</article-id>
+<title-group>
+<article-title>dtrackr: An R package for tracking the provenance of
+data</article-title>
+</title-group>
+<contrib-group>
+<contrib contrib-type="author">
+<contrib-id contrib-id-type="orcid">0000-0002-5504-7768</contrib-id>
+<name>
+<surname>Challen</surname>
+<given-names>Robert</given-names>
+</name>
+<xref ref-type="aff" rid="aff-1"/>
+<xref ref-type="aff" rid="aff-2"/>
+</contrib>
+<aff id="aff-1">
+<institution-wrap>
+<institution>Engineering Mathematics, University of Bristol, Bristol,
+UK</institution>
+</institution-wrap>
+</aff>
+<aff id="aff-2">
+<institution-wrap>
+<institution>College of Engineering, Mathematics and Physical Sciences,
+University of Exeter, Devon, UK</institution>
+</institution-wrap>
+</aff>
+</contrib-group>
+<pub-date date-type="pub" publication-format="electronic" iso-8601-date="2022-10-04">
+<day>4</day>
+<month>10</month>
+<year>2022</year>
+</pub-date>
+<volume>7</volume>
+<issue>80</issue>
+<fpage>4707</fpage>
+<permissions>
+<copyright-statement>Authors of papers retain copyright and release the
+work under a Creative Commons Attribution 4.0 International License (CC
+BY 4.0)</copyright-statement>
+<copyright-year>2022</copyright-year>
+<copyright-holder>The article authors</copyright-holder>
+<license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
+<license-p>Authors of papers retain copyright and release the work under
+a Creative Commons Attribution 4.0 International License (CC BY
+4.0)</license-p>
+</license>
+</permissions>
+<kwd-group kwd-group-type="author">
+<kwd>R</kwd>
+<kwd>data pipeline</kwd>
+<kwd>consort diagram</kwd>
+<kwd>strobe statement</kwd>
+<kwd>data quality</kwd>
+<kwd>reproducible research</kwd>
+</kwd-group>
+</article-meta>
+</front>
+<body>
+<sec id="summary">
+  <title>Summary</title>
+  <p>An accurate statement of the provenance of data is essential in
+  bio-medical research. Powerful data manipulation tools available in
+  the <monospace>tidyverse</monospace> R package ecosystem
+  (<xref alt="Wickham et al., 2019" rid="ref-wickhamWelcomeTidyverse2019a" ref-type="bibr">Wickham
+  et al., 2019</xref>) provide the infrastructure to assemble, clean and
+  filter data prior to statistical analysis. Manual documentation of the
+  steps taken in the data pipeline and the provenance of data is a
+  cumbersome and error prone task which may restrict reproducibility.
+  <monospace>dtrackr</monospace> is a wrapper around a subset of the
+  standard <monospace>tidyverse</monospace> data manipulation tools that
+  allows automatic tracking of the processing steps applied to a data
+  set, prior to statistical analysis. It allows early detection and
+  reporting of data quality problems, and automatically documents a
+  pipeline of data transformations as a flowchart in a format suitable
+  for scientific publication, including, but not limited to CONSORT
+  diagrams
+  (<xref alt="Schulz et al., 2010" rid="ref-schulzCONSORT2010Statement2010" ref-type="bibr">Schulz
+  et al., 2010</xref>).</p>
+  <p><monospace>dtrackr</monospace> is first and foremost a utility to
+  accelerate and improve research by facilitating documentation,
+  supporting extraction of knowledge from data sets, and the execution
+  of research by helping identify data quality issues. The general
+  capability however fits into a broader context of other provenance or
+  data pipeline research. This includes initiatives such as
+  <monospace>C2Metadata</monospace>
+  (<xref alt="Alter et al., 2021" rid="ref-alterCapturingDataProvenance2021" ref-type="bibr">Alter
+  et al., 2021</xref>), which focus on a language independent
+  representation of a data pipeline, and R packages such as
+  <monospace>targets</monospace>
+  (<xref alt="Landau, 2021" rid="ref-landauTargetsPackageDynamic2021" ref-type="bibr">Landau,
+  2021</xref>) which focus on documenting pipeline code, and managing
+  the execution of a pipeline, or <monospace>RDataTracker</monospace>
+  which focusses on tracking the execution of a arbitrary R script
+  (<xref alt="Lerner et al., 2018" rid="ref-lerner_using_2018" ref-type="bibr">Lerner
+  et al., 2018</xref>). <monospace>dtrackr</monospace> takes a more data
+  oriented approach, which could be complementary, in which we remain
+  agnostic to the detail of a data pipeline script or nature of its
+  execution, but capture a subset of the transformations applied to data
+  alongside the data itself, thereby documenting the data state as it is
+  being manipulated. This is achieved by overriding the execution of
+  <monospace>dplyr</monospace> pipeline functions and results in a
+  retrospective record of provenance
+  (<xref alt="Pimentel et al., 2019" rid="ref-pimentelSurveyCollectingManaging2019" ref-type="bibr">Pimentel
+  et al., 2019</xref>). <monospace>dtrackr</monospace> also has the
+  ability to insert secondary analysis as annotations into the pipeline,
+  and allows control over what information is collected, ultimately with
+  a view to producing simple human readable output. The approach of
+  <monospace>dtrackr</monospace> is analogous to a
+  <monospace>git</monospace> commit history for dataframes, and there is
+  potential synergy with emerging versioned databases such as
+  <monospace>dolt</monospace>
+  (<xref alt="Dolt Is Git for Data!, 2019/2022" rid="ref-DoltGitData2022" ref-type="bibr"><italic>Dolt
+  Is Git for Data!</italic>, 2019/2022</xref>;
+  <xref alt="Ross, 2022" rid="ref-rossDoltrClientDolt2022" ref-type="bibr">Ross,
+  2022</xref>).</p>
+</sec>
+<sec id="statement-of-need">
+  <title>Statement of need</title>
+  <p>The collection of experimental or observational data for research
+  is often an iterative endeavour, involving curation of complex data
+  sets designed for multiple goals. Systematic data quality checking for
+  such sets is a major challenge, particularly when they are assembled
+  to identify emerging or rapidly evolving issues. Feedback from early
+  data analysis can identify specific data quality issues, resolution of
+  which can considerably improve data for the task at hand. However this
+  requires a clear understanding of why and when individual data items
+  are excluded, which is potentially tedious and may be seen as lower
+  priority compared to statistical analysis.</p>
+  <p>Data analysis using <monospace>tidyverse</monospace> in R is a
+  rapid means of transforming raw data into a format suitable for
+  statistical analysis. The transformations involved can, however affect
+  the results of statistical analysis, and meticulous care must be taken
+  to ensure that any assumptions made during data processing are well
+  documented. It is often too easy to inadvertently exclude data where
+  filtering on missing items, or joining linked data sets with
+  incomplete foreign key relationships.</p>
+  <p>In complex data analysis, the use of interactive programming
+  environments such as Read-Eval-Print Loops (REPL) in R markdown
+  documents, interim caching of results, or conditional branching data
+  pipelines, can result in the current state of a processed data set
+  becoming decoupled from the code that is designed to generate
+  them.</p>
+  <p>To surface these issues bio-medical journal articles are usually
+  required to report data manipulation to an agreed standard. For
+  example, CONSORT diagrams are part of the requirements in reporting
+  parallel group clinical trials. They are described in the updated 2010
+  CONSORT statement
+  (<xref alt="Schulz et al., 2010" rid="ref-schulzCONSORT2010Statement2010" ref-type="bibr">Schulz
+  et al., 2010</xref>), and clarify how patients were recruited,
+  selected, randomized and followed up. For observational studies, such
+  as case control designs, an equivalent requirement is the STROBE
+  statement
+  (<xref alt="von Elm et al., 2008" rid="ref-vonelmStrengtheningReportingObservational2008" ref-type="bibr">von
+  Elm et al., 2008</xref>). There are many other similar requirements
+  for other types of study, such as the TRIPOD statement for
+  multivariate models
+  (<xref alt="Collins et al., 2015" rid="ref-collinsTransparentReportingMultivariable2015" ref-type="bibr">Collins
+  et al., 2015</xref>). Maintaining such CONSORT diagram over the course
+  of a study when data sets are being actively collected and data
+  quality issues being addressed is time-consuming.</p>
+  <p><monospace>dtrackr</monospace> addresses these issues by
+  instrumenting a commonly used subset of standard
+  <monospace>tidyverse</monospace> data manipulation pipeline functions
+  from <monospace>dplyr</monospace> and <monospace>tidyr</monospace>. It
+  can automatically record the steps taken, records excluded and a
+  summary of the result of each data processing step, as part of the
+  data set itself in a “history graph”. In this way data sets retain an
+  accurate history of their own provenance regardless of the actual
+  route taken to assemble them. This history includes a complete record
+  of any data quality issues that lead to excluded records. The history
+  is a directed graph which can be expressed in the commonly used
+  <monospace>GraphViz</monospace> language
+  (<xref alt="Gansner &amp; North, 2000" rid="ref-gansnerOpenGraphVisualization2000" ref-type="bibr">Gansner
+  &amp; North, 2000</xref>) and may be visualised as a flowchart such as
+  in <xref alt="Figure 1" rid="figU003Afigure1">Figure 1</xref>; this
+  uses the Chronic Granulomatous Disease dataset from the
+  <monospace>survival</monospace> package
+  (<xref alt="Terry M. Therneau &amp; Patricia M. Grambsch, 2000" rid="ref-survival-book" ref-type="bibr">Terry
+  M. Therneau &amp; Patricia M. Grambsch, 2000</xref>;
+  <xref alt="Therneau, 2022" rid="ref-survival-package" ref-type="bibr">Therneau,
+  2022</xref>) as an example of a parallel group study and produces a
+  STROBE like flowchart.</p>
+  <fig>
+    <caption><p>An example flowchart derived directly from a simple
+    analysis of the Chronic Granulomatous Disease dataset demonstrating
+    use of <monospace>dtrackr</monospace> to generate the key parts of a
+    STROBE or CONSORT diagram.
+    <styled-content id="figU003Afigure1"></styled-content></p></caption>
+    <graphic mimetype="application" mime-subtype="pdf" xlink:href="media/figure1-consort.pdf" xlink:title="" />
+  </fig>
+  <p><monospace>dtrackr</monospace> was originally conceptualized during
+  an analysis I undertook of the severity of the Alpha variant of
+  SARS-CoV-2
+  (<xref alt="Challen et al., 2021" rid="ref-challenRiskMortalityPatients2021" ref-type="bibr">Challen
+  et al., 2021</xref>), and has since been used for other
+  epidemiological studies including an analysis of the incidence of
+  hospitalization of acute lower respiratory tract disease in Bristol
+  (<xref alt="Hyams, Challen, Begier, et al., 2022" rid="ref-hyamsIncidenceCommunityAcquired2022" ref-type="bibr">Hyams,
+  Challen, Begier, et al., 2022</xref>), and a comparative analysis of
+  the severity of the SARS-CoV-2 Omicron variant, versus the Delta
+  variant against a range of hospital outcomes
+  (<xref alt="Hyams, Challen, Marlow, et al., 2022" rid="ref-hyamsSeverityOmicron5292022" ref-type="bibr">Hyams,
+  Challen, Marlow, et al., 2022</xref>).</p>
+  <p>Although the specific example presented here is in the bio-medical
+  domain, tracking the provenance of data is a much broader issue, and
+  we anticipate there are many other applications for
+  <monospace>dtrackr</monospace>.</p>
+</sec>
+<sec id="acknowledgements">
+  <title>Acknowledgements</title>
+  <p>Thanks for contributions from TJ McKinley. I gratefully acknowledge
+  the financial support of the EPSRC via grants EP/N014391/1,
+  EP/T017856/1, the MRC (MC/PC/19067), and from the Somerset NHS
+  Foundation Trust, Global Digital Exemplar programme.</p>
+</sec>
+</body>
+<back>
+<ref-list>
+  <ref id="ref-schulzCONSORT2010Statement2010">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Schulz</surname><given-names>Kenneth F.</given-names></name>
+        <name><surname>Altman</surname><given-names>Douglas G.</given-names></name>
+        <name><surname>Moher</surname><given-names>David</given-names></name>
+      </person-group>
+      <article-title>CONSORT 2010 Statement: Updated guidelines for reporting parallel group randomised trials</article-title>
+      <source>BMJ</source>
+      <publisher-name>British Medical Journal Publishing Group</publisher-name>
+      <year iso-8601-date="2010-03-24">2010</year><month>03</month><day>24</day>
+      <volume>340</volume>
+      <issn>0959-8138</issn>
+      <pub-id pub-id-type="doi">10.1136/bmj.c332</pub-id>
+      <fpage>c332</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-vonelmStrengtheningReportingObservational2008">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>von Elm</surname><given-names>Erik</given-names></name>
+        <name><surname>Altman</surname><given-names>Douglas G</given-names></name>
+        <name><surname>Egger</surname><given-names>Matthias</given-names></name>
+        <name><surname>Pocock</surname><given-names>Stuart J</given-names></name>
+        <name><surname>Gøtzsche</surname><given-names>Peter C</given-names></name>
+        <name><surname>Vandenbroucke</surname><given-names>Jan P</given-names></name>
+        <string-name>STROBE Initiative</string-name>
+      </person-group>
+      <article-title>The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statement: Guidelines for reporting observational studies</article-title>
+      <source>Journal of Clinical Epidemiology</source>
+      <year iso-8601-date="2008-04">2008</year><month>04</month>
+      <volume>61</volume>
+      <issue>4</issue>
+      <fpage>344</fpage>
+      <lpage>349</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-collinsTransparentReportingMultivariable2015">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Collins</surname><given-names>Gary S.</given-names></name>
+        <name><surname>Reitsma</surname><given-names>Johannes B.</given-names></name>
+        <name><surname>Altman</surname><given-names>Douglas G.</given-names></name>
+        <name><surname>Moons</surname><given-names>Karel GM</given-names></name>
+      </person-group>
+      <article-title>Transparent reporting of a multivariable prediction model for individual prognosis or diagnosis (TRIPOD): The TRIPOD Statement</article-title>
+      <source>BMC Medicine</source>
+      <year iso-8601-date="2015-01-06">2015</year><month>01</month><day>06</day>
+      <volume>13</volume>
+      <issue>1</issue>
+      <issn>1741-7015</issn>
+      <pub-id pub-id-type="doi">10.1186/s12916-014-0241-z</pub-id>
+      <fpage>1</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-wickhamWelcomeTidyverse2019a">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Wickham</surname><given-names>Hadley</given-names></name>
+        <name><surname>Averick</surname><given-names>Mara</given-names></name>
+        <name><surname>Bryan</surname><given-names>Jennifer</given-names></name>
+        <name><surname>Chang</surname><given-names>Winston</given-names></name>
+        <name><surname>McGowan</surname><given-names>Lucy D’Agostino</given-names></name>
+        <name><surname>François</surname><given-names>Romain</given-names></name>
+        <name><surname>Grolemund</surname><given-names>Garrett</given-names></name>
+        <name><surname>Hayes</surname><given-names>Alex</given-names></name>
+        <name><surname>Henry</surname><given-names>Lionel</given-names></name>
+        <name><surname>Hester</surname><given-names>Jim</given-names></name>
+        <name><surname>Kuhn</surname><given-names>Max</given-names></name>
+        <name><surname>Pedersen</surname><given-names>Thomas Lin</given-names></name>
+        <name><surname>Miller</surname><given-names>Evan</given-names></name>
+        <name><surname>Bache</surname><given-names>Stephan Milton</given-names></name>
+        <name><surname>Müller</surname><given-names>Kirill</given-names></name>
+        <name><surname>Ooms</surname><given-names>Jeroen</given-names></name>
+        <name><surname>Robinson</surname><given-names>David</given-names></name>
+        <name><surname>Seidel</surname><given-names>Dana Paige</given-names></name>
+        <name><surname>Spinu</surname><given-names>Vitalie</given-names></name>
+        <name><surname>Takahashi</surname><given-names>Kohske</given-names></name>
+        <name><surname>Vaughan</surname><given-names>Davis</given-names></name>
+        <name><surname>Wilke</surname><given-names>Claus</given-names></name>
+        <name><surname>Woo</surname><given-names>Kara</given-names></name>
+        <name><surname>Yutani</surname><given-names>Hiroaki</given-names></name>
+      </person-group>
+      <article-title>Welcome to the Tidyverse</article-title>
+      <source>Journal of Open Source Software</source>
+      <year iso-8601-date="2019-11-21">2019</year><month>11</month><day>21</day>
+      <volume>4</volume>
+      <issue>43</issue>
+      <issn>2475-9066</issn>
+      <pub-id pub-id-type="doi">10.21105/joss.01686</pub-id>
+      <fpage>1686</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-challenRiskMortalityPatients2021">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Challen</surname><given-names>Robert</given-names></name>
+        <name><surname>Brooks-Pollock</surname><given-names>Ellen</given-names></name>
+        <name><surname>Read</surname><given-names>Jonathan M.</given-names></name>
+        <name><surname>Dyson</surname><given-names>Louise</given-names></name>
+        <name><surname>Tsaneva-Atanasova</surname><given-names>Krasimira</given-names></name>
+        <name><surname>Danon</surname><given-names>Leon</given-names></name>
+      </person-group>
+      <article-title>Risk of mortality in patients infected with SARS-CoV-2 variant of concern 202012/1: Matched cohort study</article-title>
+      <source>BMJ</source>
+      <publisher-name>British Medical Journal Publishing Group</publisher-name>
+      <year iso-8601-date="2021-03-10">2021</year><month>03</month><day>10</day>
+      <volume>372</volume>
+      <issn>1756-1833</issn>
+      <pub-id pub-id-type="doi">10.1136/bmj.n579</pub-id>
+      <fpage>n579</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-hyamsIncidenceCommunityAcquired2022">
+    <element-citation>
+      <person-group person-group-type="author">
+        <name><surname>Hyams</surname><given-names>Catherine</given-names></name>
+        <name><surname>Challen</surname><given-names>Robert</given-names></name>
+        <name><surname>Begier</surname><given-names>Elizabeth</given-names></name>
+        <name><surname>Southern</surname><given-names>Jo</given-names></name>
+        <name><surname>King</surname><given-names>Jade</given-names></name>
+        <name><surname>Morley</surname><given-names>Anna</given-names></name>
+        <name><surname>Szasz-Benczur</surname><given-names>Zsuzsa</given-names></name>
+        <name><surname>Garcia Gonzalez</surname><given-names>Maria</given-names></name>
+        <name><surname>Kinney</surname><given-names>Jane</given-names></name>
+        <name><surname>Campling</surname><given-names>James</given-names></name>
+        <name><surname>Gray</surname><given-names>Sharon</given-names></name>
+        <name><surname>Oliver</surname><given-names>Jennifer</given-names></name>
+        <name><surname>Hubler</surname><given-names>Robin</given-names></name>
+        <name><surname>Valluri</surname><given-names>Srinivas R.</given-names></name>
+        <name><surname>Vyse</surname><given-names>Andrew</given-names></name>
+        <name><surname>McLaughlin</surname><given-names>John M.</given-names></name>
+        <name><surname>Ellsbury</surname><given-names>Gillian</given-names></name>
+        <name><surname>Maskell</surname><given-names>Nick</given-names></name>
+        <name><surname>Gessner</surname><given-names>Bradford</given-names></name>
+        <name><surname>Danon</surname><given-names>Leon</given-names></name>
+        <name><surname>Finn</surname><given-names>Adam</given-names></name>
+      </person-group>
+      <article-title>Incidence of Community Acquired Lower Respiratory Tract Disease in Bristol, UK During the COVID-19 Pandemic</article-title>
+      <publisher-loc>Rochester, NY</publisher-loc>
+      <year iso-8601-date="2022-04-19">2022</year><month>04</month><day>19</day>
+      <pub-id pub-id-type="doi">10.2139/ssrn.4087373</pub-id>
+    </element-citation>
+  </ref>
+  <ref id="ref-gansnerOpenGraphVisualization2000">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Gansner</surname><given-names>Emden R.</given-names></name>
+        <name><surname>North</surname><given-names>Stephen C.</given-names></name>
+      </person-group>
+      <article-title>An open graph visualization system and its applications to software engineering</article-title>
+      <source>Software - Practice and Experience</source>
+      <year iso-8601-date="2000">2000</year>
+      <volume>30</volume>
+      <issue>11</issue>
+      <pub-id pub-id-type="doi">10.1002/1097-024X(200009)30:11&lt;1203::AID-SPE338&gt;3.0.CO;2-N</pub-id>
+      <fpage>1203</fpage>
+      <lpage>1233</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-alterCapturingDataProvenance2021">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Alter</surname><given-names>George Charles</given-names></name>
+        <name><surname>Gager</surname><given-names>Jack</given-names></name>
+        <name><surname>Heus</surname><given-names>Pascal</given-names></name>
+        <name><surname>Hunter</surname><given-names>Carson</given-names></name>
+        <name><surname>Ionescu</surname><given-names>Sanda</given-names></name>
+        <name><surname>Iverson</surname><given-names>Jeremy</given-names></name>
+        <name><surname>Jagadish</surname><given-names>H. V.</given-names></name>
+        <name><surname>Lyle</surname><given-names>Jared</given-names></name>
+        <name><surname>Mueller</surname><given-names>Alexander</given-names></name>
+        <name><surname>Nordgaard</surname><given-names>Sigve</given-names></name>
+        <name><surname>Risnes</surname><given-names>Ornulf</given-names></name>
+        <name><surname>Smith</surname><given-names>Dan</given-names></name>
+        <name><surname>Song</surname><given-names>Jie</given-names></name>
+      </person-group>
+      <article-title>Capturing Data Provenance from Statistical Software</article-title>
+      <source>International Journal of Digital Curation</source>
+      <year iso-8601-date="2021">2021</year>
+      <volume>16</volume>
+      <issue>1, 1</issue>
+      <issn>1746-8256</issn>
+      <pub-id pub-id-type="doi">10.2218/ijdc.v16i1.763</pub-id>
+      <fpage>14</fpage>
+      <lpage>14</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-DoltGitData2022">
+    <element-citation publication-type="book">
+      <source>Dolt is Git for Data!</source>
+      <publisher-name>DoltHub</publisher-name>
+      <year iso-8601-date="2022-10-04">2022</year><month>10</month><day>04</day>
+      <date-in-citation content-type="access-date"><year iso-8601-date="2022-10-04">2022</year><month>10</month><day>04</day></date-in-citation>
+      <uri>https://github.com/dolthub/dolt</uri>
+    </element-citation>
+  </ref>
+  <ref id="ref-hyamsSeverityOmicron5292022">
+    <element-citation>
+      <person-group person-group-type="author">
+        <name><surname>Hyams</surname><given-names>Catherine</given-names></name>
+        <name><surname>Challen</surname><given-names>Robert</given-names></name>
+        <name><surname>Marlow</surname><given-names>Robin</given-names></name>
+        <name><surname>Nguyen</surname><given-names>Jennifer</given-names></name>
+        <name><surname>Begier</surname><given-names>Elizabeth</given-names></name>
+        <name><surname>Southern</surname><given-names>Jo</given-names></name>
+        <name><surname>King</surname><given-names>Jade</given-names></name>
+        <name><surname>Morley</surname><given-names>Anna</given-names></name>
+        <name><surname>Kinney</surname><given-names>Jane</given-names></name>
+        <name><surname>Clout</surname><given-names>Madeleine</given-names></name>
+        <name><surname>Oliver</surname><given-names>Jennifer</given-names></name>
+        <name><surname>Ellsbury</surname><given-names>Gillian</given-names></name>
+        <name><surname>Maskell</surname><given-names>Nick</given-names></name>
+        <name><surname>Jodar</surname><given-names>Luis</given-names></name>
+        <name><surname>Gessner</surname><given-names>Bradford</given-names></name>
+        <name><surname>McLaughlin</surname><given-names>John</given-names></name>
+        <name><surname>Danon</surname><given-names>Leon</given-names></name>
+        <name><surname>Finn</surname><given-names>Adam</given-names></name>
+        <name><surname>Group</surname><given-names>The Avon CAP Research</given-names></name>
+      </person-group>
+      <article-title>Severity of Omicron (B.1.1.529) and Delta (B.1.1.617.2) SARS-CoV-2 infection among hospitalised adults: A prospective cohort study</article-title>
+      <publisher-name>medRxiv</publisher-name>
+      <year iso-8601-date="2022-06-30">2022</year><month>06</month><day>30</day>
+      <pub-id pub-id-type="doi">10.1101/2022.06.29.22277044</pub-id>
+      <fpage>2022.06.29.22277044</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-landauTargetsPackageDynamic2021">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Landau</surname><given-names>William Michael</given-names></name>
+      </person-group>
+      <article-title>The targets R package: A dynamic Make-like function-oriented pipeline toolkit for reproducibility and high-performance computing</article-title>
+      <source>Journal of Open Source Software</source>
+      <year iso-8601-date="2021-01-15">2021</year><month>01</month><day>15</day>
+      <volume>6</volume>
+      <issue>57</issue>
+      <issn>2475-9066</issn>
+      <pub-id pub-id-type="doi">10.21105/joss.02959</pub-id>
+      <fpage>2959</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-pimentelSurveyCollectingManaging2019">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Pimentel</surname><given-names>João Felipe</given-names></name>
+        <name><surname>Freire</surname><given-names>Juliana</given-names></name>
+        <name><surname>Murta</surname><given-names>Leonardo</given-names></name>
+        <name><surname>Braganholo</surname><given-names>Vanessa</given-names></name>
+      </person-group>
+      <article-title>A Survey on Collecting, Managing, and Analyzing Provenance from Scripts</article-title>
+      <source>ACM Computing Surveys</source>
+      <year iso-8601-date="2019-06-18">2019</year><month>06</month><day>18</day>
+      <volume>52</volume>
+      <issue>3</issue>
+      <issn>0360-0300</issn>
+      <pub-id pub-id-type="doi">10.1145/3311955</pub-id>
+      <fpage>47:1</fpage>
+      <lpage>47:38</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-rossDoltrClientDolt2022">
+    <element-citation publication-type="book">
+      <person-group person-group-type="author">
+        <name><surname>Ross</surname><given-names>Noam</given-names></name>
+      </person-group>
+      <source>Doltr: A client for the dolt database</source>
+      <year iso-8601-date="2022">2022</year>
+    </element-citation>
+  </ref>
+  <ref id="ref-survival-package">
+    <element-citation publication-type="book">
+      <person-group person-group-type="author">
+        <name><surname>Therneau</surname><given-names>Terry M</given-names></name>
+      </person-group>
+      <source>A package for survival analysis in r</source>
+      <year iso-8601-date="2022">2022</year>
+      <uri>https://CRAN.R-project.org/package=survival</uri>
+    </element-citation>
+  </ref>
+  <ref id="ref-survival-book">
+    <element-citation publication-type="book">
+      <person-group person-group-type="author">
+        <string-name>Terry M. Therneau</string-name>
+        <string-name>Patricia M. Grambsch</string-name>
+      </person-group>
+      <source>Modeling survival data: Extending the Cox model</source>
+      <publisher-name>Springer</publisher-name>
+      <publisher-loc>New York</publisher-loc>
+      <year iso-8601-date="2000">2000</year>
+      <isbn>0-387-98784-3</isbn>
+    </element-citation>
+  </ref>
+  <ref id="ref-lerner_using_2018">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Lerner</surname><given-names>Barbara</given-names></name>
+        <name><surname>Boose</surname><given-names>Emery</given-names></name>
+        <name><surname>Perez</surname><given-names>Luis</given-names></name>
+      </person-group>
+      <article-title>Using Introspection to Collect Provenance in R</article-title>
+      <source>Informatics</source>
+      <year iso-8601-date="2018-03">2018</year><month>03</month>
+      <date-in-citation content-type="access-date"><year iso-8601-date="2022-11-03">2022</year><month>11</month><day>03</day></date-in-citation>
+      <volume>5</volume>
+      <issue>1</issue>
+      <issn>2227-9709</issn>
+      <uri>https://www.mdpi.com/2227-9709/5/1/12</uri>
+      <pub-id pub-id-type="doi">10.3390/informatics5010012</pub-id>
+      <fpage>12</fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+</ref-list>
+</back>
+</article>
diff --git a/joss.04707/10.21105.joss.04707.pdf b/joss.04707/10.21105.joss.04707.pdf
new file mode 100644
index 0000000000..79a6ccbad7
Binary files /dev/null and b/joss.04707/10.21105.joss.04707.pdf differ
diff --git a/joss.04707/media/figure1-consort.pdf b/joss.04707/media/figure1-consort.pdf
new file mode 100644
index 0000000000..28ac7c6f57
Binary files /dev/null and b/joss.04707/media/figure1-consort.pdf differ