From 768b1f9682bf8f39f5a463fe8bba99ed086e3efb Mon Sep 17 00:00:00 2001 From: Feshuk Date: Wed, 4 Sep 2024 11:13:35 -0400 Subject: [PATCH] additional edits --- vignettes/Introduction_Appendices.Rmd | 54 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/vignettes/Introduction_Appendices.Rmd b/vignettes/Introduction_Appendices.Rmd index bc00110..55b26cd 100644 --- a/vignettes/Introduction_Appendices.Rmd +++ b/vignettes/Introduction_Appendices.Rmd @@ -61,7 +61,7 @@ The tcpl package is a flexible analysis pipeline is -The original tcplFit() functions performed basic concentration response curve fitting. Processing with tcpl v3 and beyond depends on the stand-alone tcplFit2 package to allow a wider variety of concentration-response models when using invitrodb in the 4.0 schema and beyond. Using tcpl_v3 with the schema from invitrodb versions 2.0-3.5 will still default to tcplFit() modeling with constant, Hill, and gain-loss. The main improvement provided by updating to using tcplFit2 is inclusion of concentration-response models like those contained in the program [BMDExpress2](https://github.com/auerbachs/BMDExpress-2). These models include polynomial, exponential, and power functions in addition to the original Hill, gain-loss, and constant models. Similar to the program BMDExpress, tcplFit2 curve-fitting uses a defined Benchmark Response (BMR) level to estimate a benchmark dose (BMD), which is the concentration where the curve-fit intersects with this BMR threshold. One final addition was to let the hit call value be a continuous number ranging from 0 to 1 (in contrast to binary hit call values from tcplFit() ). While developed primarily for ToxCast, the tcpl package is written to be generally applicable to the chemical-screening community. +The original tcplFit() functions performed basic concentration response curve fitting. Processing with tcpl v3 and beyond depends on the stand-alone tcplFit2 package to allow a wider variety of concentration-response models when using invitrodb in the 4.0 schema and beyond. Using tcpl_v3 with the schema from invitrodb versions 2.0-3.5 will still default to tcplFit() modeling with constant, Hill, and gain-loss. The main improvement provided by updating to using tcplFit2 is inclusion of concentration-response models like those contained in the program [BMDExpress2](https://github.com/auerbachs/BMDExpress-2). These models include polynomial, exponential, and power functions in addition to the original Hill, gain-loss, and constant models. Similar to the program [BMDExpress](https://www.sciome.com/bmdexpress/), tcplFit2 curve-fitting uses a defined Benchmark Response (BMR) level to estimate a benchmark dose (BMD), which is the concentration where the curve-fit intersects with this BMR threshold. One final addition was to let the hit call value be a continuous number ranging from 0 to 1 (in contrast to binary hit call values from tcplFit() ). While developed primarily for ToxCast, the tcpl package is written to be generally applicable to the chemical-screening community. The tcpl package includes processing functionality for two screening paradigms: (1) single-concentration (SC) and (2) multiple-concentration (MC) screening. SC screening consists of testing chemicals at one to three concentrations, often for the purpose of identifying potentially active chemicals to test in the multiple-concentration format. MC screening consists of testing chemicals across a concentration range, such that the modeled activity can give an estimate of potency, efficacy, etc. @@ -242,6 +242,8 @@ kable(output)%>% kable_styling("striped") ``` +See the [Data Interpretation>Representative Samples section](#chid) for more details. + ## MC Data-containing Tables ## - Level 1 @@ -420,6 +422,8 @@ kable(output)%>% kable_styling("striped") ``` +See the [Data Interpretation>Representative Samples section](#chid) for more details. + ## - Level 6 ```{r warning = FALSE, echo = FALSE} Field <- c("m6id", "m5id", "m4id", "aeid", "mc6_mthd_id", "flag") @@ -466,7 +470,7 @@ See the [Data Interpretation>Adminstered Equivalent Doses](#aed) section for mor The fields pertinent to the tcpl package are listed in the tables below. More specifics on assay and auxiliary annotations will be provided in later sections. ```{r warning = FALSE, echo = FALSE} -Field <- c("assay_source", "assay", "assay_component", "assay_component_endpoint", "assay_component_map", "assay_descriptions**", "assay_reagent**", "assay_reference**", "chemical", "chemical_analytical_qc**", "chemical_lists", "citations**", "gene**", "intended_target**", "organism**", "sample") +Table <- c("assay_source", "assay", "assay_component", "assay_component_endpoint", "assay_component_map", "assay_descriptions**", "assay_reagent**", "assay_reference**", "chemical", "chemical_analytical_qc**", "chemical_lists", "citations**", "gene**", "intended_target**", "organism**", "sample") Description <- c("Assay source-level annotation", "Assay-level annotation", "Assay component-level annotation", @@ -484,13 +488,13 @@ Description <- c("Assay source-level annotation", "Assay-level annotation", "Organism identifiers and descriptions", "Sample identifiers and chemical provenance information") -output <- data.frame(Field, Description) +output <- data.frame(Table, Description) kable(output)%>% kable_styling("striped") ``` -** indicates tables not currently used by the tcpl package +** indicates tables may have limited tcpl functionality, but data is still retrievable via tcplQuery. ## - Assay Source {#asid} ```{r warning = FALSE, echo = FALSE} @@ -728,6 +732,7 @@ tcplRegister(what = "asid", flds = list(asid = 1, asnm = "Tox21")) The **tcplRegister** function takes the abbreviation for $\mathit{assay\_source\_name}$, but the function will also take the unabbreviated form. The same is true of the **tcplLoadA-** functions, which load the information for the assay annotations stored in the database. ## Assay + [Assay](#aid) refers to the procedure, conducted by some vendor, to generate the component data. **To register an assay, an $\mathit{asid}$ must be provided to map the assay to the correct assay source.** One source may have many assays. To ensure consistency of the naming convention, first check how other registered assays within the assay source were conducted and named. The assay names follow an abbreviated and flexible naming convention of *Source_Assay*. Notable assay design features to describe the assay include: * Technology (i.e., detection technology), @@ -743,6 +748,7 @@ tcplRegister(what = "aid", flds = list(asid = 1, anm = "TOX21_ERa_BLA_Agonist", When registering an assay ($\mathit{aid}$), the user must give an $\mathit{asid}$ to map the assay to the correct assay source. Registering an assay, in addition to an assay\_name ($\mathit{anm}$) and $\mathit{asid}$, requires $\mathit{assay\_footprint}$. The $\mathit{assay\_footprint}$ field is used in the assay plate visualization functions (discussed later) to define the appropriate plate size. The $\mathit{assay\_footprint}$ field can take most string values, but only the numeric value will be extracted, e.g. the text string "hello 384" would indicate to draw a 384-well microtitier plate. Values containing multiple numeric values in $\mathit{assay\_footprint}$ may cause errors in plotting plate diagrams. ## Assay Component + [Assay component](#acid), or “component” for short, describes the raw data readouts. Like the previous level, one assay may have many components. **To register an assay component and create an $\mathit{acid}$, an $\mathit{aid}$ must be provided to map the component to the correct assay.** The assay component name will build on its respective assay name, to describe the specific feature being measured in each component. If there is only one component, the component name can be the same as the assay name. If there are multiple components measured in an assay, understanding the differences, and how one component may relate to another within an assay, are important naming considerations to prevent confusion. Assay component names will usually follow the naming convention of *Source_Assay_Component*, where “Component” is a brief description of what is being measured. ```{r eval = FALSE, message = FALSE} tcplLoadAcid(what = "asid", val = 1, add.fld = c("aid", "anm")) @@ -754,6 +760,7 @@ tcplRegister(what = "acsn", flds = list(acid = 1, acsn = "TCPL-MC-Demo")) ``` ## Assay Component Endpoint + [Assay component endpoint](#aeid), or “endpoint” for short, represents the normalized component data. **To register an endpoint and create an $\mathit{aeid}$, an $\mathit{acid}$ must be provided to map the endpoint to the correct component.** In past tcpl versions, each component could have up to two endpoints therefore endpoint names would express directionality (*_up/_down*). tcpl v3+ allows bidirectional fitting to capture both the gain and loss of signal. Therefore with tcpl v3+ , the endpoint name will usually be the same as the component name. ```{r eval = FALSE, message = FALSE} tcplLoadAeid(fld = "asid", val = 1, add.fld = c("aid", "anm", "acid", "acnm")) @@ -989,27 +996,29 @@ $$ resp.fc = \frac{cval}{bval} $$ **Order matters when assigning normalization methods.** The $\mathit{bval}$, and $\mathit{pval}$ if normalizing as a percent of control, need to be calculated prior to calculating the response value. Examples of normalization schemes are presented below: ```{r warning = FALSE, echo = FALSE} -output <- - matrix(c("1. bval.apid.nwlls.med", "2. resp.fc", "1. bval.apid.lowconc.med", "2. bval.apid.pwlls.med", -"3. resp.log2", "4. resp.mult.neg1", "3. resp.pc", "4. resp.multneg1 ", -"1. bval.apid.lowconc.med", "2. resp.fc", "1. bval.spid.lowconc.med", "2. pval.apid.mwlls.med", -"3. resp.log2", "4. \t", "3. resp.pc", "4. \t" , -"1. none", "2. resp.log10", "1. none", "2. resp.multneg1", -"3. resp.blineshift.50.spid", "4. \t", "3. \t", "4. \t"), - ncol=4, byrow = TRUE) +Normalization <- c('', 'Fold Change', '%Control') +Scheme_1 <- c('Scheme 1', '1. bval.apid.nwlls.med
2. resp.fc
3. resp.log2
4. resp.mult.neg1', + '1. bval.apid.lowconc.med
2. bval.apid.pwlls.med
3. resp.pc
4. resp.multneg1') +Scheme_2 <- c('Scheme 2', '1. bval.apid.lowconc.med
2. resp.fc
3. resp.log2', + '1. bval.spid.lowconc.med
2. pval.apid.mwlls.med
3. resp.pc') +Scheme_3 <- c('Scheme 3', '1. none
2. resp.log10
3. resp.blineshift.50.spid', + '1. none
2. resp.multneg1') +output <- t(data.frame(Normalization, Scheme_1, Scheme_2, Scheme_3)) + +# Export/print the table to an html rendered table. htmlTable(output, - rnames = FALSE, - rgroup = c("Scheme 1", - "Scheme 2", "Scheme 3"), - n.rgroup = c(2,2), - cgroup = c("Fold-Change", "\\%Control"), - n.cgroup = c(2,2)) + align = 'l', + align.header = 'l', + rnames = FALSE , + css.cell = ' padding-bottom: 5px; vertical-align:top; padding-right: 10px;min-width: 5em ', + caption = "Examples of Normalization Schemes" + ) ``` If the data does not require any normalization, the "none" method will be assigned. The "none" method simply copies the input data to the response field. Without assigning "none", the response field will not get generated and processing will fail. -With tcpl v2 , responses were only fit in the positive analysis direction. Therefore, a signal in the negative direction needed to be "flipped" to the positive direction during normalization. Multiple endpoints of one component were created to enable multiple normalization approaches when the assay measured gain and loss of signal. Negative direction data was inverted by multiplying the final response values by ${-1}$ via the "resp.mult.neg" methods. For tcpl v3 onward, the tcplFit2 package is utilized which allows for bidirectional fitting, meaning the "resp.mult.neg" method is now only required in special cases. +With tcpl v2 , responses were only fit in the positive analysis direction. Therefore, a signal in the negative direction needed to be "flipped" to the positive direction during normalization. Multiple endpoints stemming from one component were created to enable multiple normalization approaches when the assay measured gain and loss of signal. Negative direction data was inverted by multiplying the final response values by ${-1}$ via the "resp.multneg1" methods. For tcpl v3 onward, the tcplFit2 package is utilized which allows for bidirectional fitting, meaning the "resp.multneg1" method is now only required in special cases. In addition to the required normalization methods, the user can apply additional methods to transform the normalized values. For example, "resp.blineshift.50.spid" corrects for baseline deviations by $\mathit{spid}$. A complete list of available methods, by processing type and level, can be accessed with tcplMthdList. More information is also available in package documentation, `??tcpl::Methods`. @@ -1564,7 +1573,7 @@ htmlTable(output, ``` -Most models in tcplfit2 assume the background response is zero and the absolute response (or initial response) is increasing. In other words, these models fit a monotonic curve in either direction. The polynomial 2 (poly2) model is an exception with two parameterization options. The biphasic parameterization is what is used in tcpl . A biphasic poly2 model fits responses that are increasing first and then decreasing, and vice versa (assuming the background response is zero). If biphasic responses are not reasonable, data can be fit using the monotonic-only parameterization in a standalone application of tcplfit2_core with the parameter biphasic=FALSE assigned. +Most models in tcplfit2 assume the background response is zero and the absolute response (or initial response) is increasing. In other words, these models fit a monotonic curve in either direction. The polynomial 2 (poly2) model is an exception with two parameterization options. The biphasic parameterization is what is used in tcpl . A biphasic poly2 model fits responses that are increasing first and then decreasing, and vice versa (assuming the background response is zero). *If biphasic responses are not reasonable, data can be fit using the monotonic-only parameterization in a standalone application of tcplfit2_core with the parameter biphasic=FALSE assigned. This argument is not available in tcpl.* All data is fit bidirectionally then responses in unintended direction may be indicated with negative hit calls if ["overwrite" MC5 methods](#mc5) are applied. Upon completion of model fitting, each model gets a success designation: 1 if the model optimization converges, 0 if the optimization fails, and NA if 'nofit' was set to TRUE within tcplFit2::tcplfit2_core function. Similarly, if the Hessian matrix was successfully inverted then 1 indicates a successful covariance calculation (cov); otherwise 0 is returned. Finally, in cases where 'nofit' was set to TRUE (within tcplFit2::tcplfit2_core ) or the model fit failed the Akaike information criterion (aic), root mean squared error (rme), model estimated responses (modl), model parameters (parameters), and the standard deviation of model parameters (parameter sds) are set to NA. A complete list of model output parameters is provided below: @@ -3024,13 +3033,14 @@ aeid <- tcplLoadAeid(fld = "acid", val = 400) print(aeid) ``` -Users may subset on as many fields as desired. tcplLoadAeid joins the criteria with multiple `fld` and `val` as an “AND” rather than “OR”, meaning the subset returns rows where all are TRUE. `val` has the same length that `fld`. To combine fields of different types (i.e. numeric and string), or of different element lengths (list(“protein”, c(“Colorimetric”, “Fluorescence”))), ensure all values are provided in appropriate length lists. +Users may subset on as many fields as desired. tcplLoadAeid joins the criteria with multiple `fld` and `val` as an “AND” rather than “OR”, meaning the subset returns rows where all are TRUE. `val` has the same length that `fld`. To combine fields of different types (i.e. numeric and string), or of different element lengths, ensure all values are provided in appropriate length lists. ```{r load_aeid_plus} # subset all aeids by using multiple fields -- val must be same length in list form! aeids <- tcplLoadAeid(fld = c("intended_target_type", "detection_technology_type"), val = list("protein", c("Colorimetric", "Fluorescence"))) # list length == 2! ``` +The above example subsets to endpoints where intended target type is "protein" and detection_technology_type is "colorimetric" or "fluorescence". ### Load acid @@ -3169,7 +3179,7 @@ tcplPlot(fld = "aeid", val = 704, output = "pdf", verbose = TRUE, This section will explore how one can compare in vivo Points of Departure (PODs) from the [Toxicity Reference Database (ToxRefDB)](https://www.epa.gov/comptox-tools/downloadable-computational-toxicology-data#AT) with administered equivalent doses (AEDs) from ToxCast *in vitro* bioactivity data from invitrodb. The process can be adapted for any given chemical and target depending on available data in either database. -The following example will consider "Pentachlorophenol" and "liver toxicity" +The following example will consider "[Pentachlorophenol (PCP, DTXSID7021106)](https://comptox.epa.gov/dashboard/chemical/details/DTXSID7021106)" and "liver toxicity". This pesticide was selected at random to showcase workflow, but the process can be adapted for any given chemical and target depending on available data in either database. ### Consider ToxRefDB *in vivo* toxicity benchmarks as POD-Traditional