From 6c68028bbcf45a01962a623b5d165c57f4995dbb Mon Sep 17 00:00:00 2001
From: Jayoung Ryu <jayoung_ryu@g.harvard.edu>
Date: Tue, 7 May 2024 17:35:44 -0400
Subject: [PATCH] update tutorial docs

---
 docs/_create-screen.md        |  4 +--
 docs/_tutorial_cds.md         | 54 +++++++++++++++++++++--------------
 docs/_tutorial_gwas.md        | 38 +++++++++++++++---------
 docs/_tutorial_no_edit.md     | 18 ++++++------
 docs/_tutorial_prolif_gwas.md | 37 +++++++++++++++---------
 5 files changed, 93 insertions(+), 58 deletions(-)

diff --git a/docs/_create-screen.md b/docs/_create-screen.md
index ee897b3..be470f2 100755
--- a/docs/_create-screen.md
+++ b/docs/_create-screen.md
@@ -3,7 +3,7 @@
 bean create-screen gRNA_library.csv sample_list.csv gRNA_counts_table.csv
 ```
 ## Input
-  * gRNA_library.csv ([`variant` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_guides.csv), [`tiling` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/test_guide_info_tiling_chrom.csv))
-  * sample_list.csv ([`sorting` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/sample_list_survival.csv), [`variant` screen example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_samples.csv))
+  * gRNA_library.csv (`variant` screen [example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_guides.csv), `tiling` screen [example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/test_guide_info_tiling_chrom.csv))
+  * sample_list.csv (`sorting` screen [example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/sample_list_survival.csv), `variant` screen [example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_samples.csv))
   * gRNA_counts_table.csv: Table with gRNA ID in the first column and sample IDs as the column names (first row)
 `gRNA_library.csv` and `sample_list.csv` should be formatted as :ref:`input`. ([example](https://github.com/pinellolab/crispr-bean/blob/main/tests/data/var_mini_counts.csv))
\ No newline at end of file
diff --git a/docs/_tutorial_cds.md b/docs/_tutorial_cds.md
index 3913879..046da1d 100755
--- a/docs/_tutorial_cds.md
+++ b/docs/_tutorial_cds.md
@@ -16,28 +16,32 @@ Tiling screen that tiles gRNA densely across locus or multiple loci, selected ba
 
 ## Example workflow
 ```bash
-screen_id=my_sorting_tiling_screen
-working_dir=my_workdir
+screen_id=tiling_mini_screen
+working_dir=tests/workdir
 
 # 1. Count gRNA & reporter
 bean count-samples \
---input ${working_dir}/sample_list_tiling.csv          `# Contains fastq file path; see test file for example.`\
--b A                                               `# Base A is edited (into G)` \
--f ${working_dir}/test_guide_info_tiling_chrom.csv     `# Contains gRNA metadata; see test file for example.`\
--o $working_dir                                              `# Output directory` \
--r                                                 `# Quantify reporter edits` \
--n ${screen_id}                                       `# ID of the screen` \
+--input ${working_dir}/sample_list_tiling.csv           `# Contains fastq file path; see test file for example.`\
+-b A                                                    `# Base A is edited (into G)` \
+-f ${working_dir}/test_guide_info_tiling_chrom.csv      `# Contains gRNA metadata; see test file for example.`\
+-o $working_dir                                         `# Output directory` \
+-r                                                      `# Quantify reporter edits` \
+-n ${screen_id}                                         `# ID of the screen` \
 --tiling
+# count-samples output from above test run is too low in read depth. Downstream processes can be run with test file included in the Github repo.
+
+# (Optional) Profile editing patterns
+bean profile tests/data/${screen_id}.h5ad --pam-col '5-nt PAM'
 
 # 2. QC samples & guides
 bean qc \
-  ${working_dir}/bean_count_${screen_id}.h5ad           `# Input ReporterScreen .h5ad file path` \
-  -o ${working_dir}/bean_count_${screen_id}_masked.h5ad `# Output ReporterScreen .h5ad file path` \
+  ${working_dir}/${screen_id}.h5ad           `# Input ReporterScreen .h5ad file path` \
+  -o ${working_dir}/${screen_id}_masked.h5ad `# Output ReporterScreen .h5ad file path` \
   -r ${working_dir}/qc_report_${screen_id}              `# Prefix for QC report` \
 
 # 3. Filter & translate alleles
-bean filter ${working_dir}/bean_count_${screen_id}_masked.h5ad \
--o ${working_dir}/bean_count_${screen_id}_alleleFiltered \
+bean filter ${working_dir}/${screen_id}_masked.h5ad \
+-o ${working_dir}/${screen_id}_alleleFiltered \
 --filter-target-basechange                             `# Filter based on intended base changes. If -b A was provided in bean count, filters for A>G edit. If -b C was provided, filters for C>T edit.`\
 --filter-window --edit-start-pos 0 --edit-end-pos 19   `# Filter based on editing window in spacer position within reporter.`\
 --filter-allele-proportion 0.1 --filter-sample-proportion 0.3 `#Filter based on allele proportion larger than 0.1 in at least 0.3 (30%) of the control samples.` \
@@ -45,7 +49,7 @@ bean filter ${working_dir}/bean_count_${screen_id}_masked.h5ad \
 
 # 4. Quantify variant effect
 bean run sorting tiling \
-    ${working_dir}/bean_count_${screen_id}_alleleFiltered.h5ad \
+    ${working_dir}/${screen_id}_alleleFiltered.h5ad \
     -o $working_dir \
     --fit-negctrl \
     --scale-by-acc \
@@ -71,13 +75,21 @@ bean count-samples \
 
 Make sure you follow the [input file format](https://pinellolab.github.io/crispr-bean/input.html) for seamless downstream steps. This will produce `./bean_count_${screen_id}.h5ad`. 
 
+## (Optional) Profile editing pattern (:ref:`profile`)
+You can profile the pattern of base editing based on the allele counts. 
+```bash
+bean profile tests/data/${screen_id}.h5ad --pam-col '5-nt PAM'
+```
+Check the editing window, and consider feeding the start/end position of the editing window with the maximal editing rate into `bean qc` with `--edit-start-pos`, `--edit-end-pos` arguments.
+
+
 ## 2. QC (:ref:`qc`)
 Base editing data will include QC about editing efficiency. As QC uses predefined column names and values, beware to follow the [input file guideline](https://pinellolab.github.io/crispr-bean/input.html), but you can change the parameters with the full argument list of [bean qc](https://pinellolab.github.io/crispr-bean/qc.html). (Common factors you may want to tweak is `--ctrl-cond=bulk` and `--lfc-conds=top,bot` if you have different sample condition labels.)
 
 ```bash
 bean qc \
-  ${working_dir}/bean_count_${screen_id}.h5ad           `# Input ReporterScreen .h5ad file path` \
-  -o ${working_dir}/bean_count_${screen_id}_masked.h5ad `# Output ReporterScreen .h5ad file path` \
+  ${working_dir}/${screen_id}.h5ad           `# Input ReporterScreen .h5ad file path` \
+  -o ${working_dir}/${screen_id}_masked.h5ad `# Output ReporterScreen .h5ad file path` \
   -r ${working_dir}/qc_report_${screen_id}              `# Prefix for QC report` \
   [--tiling]                          `# Not required if you have passed --tiling in counting step`
 ```
@@ -103,8 +115,8 @@ where `path_to_gene_names_file.txt` has one gene symbol per line, and gene symbo
 Example allele filtering given we're translating based on MANE transcript exons of multiple gene symbols:
 
 ```bash
-bean filter ${working_dir}/bean_count_${screen_id}_masked.h5ad \
--o ${working_dir}/bean_count_${screen_id}_alleleFiltered \
+bean filter ${working_dir}/${screen_id}_masked.h5ad \
+-o ${working_dir}/${screen_id}_alleleFiltered \
 --filter-target-basechange                             `# Filter based on intended base changes. If -b A was provided in bean count, filters for A>G edit. If -b C was provided, filters for C>T edit.`\
 --filter-window --edit-start-pos 0 --edit-end-pos 19   `# Filter based on editing window in spacer position within reporter.`\
 --filter-allele-proportion 0.1 --filter-sample-proportion 0.3 `#Filter based on allele proportion larger than 0.1 in at least 0.3 (30%) of the control samples.` \
@@ -122,7 +134,7 @@ By default, `bean run [sorting,survival] tiling` uses most filtered allele count
   
     ```bash
     bean run sorting tiling \
-    ${working_dir}/bean_count_${screen_id}_alleleFiltered.h5ad \
+    ${working_dir}/${screen_id}_alleleFiltered.h5ad \
     -o $working_dir \
     --fit-negctrl \
     --scale-by-acc \
@@ -133,7 +145,7 @@ By default, `bean run [sorting,survival] tiling` uses most filtered allele count
 
     ```bash
     bean run sorting tiling \
-    ${working_dir}/bean_count_${screen_id}_alleleFiltered.h5ad \
+    ${working_dir}/${screen_id}_alleleFiltered.h5ad \
     -o $working_dir \
     --fit-negctrl \
     --scale-by-acc \
@@ -144,7 +156,7 @@ By default, `bean run [sorting,survival] tiling` uses most filtered allele count
 
     ```bash
     bean run sorting tiling \
-    ${working_dir}/bean_count_${screen_id}_alleleFiltered.h5ad \
+    ${working_dir}/${screen_id}_alleleFiltered.h5ad \
     -o $working_dir \
     --fit-negctrl 
     ```
@@ -154,7 +166,7 @@ By default, `bean run [sorting,survival] tiling` uses most filtered allele count
 
     ```bash
     bean run sorting tiling \
-    ${working_dir}/bean_count_${screen_id}_alleleFiltered.h5ad \
+    ${working_dir}/${screen_id}_alleleFiltered.h5ad \
     -o $working_dir \
     --fit-negctrl \
     --uniform-edit
diff --git a/docs/_tutorial_gwas.md b/docs/_tutorial_gwas.md
index c29d52f..d933382 100755
--- a/docs/_tutorial_gwas.md
+++ b/docs/_tutorial_gwas.md
@@ -16,32 +16,36 @@ GWAS variant screen with per-variant gRNA tiling design, selected based on FACS
 
 ## Example workflow
 ```bash
-screen_id=my_sorting_tiling_screen
-working_dir=my_workdir
+screen_id=var_mini_screen
+working_dir=tests/data/
 
 # 1. Count gRNA & reporter
 bean count-samples \
---input ${working_dir}//sample_list.csv    `# Contains fastq file path; see test file for example.`\
+--input ${working_dir}/sample_list.csv    `# Contains fastq file path; see test file for example.`\
 -b A                                  `# Base A is edited (into G)` \
 -f ${working_dir}/test_guide_info.csv     `# Contains gRNA metadata; see test file for example.`\
 -o ./                                 `# Output directory` \
 -r                                    `# Quantify reporter edits` \
 -n ${screen_id}                          `# ID of the screen to be counted`   
+# count-samples output from above test run is too low in read depth. Downstream processes can be run with test file included in the Github repo.
+
+# (Optional) Profile editing patterns
+bean profile tests/data/${screen_id}.h5ad --pam-col '5-nt PAM'
 
 # 2. QC samples & guides
 bean qc \
   ${working_dir}/bean_count_${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
   -o ${working_dir}/bean_count_${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
   -r ${working_dir}/qc_report_${screen_id}                `# Prefix for QC report` \
-  -b                                       ` # Remove replicates with no good samples.
+  -b                                                      ` # Remove replicates with no good samples.
 
 # 3. Quantify variant effect
 bean run sorting variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --fit-negctrl \
     --scale-by-acc \
-    --accessibility-col accessibility
+    --acc-bw-path tests/data/accessibility_signal_chr6.bw
 ```
 
 See more details below.
@@ -62,18 +66,26 @@ bean count-samples \
 
 Make sure you follow the [input file format](https://pinellolab.github.io/crispr-bean/input.html) for seamless downstream steps. This will produce `./bean_count_${screen_id}.h5ad`. 
 
+
+## (Optional) Profile editing pattern (:ref:`profile`)
+You can profile the pattern of base editing based on the allele counts. 
+
+```bash
+bean profile tests/data/${screen_id}.h5ad --pam-col '5-nt PAM'
+```
+
+
 ## 2. QC samples & guides (:ref:`qc`)
 Base editing data will include QC about editing efficiency. As QC uses predefined column names and values, beware to follow the [input file guideline](https://pinellolab.github.io/crispr-bean/input.html), but you can change the parameters with the full argument list of [bean qc](https://pinellolab.github.io/crispr-bean/qc.html). (Common factors you may want to tweak is `--ctrl-cond=bulk` and `--lfc-conds=top,bot` if you have different sample condition labels.)
 
 ```bash
 bean qc \
-  bean_count_${screen_id}.h5ad    `# Input ReporterScreen .h5ad file path` \
-  -o bean_count_${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
+  ${screen_id}.h5ad    `# Input ReporterScreen .h5ad file path` \
+  -o ${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
   -r qc_report_${screen_id}   `# Prefix for QC report` 
 ```
 
 
-
 If the data does not include reporter editing data, you can provide `--no-editing` flag to omit the editing rate QC.
 
 
@@ -84,7 +96,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
   If your gRNA metadata table (`${working_dir}/test_guide_info.csv` above) included per-gRNA accessibility score, 
     ```bash
     bean run sorting variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --fit-negctrl \
     --scale-by-acc \
@@ -95,7 +107,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
     
     ```bash
     bean run sorting variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --fit-negctrl \
     --scale-by-acc \
@@ -108,7 +120,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
 
     ```bash
     bean run sorting variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --fit-negctrl 
     ```
@@ -118,7 +130,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
 
     ```bash
     bean run sorting variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --fit-negctrl \
     --uniform-edit
diff --git a/docs/_tutorial_no_edit.md b/docs/_tutorial_no_edit.md
index cb499ae..da71b3c 100644
--- a/docs/_tutorial_no_edit.md
+++ b/docs/_tutorial_no_edit.md
@@ -17,25 +17,25 @@ Here, we consider an example where BEAN uses the **external count** without repo
 
 ## Example workflow
 ```bash
-screen_id=my_sorting_screen
-working_dir=my_workdir
+screen_id=var_mini_screen_noedit
+working_dir=tests/data/
 
 # 1. Given that we have gRNA count for each sample, generate ReporterScreen (.h5ad) object for downstream analysis.
-bean create-screen ${working_dir}/gRNA_info.csv ${working_dir}/sample_list.csv ${working_dir}/gRNA_counts.csv -o ${working_dir}/bean_count_${screen_id}
+bean create-screen ${working_dir}/gRNA_info.csv ${working_dir}/sample_list.csv ${working_dir}/var_mini_counts.csv -o ${working_dir}/${screen_id}
 
 # 2. QC samples & guides
 bean qc \
-  ${working_dir}/bean_count_${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
-  -o ${working_dir}/bean_count_${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
-  -r ${working_dir}/qc_report_${screen_id}                `# Prefix for QC report` \
-  -b                                       ` # Remove replicates with no good samples.
+  ${working_dir}/${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
+  -o ${working_dir}/${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
+  -r ${working_dir}/qc_report_${screen_id}     `# Prefix for QC report` \
+  -b                                           ` # Remove replicates with no good samples.
 
 # 3. Quantify variant effect
 bean run sorting variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --uniform-edit --ignore-bcmatch            `# As we have no edit/reporter information.` \
-    [--fit-negctrl [--negctrl-col target_group --negctrl-col-value NegCtrl]]    `# If you have the negative control gRNAs.`
+    [--fit-negctrl [--negctrl-col target_group --negctrl-col-value NegCtrl]]                                      `# If you have the negative control gRNAs.`
 ```
 
 ## Input file spec
diff --git a/docs/_tutorial_prolif_gwas.md b/docs/_tutorial_prolif_gwas.md
index fd3ef0f..032d8db 100644
--- a/docs/_tutorial_prolif_gwas.md
+++ b/docs/_tutorial_prolif_gwas.md
@@ -31,29 +31,33 @@ Note that `time` column should be numeric, and `condition` and `time` should mat
 
 ## Example workflow
 ```bash
-screen_id=my_sorting_tiling_screen
-working_dir=my_workdir
+screen_id=survival_var_mini_screen
+working_dir=tests/data
 
 # 1. Count gRNA & reporter
 bean count-samples \
---input ${working_dir}/sample_list.csv    `# Contains fastq file path; see test file for example.`\
+--input ${working_dir}/sample_list_survival.csv    `# Contains fastq file path; see test file for example.`\
 -b A                                  `# Base A is edited (into G)` \
 -f ${working_dir}/test_guide_info.csv     `# Contains gRNA metadata; see test file for example.`\
 -o ${working_dir}                                 `# Output directory` \
 -r                                    `# Quantify reporter edits` \
 -n ${screen_id}                          `# ID of the screen to be counted`   
+# count-samples output from above test run is too low in read depth. Downstream processes can be run with test file included in the Github repo.
+
+# (Optional) Profile editing patterns
+bean profile tests/data/${screen_id}.h5ad --pam-col '5-nt PAM'
 
 # 2. QC samples & guides
 bean qc \
-  ${working_dir}/bean_count_${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
-  -o ${working_dir}/bean_count_${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
+  ${working_dir}/${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
+  -o ${working_dir}/${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
   -r ${working_dir}/qc_report_${screen_id}                `# Prefix for QC report` \
   --lfc-conds D0,D14                `# Conditions to calculate LFC of positive controls` \
   -b                                       ` # Remove replicates with no good samples.
 
 # 3. Quantify variant effect
 bean run survival variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     -o ${working_dir}/ \
     --fit-negctrl \
     --scale-by-acc \
@@ -64,7 +68,7 @@ See more details below.
 ## 1. Count gRNA & reporter (:ref:`count_samples`)
 ```bash
 bean count-samples \
---input ${working_dir}/sample_list.csv    `# Contains fastq file path; see test file for example.`\
+--input ${working_dir}/sample_list_survival.csv    `# Contains fastq file path; see test file for example.`\
 -b A                                  `# Base A is edited (into G)` \
 -f ${working_dir}/test_guide_info.csv     `# Contains gRNA metadata; see test file for example.`\
 -o ${working_dir}                                 `# Output directory` \
@@ -73,12 +77,19 @@ bean count-samples \
 ```
 Make sure you follow the [input file format](https://pinellolab.github.io/crispr-bean/input.html) for seamless downstream steps. This will produce `./bean_count_${screen_id}.h5ad`. 
 
+## (Optional) Profile editing pattern (:ref:`profile`)
+You can profile the pattern of base editing based on the allele counts. 
+
+```bash
+bean profile tests/data/${screen_id}.h5ad --pam-col '5-nt PAM'
+```
+
 ## 2. QC samples & guides (:ref:`qc`)
 Base editing data will include QC about editing efficiency. As QC uses predefined column names and values, beware to follow the [input file guideline](https://pinellolab.github.io/crispr-bean/input.html), but you can change the parameters with the full argument list of [bean qc](https://pinellolab.github.io/crispr-bean/qc.html). (Common factors you may want to tweak is `--ctrl-cond=bulk` and `--lfc-conds=top,bot` if you have different sample condition labels.)
 ```bash
 bean qc \
-  ${working_dir}/bean_count_${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
-  -o ${working_dir}/bean_count_${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
+  ${working_dir}/${screen_id}.h5ad             `# Input ReporterScreen .h5ad file path` \
+  -o ${working_dir}/${screen_id}_masked.h5ad   `# Output ReporterScreen .h5ad file path` \
   -r ${working_dir}/qc_report_${screen_id}                `# Prefix for QC report` \
   --lfc-conds D0,D14                `# Conditions to calculate LFC of positive controls` \
   -b                                       ` # Remove replicates with no good samples.
@@ -97,7 +108,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
   If your gRNA metadata table (`${working_dir}/test_guide_info.csv` above) included per-gRNA accessibility score, 
     ```bash
     bean run survival variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     --control-condition D7 \    # This allows taking editing pattern from D7 (time=7) to infer unbiased editing pattern in time=0.
     -o $working_dir \
     --fit-negctrl \
@@ -109,7 +120,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
 
     ```bash
     bean run survival variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     --control-condition D7 \
     -o $working_dir \
     --fit-negctrl \
@@ -123,7 +134,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
 
     ```bash
     bean run survival variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     --control-condition D7 \
     -o $working_dir \
     --fit-negctrl 
@@ -134,7 +145,7 @@ If the data does not include reporter editing data, you can provide `--no-editin
   
     ```bash
     bean run survival variant \
-    ${working_dir}/bean_count_${screen_id}_masked.h5ad \
+    ${working_dir}/${screen_id}_masked.h5ad \
     --control-condition D7 \
     -o $working_dir \
     --fit-negctrl \