From 0475d2df6684eb7f497ccff1d4caa80c18f69cb5 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 2 Dec 2022 18:18:52 -0300
Subject: [PATCH 01/34] add infra to init-functions, draft subset step, improve
 data handling

---
 .../R/gem2s-6-prepare_experiment.R            |   5 +-
 pipeline-runner/R/gem2s-X-subset_experiment.R |  55 ++++++++++++++++++
 pipeline-runner/R/handle_data.R               |  12 ++--
 pipeline-runner/R/init-functions.R            |  38 +++++++++++-
 pipeline-runner/R/sysdata.rda                 | Bin 3116 -> 3123 bytes
 pipeline-runner/data-raw/sysdata.R            |   7 ++-
 6 files changed, 106 insertions(+), 11 deletions(-)
 create mode 100644 pipeline-runner/R/gem2s-X-subset_experiment.R

diff --git a/pipeline-runner/R/gem2s-6-prepare_experiment.R b/pipeline-runner/R/gem2s-6-prepare_experiment.R
index 5688a94d..8c636c6b 100644
--- a/pipeline-runner/R/gem2s-6-prepare_experiment.R
+++ b/pipeline-runner/R/gem2s-6-prepare_experiment.R
@@ -1,8 +1,7 @@
 #' Prepare experiment for upload to AWS
 #'
-#'  1) Merges the samples for the current experiment
-#'  2) Adds metadata: cellsId, color_pool, and gene annotation
-#'  3) Preparing QC configuration
+#'  1) Adds metadata: cellsId, color_pool, and gene annotation
+#'  2) Prepares QC configuration
 #'
 #' @inheritParams download_user_files
 #' @param prev_out  'output' slot from call to \code{create_seurat}
diff --git a/pipeline-runner/R/gem2s-X-subset_experiment.R b/pipeline-runner/R/gem2s-X-subset_experiment.R
new file mode 100644
index 00000000..95b82c74
--- /dev/null
+++ b/pipeline-runner/R/gem2s-X-subset_experiment.R
@@ -0,0 +1,55 @@
+
+download_cellsets_file <- function(parent_experiment_id) {
+  # download parent experiment cellsets file from S3
+}
+
+parse_cellsets <- function(cellset_path, cellset_type) {
+  cellsets <- jsonlite::fromJSON(cellset_path, flatten = T)
+
+  cellsets$cellSets |>
+    filter(key == cellset_type) %>%
+    .$children |>
+    as.data.frame() |>
+    as_tibble() |>
+    select(key, name, cellIds)
+
+}
+
+create_subset_experiment <- function(input, pipeline_config) {
+
+  parent_experiment_id <- input$parentExperimentId
+  subset_experiment_id <- input$subsetExperimentId
+  cellset_keys <- input$cellSetKeys
+
+  # load parent processed scdata and cellsets
+  s3 <- paws::s3(config = pipeline_config$aws_config)
+  parent_scdata <- load_processed_scdata(s3, pipeline_config, parent_experiment_id)
+  parent_cellsets <- load_cellsets(s3, pipeline_config, parent_experiment_id)
+
+  cell_ids_to_keep <- get_cell_sets(parent_cellsets, cellset_keys)
+
+  sample_id_mapping <- input$sampleIdMapping
+
+  # subset seurat object
+  scdata <- subset_ids(scdata, cell_ids_to_keep)
+
+  # add subset experiment name to the subset seurat object
+  scdata$project <- input$name
+
+  # add new sample_ids, keep originals in a new variable
+  scdata$parent_samples <- scdata$samples
+  scdata$samples <- sample_id_mapping[match(parent_samples, sample_id_mapping)]
+
+  # split by sample
+  scdata_list <- Seurat::SplitObject(scdata, split.by = "samples")
+
+  prev_out$scdata_list <- scdata_list
+  prev_out$annot <- scdata@misc
+  res <- list(
+    data = list(),
+    output = prev_out
+  )
+
+  message("\nSubsetting of Seurat object step complete.")
+  return(res)
+}
diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index 44018335..8082dd6e 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -25,7 +25,7 @@ upload_cells_id <- function(pipeline_config, object_key, cells_id) {
   return(object_key)
 }
 
-reload_scdata_from_s3 <- function (s3, pipeline_config, experiment_id) {
+load_processed_scdata <- function (s3, pipeline_config, experiment_id) {
   bucket <- pipeline_config$processed_bucket
   message(bucket)
   message(paste(experiment_id, "r.rds", sep = "/"))
@@ -45,10 +45,10 @@ get_nnzero <- function (x) {
 }
 
 order_by_size <- function(scdata_list) {
-    return(scdata_list <- scdata_list[ order( sapply(scdata_list, get_nnzero)) ])
+    return(scdata_list[order(sapply(scdata_list, get_nnzero))])
 }
 
-reload_scdata_list_from_s3 <- function (s3, pipeline_config, experiment_id) {
+load_source_scdata_list <- function (s3, pipeline_config, experiment_id) {
   bucket <- pipeline_config$source_bucket
   objects <- s3$list_objects(
     Bucket = bucket,
@@ -83,11 +83,11 @@ reload_data_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks
 
   # If the task is after data integration, we need to get scdata from processed_matrix
   if (match(task_name, task_names) > integration_index) {
-    return(reload_scdata_from_s3(s3, pipeline_config, experiment_id))
+    return(load_processed_scdata(s3, pipeline_config, experiment_id))
   }
 
   # Otherwise, return scdata_list
-  return(reload_scdata_list_from_s3(s3, pipeline_config, experiment_id))
+  return(load_source_scdata_list(s3, pipeline_config, experiment_id))
 
 }
 
@@ -200,7 +200,7 @@ send_gem2s_update_to_api <- function(pipeline_config, experiment_id, task_name,
   sns <- paws::sns(config = pipeline_config$aws_config)
   # TODO -REMOVE DUPLICATE AUTHJWT IN RESPONSE
   msg <- c(
-    data, 
+    data,
     taskName = list(task_name),
     experimentId = list(experiment_id),
     authJWT = list(input$auth_JWT),
diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index 25df328b..340595d6 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -249,6 +249,40 @@ call_gem2s <- function(task_name, input, pipeline_config) {
 }
 
 
+#' Call subset gem2s
+#'
+#' Runs step `task_name` of the subset GEM2S pipeline, sends output message to the API
+#'
+#' @param task_name character name of the step
+#' @param input list containing
+#'   - parentExperimentId
+#'   - childExperimentId
+#'   - sample IDs, and names
+#' @param pipeline_config list as defined by load_config
+#'
+#' @return character message id
+#'
+call_subset_gem2s <- function(task_name, input, pipeline_config) {
+  experiment_id <- input$experimentId
+  # remove when it's added to the input
+  input$subset_experiment <- TRUE
+
+  if (!exists("prev_out")) {
+    remove_cell_ids(pipeline_config, experiment_id)
+    assign("prev_out", NULL, pos = ".GlobalEnv")
+  }
+
+  check_input(input)
+  tasks <- lapply(SUBSET_GEM2S_TASK_LIST, get)
+
+  c(data, task_out) %<-% run_gem2s_step(prev_out, input, pipeline_config, tasks, task_name)
+  assign("prev_out", task_out, pos = ".GlobalEnv")
+
+  message_id <- send_gem2s_update_to_api(pipeline_config, experiment_id, task_name, data, input)
+
+  return(message_id)
+}
+
 #' Call QC pipeline
 #'
 #' Runs step `task_name` of the data processing pipeline, sends plot data to s3
@@ -258,7 +292,7 @@ call_gem2s <- function(task_name, input, pipeline_config) {
 #' @param input list containing:
 #'   - step parameters for all samples
 #'   - current sample UUID
-#'   - uploadCountMatrix (wether or not to upload matrix after step)
+#'   - uploadCountMatrix (whether or not to upload matrix after step)
 #' @param pipeline_config list as defined by load_config
 #'
 #' @return character message id
@@ -450,6 +484,8 @@ wrapper <- function(input, pipeline_config) {
     message_id <- call_qc(task_name, input, pipeline_config)
   } else if (process_name == "gem2s") {
     message_id <- call_gem2s(task_name, input, pipeline_config)
+  } else if (process_name == "subsetGem2s") {
+    message_id <- call_subset_gem2s(task_name, input, pipeline_config)
   } else {
     stop("Process name not recognized.")
   }
diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda
index 7ddc51f6298c99a3304cba1eff0aa729041a1bd8..58e71f4714dfaaad1c80ce19cd060f1c220a1bd5 100644
GIT binary patch
delta 3122
zcmV-249)Yb7_%4-LRx4!F+o`-Q&}>=HW85yB7gYCYklbYUwmBjeF_`u8cloMPrGdP
z?(r=(v;-2SiK3oODu0yP0X(XIrh=Z*1w1L?q{=-{Ow}<xM%hGqfB*mh0kr@$B58z7
zXcH1LH7AKZPgB(LdZWr{(?_TPGynhq00002Dt-WwPfaQMh-i9-fB-ZA0000000000
z0DlcnAq3G(N_tJ{Pg74M%xI>MQ1u3Y15E~i0000013&;ssMS11gq~5S>Q7K;0004@
zpaz3Ss0X7+&}h@j84N=p03?V_MofXJlL<lT>YtP*(jHLLLqGr;03M(K003w-{V4T1
zoqUd$_`9cOn!y+_ZLy4FZLk|{4Wu^8jDKZhh?eChL?WmKfP^0b5D;M<(m=5cBoB<4
zG9axX2q4@U9sJxyV*!+4$NJ_kkrRGCcl~K0WEe>c(e}9gc5L<NYxY71_ttd{G3!U`
zM7i`#`ZeS~KhmfSq?-?dJ4y-!QK}%JN>e*p!wf<i;%o?PiRB5jZHwGDPb}Uy&VK`h
z%5fdDjO0AS%{fmv<~xTu&}KaIx_6HG=p80w(LE!;Yn?`OLE$_{rf5Av*^XK<%!1))
zxMZF)$ahSm%6X4DY_@s~v(kEu!!Y%iOUJPGp5rviGmm-jGRxl*;gg@Cqw@LvGw$8e
z&b^zb^|AJ_ex&Q$&8IAO?LP#BAAfXcT>_aTOO?Sm3>&b$x;??KLTp$QGIwavqMb?R
zLPAQb1d?n42+|^$#y7Yp&@?&e&()#cg|QhHb2=hVUD6T&@9E0cw{5?BTHQYL@%C$6
zYcD?8nVUb3bLO^$swLo&Q<q6`9D2F6TNYu^j*a9~dWkbs<|u4Vmt>9I-G8?mq2Q}c
zw4>p2CcQ@99Gy;BE-JD2so2}rZM3JQmg1IzGiB2o$w|e$WVz``*b%SD(7-$sy{a=A
z&$|tX*+>q=j~Q2gn;am3Nc_m;^g@?RxsF=rBoHW$B&DmY>NiIdZuW_|Td(D9H0<+)
zO)Wb#=Fm)#gvjM7GAyFvMt`Vf4km^WYLP1{21f90X}XzrX0+nesJO>j08$(MG{l(^
zJL6k63=@+E7EbfZaS^RG6sn*h^@XWAy@8>vRa3NC1t91PBEU{S%&h>TOk~i|uFi5o
zrcVN<@~qftdS@e5YWujjL>k!$0LX-rOfUd}hyX=YBn>E%2Ag6UM1Mj67)HcIgv63S
zM$nrOA|`|(B8H}7LHlA!AdsMOFkk@!wM~}LY}jA#BeO`Q$1oHw9svyv)QpRVAjQ)9
z-Os0c95l*};&PN>K>5-_1aZhpQWBt~5jzRs^>;ztkFcW%r06D%S)R^?V5by=2s=F9
z98}cN(43`+;@#oi;D6v|;EC^N2fFO%qF&dFpP3m$DA^Iqs$N~>*zJsNoOJDNjUyD2
zMBObxypA!%YGqB~tFS0->Calp5rGWI48%3gIpZwndbzMd=>lZ1s|gcw1Hu9G;akpf
zCQNaAL&8GFq}G?SPItN%GJ-2bawmeJt%8#mW(|%5#C$5JeSdT?5P)q=yj6y6NnF)1
z<iRwx9z?-`cj&)=Lo+iLC>LRp88Ucc?YKK#Jeju^&s)399JU+@^*cG>Q3$ysn0M&V
z9=X*uV~sw&LJbVIbk8uyDT9b;&zkK?2V`$*z3(=dF+v^lz%Ih>ChHGol^H0+(zrw5
z`Sas9oy@IgcYhLMI7>%7<j%4~BQC&*o)|eA?`)JrBbQsdv3@pvT*~r!ysgE|#|&`8
z3)H^uvB_MfoH0cU^ih^umL*)yAj>h#?c__#N7AiE+rWn+rp(o5cRiCKbgrQq#@^KM
zbvlN2SQ6^_Y0XL*P#mzcfZ?3O2qFU_Un@Km`y;H{*nilIXO)%1Cs}1E*mrGe)N4bz
zwun}YESiikgyuy}juARS#~X$f!sWSnD0~n&-xqkv2C72p({{c|4iYjATO7cx`~xB<
z5sYJY`QN~BbrV)8wyT!ILzfi+q8yY2A`2TC*A5yj8nHlTqFzRMtrlHCYRjxS7{$vV
zbn-pOhJSB2xfZiS7l;dOXukU0m{O3Mg}D(*wzbq1TrPNIL{^A3v>pgVLLMbB4Gn2!
zRSK}UL3B8$Vl5uRS!)i&gjcNOA+Ib;+!+}b-C$(o*?$x*6B|(MRKU%A8Qt(E0}9>D
z&W2A)8a*LmDCVkCFe-*(aYQnTJr!7x$rgrU=YJK~Q`E%(V<cM2CeVfHL073@n?TW{
zR{iu)R}3!@rVC>1>Cx12@!apsQh?J!TU-=DhFu6{3MC#`?u*RAD*-G7!!A%%nHM!+
zMp9ZvZwy=yiHI&GQ%aj4gu|nZ8f0qItbsVOgFxz*V8bBz5Z2Q)D^^%DQLNBhU6ZC~
zn}5=BrXWj%5M)Mf-rRNFjet85v^j7=-S;%{@y9Sjf<RKP+cA!5rcs$zA3SrQ@3T+D
zpzN*;*>+kQW%Y)gvp!CqaBWG*mTvbAt^B2=xd4zhnGzYPMvKNNdbK>&#&!l^c4j;{
zKMT&%y0W==Ffej)uB)Ccv&cg2t_QiDZ+~25a^0&vfyq~8*q+Bk_5=7(pjYkyh@LGo
zq)ALJ$1Q3q@MHtUajbz60$4}OxmFB0<mcaUsfMWxjw!Oh#WGinmy80!y|eI|mm|GJ
zvwo=+^jP!S9cdm`Vqs%AMHmhnPdfYgaYxUbX0L6QQtwPq7adGTp>7=(kh<HG3xDmJ
zN*dq+5hX^V50N{ng-*W8BU6p=jBvt?uH^~FgpjYMZfK5lb&9;!tl&VWtihtQaB7CN
zh`h7kluIsAJXlFF$1=}uu8ih07!9?#Rvhh)cX>NXpV)rY8RpO~2DTWUd6WXP3P&8Z
zZ{B!J-1xAFhy|E5gofJ$#=sbq(ti~Q0FsIhm3>h$eokugP7GW&8f`O`^CyDB4-0Ij
z4X<VBTOs%yc|3+;hJ}TCYNA<V3zeJl%mPYxW;+}gAyKV~QqFYFE4b|EcGMb1xd;hu
zGiclzQyrD3mCy2d0nZZLZj*GgM?s1*@Xs_&gyn~3TvscSUR`RJ!yN?d>VMeAeFt2T
zRgkU`t1RG}vK7<?Zx#b7SHj+{*7Q~o=$yKxP}||#v_w&eEL3L@9Yxl275v^K5YX=3
zTsG!(ws<`<fH+GJ<F4CaYoHv<!=)X7Dp?E{T((=YnK4l;-6%4$+(C#F;j_lKPJbU?
zRPsb?T3pHw9tU?6VKoSq-G9^FGxU17XfKtEyJ<=T2R0b1)ynq~vIOW4>U^4dT8g3R
z3Wh$+S)$5Zj=)t{U5kXYnwcTp2|p1VyXMNA?4UHr7}j8vDW#hfB6b!P2b9N42X#?N
zzohEWPq7IZ3ooqB-NToL3<t41sj{`zk(rtlV;pJ_0Pu)Fs%v&iKYs)9QoWWa@_8;}
z?G2LGjxpK0H!^HOEGSYFfk>d3WoR6<2$O;I-~=>l-Nc=sbB!~*RV@u8q^+ym6&EYR
zicpHjy^2Be9`3E}kgEWdAUV1uhcd@O>vp!1q<5Z~NU4+=Sf{TWFT}do8C*wTB3=Qg
zAV~y7?3U)Dv!3+8p?~YKnT$1yS%VjHSv|?n<Xk3DQV&l5Y~`LvIFNw3ScpX=)pe|K
zxr1Lz3@R|xtFdP7*2^_VNGDx-14JmTZuJ`~-i8#Gv&Hbmd`f5%;AlYnr_-`GTUK@6
zM=?FCwNd7s6GLDE+3cvP5L-27ymSIq8TYQyK^t(&S~MYnoPQl=6q0k~BwV8LxTRNH
zvI*CkogMDU=xC^f)R&U-6pU?gow8h^deovVg2Av^dWK>%r1A}$aHI?*j~1cORBLG0
z0m!f*Qf?H)(&auzW3n0PL3SC9QyP%MVx-t}jr-cRVvY(xT*8A^Ukr?CqCiH@oK-AD
z;X^|OvTz(ICVzJn@tm7UP&2cUGbsfkfxtTgF3MV;7A%O_Bz4FasSp*Mf?hS14<*<M
zOP_-j-nE9=K%6+=nHAK$Ftt`f7VfKhAPIo!5FG{uB9%rMOV`z4Q0(-UMeV{zjPa#*
zVJLV8tg!P<4(!0tRz*`zPWKZw!y8;wF?>^b1=Eb$avSscd2pcTP;|S`SO@_li2udh
MkxmpO46qG^K$4%PF8}}l

delta 3115
zcmV+`4Ak?p7_1l$LRx4!F+o`-Q&~#NT7QuaB7Z)_&G)T+FCTrq^hlxKX`z{Rqh6i2
z+}2E>^w$YfMA0!5$)E<4&?X25L7<~*dSafD=`$iU0000QWEub(2qQoNG-#&Lq}o#+
zqct@Lq%_k*AQ}Jw00000&=FJc1WJ0RwNFGeGynhqG&BGJ000008UO$Wr;vhZflpM=
zQGck}A+%_in1`qu8Vvvd000008UO-CCQT{m)f-dFKS}@^00000000000000qG!lq}
zo>Xn5^)%Y2=`=tY01Td{ng9ScpdOD%5u-911ZebzY95>}H(P;_d4Ac7))9jS+Ze_+
z+X1%F+Cz4t`C%d^3rUd(szG9uQ4CTbAb-3<2_RWPi3Bf`CPWqjiYP&q8aFe5iv|Uw
z3;S{qjtILxpZxLBC<-DVp1i4kDzt9z))}p7{ItYvCoMVlqFDN^zKwYgCzgN=##_PO
zj<SO2iZX~OQk2X(qYXg~VKu}y1lbd4+ZUM5ZX2fYzGyfO6NuwH<Ig)sA?O^ZY=7oC
z&V$i9PT}bunbdS0)6_f0vEMq*!@zh?RO~%ssghJOOrlbuDOk%h4`G^T8Ry8VRX%2c
z^ggDEqG=V&2AQa8mQ2%F*ES6T#>pvLYwh>j{C;+;*R`!;&gbec+(oy)STwcU9DH-y
zkb~D6R-~>;63Bok0K*LzaFD2XV}DmJB}};ahsN6O+^Go(D;Ok`U<gOlg-0me-<z;#
zb7So5rtdP?jEk)6L`^zwlgI<PM$=l|rv5E!X!O5Bp0y0i&$f1E(c+yM?WHP-bjYd8
z$6V8oTQ;j=%sd$CU8@$<mL-kKI%&VnNVm7H&3v93?p;bkOVKrHIJwxzPJcOBTH6cM
z>)XuRX-{J<#VsUf%c?h0lZ$wXZ>1w(My|6<0PRG>jj<!H@eH0=TEk@ymCnV1xkLm*
z%7t!$YW-2fZy9z;1PUXZl({@!2OM*okLf1mH#4r<Y1!upnp$>g&7hef36aWDWLZVT
zjZn%QO$;E_B34ukjo{kTb$>GM&1uD{QE`rB0Z4K>sfjWocgD7C7$+tSES=|+;v+QF
zQmTN5%q>aP><tZSs-2?9DF;AV76NhxWoQKwV<v`$c5{*yGI$j?m1lamS2-BgESw-M
zc!NgJfDEz{Nil#37=Q#=gn^|5fu`7okpTb<C}JRl#F9Wz(3=n<4SxtiMXglCgZiQ(
zA_K%?u|Yrq>CZ*Q<BFyXJl4^Gf2>B1L^`2yKK+dm4%jh3#Is9Qj8{8dpgpZ(xFzHW
zh!U8DkhTa(3S{YIu?lPK$>K^-B`O<@ah$0sgR3*y(5w{VkYNXBo5PBlnmQAdu^jF{
zD|dT&n|ZF`j7}`h%76C8zK!*}6R~bcX1MljE8Kr^+0$8?-kC9FZEEi-a@-c@;72gC
zD(;R?G@-Y%9ot$=MaX7AXd$sVX^h%UGuaFh1j$>eR}v=V2$}))VQbEECQNaAL&8GF
zq}G*D7r_ddK@_6EiP=!r!AXm=2FC!$-dkXiW8x2ugM)@bbAOvushLVRGGDhX3CNf4
zuJzm3yBW~UO-L76l+2kt5pQJOuUXb?*we#+yu%vY#^W4Ii)&VnA{WyW%>nD8m%RSI
zK;AYo;q0Dqj#CE^(W5=uk`Bn;)qCD;Fk*x|=zv{?+)dUKpv67R>1caCeD|Bq=2o-2
zi7^|+qkQDfvVTJ(E?E&gFmg1#vQZF@TyDpKeuEsvdae0pT&!@z3^2Wk_i~(c%vi6*
z6!-4k(JgxtuG@$*tajbpCvh7WtJAZM%aH_7;LT=da)`QDPK_gPYIb!xhIUpZ)){Qi
zN*Pcbu(N>SoWlqr10r85JQVyf#oE`dEuvOe44q|^qkmhwX01kOcQ(@s(S?&yh7g>{
zsj<Q*NLb@=!nj<w8zm2d2OHw=86eRj2@*FlC^fP*(nQ9_;Jg__s}l<AuDkdTCbN<{
zC9qR<*<@iMd^84#a!?S6EKEmSY-%kU!9ZrBUPgJb7HI(AWLC;^dO)^b%<qCk+}{N5
zQ5yy$gMY!O*|S}<g~eMhq>?FE*E*`WVZ1UTD?|~k@OU8+2#1uoXlqMrQC1fSE{_=P
zMWfnFEos`2iuN3YHRXwX0lmFVla0G)qR~mI(#4&usm`D|?|n?EM6Y_P!BOynvFQsD
zM>SHBfmAaSiXoIy=&HnqNVGE-II_yUSfC7KLVskHjYul)5RfJWcLmpJ^wqh-DN3r7
z0G)_vNRGI;u9-=SP#S1UeZfQ+WzdFDqEY3K1YTwlSP5Vz8FGT8$hoTmGLq6WxyZm6
zSVF>8TEk%EOgcEhrbewg$P<egG!Cg|3^EUa4Q-P`wPk}f8qEd8**a!<y(cPS1i4^?
zB7ZaQ+oN69+6NHCZOVhb<)@2^xxyTYAf<TcG2P25nU#VQSAq`)dAcei#_+2)+Lgx1
z#v6w0m~0Eej4)JnVUD?r17Uz>2Gb%#93(ce8l%v<q^eN>kP#$w*>qJkjIY8&5RC)-
z<6Q?lJ7$s=YiN6w*7&0%mdy#O4obSN#DDfHvs4e=K>%N{0EnP2Go&_|FU^jnD>Ax4
z0cCa&A|Ol&2kz0Z>QHY6^Nmat3G|#(Wr2!huNf{E#f3WO<!>92+`C-fpi#VN9{lm#
zM~%6RFklf(2*!vVo)hyVCcsZ;G1k)`s3;7tOOgm%r$xjsy7bGE>03lmfCz~yHGd%n
zr+ZMUJeJX2QJf^LS-7I3coG93L_nJnoJcy$IGeyS&2t1Y;gc~gWx_hhbsS9a<27eZ
zT0{}+P-gY-&7}b&K>o5q`g0wRkGIXL9jIud33H3VvBoBvRRL!dj+5%wkn+nG8Jt8&
zBF2!A+hCa30}`r|=@1f7i^RQvM1Lp}4gSd4UfITbMqxIm(>PRHXM&#wxmMPM*nt|2
z24KOg;%xFcxfrobH=C3IL{E+33~Y&H90nT$LCR--mX=MIisXna6={s4aIUr`h1b~8
z-KaR5OSsOabwu8FBpTqnVyx?&c@yF}x@Wk_U699VahyFjCGVZ$m1QO1pntJ3#xEgd
zKv@B3GJ84^VSU|u0v!_<P^uixe=^}%qcAEhRfrglF({EkCw)fWv5S#;K1L#8cQJ>6
z>Jv@P)wa_Mpt81!(bNJOp?=Gi%Jq5^QtY<+kYtN-1|Us>uNvJszmFGO)S@+OF4}`e
z`?-{0EfFiGj&r}y>+oGK5`PwU(;5Q<HYsVeTa?>i8-@gU{B=c%Zoa`F*Zp&Q6mpET
z+yuqx-IPm=wUT~tYV(dic*P3Kj1l35C0GWs2oWS>hMtiKc~qx67l1%<sTmIwIAG~O
zn+iz9>9F966N<AVMU7ud3aYJo^l{Z70${2@m~n4XAA$IA+0bejJ%6b%xkI<H?#>;z
z{N~B8Jf^5)7mbTxyB4YjXo(c~nIQ;ezk14_l+aB<`;pEkDC#83+Z7ih!-`Of#k;9U
ze8rzF&48f*kic^D(x2Es;2(xgBa)pj&_-Ga4pk@V9T@0J^(~;?0YYjGFko2<2H}xi
zz9t;xf`=s2nV70zTz>+Z`2_E|?m)x=<D>w*d)!AP)S%%)0>p$uDJ{^|smA5~Y1k*I
z!3_<1wrsXt5;#L~EY2blQDY<ZVtXKl6)ty;*%oYAfPPa#2j#+^O7FU?8IXpeOG}<i
z;T;i)(g3yk<Ps24*T0rg&;hfI+9Pv=Evc`gIv5xOu)?Ez%zr~z@sMt+*<^Tx@Fk}|
zTas!b6%dw^@}NSIn&`~#ZR#>HREv;WIR&{CPDXT|A}M!Cp~RK!(mLfvsD`Nr8vu}-
zx>A%ywD{edl1wQ<ep$_BHbgEXF3{vT94fr{iouYU_|98v;)&FyajRbTrlz5gQJuo(
z@SKzryQ+Fd-G2#CGq;T=R1}H_2<-~v#q~nBFo@jR^)xd|AS-yo`b;VyOtccRZXC(M
zbB-s0FpS|kFHW%Pu2NAAOU%O{6T<?7l>#z0Nezd)S1F7#%&SWG;cNyK(#vWNMFW2#
z`2!R~Fbzs}6d`|7vpr(Nh@4`8lNjheN?R^-E?m4o<rKVA>-+#{4xLkcUC9*TLP0Aj
FY5epZls^Cf

diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R
index abd14a09..884288ec 100644
--- a/pipeline-runner/data-raw/sysdata.R
+++ b/pipeline-runner/data-raw/sysdata.R
@@ -24,6 +24,12 @@ GEM2S_TASK_LIST <- list(
   "uploadToAWS" = "upload_to_aws"
 )
 
+SUBSET_GEM2S_TASK_LIST <- list(
+  "subsetSeurat" = "create_subset_experiment",
+  "prepareExperiment" = "prepare_experiment",
+  "uploadToAWS" = "upload_to_aws"
+)
+
 # vector of task functions named by task name
 QC_TASK_LIST <- list(
   "classifier" = "filter_emptydrops",
@@ -38,7 +44,6 @@ QC_TASK_LIST <- list(
 # directory where download_user_files downloads user files
 INPUT_DIR <- "/input"
 
-
 # constants used in GEM2S
 gem2s <- list(
   max.edrops.fdr = 0.001,

From 7321a3898baa78ccd2b1350c6c9fde94d574f3f7 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Mon, 5 Dec 2022 11:05:58 -0300
Subject: [PATCH 02/34] add func to download and parse cellsets, start tests

---
 pipeline-runner/R/gem2s-X-subset_experiment.R | 45 ++++++++++++++-----
 pipeline-runner/R/handle_data.R               | 17 +++++++
 .../testthat/test-gem2s-X-subset_experiment.R | 13 ++++++
 3 files changed, 65 insertions(+), 10 deletions(-)
 create mode 100644 pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R

diff --git a/pipeline-runner/R/gem2s-X-subset_experiment.R b/pipeline-runner/R/gem2s-X-subset_experiment.R
index 95b82c74..cf116a37 100644
--- a/pipeline-runner/R/gem2s-X-subset_experiment.R
+++ b/pipeline-runner/R/gem2s-X-subset_experiment.R
@@ -1,20 +1,45 @@
-
-download_cellsets_file <- function(parent_experiment_id) {
-  # download parent experiment cellsets file from S3
-}
-
-parse_cellsets <- function(cellset_path, cellset_type) {
-  cellsets <- jsonlite::fromJSON(cellset_path, flatten = T)
+#' Extract cellset type as data.frame
+#'
+#' Gets the cellsets list and converts it to tidy tibble, keeping only the
+#' the required cellset type
+#'
+#' @param cellsets
+#' @param cellset_type
+#'
+#' @return
+#' @export
+#'
+parse_cellsets <- function(cellsets, cellset_type) {
 
   cellsets$cellSets |>
-    filter(key == cellset_type) %>%
+    dplyr::filter(key == cellset_type) %>%
     .$children |>
     as.data.frame() |>
-    as_tibble() |>
-    select(key, name, cellIds)
+    tibble::as_tibble() |>
+    dplyr::select(key, name, cellIds)
+}
+
+
+#' Filters cellsets, getting vector of cell_ids to keep
+#'
+#' @param cellsets_df data.frame
+#' @param cellset_keys character
+#'
+#' @return
+#' @export
+#'
+get_cell_ids <- function(cellsets_df, cellset_keys) {
+
+  cellsets_df |>
+    dplyr::filter(key %in% cellset_keys) |>
+    tidyr::unnest(cellIds) |>
+    dplyr::pull(cellIds) |>
+    unique()
 
 }
 
+
+
 create_subset_experiment <- function(input, pipeline_config) {
 
   parent_experiment_id <- input$parentExperimentId
diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index 8082dd6e..3f0f7fda 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -398,3 +398,20 @@ upload_multipart_parts <- function(s3, bucket, object, key, upload_id) {
 
   return(parts)
 }
+
+
+load_cellsets_file <- function(pipeline_config, experiment_id) {
+  message("loading cellsets file")
+  s3 <- paws::s3(config = pipeline_config$aws_config)
+
+  bucket <- pipeline_config$cell_sets_bucket
+
+  c(body, ...rest) %<-% s3$get_object(
+    Bucket = bucket,
+    Key = experiment_id
+  )
+
+  obj <- jsonlite::fromJSON(rawConnection(body), flatten = T)
+  return(obj)
+
+}
diff --git a/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R b/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R
new file mode 100644
index 00000000..c0292edd
--- /dev/null
+++ b/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R
@@ -0,0 +1,13 @@
+mock_scdata <- function(){
+  processed_path <- "/Users/german/bm/cellenics/data/8ecc9d20-30e4-49eb-b536-a0d1f0ba420d/processed_r.rds"
+
+  readRDS(processed_path)
+}
+
+mock_cellsets <- function(){
+  cellsets_path <- "/Users/german/bm/cellenics/data/8ecc9d20-30e4-49eb-b536-a0d1f0ba420d/cellsets.json"
+
+}
+
+mock_sample_id_mapping <- function(){}
+

From 69fb01cac00476465bd0e393002aca26f186d72c Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Mon, 12 Dec 2022 20:13:52 -0300
Subject: [PATCH 03/34] migrate to data.table

---
 pipeline-runner/R/gem2s-X-subset_experiment.R | 46 +------------------
 pipeline-runner/R/handle_data.R               | 32 ++++++++++++-
 2 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/pipeline-runner/R/gem2s-X-subset_experiment.R b/pipeline-runner/R/gem2s-X-subset_experiment.R
index cf116a37..52c900f2 100644
--- a/pipeline-runner/R/gem2s-X-subset_experiment.R
+++ b/pipeline-runner/R/gem2s-X-subset_experiment.R
@@ -1,45 +1,3 @@
-#' Extract cellset type as data.frame
-#'
-#' Gets the cellsets list and converts it to tidy tibble, keeping only the
-#' the required cellset type
-#'
-#' @param cellsets
-#' @param cellset_type
-#'
-#' @return
-#' @export
-#'
-parse_cellsets <- function(cellsets, cellset_type) {
-
-  cellsets$cellSets |>
-    dplyr::filter(key == cellset_type) %>%
-    .$children |>
-    as.data.frame() |>
-    tibble::as_tibble() |>
-    dplyr::select(key, name, cellIds)
-}
-
-
-#' Filters cellsets, getting vector of cell_ids to keep
-#'
-#' @param cellsets_df data.frame
-#' @param cellset_keys character
-#'
-#' @return
-#' @export
-#'
-get_cell_ids <- function(cellsets_df, cellset_keys) {
-
-  cellsets_df |>
-    dplyr::filter(key %in% cellset_keys) |>
-    tidyr::unnest(cellIds) |>
-    dplyr::pull(cellIds) |>
-    unique()
-
-}
-
-
-
 create_subset_experiment <- function(input, pipeline_config) {
 
   parent_experiment_id <- input$parentExperimentId
@@ -49,9 +7,9 @@ create_subset_experiment <- function(input, pipeline_config) {
   # load parent processed scdata and cellsets
   s3 <- paws::s3(config = pipeline_config$aws_config)
   parent_scdata <- load_processed_scdata(s3, pipeline_config, parent_experiment_id)
-  parent_cellsets <- load_cellsets(s3, pipeline_config, parent_experiment_id)
+  parent_cellsets <- parse_cellsets(load_cellsets(s3, pipeline_config, parent_experiment_id))
 
-  cell_ids_to_keep <- get_cell_sets(parent_cellsets, cellset_keys)
+  cell_ids_to_keep <- parent_cellsets[key %in% cellset_keys, cell_id]
 
   sample_id_mapping <- input$sampleIdMapping
 
diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index cad76407..c5253e9c 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -403,9 +403,17 @@ upload_multipart_parts <- function(s3, bucket, object, key, upload_id) {
 }
 
 
-load_cellsets_file <- function(pipeline_config, experiment_id) {
+#' Load cellsets object from s3
+#'
+#' @param s3
+#' @param pipeline_config
+#' @param experiment_id
+#'
+#' @return
+#' @export
+#'
+load_cellsets <- function(s3, pipeline_config, experiment_id) {
   message("loading cellsets file")
-  s3 <- paws::s3(config = pipeline_config$aws_config)
 
   bucket <- pipeline_config$cell_sets_bucket
 
@@ -418,3 +426,23 @@ load_cellsets_file <- function(pipeline_config, experiment_id) {
   return(obj)
 
 }
+
+
+#' Parse cellsets object to data.table
+#'
+#' Gets the cellsets list and converts it to a tidy data.table
+#'
+#' @param cellsets list
+#'
+#' @return
+#' @export
+#'
+parse_cellsets <- function(cellsets) {
+
+  data.table::setDT(cellsets$cellSets)
+  # fill columns in case there are empty cellset classes
+  dt <- data.table::rbindlist(cellsets$cellSets$children, fill = TRUE)
+  # unnest, and change column name
+  dt[, setNames(.(unlist(cellIds)), "cell_id"), by = .(key, name)]
+
+}

From 7216814aec3cfd5d886a6b50e3f332fd584e3320 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Thu, 15 Dec 2022 19:11:23 -0300
Subject: [PATCH 04/34] finish subset step, add docs. init tests

---
 pipeline-runner/NAMESPACE                     |   6 +
 pipeline-runner/R/gem2s-X-subset_experiment.R | 107 +++++++++++++++---
 pipeline-runner/R/handle_data.R               |  10 +-
 pipeline-runner/man/add_new_sample_ids.Rd     |  19 ++++
 pipeline-runner/man/call_qc.Rd                |   2 +-
 pipeline-runner/man/call_subset_gem2s.Rd      |  26 +++++
 pipeline-runner/man/construct_qc_config.Rd    |   5 +-
 pipeline-runner/man/create_sample_id_map.Rd   |  20 ++++
 .../man/create_subset_experiment.Rd           |  27 +++++
 pipeline-runner/man/diet_scdata.Rd            |  18 +++
 pipeline-runner/man/load_cellsets.Rd          |  21 ++++
 pipeline-runner/man/parse_cellsets.Rd         |  17 +++
 pipeline-runner/man/prepare_experiment.Rd     |   3 +-
 .../testthat/test-gem2s-X-subset_experiment.R |  18 ++-
 14 files changed, 270 insertions(+), 29 deletions(-)
 create mode 100644 pipeline-runner/man/add_new_sample_ids.Rd
 create mode 100644 pipeline-runner/man/call_subset_gem2s.Rd
 create mode 100644 pipeline-runner/man/create_sample_id_map.Rd
 create mode 100644 pipeline-runner/man/create_subset_experiment.Rd
 create mode 100644 pipeline-runner/man/diet_scdata.Rd
 create mode 100644 pipeline-runner/man/load_cellsets.Rd
 create mode 100644 pipeline-runner/man/parse_cellsets.Rd

diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
index 8173b447..fc511d60 100644
--- a/pipeline-runner/NAMESPACE
+++ b/pipeline-runner/NAMESPACE
@@ -1,10 +1,14 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_metadata)
+export(add_new_sample_ids)
 export(build_cc_gene_list)
 export(build_metadata_cellsets)
+export(create_sample_id_map)
 export(create_scdata)
 export(create_seurat)
+export(create_subset_experiment)
+export(diet_scdata)
 export(download_user_files)
 export(embed_and_cluster)
 export(filter_doublets)
@@ -25,12 +29,14 @@ export(integrate_from_sketch)
 export(integrate_scdata)
 export(learn_from_sketches)
 export(list_exclude_genes)
+export(load_cellsets)
 export(load_user_files)
 export(log_normalize)
 export(make_annot_with_ids)
 export(merge_scdata_list)
 export(normalize_annotation_types)
 export(normalize_data)
+export(parse_cellsets)
 export(prepare_experiment)
 export(prepare_sct_integration)
 export(read_10x_annotations)
diff --git a/pipeline-runner/R/gem2s-X-subset_experiment.R b/pipeline-runner/R/gem2s-X-subset_experiment.R
index 52c900f2..81b84169 100644
--- a/pipeline-runner/R/gem2s-X-subset_experiment.R
+++ b/pipeline-runner/R/gem2s-X-subset_experiment.R
@@ -1,38 +1,111 @@
+#' create a subset experiment
+#'
+#' This is the first step of a subset pipeline, which basically takes the parent
+#' experiment ID and cellset keys to keep as input, extracts the cell ids to keep
+#' and subsets and slims down the parent seurat object.
+#'
+#' @param input list containing:
+#'   - parentExperimentId character
+#'   - subsetExperimentId character
+#'   - cellSetKeys character vector of cellset keys to subset
+#'   - experimentName character
+#' @param pipeline_config list
+#'
+#' @return list containing scdata_list, annotations and sample_id_map
+#' @export
+#'
 create_subset_experiment <- function(input, pipeline_config) {
 
-  parent_experiment_id <- input$parentExperimentId
-  subset_experiment_id <- input$subsetExperimentId
-  cellset_keys <- input$cellSetKeys
-
   # load parent processed scdata and cellsets
   s3 <- paws::s3(config = pipeline_config$aws_config)
-  parent_scdata <- load_processed_scdata(s3, pipeline_config, parent_experiment_id)
-  parent_cellsets <- parse_cellsets(load_cellsets(s3, pipeline_config, parent_experiment_id))
-
-  cell_ids_to_keep <- parent_cellsets[key %in% cellset_keys, cell_id]
+  parent_scdata <- load_processed_scdata(s3, pipeline_config, input$parentExperimentId)
+  parent_cellsets <- parse_cellsets(load_cellsets(s3, pipeline_config, input$parentExperimentId))
 
-  sample_id_mapping <- input$sampleIdMapping
+  cell_ids_to_keep <- parent_cellsets[key %in% input$cellSetKeys, cell_id]
 
-  # subset seurat object
-  scdata <- subset_ids(scdata, cell_ids_to_keep)
+  # subset seurat object, remove unnecesary data
+  scdata <- subset_ids(parent_scdata, cell_ids_to_keep)
+  scdata <- diet_scdata(scdata)
+  scdata@misc$experimentId <- input$subsetExperimentId
 
-  # add subset experiment name to the subset seurat object
-  scdata$project <- input$name
+  # delete parent_scdata to free memory
+  rm(parent_scdata)
 
   # add new sample_ids, keep originals in a new variable
   scdata$parent_samples <- scdata$samples
-  scdata$samples <- sample_id_mapping[match(parent_samples, sample_id_mapping)]
+  sample_id_map <- create_sample_id_map(unique(scdata$parent_samples))
+  scdata <- add_new_sample_ids(scdata, sample_id_map)
 
   # split by sample
   scdata_list <- Seurat::SplitObject(scdata, split.by = "samples")
 
-  prev_out$scdata_list <- scdata_list
-  prev_out$annot <- scdata@misc
+  # structure step output
   res <- list(
     data = list(),
-    output = prev_out
+    output = list(scdata_list = scdata_list,
+                  annot = scdata@misc$gene_annotations,
+                  sample_id_map = sample_id_map)
   )
 
   message("\nSubsetting of Seurat object step complete.")
   return(res)
 }
+
+
+#' generate a sample id mapping for remaining samples after subset
+#'
+#' New sample ids must be created, but the number of samples depends on which
+#' cells have been subset by the user. Sample Ids that belong to the parent
+#' experiment are also kept, which is useful for the addition of the new subclusters
+#' to the parent experiment.
+#'
+#' @param parent_sample_id character vector of unique parent sample ids
+#'
+#' @return data.table with sample id map
+#' @export
+#'
+create_sample_id_map <- function(parent_sample_id) {
+  subset_sample_id <-  uuid::UUIDgenerate(n = length(parent_sample_id))
+  sample_id_map <-data.table::data.table(parent_sample_id = parent_sample_id,
+                                         subset_sample_id = subset_sample_id)
+
+  return(sample_id_map)
+}
+
+
+#' Add new sample ids to the subset Seurat Object
+#'
+#' @param scdata Seurat Object
+#' @param sample_id_map data.table of parent/subset sample id map
+#'
+#' @return SeuratObject with new sample ids
+#' @export
+#'
+add_new_sample_ids <- function(scdata, sample_id_map) {
+  sample_map_idx <- match(scdata$parent_samples, sample_id_map$parent_sample_id)
+  scdata$samples <- sample_id_map$subset_sample_id[sample_map_idx]
+  return(scdata)
+}
+
+
+#' Remove all unnecessary data from the parent seurat object
+#'
+#' Seurat::DietSeurat is not able to remove certain slots from a seurat object.
+#' This function also removes elements from the misc slot which are not necessary
+#'
+#' @param scdata SeuratObject
+#'
+#' @return leaner SeuratObject
+#' @export
+#'
+diet_scdata <- function(scdata) {
+  lean_scdata <- Seurat::CreateSeuratObject(counts = scdata@assays$RNA@counts,
+                             meta.data = scdata@meta.data,
+                             min.cells = 0,
+                             min.features = 0)
+
+  lean_scdata@misc <- list(gene_annotations = scdata@misc$gene_annotations,
+                           parent_experimentId = scdata@misc$experimentId)
+
+  return(lean_scdata)
+}
diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index c5253e9c..73fed0ee 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -405,11 +405,11 @@ upload_multipart_parts <- function(s3, bucket, object, key, upload_id) {
 
 #' Load cellsets object from s3
 #'
-#' @param s3
-#' @param pipeline_config
-#' @param experiment_id
+#' @param s3 paws::s3 object
+#' @param pipeline_config list
+#' @param experiment_id character
 #'
-#' @return
+#' @return cellsets list
 #' @export
 #'
 load_cellsets <- function(s3, pipeline_config, experiment_id) {
@@ -434,7 +434,7 @@ load_cellsets <- function(s3, pipeline_config, experiment_id) {
 #'
 #' @param cellsets list
 #'
-#' @return
+#' @return data.table of cellset keys, names and corresponding cell_ids
 #' @export
 #'
 parse_cellsets <- function(cellsets) {
diff --git a/pipeline-runner/man/add_new_sample_ids.Rd b/pipeline-runner/man/add_new_sample_ids.Rd
new file mode 100644
index 00000000..d7af121f
--- /dev/null
+++ b/pipeline-runner/man/add_new_sample_ids.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gem2s-X-subset_experiment.R
+\name{add_new_sample_ids}
+\alias{add_new_sample_ids}
+\title{Add new sample ids to the subset Seurat Object}
+\usage{
+add_new_sample_ids(scdata, sample_id_map)
+}
+\arguments{
+\item{scdata}{Seurat Object}
+
+\item{sample_id_map}{data.table of parent/subset sample id map}
+}
+\value{
+SeuratObject with new sample ids
+}
+\description{
+Add new sample ids to the subset Seurat Object
+}
diff --git a/pipeline-runner/man/call_qc.Rd b/pipeline-runner/man/call_qc.Rd
index 0daaef32..ccce56a7 100644
--- a/pipeline-runner/man/call_qc.Rd
+++ b/pipeline-runner/man/call_qc.Rd
@@ -13,7 +13,7 @@ call_qc(task_name, input, pipeline_config)
 \itemize{
 \item step parameters for all samples
 \item current sample UUID
-\item uploadCountMatrix (wether or not to upload matrix after step)
+\item uploadCountMatrix (whether or not to upload matrix after step)
 }}
 
 \item{pipeline_config}{list as defined by load_config}
diff --git a/pipeline-runner/man/call_subset_gem2s.Rd b/pipeline-runner/man/call_subset_gem2s.Rd
new file mode 100644
index 00000000..60350a77
--- /dev/null
+++ b/pipeline-runner/man/call_subset_gem2s.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/init-functions.R
+\name{call_subset_gem2s}
+\alias{call_subset_gem2s}
+\title{Call subset gem2s}
+\usage{
+call_subset_gem2s(task_name, input, pipeline_config)
+}
+\arguments{
+\item{task_name}{character name of the step}
+
+\item{input}{list containing
+\itemize{
+\item parentExperimentId
+\item childExperimentId
+\item sample IDs, and names
+}}
+
+\item{pipeline_config}{list as defined by load_config}
+}
+\value{
+character message id
+}
+\description{
+Runs step \code{task_name} of the subset GEM2S pipeline, sends output message to the API
+}
diff --git a/pipeline-runner/man/construct_qc_config.Rd b/pipeline-runner/man/construct_qc_config.Rd
index e5306805..ae11ef07 100644
--- a/pipeline-runner/man/construct_qc_config.Rd
+++ b/pipeline-runner/man/construct_qc_config.Rd
@@ -4,12 +4,15 @@
 \alias{construct_qc_config}
 \title{Constructs default QC configuration}
 \usage{
-construct_qc_config(scdata_list, any_filtered)
+construct_qc_config(scdata_list, any_filtered, disable_qc_filters)
 }
 \arguments{
 \item{scdata_list}{list of seurat objects}
 
 \item{any_filtered}{bool indicating if barcodes were filtered by emptyDrops}
+
+\item{disable_qc_filters}{bool indicating if the data derives from the
+subsetting of another experiment}
 }
 \value{
 list of QC configuration parameters
diff --git a/pipeline-runner/man/create_sample_id_map.Rd b/pipeline-runner/man/create_sample_id_map.Rd
new file mode 100644
index 00000000..9f3c1382
--- /dev/null
+++ b/pipeline-runner/man/create_sample_id_map.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gem2s-X-subset_experiment.R
+\name{create_sample_id_map}
+\alias{create_sample_id_map}
+\title{generate a sample id mapping for remaining samples after subset}
+\usage{
+create_sample_id_map(parent_sample_id)
+}
+\arguments{
+\item{parent_sample_id}{character vector of unique parent sample ids}
+}
+\value{
+data.table with sample id map
+}
+\description{
+New sample ids must be created, but the number of samples depends on which
+cells have been subset by the user. Sample Ids that belong to the parent
+experiment are also kept, which is useful for the addition of the new subclusters
+to the parent experiment.
+}
diff --git a/pipeline-runner/man/create_subset_experiment.Rd b/pipeline-runner/man/create_subset_experiment.Rd
new file mode 100644
index 00000000..3a679b7b
--- /dev/null
+++ b/pipeline-runner/man/create_subset_experiment.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gem2s-X-subset_experiment.R
+\name{create_subset_experiment}
+\alias{create_subset_experiment}
+\title{create a subset experiment}
+\usage{
+create_subset_experiment(input, pipeline_config)
+}
+\arguments{
+\item{input}{list containing:
+\itemize{
+\item parentExperimentId character
+\item subsetExperimentId character
+\item cellSetKeys character vector of cellset keys to subset
+\item experimentName character
+}}
+
+\item{pipeline_config}{list}
+}
+\value{
+list containing scdata_list, annotations and sample_id_map
+}
+\description{
+This is the first step of a subset pipeline, which basically takes the parent
+experiment ID and cellset keys to keep as input, extracts the cell ids to keep
+and subsets and slims down the parent seurat object.
+}
diff --git a/pipeline-runner/man/diet_scdata.Rd b/pipeline-runner/man/diet_scdata.Rd
new file mode 100644
index 00000000..5039fc32
--- /dev/null
+++ b/pipeline-runner/man/diet_scdata.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gem2s-X-subset_experiment.R
+\name{diet_scdata}
+\alias{diet_scdata}
+\title{Remove all unnecessary data from the parent seurat object}
+\usage{
+diet_scdata(scdata)
+}
+\arguments{
+\item{scdata}{SeuratObject}
+}
+\value{
+leaner SeuratObject
+}
+\description{
+Seurat::DietSeurat is not able to remove certain slots from a seurat object.
+This function also removes elements from the misc slot which are not necessary
+}
diff --git a/pipeline-runner/man/load_cellsets.Rd b/pipeline-runner/man/load_cellsets.Rd
new file mode 100644
index 00000000..27a970ce
--- /dev/null
+++ b/pipeline-runner/man/load_cellsets.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/handle_data.R
+\name{load_cellsets}
+\alias{load_cellsets}
+\title{Load cellsets object from s3}
+\usage{
+load_cellsets(s3, pipeline_config, experiment_id)
+}
+\arguments{
+\item{s3}{paws::s3 object}
+
+\item{pipeline_config}{list}
+
+\item{experiment_id}{character}
+}
+\value{
+cellsets list
+}
+\description{
+Load cellsets object from s3
+}
diff --git a/pipeline-runner/man/parse_cellsets.Rd b/pipeline-runner/man/parse_cellsets.Rd
new file mode 100644
index 00000000..1bf0ed9b
--- /dev/null
+++ b/pipeline-runner/man/parse_cellsets.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/handle_data.R
+\name{parse_cellsets}
+\alias{parse_cellsets}
+\title{Parse cellsets object to data.table}
+\usage{
+parse_cellsets(cellsets)
+}
+\arguments{
+\item{cellsets}{list}
+}
+\value{
+data.table of cellset keys, names and corresponding cell_ids
+}
+\description{
+Gets the cellsets list and converts it to a tidy data.table
+}
diff --git a/pipeline-runner/man/prepare_experiment.Rd b/pipeline-runner/man/prepare_experiment.Rd
index cc8860d0..01cfb4d9 100644
--- a/pipeline-runner/man/prepare_experiment.Rd
+++ b/pipeline-runner/man/prepare_experiment.Rd
@@ -19,8 +19,7 @@ prev_out \code{prev_out} with added slots 'scdata' containing merged
 }
 \description{
 \enumerate{
-\item Merges the samples for the current experiment
 \item Adds metadata: cellsId, color_pool, and gene annotation
-\item Preparing QC configuration
+\item Prepares QC configuration
 }
 }
diff --git a/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R b/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R
index c0292edd..fcae2442 100644
--- a/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R
+++ b/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R
@@ -1,13 +1,25 @@
 mock_scdata <- function(){
   processed_path <- "/Users/german/bm/cellenics/data/8ecc9d20-30e4-49eb-b536-a0d1f0ba420d/processed_r.rds"
-
   readRDS(processed_path)
 }
 
 mock_cellsets <- function(){
   cellsets_path <- "/Users/german/bm/cellenics/data/8ecc9d20-30e4-49eb-b536-a0d1f0ba420d/cellsets.json"
-
+  jsonlite::fromJSON(cellsets_path, flatten = TRUE)
 }
 
-mock_sample_id_mapping <- function(){}
+mock_input <- function() {
+  input <- list(
+    name = "mock_subset_experiment_name",
+    parentExperimentId = "mock_parent_experiment_id",
+    subsetExperimentId = "mock_subset_experiment_id",
+    cellSetKeys =  c("louvain-0", "louvain-1")
+  )
+
+  return(input)
+}
 
+parent_scdata <- mock_scdata()
+parent_cellsets <- parse_cellsets(mock_cellsets())
+sample_mapping <- mock_sample_id_mapping()
+input <- mock_input()

From dd92e9879fc6877cbd50c327d39a6487c5ef13cf Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 16 Dec 2022 09:28:07 -0300
Subject: [PATCH 05/34] rename step

---
 .../R/{gem2s-X-subset_experiment.R => subset-1-subset_seurat.R}   | 0
 ...-gem2s-X-subset_experiment.R => test-subset-1-subset_seurat.R} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename pipeline-runner/R/{gem2s-X-subset_experiment.R => subset-1-subset_seurat.R} (100%)
 rename pipeline-runner/tests/testthat/{test-gem2s-X-subset_experiment.R => test-subset-1-subset_seurat.R} (100%)

diff --git a/pipeline-runner/R/gem2s-X-subset_experiment.R b/pipeline-runner/R/subset-1-subset_seurat.R
similarity index 100%
rename from pipeline-runner/R/gem2s-X-subset_experiment.R
rename to pipeline-runner/R/subset-1-subset_seurat.R
diff --git a/pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R b/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
similarity index 100%
rename from pipeline-runner/tests/testthat/test-gem2s-X-subset_experiment.R
rename to pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R

From 018ae4c91aff066766c054056f98c31677e3f226 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 16 Dec 2022 09:35:53 -0300
Subject: [PATCH 06/34] rename subset pipeline

---
 pipeline-runner/R/init-functions.R                | 15 +++++++--------
 pipeline-runner/man/add_new_sample_ids.Rd         |  2 +-
 ...call_subset_gem2s.Rd => call_subset_seurat.Rd} | 10 +++++-----
 pipeline-runner/man/create_sample_id_map.Rd       |  2 +-
 pipeline-runner/man/create_subset_experiment.Rd   |  2 +-
 pipeline-runner/man/diet_scdata.Rd                |  2 +-
 6 files changed, 16 insertions(+), 17 deletions(-)
 rename pipeline-runner/man/{call_subset_gem2s.Rd => call_subset_seurat.Rd} (63%)

diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index 5c2528f4..86ee1cb6 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -94,7 +94,7 @@ load_config <- function(development_aws_server) {
   }
 
   # batch does not have access to the internal EKS cluster api URL, use the public one
-  if(running_in_batch == "true" && domain_name != "") {
+  if (running_in_batch == "true" && domain_name != "") {
     config$api_url <- config$public_api_url
   }
 
@@ -205,7 +205,6 @@ run_qc_step <- function(scdata, config, tasks, task_name, cells_id, sample_id, d
 #' @return list of task results
 #'
 run_gem2s_step <- function(prev_out, input, pipeline_config, tasks, task_name) {
-
   if (!task_name %in% names(tasks)) {
     stop("Invalid task name given: ", task_name)
   }
@@ -252,9 +251,9 @@ call_gem2s <- function(task_name, input, pipeline_config) {
 }
 
 
-#' Call subset gem2s
+#' Call subset seurat
 #'
-#' Runs step `task_name` of the subset GEM2S pipeline, sends output message to the API
+#' Runs step `task_name` of the subset seurat pipeline, sends output message to the API
 #'
 #' @param task_name character name of the step
 #' @param input list containing
@@ -265,7 +264,7 @@ call_gem2s <- function(task_name, input, pipeline_config) {
 #'
 #' @return character message id
 #'
-call_subset_gem2s <- function(task_name, input, pipeline_config) {
+call_subset_seurat <- function(task_name, input, pipeline_config) {
   experiment_id <- input$experimentId
   # remove when it's added to the input
   input$subset_experiment <- TRUE
@@ -276,7 +275,7 @@ call_subset_gem2s <- function(task_name, input, pipeline_config) {
   }
 
   check_input(input)
-  tasks <- lapply(SUBSET_GEM2S_TASK_LIST, get)
+  tasks <- lapply(SUBSET_SEURAT_TASK_LIST, get)
 
   c(data, task_out) %<-% run_gem2s_step(prev_out, input, pipeline_config, tasks, task_name)
   assign("prev_out", task_out, pos = ".GlobalEnv")
@@ -449,14 +448,14 @@ pipeline_heartbeat <- function(task_token, aws_config) {
 start_heartbeat <- function(task_token, aws_config) {
   message("Starting heartbeat")
 
-    heartbeat_proc <- callr::r_bg(
+  heartbeat_proc <- callr::r_bg(
     func = pipeline_heartbeat, args = list(
       task_token, aws_config
     ),
     stdout = "/tmp/out",
     stderr = "/tmp/err"
   )
-    return(heartbeat_proc)
+  return(heartbeat_proc)
 }
 
 
diff --git a/pipeline-runner/man/add_new_sample_ids.Rd b/pipeline-runner/man/add_new_sample_ids.Rd
index d7af121f..c1f8875b 100644
--- a/pipeline-runner/man/add_new_sample_ids.Rd
+++ b/pipeline-runner/man/add_new_sample_ids.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gem2s-X-subset_experiment.R
+% Please edit documentation in R/subset-1-subset_seurat.R
 \name{add_new_sample_ids}
 \alias{add_new_sample_ids}
 \title{Add new sample ids to the subset Seurat Object}
diff --git a/pipeline-runner/man/call_subset_gem2s.Rd b/pipeline-runner/man/call_subset_seurat.Rd
similarity index 63%
rename from pipeline-runner/man/call_subset_gem2s.Rd
rename to pipeline-runner/man/call_subset_seurat.Rd
index 60350a77..3a194eb5 100644
--- a/pipeline-runner/man/call_subset_gem2s.Rd
+++ b/pipeline-runner/man/call_subset_seurat.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/init-functions.R
-\name{call_subset_gem2s}
-\alias{call_subset_gem2s}
-\title{Call subset gem2s}
+\name{call_subset_seurat}
+\alias{call_subset_seurat}
+\title{Call subset seurat}
 \usage{
-call_subset_gem2s(task_name, input, pipeline_config)
+call_subset_seurat(task_name, input, pipeline_config)
 }
 \arguments{
 \item{task_name}{character name of the step}
@@ -22,5 +22,5 @@ call_subset_gem2s(task_name, input, pipeline_config)
 character message id
 }
 \description{
-Runs step \code{task_name} of the subset GEM2S pipeline, sends output message to the API
+Runs step \code{task_name} of the subset seurat pipeline, sends output message to the API
 }
diff --git a/pipeline-runner/man/create_sample_id_map.Rd b/pipeline-runner/man/create_sample_id_map.Rd
index 9f3c1382..4926d873 100644
--- a/pipeline-runner/man/create_sample_id_map.Rd
+++ b/pipeline-runner/man/create_sample_id_map.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gem2s-X-subset_experiment.R
+% Please edit documentation in R/subset-1-subset_seurat.R
 \name{create_sample_id_map}
 \alias{create_sample_id_map}
 \title{generate a sample id mapping for remaining samples after subset}
diff --git a/pipeline-runner/man/create_subset_experiment.Rd b/pipeline-runner/man/create_subset_experiment.Rd
index 3a679b7b..964e1a47 100644
--- a/pipeline-runner/man/create_subset_experiment.Rd
+++ b/pipeline-runner/man/create_subset_experiment.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gem2s-X-subset_experiment.R
+% Please edit documentation in R/subset-1-subset_seurat.R
 \name{create_subset_experiment}
 \alias{create_subset_experiment}
 \title{create a subset experiment}
diff --git a/pipeline-runner/man/diet_scdata.Rd b/pipeline-runner/man/diet_scdata.Rd
index 5039fc32..0d3b5a6c 100644
--- a/pipeline-runner/man/diet_scdata.Rd
+++ b/pipeline-runner/man/diet_scdata.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gem2s-X-subset_experiment.R
+% Please edit documentation in R/subset-1-subset_seurat.R
 \name{diet_scdata}
 \alias{diet_scdata}
 \title{Remove all unnecessary data from the parent seurat object}

From 2ebeead7c23a5c2a9cbd194fdc69f2bfe98d3b33 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 16 Dec 2022 09:50:58 -0300
Subject: [PATCH 07/34] update sysdata

---
 pipeline-runner/R/init-functions.R        |   6 +++---
 pipeline-runner/R/sysdata.rda             | Bin 3123 -> 3125 bytes
 pipeline-runner/man/call_subset_seurat.Rd |   6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index 86ee1cb6..ed79bd8f 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -264,7 +264,7 @@ call_gem2s <- function(task_name, input, pipeline_config) {
 #'
 #' @return character message id
 #'
-call_subset_seurat <- function(task_name, input, pipeline_config) {
+call_subset <- function(task_name, input, pipeline_config) {
   experiment_id <- input$experimentId
   # remove when it's added to the input
   input$subset_experiment <- TRUE
@@ -486,8 +486,8 @@ wrapper <- function(input, pipeline_config) {
     message_id <- call_qc(task_name, input, pipeline_config)
   } else if (process_name == "gem2s") {
     message_id <- call_gem2s(task_name, input, pipeline_config)
-  } else if (process_name == "subsetGem2s") {
-    message_id <- call_subset_gem2s(task_name, input, pipeline_config)
+  } else if (process_name == "subset") {
+    message_id <- call_subset(task_name, input, pipeline_config)
   } else {
     stop("Process name not recognized.")
   }
diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda
index 58e71f4714dfaaad1c80ce19cd060f1c220a1bd5..1a20befed27612d8290a1b9312994767a950807d 100644
GIT binary patch
delta 3124
zcmV-449oMg7_}G<LRx4!F+o`-Q(1En)Q^!4B7ga6X5RDWmxtW+eH&rE(InTi)9%|l
zcX4e^Z39?Jnjoi$ss2oa^qHx&XqZWq^q!N_evzot(;AwODdtfhr8Er#AOK+u)Bw<e
zG62v<Dm+ao;zrc<KSrstjWihm4^RyN000000E(Z0BuzA@s2-u}8UO+60000000000
z0Dk}jQ_vtp6+KNqB-F(9Jx^(pnm<%%Xfyx-00E!?007Vc5-3r%ObU99JWU~>00u)w
zjWPfY00w{n05lI!&;XPnH5oFVnrcr|#6o&$h%^sTG{Ryr1^@w|0Aw1PU^NW%lhcdI
z&Efx#H`et<bYQ`@#xaeyz-_cPkk;xK34h2BE4oB@JtPFAbDELhAX^!c5E7w8Kx>3Z
zj|8l9oPmxi97wQ);(>Hfx$?p+k`wple{f1dwlO3wes5!{-Rk&m&4|R$zXcs6ugma6
zvG&aJHtHTvEif5`n)ftXY72rW#vq|eQ#G~57shPIv>CM?*n4vnhB3G|Cg{gEW`FR=
zH>UW>H|9~6a1UhPJ7)pHaT!kG#(PJ$avVn~?i|NZ=QA13>E1olIqw<{gP7<&^S*N(
z^WZ)c;xiu^@*V@_G#U?>`Ol#GPYK~XMv0_)4@v4hXL!>;qu6-OFYM4ee&2V$;qm!<
zbaZ}tdS1Ox@~`%z6rW1-rMmoBkAGqyfu{Kra7zS28q5g5C~MtoUNwPjN_^=PWmgm9
zZvw4Jp&=z%f=M<2gkK>uFzaq==Nk^*4BYwL;9C)p<})HDtyc;|;m1<TZMU@F=-X}7
z?k2YxWP-xyP2)4Sh&JAv<WLtkf)F=&#i|<fOmKCXhW&UoF;VfPv8l~a<bO@x(Ied4
zw+p23RifHa@?43pNypma>T%Ax)nnw0?9n>3A~!OVN)<*ZoAevVgs|@w6O|B+z{Pgd
zpcdrn*7;0FKrRDy5DlbwsZK7j_`v{?`H`c;2`-Uh`{~U@2nrFk5?c9toFi)R`e?js
z*K)U(D$wXm>lIjqa%6-iM}H|vk!2SWHA5(IG%$lyiCIuGH-l=!Bo<{_!`5^c7-T>S
zAmZsFl4L|*3RP^O7?&=XvqmuP`5_*3NrD?7tm2JOW@$vCGQChFgP<&n0XYLQv;v7S
zlS4wgImrr{JPMo2v$awcD5PrZt1y7Egc{a_09!&yCNKd55CDqkNPikoNE&U3X%G+q
z!Yo7(n36~cv?jy|gF+CIajCH&{jnsFQ!Wk+7ywADYBfV<n#RgErd`J^saQCpurKEL
z77bBMV)|2qCX5cqgqb8^2f9J8q9A~WWg#g@N@8wAR7IMM$QhKDt3$wqGgWFb3OPm-
za_AXa8Q4(M!b&;`%70i4Ht_N4*|Aq+#;%wihdH@L+wk*pqiAg#U^w)}$Gn?ewXy2Q
zS4T+HFh(&tC8S5O#yCw?C#Y-d$_{3n)TEG$5UPM>A#QNY_~tztWwHyR0%Vq=i4$@I
zIsn<NOT<W-DZ#4j=z^w%(sW0ASp^K7i_(j7CuE_mf=Np%g@3}~YczXjLD76z_~aYC
zh{knfs$(x=qH9n(iA4*$a_!#hGP3>m*IAMnv3IieyE*!FLhaq))xbB@Y)00~$6`C7
zVn+ejHE`6;HNE5O;v8lSw{*r)hA<8RUp8B;<QY)jv}@iL0eO(knZacjVGNduj1jJ4
zb#|*|%XnzHm49l?!b(QMrgI$1T1<jtE)xuE-m?`g(<2zfIkj%WPDL+&a_xC&OqkKF
z1Q0uc=P{erOc*GJ8c$rglGL^>E@qHrc&2i6N!E`hlP0{&5X4Z`R*I~}2(l%Skj?pg
zn5!gED=Ji$OJSoV=4CK(wO9rU#nd?t6$xx=&QY|ESAQpSaqF5@ONB;EaRK)kmnA4N
zJ29?zO1-L!H5PPZB9Nxf*iMj`<A70H;8%YlJ6Qq7__IDl3q(P5tTi6SNXE%KnAgM=
zy__}zaR@>e?;dUQk<?3Irs}U-O%5wi8X?I*LLjoaM_gE_v}(lxnu&24=CoOL0jmnc
zq(e=`aesSN5StrS8Hy30Tyi-S8Z>TKAk3G;D^(dvwzbt&!vn)2BD6uRpzuN>5b-H|
zXlqL<s8xl+3!%k35oq=j%UE_KBE4rJ4RK;!Fa}0NxGW5K?7fo<M8>o`6)&@126ubu
ze8RVLGoh2xhL1>CiaD#LO$)NAT3MATY1!Tpm46xRs%Esq3(oMA3Tj1aj|%87If^w3
zgk0l3eSBBXF-uzR=>V<_tc=dMaJpWYr2(ddwzw#R47w1?6iPg?;ERmHD*-G7!!A%%
znHM!+Mp9ZvZxmn)w?%}iSW}`1Fd`fhA#kQ8+AAPVDqPSqPRbNiHHcd2gcPEsm1eb9
zEPpPj$g0rOiBA9}#t1SaFFw5LYJQvzqj+r2gS_sk;^MAwhayNRS8+Lxq_U{YpdU4P
zka!c#!l3MK3b^(;)(d1Bd4@D;=7rXbM6&mG-r~kuOArYIc~Fq6Di|#lL#s1IEGsA~
zgDR=|Ik2>ACz|UPf|U(k^(J4TuufwO`F~>TwVj$pIbk-7Lh?(SL|gi_S_kYvfG^Yl
zL{Ao((iu#bVUD#FGqynS3gAG9fh;59)mC<#%;wx_w7pEFBT8=5>{Bf|Vw|8@Rh0c^
z!qijTl-#C6brw4APPC63Ffg&0B8&$PgU%g@ut&|9PY#Q0rQVpJF1;;Bp=vrUA%AtZ
zCVTCblt$nI5hX^V4`Ov{6*~JUuBRL1t}?`lY0wo$gpjFCxxmYfg1%E%v~nt8UAbYo
zHF0ZE?F;zOc}0mMRk8|bM5DJh<}npYOjEHnyX$f`b#|3Mo&5?kOOUu4+G1&?Pzsz<
zIHIe5^Uo}KuxSwx3ovO34Ymo5fPXP5(<>4IB?&wWSR~6OQ!n;2>fX>~R$nLC#+nB+
ztD;vfW^Jce0kha>GzC#bbp-hnNei!aiOi^80TCl+BPpL@(;13FRbzr-vvQ5D>C#d(
z77DV)RGgJ;OAD8GCoN&5+Fb19XCy7~u@;KpOT_UV66I0HbUJio*sVxoxPMh=8)3_Q
zCupXt(=7y68!<~{Ay5!_uo*<YoPArZ>_8#WF?C9zqo<<LSfc<~D9*4M9?5*G8uuYb
zU}j!@y6-Y3cOQgE1`>l)=(wC_NDN)cr4?Wxt`urDDJ^`de&wmT5M*m<1|VC#UwLjE
zK8+mq<b>DLT*?fdvzux!7Jo=p$5X6l;^^JDT_ZI*sf7W9TMSh7O12TX0@?%Cd{s28
zs^hW{7&>p|haqAr!A)8#SS?{J;-$6_Y$NRD8M866aIqX*OTh<ADy>d=u%MVfl{ggd
z<_{a04t9vN0Ksscg3gDr1famAc4EAa^Ko2eW~RVtv>yS{5Peu<>3_65`_SQD%M@zw
zSi!6}CZD1q$+~?^TNH(bB_TKziV2?^K;xiRP6yJE2{PBYfjc40G)(O@#v>z?j~@W2
zdrplhLMsQWenH~~-Z}O{o8mzEzr`gis~iVq)7nTPo!OL%nL&|?c5`LuR~-zhBae`-
zfwT}LeiGWEW95^ZW`C8$jXesgTbxY7nzzLf$Bc|Uf+U2X*mpYzYTS?Rp!s4l5fqbC
z*R9!Vj;59BN-sfAV!o!McBX;B`)h_mF(nXNy-4C)(87||`2HY=3Znun=7bNO^zE+e
zYRtRH*`rpQbbfKUFfAZg9NCGGa%Sb;v~B_v7xHY$Ao}gXDt|IV128xkQZt(n#l$Zf
zl&C@2gzu%NF8^%QMk*mKCFQ(@BMV$-Yc5c|YEc%^VA?FbLUJ>t@e#|Ym>Nl64w2U?
zHO4i_HY^DVnu=14xX)v7+a_vIU7mAPrlc^KDK?zLe)!hh5y40clu&BN;bUVoNd<Ov
zt5j8qf`&~Nvwwi(pqtdEh}qCeft{NYbs(frI0u+3b}>}?+Oi{SE7rihD1fZkBH~$4
z@?C(GxwtS*>soQd1i{An(Oj2?mY5L*Ypm9c3PE{6zJU=FBZ9-e#PvoP?W)SW5H|TN
zaWz(;)KD9@tH>Cl8Gvdj)=z!i-DZ2@2N^WQ0HQ_!;~hZ6Yi+f)t)OjWI3Jf>2LS^K
O_ZM<SI8cyt5!8>@Erogj

delta 3122
zcmV-249)Yk7_%4-LRx4!F+o`-Q&}>=HW85yB7gYCYklbYUwmBjeF_`u8cloMPrGdP
z?(r=(v;-2SiK3oODu0yP0X(XIrh=Z*1w1L?q{=-{Ow}<xM%hGqfB*mh0kr@$B58z7
zXcH1LH7AKZPgB(LdZWr{(?_TPGynhq00002Dt-WwPfaQMh-i9-fB-ZA0000000000
z0DlcnAq3G(N_tJ{Pg74M%xI>MQ1u3Y15E~i0000013&;ssMS11gq~5S>Q7K;0004@
zpaz3Ss0X7+&}h@j84N=p03?V_MofXJlL<lT>YtP*(jHLLLqGr;03M(K003w-{V4T1
zoqUd$_`9cOn!y+_ZLy4FZLk|{4Wu^8jDKZhh?eChL?WmKfP^0b5D;M<(m=5cBoB<4
zG9axX2q4@U9sJxyV*!+4$NJ_kkrRGCcl~K0WEe>c(e}9gc5L<NYxY71_ttd{G3!U`
zM7i`#`ZeS~KhmfSq?-?dJ4y-!QK}%JN>e*p!wf<i;%o?PiRB5jZHwGDPb}Uy&VK`h
z%5fdDjO0AS%{fmv<~xTu&}KaIx_6HG=p80w(LE!;Yn?`OLE$_{rf5Av*^XK<%!1))
zxMZF)$ahSm%6X4DY_@s~v(kEu!!Y%iOUJPGp5rviGmm-jGRxl*;gg@Cqw@LvGw$8e
z&b^zb^|AJ_ex&Q$&8IAO?LP#BAAfXcT>_aTOO?Sm3>&b$x;??KLTp$QGIwavqMb?R
zLPAQb1d?n42+|^$#y7Yp&@?&e&()#cg|QhHb2=hVUD6T&@9E0cw{5?BTHQYL@%C$6
zYcD?8nVUb3bLO^$swLo&Q<q6`9D2F6TNYu^j*a9~dWkbs<|u4Vmt>9I-G8?mq2Q}c
zw4>p2CcQ@99Gy;BE-JD2so2}rZM3JQmg1IzGiB2o$w|e$WVz``*b%SD(7-$sy{a=A
z&$|tX*+>q=j~Q2gn;am3Nc_m;^g@?RxsF=rBoHW$B&DmY>NiIdZuW_|Td(D9H0<+)
zO)Wb#=Fm)#gvjM7GAyFvMt`Vf4km^WYLP1{21f90X}XzrX0+nesJO>j08$(MG{l(^
zJL6k63=@+E7EbfZaS^RG6sn*h^@XWAy@8>vRa3NC1t91PBEU{S%&h>TOk~i|uFi5o
zrcVN<@~qftdS@e5YWujjL>k!$0LX-rOfUd}hyX=YBn>E%2Ag6UM1Mj67)HcIgv63S
zM$nrOA|`|(B8H}7LHlA!AdsMOFkk@!wM~}LY}jA#BeO`Q$1oHw9svyv)QpRVAjQ)9
z-Os0c95l*};&PN>K>5-_1aZhpQWBt~5jzRs^>;ztkFcW%r06D%S)R^?V5by=2s=F9
z98}cN(43`+;@#oi;D6v|;EC^N2fFO%qF&dFpP3m$DA^Iqs$N~>*zJsNoOJDNjUyD2
zMBObxypA!%YGqB~tFS0->Calp5rGWI48%3gIpZwndbzMd=>lZ1s|gcw1Hu9G;akpf
zCQNaAL&8GFq}G?SPItN%GJ-2bawmeJt%8#mW(|%5#C$5JeSdT?5P)q=yj6y6NnF)1
z<iRwx9z?-`cj&)=Lo+iLC>LRp88Ucc?YKK#Jeju^&s)399JU+@^*cG>Q3$ysn0M&V
z9=X*uV~sw&LJbVIbk8uyDT9b;&zkK?2V`$*z3(=dF+v^lz%Ih>ChHGol^H0+(zrw5
z`Sas9oy@IgcYhLMI7>%7<j%4~BQC&*o)|eA?`)JrBbQsdv3@pvT*~r!ysgE|#|&`8
z3)H^uvB_MfoH0cU^ih^umL*)yAj>h#?c__#N7AiE+rWn+rp(o5cRiCKbgrQq#@^KM
zbvlN2SQ6^_Y0XL*P#mzcfZ?3O2qFU_Un@Km`y;H{*nilIXO)%1Cs}1E*mrGe)N4bz
zwun}YESiikgyuy}juARS#~X$f!sWSnD0~n&-xqkv2C72p({{c|4iYjATO7cx`~xB<
z5sYJY`QN~BbrV)8wyT!ILzfi+q8yY2A`2TC*A5yj8nHlTqFzRMtrlHCYRjxS7{$vV
zbn-pOhJSB2xfZiS7l;dOXukU0m{O3Mg}D(*wzbq1TrPNIL{^A3v>pgVLLMbB4Gn2!
zRSK}UL3B8$Vl5uRS!)i&gjcNOA+Ib;+!+}b-C$(o*?$x*6B|(MRKU%A8Qt(E0}9>D
z&W2A)8a*LmDCVkCFe-*(aYQnTJr!7x$rgrU=YJK~Q`E%(V<cM2CeVfHL073@n?TW{
zR{iu)R}3!@rVC>1>Cx12@!apsQh?J!TU-=DhFu6{3MC#`?u*RAD*-G7!!A%%nHM!+
zMp9ZvZwy=yiHI&GQ%aj4gu|nZ8f0qItbsVOgFxz*V8bBz5Z2Q)D^^%DQLNBhU6ZC~
zn}5=BrXWj%5M)Mf-rRNFjet85v^j7=-S;%{@y9Sjf<RKP+cA!5rcs$zA3SrQ@3T+D
zpzN*;*>+kQW%Y)gvp!CqaBWG*mTvbAt^B2=xd4zhnGzYPMvKNNdbK>&#&!l^c4j;{
zKMT&%y0W==Ffej)uB)Ccv&cg2t_QiDZ+~25a^0&vfyq~8*q+Bk_5=7(pjYkyh@LGo
zq)ALJ$1Q3q@MHtUajbz60$4}OxmFB0<mcaUsfMWxjw!Oh#WGinmy80!y|eI|mm|GJ
zvwo=+^jP!S9cdm`Vqs%AMHmhnPdfYgaYxUbX0L6QQtwPq7adGTp>7=(kh<HG3xDmJ
zN*dq+5hX^V50N{ng-*W8BU6p=jBvt?uH^~FgpjYMZfK5lb&9;!tl&VWtihtQaB7CN
zh`h7kluIsAJXlFF$1=}uu8ih07!9?#Rvhh)cX>NXpV)rY8RpO~2DTWUd6WXP3P&8Z
zZ{B!J-1xAFhy|E5gofJ$#=sbq(ti~Q0FsIhm3>h$eokugP7GW&8f`O`^CyDB4-0Ij
z4X<VBTOs%yc|3+;hJ}TCYNA<V3zeJl%mPYxW;+}gAyKV~QqFYFE4b|EcGMb1xd;hu
zGiclzQyrD3mCy2d0nZZLZj*GgM?s1*@Xs_&gyn~3TvscSUR`RJ!yN?d>VMeAeFt2T
zRgkU`t1RG}vK7<?Zx#b7SHj+{*7Q~o=$yKxP}||#v_w&eEL3L@9Yxl275v^K5YX=3
zTsG!(ws<`<fH+GJ<F4CaYoHv<!=)X7Dp?E{T((=YnK4l;-6%4$+(C#F;j_lKPJbU?
zRPsb?T3pHw9tU?6VKoSq-G9^FGxU17XfKtEyJ<=T2R0b1)ynq~vIOW4>U^4dT8g3R
z3Wh$+S)$5Zj=)t{U5kXYnwcTp2|p1VyXMNA?4UHr7}j8vDW#hfB6b!P2b9N42X#?N
zzohEWPq7IZ3ooqB-NToL3<t41sj{`zk(rtlV;pJ_0Pu)Fs%v&iKYs)9QoWWa@_8;}
z?G2LGjxpK0H!^HOEGSYFfk>d3WoR6<2$O;I-~=>l-Nc=sbB!~*RV@u8q^+ym6&EYR
zicpHjy^2Be9`3E}kgEWdAUV1uhcd@O>vp!1q<5Z~NU4+=Sf{TWFT}do8C*wTB3=Qg
zAV~y7?3U)Dv!3+8p?~YKnT$1yS%VjHSv|?n<Xk3DQV&l5Y~`LvIFNw3ScpX=)pe|K
zxr1Lz3@R|xtFdP7*2^_VNGDx-14JmTZuJ`~-i8#Gv&Hbmd`f5%;AlYnr_-`GTUK@6
zM=?FCwNd7s6GLDE+3cvP5L-27ymSIq8TYQyK^t(&S~MYnoPQl=6q0k~BwV8LxTRNH
zvI*CkogMDU=xC^f)R&U-6pU?gow8h^deovVg2Av^dWK>%r1A}$aHI?*j~1cORBLG0
z0m!f*Qf?H)(&auzW3n0PL3SC9QyP%MVx-t}jr-cRVvY(xT*8A^Ukr?CqCiH@oK-AD
z;X^|OvTz(ICVzJn@tm7UP&2cUGbsfkfxtTgF3MV;7A%O_Bz4FasSp*Mf?hS14<*<M
zOP_-j-nE9=K%6+=nHAK$Ftt`f7VfKhAPIo!5FG{uB9%rMOV`z4Q0(-UMeV{zjPa#*
zVJLV8tg!P<4(!0tRz*`zPWKZw!y8;wF?>^b1=Eb$avSscd2pcTP;|S`SO@_li2udh
MkxmpO46qG^Kmc&2H~;_u

diff --git a/pipeline-runner/man/call_subset_seurat.Rd b/pipeline-runner/man/call_subset_seurat.Rd
index 3a194eb5..02ca6a98 100644
--- a/pipeline-runner/man/call_subset_seurat.Rd
+++ b/pipeline-runner/man/call_subset_seurat.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/init-functions.R
-\name{call_subset_seurat}
-\alias{call_subset_seurat}
+\name{call_subset}
+\alias{call_subset}
 \title{Call subset seurat}
 \usage{
-call_subset_seurat(task_name, input, pipeline_config)
+call_subset(task_name, input, pipeline_config)
 }
 \arguments{
 \item{task_name}{character name of the step}

From 34b8f9a7d8ff60b780b86dc47fa212aa49e8764c Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 16 Dec 2022 09:53:45 -0300
Subject: [PATCH 08/34] fix sysdata

---
 pipeline-runner/R/sysdata.rda      | Bin 3125 -> 3169 bytes
 pipeline-runner/data-raw/sysdata.R |   1 +
 2 files changed, 1 insertion(+)

diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda
index 1a20befed27612d8290a1b9312994767a950807d..c750a90d5513b4f53e06689742df642eb0dbfa12 100644
GIT binary patch
literal 3169
zcmV-n44(5sT4*^jL0KkKSycq{E&vmA|MLI;|NsC0|NsC0|NsC0|L{OS03ZM$5C9+r
z;0zx5G8cT%d&h3kB8R@qZBr}HVfT8jyhxYZb?wRtQ$)}bQ~eDQn3FUnX#qBx8VRC$
zYIsQX4=4=)00006fEoxOX_HeXjU=C{WM+|*!&LP>Lp3z?fEoY*0AgqW0MO78Q}Qr{
zC#dy3Q`9tIjRu-z3?n8G8X7biG+<3M3?m_<Oqv)>8mHtFLMfD!@;0ZcndDQ$RP`~b
zqerO8pfgiJpa1{>00HU%0!2YhsLfMNsiX8L(9jwH01W^DJwR<TLqVpVQ%9hWMvy&2
zK_rCAWMXNEAg77tPc){FQ%@ApwHgM100000000JusoCz=>+2SEYG5iu1`W0`jBU1o
zw$R!`VVg27DJCFYn2`iPA`2MzyBPowSUJfcSq2gZT1b%qR}Xv8UNSqmI7K5M%rtB+
zQ!q%9dOi>K85p)h5<>CUv5?(Kc5eq=3@7I40*7f8*sZruI}-eH&SsqbNl-<MVs5J8
zOoJ3+-;E(8%v!?@K@D*>1U5uAggb`Vyr$up-8YT%Ilyq7M;YBY4)NbSla%lr$CPuO
z2T9a;Pb~8ebFA}EIM+P$u<bm@ob(=3AgpASOp>`^s#r3QN$Na@tjj%zw5(P=$Gprw
z25FjnOC_V^XCE`@WtPnIK+G*@mJF>guiECGrj@HVbM0Hz`_cSSFJFGJ@rkF!dhwu=
zL2f9{3uXjpQR_eo1Pm!dLXiZ6p%k7di5i~jl3bb!6XXyif>tKbg$h3rZ=GiD{oc``
z%};M-hk2I7WPOKXB5Bxks6ri8rfXZg-@UDE-cvSjq{XS~Qx#R0mU4|ZB2WvaVuOcK
zHxA9GEjFyf)sG%wtK2PVuPi;v?srvo`+VJ=zI|5c=H9WMx?a<y*Mp7s@!<Yba^p04
zdN~`RDd%O<wCfSOZy@g_ql^0Kc#_cELcN;I2O5crmd1{|#4@36YYn72R{lIK)esR6
z%?`(mEj)$Hc7pR#@RkLRjFz2VXR_{(3wM5!Zv%PWUlyCZ!Y0?7w97)5r&9J;d)&+$
z6wvId#hryTJzg&XlUkZ@E~LA&USjs4k*^{Eq&xmZ#GMY4ooeld3CV*CXL;IUBa&2F
zm4Oe)FKMT^HZ@9<v|R;pXAyHDSqYe$5LB|7nu|@E=*;&^15Ei(G+N3XM5Z-T6}iB1
zq#Nuc0A)f+CNz*BVgv}42?I(1Lrt`X3IPBNAz+|FVn85;q?-^T1_>b|W~68l{AmCp
z_?U1sG=L%~%Zs@iG~?D}un=z=Dsl6WWXaueqv!S^0V<=KwaJ+~UAR2fxcH9PNeB_h
zgpj04W+OQoA%YNb2tAq;!ZSolHfDNsDhhFEFoUzp!-_TQDugYjnvV~6!QRP{jL6q<
zRuhXWv%RyipM7q|?0Ku1Z#|b1^Ff){?@M~+((x+W3%938uR`Q`#hF{YI=u3SM?U(k
z5f~wv0l0?A*L-cf{fga$GLR=lieaHSAUq%+ek(IN6Qdno@bHkiqH5HlF@z}T1ZJtS
zMX;#f0cge9gH(uQ4=u3PSpb9sVc_AcGr3+vAm$2h&oob>U%t98QAcYz*@p}(vdaZD
z`VsF_yK70-Y}>!7fT4vea~q6tDO*~!aS)+9_Gk}V?A==HZ`afy$<v2C?i%eN>KZrh
z%UeL~j&Dz#%SbU|9rwU4!!9M(Cc{+uGi$@$_ioI*<mr36xY3CArK7}lXImk$)6F6$
z7;-w#eUwBaw`&ZMWn-A{=-{5}=VOK_qKWKqJF0QmG2_7;bs(aQ^V+vM?l^-w`~11-
zC#4&k-Rb%uha!#HshQmPSq!C>2-I|OoI08e?66C$?a#W_I#?aCcLBpWh7d#sM84N}
zDfkaFk1s4=ewSGcoo%*}%I@5iLy{fM`p~Z!x@$1RCpr}@aT91->yW~+lHIM8J`g(R
zS9pR2p%7M-xs+7a*4I}<8%u)oG>1$;PII1qVdJ)S(;FxX%9HV4f<nAUf-@-?2s0H*
zigp$eIHfUoB1%U{)v>GboYaH22)-20*@}^!XIvysPHvsbBY@O)I}|u@=D#4`v##g5
zcv{Z;L`8=+s$JeDsiSzcB1olZK5>hVGfF~eDhyc|j+y5bn2rfErh7cro1hGoj*4E)
z!<EXGYevnM#_=Olv5RY2bDa?9z4tQZ7Cq{#QmbPO_Qge3EYS+)j5{%E!?LVf5fYt~
z#u<vasne3l34y}_2tbV-WD->ddkcV_I8Q%9)>Ei7ahQc5wyw^d?Ho8+_g=AqH^Pnf
z5Ja2V(YvaH)(Gkvk|2q&K>(0oq{SFORmOnyGB~xJlTZ~^MoP?7HXMl#j<YJ;7ip~l
zI<bRb>epbyVE7@g&nQ(g!!?@i1;$xAW_f!~medVBupsC}cxmrw+il=)4MS#>99|5!
zx2I@Btt%-m{k9<C5Q3SOVO_J(gQpvJR8y-sRXA*$9Bi~`)^5$Ca4$6&V58EjR@@7m
zxd4zjvLrFagN~!hkD--qD53!%B1rg5y%c@o72<-xBw%*BNd|hm;?o&p;6KE3eTfNe
zWes=+Z6<d{TpQb$hwy|8_W>n<puL#JM;<J@8#-RC(HtcOw$y}>pgM_fS)wgW+}$`#
z+Nm*H1vb?WPuh{gx@jC2XFmIsxmNWSmA#-xeHJ|Q@L5t#VpJS0DQrN1xe7_(dR4;>
z1#WKH*UFz{L^6H5lB6z4#{wDf&YE$;#>R?31Von_kg2KBxLr;+#&=S6S2bp}7aPE=
z83G~%+6?GH#<qxYnPr0oT6AM+n+&l#%rP63TET4&eC`b~?Yvv=SGf0sF(LiI1$Sh5
zpHFL-l<V{><V#E_8Nso}Aqx}*v`B7~u(z{1bR?xyUgso`M#hkk+hCa30}{WIq<{%0
zL8!9j`p#HVc0MV@*K$^E<u+AXrDaL(ep;}{ztyXAcduanctKfM5(q80a&~r@-@+dH
z>N}If0Fs{$0nBku`b}(2K+ZXiS?{jCl#2|EI2Oa-X7bLR(%{_wpQJulFmvcGmi8Oi
z-5xe5GSk*Q!CeL)<lEOaB4yR95irAMFtOrr`Aybq2&!UO4_VmbGNo;SZ3V%T+0Ubk
zJ{~q;hZq+TTn4^=yadWQ0?AAqz(Qa`$WFZ;3lG_Io6NH94ohEu-4O$jsQtgD>%8BH
zIL*z29L3;8q8KG|(Q8MgH7K6vS`3kHAjAu}ba@u(&;0tmmt>6A(p|L&26uN9VJ0J2
zhPGp~qs9DjUoa9AWlV^ngPSZ0ES6<P*bB%IPx9A(bzgeH5l@d7RD2}0Y=(loR&AFu
z6Kd4V4PPk<^sRVYw_ch<)vG{E`Y2||_#K5t5b_xAarE$%9Sx(-i7mjG%8IK-hc_FH
zLk2W@(#)GPPO`HzJ~oau2tas*TE(j&U0ylJA%Z$rw8a{Hmhs{nCYHTX+nsrGY*Z{N
zDx&FP*e=}GLo8)=Kqg3nDezyFP)=tq=s|j`<LWxf`n<xU@4Psr2rC}$q=W`N&0E_c
zp$L&ea`VYkp=3f?^K_dIJ=txhTWC6uQr$SGt=Tlb;iExYw39HwZ55l0mHLW9vULw4
z>S|_U%0<3o$;wqL<rffeNQnq8o$e#Qc1U<ogtuIcLMbk2YgyxS2D|ubD8m+p#*DJI
zdy+tnkx^a@604!`%4K{=LB&xqJOM?+m;q~qfB<~ath%kseVND^lG2@W=%{Nrjh4m*
z(%`~jL)Qka7>>*)sMxs!u8vr&*Q1?i7KD)oExG0fxbdW-hB8SYm>1ruR&mBSAsPk5
zERlKj5Rl6XOz>`X7_n4~fL>ez{8U;-WmG0p^(au$)h@op<CRh1e8Ccpw*n<Lg;Z1O
z%A0P>nl63lb@*}?)UqLVS$Zd7-yMp1?5l=DPY~Q&kK&1<m11}IQ&-GDD9kpVFn2WU
zkvoR%({CXr4DjDbuR1If2ymWha-(ipFxwIdG)b-1GKTYq3jWxaYXGi=3&i+S!?f-U
zu^R+}l6In}Ka)8X(6tzE)cy>BVhWAa9s>Y^OFHzw!yJ)$DCcV}Mex?zhKb3S+#HGr
zFA%*!sSwy1>ZmFx%gvE6+RIx4ixrAam?u4Ru5<Kx`RozH6Rp(r&^!r_Q~wuoML1B9
HRRr@c;up=c

literal 3125
zcmV-549fFDT4*^jL0KkKS#uH8j{p-jfAatT|NsC0|NsC0|NsC0|L{OS03ZM$5C9+r
z;0xaQYG&T^=9h=u^nDv)zR@Jtv(xU|J9lwyO>F~MN}3?2h^hWeg!GxIv}l+~l=PmH
z(teSs)6*K7k16I+AEh)610Vol4AcP7f-(TmMk+i_DdI-d^*=_bvW+wu01r?N00000
z004@gfFw;cr>GvG>KXt6>Hq)$0000000003Q_vtp6+KNqB-F(9Jx^(pnm<%%Xfyx-
z00E!?007Vc5-3r%ObU99JWU~>00u)wjWPfY00w{n05lI!&;XPnH5oFVnrcr|#6o&$
zh%^sTG{Ryr1^@w|0Aw1PU^NW%lhcdI&Efx#H`et<bYQ`@#xaeyz-_cPkk;xK3CIsC
zx<q(ABm|^$nvvijTN#lM5}`yuYlKLT1gvwMfsQI1NU((Bfpk#0^1>{V6ZhtSa7se9
zF(fX2Z)2+6>iBNWh{Vso1sx=>%kV_8_RR7&>K;!mFd2lJ_cU5+3xX)dAfZZAHMPbU
z#%#y58MPkRdvg?qF}OD-=*Kr^@W?l&_{lfsQI>EIWZyey0m5+^PT|ITN49btM=9<c
z$57`p8P4h6J<~bw8V-Y)=sokka~<>GJ`>_I9~tr<1LZUt519GSp!!b<;XFo(q<RlY
z>OE(8(?6rwc+D^D&^vzLcfaBB`FnJ9etLRdy-)J5_M;S^O7o?<{8*1-Ac3a&6L3oe
zLK@5nz$k0oYhE>hZAyIU6J=Ku<8K13NuePnS%OJ60EAy5GcfCJYUdjc-VEIN+~8Xg
zkmfTYCaqTrL*d6#%Wb!`-{{+I)b1v?8DxUO=S|}?w}>|0o8(XzH-Zp1c*Uw3^GtAc
znTGv%H8D}~q_L^ZP~=VC(Ied4w+p23RifHa@?43pNypma>T%Ax)nnw0?9n>3A~!OV
zN)<*ZoAevVgs|@w6O|B+z{Pgdpcdrn*7;0FKrRDy5DlbwsZK7j_`v{?`H`c;2`-Uh
z`{~U@2nrFk5?c9toFi)R`e?js*K)U(D$wXm>lIjqa%6-iM=42>Wfu}PLnv`HFoRTy
zSx_=JgKEPh7G+w))^rybWIzfa;^`uiWJF&IRcxUcmoAvIMlkOAAs%!|f*T;L;*C&d
zX+)wjy-*~Bpe&04IRi4Z0*Nt`LqfYb$qJb~3Y*HawNe!*q-yJ{Fo3ay8rFmWTS7@D
zFaZM)0E*~H8c;|YZHQ?Q5CFm~L=c#gNC>nh#0Y~z5Rq}Iu^|1iB#=`s4h$FoNUUl#
zLuQ)B$~UH6$1SN?IHRyH=J*y3QA}d`Q-daq4#<R=Bw+`-L9e19fQMxvDM(6UZbVc?
znvBR9l$NVQz=ShZYBCBrMiX-A8Cn_GP}9OnItj{H3^wrb>e;bZW5%wS9)~%(MceT6
za-(Q%8(=u}#mBsxUA3|5$5%&4)G$UdIwhn>vBo$}RVS!x>&gyhoYbU{ix8@SWg%{G
z%=qR#8fCHzq5@==qKOl713CcNtxLp6nJK}l?C64~gwk|JdszhxoQu+nawlY=t%6BQ
zDuu$~YczXjLD76z_~aYCh{knfs$(x=qH9n(iA4*$a_!#hGP3>m*IAMnv3IieyE*!F
zLhaq))xbB@Y)00~$6`C7Vn+ejHE`6;HNE5O;v8lSw{*r)hA<8RUp8B;<QY)jv}@iL
z0eO(knZacjVGNduj1jJ4b#|*|%XnzHm1@kwN=Cw_a~#TAOoC%B6AWwKvlT7VBN)Uv
zwQj;rMK6DH?RjWSn9;2S5IceAF`Ly)7$}AsPh7c@)V3`yW{_ogrgC&i){iEWCcMiK
z#8A~%imb&5vL%s_&G~$ot0Yh>DpZzBVWTAGWiWBISOyBk)Hx0n32bW4QM8U%Cv$P@
znpI1MMoe)5_ZgQZC^9=Su6Ihks*5!ibYmirrq9?;keK6uQCr|we<C|s0mk^VK12&d
zL3FG&9>z$<$vc?W#1*}qHUe=7LKp8IZS#@TOJJtzuUkzHD^MCC$v{FNvbjfGSg5pW
z#Q~a$aT(^cS#<%c3dE#CO~rA0RS=sSRT+vApj>h}6dE*cS0K!n!z)!8O18DtRl@_r
zA|kXwt)TEiA`tN@d}wP+DyUV3!V96rI}vF163bY2BqF_MAq{b2T`&elMYt>sc<jBC
z3q;1WI~6apUIur2>3qVsb2Fin(uR*nSc*BTrA-U6sajc;DQVf>5tSM2s%Esq3(oMA
z3Tj1aj|%87If^w3gk0l3eSBBXF-uzR=>V<_tc=dMaJpWYr2(ddwzw#R47w1?6iPg?
z;ERmHD*-G7!!A%%nHM!+Mp9ZvZxmn)w?%}iSW}`1Fd`fhA#kQ8+AAPVDqPSqPRbNi
zHHcd2gcPEsm1eb9EH0?Xs?gJkPXHyx2r?rtKD_E`ew+=Xcx=vtyzZ&u;;wLqB1kD$
zaXF5pvZ&0UA2oT9coWUSpzLl6xb`{L3uGF3hBRsBh1QHjviEl0;>KD_5D5c$P>`!C
z7%de;t20F`D<~?1DyjN8u(WF@n(G#Vl?`6?CSRejPGbxCV(Yb?nngKbHj6^?OPfSn
z`m|aH>_C7o)Br?J7Mao+OqXGfwG=b9K=BITK!|}XBjVLocAU)S+-bDEOr;}AZqw{j
zEjnVHpjcIu{bs_{Q{0r?rbBfWI_^%ij~g&Bv6v!^2M&YI9f+_;&6rOPi)^Ldn4m7b
zEk~hhIxQh}w<de-m6S%{0TCreq7PzqY85*BD6Xd)<gPNriD}RkMud>5O}W6!je@>Y
zSF~~}VO_alxixWXQSA%((0N6PBUQ2rXhfs8Hs&!EN=#F+HM{F_Hg$HDKb`#wGfR-T
z8rouMrBDi-QaGZke)G>Pd9Y~_5DPGA2@SRhjes#J(<>4IB?&wWSR~6OQ!n;2>fX>~
zR$nLC#+nB+tD;vfW^Jce0kha>GzC#bbp-hnNei!aiOi^80TCl+BPpL@(;13FRbzr-
zvvQ5D>C#d(77DV)RGgJ;OAD8GCoN&5+Fb19XCy7~u@;KpOT_UV66I0HbUJio*sVxo
zxK(ExVat3cXr`;vEd*8@F-v42P!M>q8AQIEeOs;UKq1jFbxNV5r=rnVqX1Ya&afFC
z$$YCC_aR4MW?p@|?=mKLAB0E-5`$CdxSVE43|+~k6<{E)6lygoEqtkd<*B(4WNT>#
zAX~j(d2Sp&jU4vmgxAwt$_$>fn`$o>NL9yEtY_lr-MC#NH9Dz<0fSo%RP{=>5xD}|
z1J-<1G_0!QvJe<LZ{>#}Vk*H+S}RyBVJzaMwh(M1?ByA=F|u&699&Dm2TLlgPI<7P
zm_L;`6z}E_8<`Gvh_nE~aGrwBhp_~pz@&C!ypHp6TxMpbz-hD}0nreBSYzq5Jp0h$
zUdt3}?^waCHzuE=A<4RZOj{I%g(V?46p9I-8$jcrR!#@fkO?x^xq&+&%rs2xG{z$%
zl#d?(sC!P0DMBj;tA0V_2HrXLLYv}1`M<>_EUO#`Wz*V7BAwZkikU%?igt5l=vN&K
zsw0n(u7R`=Bz_XwqGRQgn`V{7jXesgTbxY7nzzLf$Bc|Uf+U2X*mpYzYTS?Rp!s4l
z5fqbC*R9!Vj;59BN-sfAV!o!McBX;B`)h_mF(nXNy-4C)(87||`2HY=3Znun=7bNO
z^zE+eYRtRH*`rpQbbfKUFfAZg9NCGGa%Sb;v~B_v7xHY$Ao}gXDl$U@FgO@eGn){_
z#4j3@s6p6-@1>_M|7_GoDj_W;<-CO>3tVSwE>OK{Q5Ml)+AO_7ax<jy5zDBU8cAOc
zk=H6U#x=+`EC~skic*ZY&tq`gCTdV!o^w>Dq%fE%Hk`wL_}1JJ!AJ{~P-@5FV`DT)
z1$K0+R8@(BhD{c;faIW?)TfBq&`N=wn-X;(q)<2qm@9TMRQlSoBWo+xz`ZDdtk@#r
zSy1v_fRwqoFiq=Pal{0{#`)1)mxh*@5d~|k){F{4c|pE`5fdYV!@k7zMi}j?%DfOZ
z`7CiYR-n{S8@H>-7@`?~YAM!Fecjz=d*TNfG{pd-MgZeL#A|J}wXL9SWH=v}Tn7OI
P3HKLrML1B9a}m^!3Z9E*

diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R
index 51bc8ae7..9c6e8f50 100644
--- a/pipeline-runner/data-raw/sysdata.R
+++ b/pipeline-runner/data-raw/sysdata.R
@@ -90,6 +90,7 @@ usethis::use_data(
   debug_timestamp,
   bucket_list,
   gem2s,
+  SUBSET_SEURAT_TASK_LIST,
   GEM2S_TASK_LIST,
   QC_TASK_LIST,
   INPUT_DIR,

From 20a3e5b876b80c5075549cc11a57abfa230980b9 Mon Sep 17 00:00:00 2001
From: cosa65 <martin@biomage.net>
Date: Fri, 16 Dec 2022 10:17:35 -0300
Subject: [PATCH 09/34] Some small fixes

---
 pipeline-runner/R/handle_data.R            | 1 +
 pipeline-runner/R/init-functions.R         | 6 +++---
 pipeline-runner/R/subset-1-subset_seurat.R | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index 115c6c4e..18398920 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -27,6 +27,7 @@ upload_cells_id <- function(pipeline_config, object_key, cells_id) {
 
 load_processed_scdata <- function (s3, pipeline_config, experiment_id) {
   bucket <- pipeline_config$processed_bucket
+  message("Loading processed scdata")
   message(bucket)
   message(paste(experiment_id, "r.rds", sep = "/"))
 
diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index ed79bd8f..56719f26 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -204,7 +204,7 @@ run_qc_step <- function(scdata, config, tasks, task_name, cells_id, sample_id, d
 #'
 #' @return list of task results
 #'
-run_gem2s_step <- function(prev_out, input, pipeline_config, tasks, task_name) {
+run_pipeline_step <- function(prev_out, input, pipeline_config, tasks, task_name) {
   if (!task_name %in% names(tasks)) {
     stop("Invalid task name given: ", task_name)
   }
@@ -242,7 +242,7 @@ call_gem2s <- function(task_name, input, pipeline_config) {
   check_input(input)
   tasks <- lapply(GEM2S_TASK_LIST, get)
 
-  c(data, task_out) %<-% run_gem2s_step(prev_out, input, pipeline_config, tasks, task_name)
+  c(data, task_out) %<-% run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
   assign("prev_out", task_out, pos = ".GlobalEnv")
 
   message_id <- send_gem2s_update_to_api(pipeline_config, experiment_id, task_name, data, input)
@@ -277,7 +277,7 @@ call_subset <- function(task_name, input, pipeline_config) {
   check_input(input)
   tasks <- lapply(SUBSET_SEURAT_TASK_LIST, get)
 
-  c(data, task_out) %<-% run_gem2s_step(prev_out, input, pipeline_config, tasks, task_name)
+  c(data, task_out) %<-% run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
   assign("prev_out", task_out, pos = ".GlobalEnv")
 
   message_id <- send_gem2s_update_to_api(pipeline_config, experiment_id, task_name, data, input)
diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 81b84169..deadf5e8 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -10,12 +10,12 @@
 #'   - cellSetKeys character vector of cellset keys to subset
 #'   - experimentName character
 #' @param pipeline_config list
+#' @param prev_out list, ignored because this is the first step in the subset pipeline
 #'
 #' @return list containing scdata_list, annotations and sample_id_map
 #' @export
 #'
-create_subset_experiment <- function(input, pipeline_config) {
-
+create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
   # load parent processed scdata and cellsets
   s3 <- paws::s3(config = pipeline_config$aws_config)
   parent_scdata <- load_processed_scdata(s3, pipeline_config, input$parentExperimentId)

From e04502736f68a74d07cc854d1b657183c15a8995 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 16 Dec 2022 14:06:26 -0300
Subject: [PATCH 10/34] add config, format

---
 pipeline-runner/R/subset-1-subset_seurat.R | 47 +++++++++++++++-------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 81b84169..a3821e52 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -14,8 +14,7 @@
 #' @return list containing scdata_list, annotations and sample_id_map
 #' @export
 #'
-create_subset_experiment <- function(input, pipeline_config) {
-
+create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
   # load parent processed scdata and cellsets
   s3 <- paws::s3(config = pipeline_config$aws_config)
   parent_scdata <- load_processed_scdata(s3, pipeline_config, input$parentExperimentId)
@@ -26,7 +25,7 @@ create_subset_experiment <- function(input, pipeline_config) {
   # subset seurat object, remove unnecesary data
   scdata <- subset_ids(parent_scdata, cell_ids_to_keep)
   scdata <- diet_scdata(scdata)
-  scdata@misc$experimentId <- input$subsetExperimentId
+  scdata@misc$experimentId <- input$experimentId
 
   # delete parent_scdata to free memory
   rm(parent_scdata)
@@ -39,12 +38,24 @@ create_subset_experiment <- function(input, pipeline_config) {
   # split by sample
   scdata_list <- Seurat::SplitObject(scdata, split.by = "samples")
 
+  # TODO: remove from here and refactor all pipeline.
+  config <- list(
+    name = input$experimentName,
+    samples = input$sampleIds,
+    organism = input$organism,
+    input = list(type = input$input$type),
+    sampleOptions = input$sampleOptions
+  )
+
   # structure step output
   res <- list(
     data = list(),
-    output = list(scdata_list = scdata_list,
-                  annot = scdata@misc$gene_annotations,
-                  sample_id_map = sample_id_map)
+    output = list(
+      scdata_list = scdata_list,
+      annot = scdata@misc$gene_annotations,
+      sample_id_map = sample_id_map,
+      config = config
+    )
   )
 
   message("\nSubsetting of Seurat object step complete.")
@@ -65,9 +76,11 @@ create_subset_experiment <- function(input, pipeline_config) {
 #' @export
 #'
 create_sample_id_map <- function(parent_sample_id) {
-  subset_sample_id <-  uuid::UUIDgenerate(n = length(parent_sample_id))
-  sample_id_map <-data.table::data.table(parent_sample_id = parent_sample_id,
-                                         subset_sample_id = subset_sample_id)
+  subset_sample_id <- uuid::UUIDgenerate(n = length(parent_sample_id))
+  sample_id_map <- data.table::data.table(
+    parent_sample_id = parent_sample_id,
+    subset_sample_id = subset_sample_id
+  )
 
   return(sample_id_map)
 }
@@ -99,13 +112,17 @@ add_new_sample_ids <- function(scdata, sample_id_map) {
 #' @export
 #'
 diet_scdata <- function(scdata) {
-  lean_scdata <- Seurat::CreateSeuratObject(counts = scdata@assays$RNA@counts,
-                             meta.data = scdata@meta.data,
-                             min.cells = 0,
-                             min.features = 0)
+  lean_scdata <- Seurat::CreateSeuratObject(
+    counts = scdata@assays$RNA@counts,
+    meta.data = scdata@meta.data,
+    min.cells = 0,
+    min.features = 0
+  )
 
-  lean_scdata@misc <- list(gene_annotations = scdata@misc$gene_annotations,
-                           parent_experimentId = scdata@misc$experimentId)
+  lean_scdata@misc <- list(
+    gene_annotations = scdata@misc$gene_annotations,
+    parent_experimentId = scdata@misc$experimentId
+  )
 
   return(lean_scdata)
 }

From fc968011eefd728533e582659b1edc2fe0e84aaa Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Wed, 21 Dec 2022 14:08:08 -0300
Subject: [PATCH 11/34] make everything work

---
 pipeline-runner/R/gem2s-6-prepare_experiment.R |  2 +-
 pipeline-runner/R/gem2s-7-upload_to_aws.R      |  6 +++++-
 pipeline-runner/R/init-functions.R             | 10 +++++++++-
 pipeline-runner/R/subset-1-subset_seurat.R     |  9 ++++-----
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/pipeline-runner/R/gem2s-6-prepare_experiment.R b/pipeline-runner/R/gem2s-6-prepare_experiment.R
index 0d9b13b4..72e58a38 100644
--- a/pipeline-runner/R/gem2s-6-prepare_experiment.R
+++ b/pipeline-runner/R/gem2s-6-prepare_experiment.R
@@ -14,7 +14,7 @@
 prepare_experiment <- function(input, pipeline_config, prev_out) {
   message("Preparing experiment ...")
 
-  check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list", "disable_qc_filters")
+  check_names <- c("config", "edrops", "annot", "scdata_list", "disable_qc_filters")
   check_prev_out(prev_out, check_names)
 
   scdata_list <- prev_out$scdata_list
diff --git a/pipeline-runner/R/gem2s-7-upload_to_aws.R b/pipeline-runner/R/gem2s-7-upload_to_aws.R
index 913fcc86..fccc8cd7 100644
--- a/pipeline-runner/R/gem2s-7-upload_to_aws.R
+++ b/pipeline-runner/R/gem2s-7-upload_to_aws.R
@@ -10,7 +10,7 @@
 #'
 upload_to_aws <- function(input, pipeline_config, prev_out) {
   message("Uploading to AWS ...")
-  check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list", "qc_config", "disable_qc_filters")
+  check_names <- c("config", "scdata_list", "qc_config", "disable_qc_filters")
   check_prev_out(prev_out, check_names)
 
   experiment_id <- input$experimentId
@@ -21,6 +21,10 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
   config <- prev_out$config
   qc_config <- prev_out$qc_config
   disable_qc_filters <- prev_out$disable_qc_filters
+  if("sample_id_map" %in% names(prev_out)) {
+    input$sampleIds <- names(scdata_list)
+    input$sampleNames <- names(scdata_list)
+  }
 
   message("Constructing cell sets ...")
   cell_sets <- get_cell_sets(scdata_list, input)
diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index 56719f26..fe03f867 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -280,6 +280,15 @@ call_subset <- function(task_name, input, pipeline_config) {
   c(data, task_out) %<-% run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
   assign("prev_out", task_out, pos = ".GlobalEnv")
 
+  if (task_name == names(tasks)[1]) {
+    assign("cells_id", generate_first_step_ids(prev_out$scdata_list), pos = ".GlobalEnv")
+    next_task <- "dataIntegration"
+    for(sample_id in names(prev_out$scdata_list)) {
+      object_key <- paste0(experiment_id, "/", next_task, "/", sample_id, ".rds")
+      upload_cells_id(pipeline_config, object_key, cells_id)
+    }
+  }
+
   message_id <- send_gem2s_update_to_api(pipeline_config, experiment_id, task_name, data, input)
 
   return(message_id)
@@ -474,7 +483,6 @@ wrapper <- function(input, pipeline_config) {
   message("\n------\nStarting task: ", task_name, "\n")
   message("Input:")
   str(input)
-  message("")
 
   # common to gem2s and data processing
   server <- input$server
diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index d0b607d4..b5de9cf0 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -42,10 +42,7 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
   # TODO: remove from here and refactor all pipeline.
   config <- list(
     name = input$experimentName,
-    samples = input$sampleIds,
-    organism = input$organism,
-    input = list(type = input$input$type),
-    sampleOptions = input$sampleOptions
+    samples = sample_id_map$subset_sample_id
   )
 
   # structure step output
@@ -54,8 +51,10 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
     output = list(
       scdata_list = scdata_list,
       annot = scdata@misc$gene_annotations,
+      edrops = NULL,
       sample_id_map = sample_id_map,
-      config = config
+      config = config,
+      disable_qc_filters = TRUE
     )
   )
 

From 8e6218a84636e54573c0b80a3f9f0f7df1f7893a Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Wed, 21 Dec 2022 14:33:17 -0300
Subject: [PATCH 12/34] send sample_id_map to API

---
 pipeline-runner/R/gem2s-7-upload_to_aws.R  | 1 +
 pipeline-runner/R/subset-1-subset_seurat.R | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline-runner/R/gem2s-7-upload_to_aws.R b/pipeline-runner/R/gem2s-7-upload_to_aws.R
index fccc8cd7..37294f98 100644
--- a/pipeline-runner/R/gem2s-7-upload_to_aws.R
+++ b/pipeline-runner/R/gem2s-7-upload_to_aws.R
@@ -21,6 +21,7 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
   config <- prev_out$config
   qc_config <- prev_out$qc_config
   disable_qc_filters <- prev_out$disable_qc_filters
+
   if("sample_id_map" %in% names(prev_out)) {
     input$sampleIds <- names(scdata_list)
     input$sampleNames <- names(scdata_list)
diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index b5de9cf0..6608cd57 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -47,7 +47,9 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 
   # structure step output
   res <- list(
-    data = list(),
+    data = list(
+      sample_id_map = sample_id_map
+      ),
     output = list(
       scdata_list = scdata_list,
       annot = scdata@misc$gene_annotations,

From 97a51da74ff37caf3b08fd9c295a0a8032128492 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Wed, 21 Dec 2022 14:42:34 -0300
Subject: [PATCH 13/34] better sample mapping

---
 pipeline-runner/R/init-functions.R         | 2 --
 pipeline-runner/R/subset-1-subset_seurat.R | 9 ++++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index fe03f867..9080a5b7 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -266,8 +266,6 @@ call_gem2s <- function(task_name, input, pipeline_config) {
 #'
 call_subset <- function(task_name, input, pipeline_config) {
   experiment_id <- input$experimentId
-  # remove when it's added to the input
-  input$subset_experiment <- TRUE
 
   if (!exists("prev_out")) {
     remove_cell_ids(pipeline_config, experiment_id)
diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 6608cd57..de20cf6c 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -48,7 +48,7 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
   # structure step output
   res <- list(
     data = list(
-      sample_id_map = sample_id_map
+      sampleIdMap = sample_id_map
       ),
     output = list(
       scdata_list = scdata_list,
@@ -79,10 +79,9 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 #'
 create_sample_id_map <- function(parent_sample_id) {
   subset_sample_id <- uuid::UUIDgenerate(n = length(parent_sample_id))
-  sample_id_map <- data.table::data.table(
-    parent_sample_id = parent_sample_id,
-    subset_sample_id = subset_sample_id
-  )
+
+  sample_id_map <- as.list(subset_sample_id)
+  names(sample_id_map) <- parent_sample_id
 
   return(sample_id_map)
 }

From ba84375480dda67406642e316fc8f13dd139d496 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Thu, 22 Dec 2022 12:57:16 -0300
Subject: [PATCH 14/34] add cellset type to cellset data.table

---
 pipeline-runner/R/handle_data.R | 47 ++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index 18398920..9b80d7f2 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -434,6 +434,37 @@ load_cellsets <- function(s3, pipeline_config, experiment_id) {
 }
 
 
+#' Bind columns not failing if there's an empty data.table
+#'
+#' @param dt data.table
+#' @param ... columns to add
+#'
+#' @return data.table with new columns
+#' @export
+#'
+safe_cbind <- function(dt, ...) {
+  if (nrow(dt) > 0) {
+    dt <- cbind(dt, ...)
+  }
+  return(dt)
+}
+
+
+#' add cellset type column to cellsets data.table
+#'
+#' helper to correctly name the cellset_type column.
+#'
+#' @param dt data.table
+#' @param col string of corresponding cellset type
+#'
+#' @return
+#' @export
+#'
+cbind_cellset_type <- function(dt, col) {
+  dt <- safe_cbind(dt, cellset_type = col)
+}
+
+
 #' Parse cellsets object to data.table
 #'
 #' Gets the cellsets list and converts it to a tidy data.table
@@ -445,10 +476,18 @@ load_cellsets <- function(s3, pipeline_config, experiment_id) {
 #'
 parse_cellsets <- function(cellsets) {
 
-  data.table::setDT(cellsets$cellSets)
+  dt_list <- cellsets$cellSets$children
+
+  lapply(dt_list, data.table::setDT)
+  dt_list <- purrr::map2(dt_list, cellsets$cellSets$key, cbind_cellset_type)
+
   # fill columns in case there are empty cellset classes
-  dt <- data.table::rbindlist(cellsets$cellSets$children, fill = TRUE)
-  # unnest, and change column name
-  dt[, setNames(.(unlist(cellIds)), "cell_id"), by = .(key, name)]
+  dt <- data.table::rbindlist(dt_list, fill = TRUE)
+
+  # rename cellset type to metadata in case of metadata cellsets
+  dt[!cellset_type%in% c("louvain", "scratchpad", "sample"), cellset_type := "metadata"]
 
+  # unnest, and change column name
+  dt[, setNames(.(unlist(cellIds)), "cell_id"), by = .(key, name, cellset_type)]
 }
+

From a057e5ab6c96531200cda827aa788596ada6f015 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Thu, 22 Dec 2022 15:49:35 -0300
Subject: [PATCH 15/34] add unittest for helpers

---
 .../tests/testthat/test-handle_data.R         | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/pipeline-runner/tests/testthat/test-handle_data.R b/pipeline-runner/tests/testthat/test-handle_data.R
index cfd59197..e5c3ac42 100644
--- a/pipeline-runner/tests/testthat/test-handle_data.R
+++ b/pipeline-runner/tests/testthat/test-handle_data.R
@@ -6,6 +6,12 @@ mock_sns <- function(config) {
     ))
 }
 
+mock_cellsets <- function(){
+  # get a snapshot cellsets json
+  jsonlite::fromJSON("tests/testthat/_snaps/gem2s/gem2s-7-mock_experiment_id-cellsets.json", flatten = TRUE)
+
+}
+
 test_that("send_gem2s_update_to_api completes successfully", {
     pipeline_config <- list(
         sns_topic = 'ExampleTopic',
@@ -88,3 +94,79 @@ test_that("send_output_to_api completes successfully", {
 
     expect_true(response == 'ok')
 })
+
+
+test_that("safe_cbind returns empty data.table when binding an empty data.table with a vector", {
+
+  dt_empty <- data.table::data.table()
+  col <- c(a_col = "a_value")
+
+  res <- safe_cbind(dt_empty, col)
+
+  expect_identical(res, dt_empty)
+
+})
+
+
+test_that("safe_cbind adds a column to a non-empty data.table", {
+  dt <- data.table::data.table(col1 = 1:10, col2 = 11:20)
+  values <- seq(1, 20, 2)
+  res <- safe_cbind(dt, bound_col = values)
+
+  expect_identical(res[,bound_col], values)
+  expect_equal(ncol(res), ncol(dt) + 1)
+})
+
+
+test_that("safe_cbind names bound column as expected", {
+  dt <- data.table::data.table(col1 = 1:10, col2 = 11:20)
+  values <- seq(1, 20, 2)
+  res <- safe_cbind(dt, my_expected_column_name = values)
+
+  expect_true("my_expected_column_name" %in% names(res))
+  expect_identical(res[,my_expected_column_name], values)
+
+
+})
+
+
+test_that("safe_cbind binds more than one column and names accordingly", {
+
+  dt <- data.table::data.table(col1 = 1:10, col2 = 11:20)
+  values_1 <- seq(1, 20, 2)
+  values_2 <- values_1 + 2
+
+  res <- safe_cbind(dt, an_interesting_variable = values_1, an_interesting_variable_plus_2 = values_2)
+
+  expect_true("an_interesting_variable" %in% names(res))
+  expect_identical(res[,an_interesting_variable], values_1)
+
+  expect_true("an_interesting_variable_plus_2" %in% names(res))
+  expect_identical(res[,an_interesting_variable_plus_2], values_2)
+
+})
+
+
+test_that("cbind_cellset_type names the bound column correctly", {
+
+  dt <- data.table::data.table(col1 = 1:10, col2 = 11:20)
+  values <- seq(1, 20, 2)
+
+  res <- cbind_cellset_type(dt, values_1)
+
+  expect_true("cellset_type" %in% names(res))
+  expect_identical(res[,cellset_type], values)
+
+})
+
+
+test_that("parse_cellsets parses a cellset object", {
+
+  cellsets <- mock_cellsets()
+
+  res <- parse_cellsets(cellsets)
+
+  expect_s3_class(res, "data.table")
+  expect_identical(names(res), c("key", "name", "cellset_type", "cell_id"))
+
+})

From 723b3df63b4536b8a3be185f25555719ebaf2ba9 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Thu, 22 Dec 2022 16:22:22 -0300
Subject: [PATCH 16/34] refactor, fix addition of sample_ids

---
 pipeline-runner/R/subset-1-subset_seurat.R | 43 ++++++++++++++++------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index de20cf6c..9cc73791 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -16,21 +16,16 @@
 #' @export
 #'
 create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
-  # load parent processed scdata and cellsets
-  s3 <- paws::s3(config = pipeline_config$aws_config)
-  parent_scdata <- load_processed_scdata(s3, pipeline_config, input$parentExperimentId)
-  parent_cellsets <- parse_cellsets(load_cellsets(s3, pipeline_config, input$parentExperimentId))
 
-  cell_ids_to_keep <- parent_cellsets[key %in% input$cellSetKeys, cell_id]
+  parent <- load_parental_data(input, pipeline_config)
+
+  cell_ids_to_keep <- parent$cellsets[key %in% input$cellSetKeys, cell_id]
 
   # subset seurat object, remove unnecesary data
-  scdata <- subset_ids(parent_scdata, cell_ids_to_keep)
+  scdata <- subset_ids(parent$scdata, cell_ids_to_keep)
   scdata <- diet_scdata(scdata)
   scdata@misc$experimentId <- input$experimentId
 
-  # delete parent_scdata to free memory
-  rm(parent_scdata)
-
   # add new sample_ids, keep originals in a new variable
   scdata$parent_samples <- scdata$samples
   sample_id_map <- create_sample_id_map(unique(scdata$parent_samples))
@@ -56,7 +51,8 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
       edrops = NULL,
       sample_id_map = sample_id_map,
       config = config,
-      disable_qc_filters = TRUE
+      disable_qc_filters = TRUE,
+      parent_cellsets = parent$cellsets
     )
   )
 
@@ -65,6 +61,29 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 }
 
 
+#' load parent experiment data
+#'
+#' Loads the processed rds and cellsets file from the parent experiment from s3.
+#'
+#' @param input list of input parameters
+#' @param pipelne_config list of pipeline parameters
+#'
+#' @return list with scdata and parsed cellsets
+#' @export
+#'
+load_parental_data <- function(input, pipelne_config) {
+  # load parent processed scdata and cellsets
+  s3 <- paws::s3(config = pipeline_config$aws_config)
+  parent_scdata <-
+    load_processed_scdata(s3, pipeline_config, input$parentExperimentId)
+  parent_cellsets <-
+    parse_cellsets(load_cellsets(s3, pipeline_config, input$parentExperimentId))
+
+  return(list(scdata = parent_scdata, cellsets = parent_cellsets))
+
+}
+
+
 #' generate a sample id mapping for remaining samples after subset
 #'
 #' New sample ids must be created, but the number of samples depends on which
@@ -96,8 +115,8 @@ create_sample_id_map <- function(parent_sample_id) {
 #' @export
 #'
 add_new_sample_ids <- function(scdata, sample_id_map) {
-  sample_map_idx <- match(scdata$parent_samples, sample_id_map$parent_sample_id)
-  scdata$samples <- sample_id_map$subset_sample_id[sample_map_idx]
+  sample_map_idx <- match(scdata$parent_samples, names(sample_id_map))
+  scdata$samples <- unname(unlist(sample_id_map[sample_map_idx]))
   return(scdata)
 }
 

From 624f0ad8bf19cb07c9002179d4d31dc965d09176 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Thu, 22 Dec 2022 16:42:36 -0300
Subject: [PATCH 17/34] more refactor

---
 pipeline-runner/R/subset-1-subset_seurat.R | 93 +++++++++++++++-------
 1 file changed, 63 insertions(+), 30 deletions(-)

diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 9cc73791..810f38c6 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -19,19 +19,10 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 
   parent <- load_parental_data(input, pipeline_config)
 
-  cell_ids_to_keep <- parent$cellsets[key %in% input$cellSetKeys, cell_id]
-
-  # subset seurat object, remove unnecesary data
-  scdata <- subset_ids(parent$scdata, cell_ids_to_keep)
-  scdata <- diet_scdata(scdata)
-  scdata@misc$experimentId <- input$experimentId
-
-  # add new sample_ids, keep originals in a new variable
-  scdata$parent_samples <- scdata$samples
-  sample_id_map <- create_sample_id_map(unique(scdata$parent_samples))
-  scdata <- add_new_sample_ids(scdata, sample_id_map)
+  scdata <- subset_experiment(input, parent)
+  sample_id_map <- create_sample_id_map(unique(scdata$samples))
+  scdata <- add_subset_metadata(input, scdata, sample_id_map)
 
-  # split by sample
   scdata_list <- Seurat::SplitObject(scdata, split.by = "samples")
 
   # TODO: remove from here and refactor all pipeline.
@@ -80,7 +71,54 @@ load_parental_data <- function(input, pipelne_config) {
     parse_cellsets(load_cellsets(s3, pipeline_config, input$parentExperimentId))
 
   return(list(scdata = parent_scdata, cellsets = parent_cellsets))
+}
+
 
+#' Remove all unnecessary data from the parent seurat object
+#'
+#' Seurat::DietSeurat is not able to remove certain slots from a seurat object.
+#' This function also removes elements from the misc slot which are not necessary
+#'
+#' @param scdata SeuratObject
+#'
+#' @return leaner SeuratObject
+#' @export
+#'
+diet_scdata <- function(scdata) {
+  lean_scdata <- Seurat::CreateSeuratObject(
+    counts = scdata@assays$RNA@counts,
+    meta.data = scdata@meta.data,
+    min.cells = 0,
+    min.features = 0
+  )
+
+  lean_scdata@misc <- list(
+    gene_annotations = scdata@misc$gene_annotations,
+    parent_experimentId = scdata@misc$experimentId
+  )
+
+  return(lean_scdata)
+}
+
+
+#' Subset seurat object by the input cellset keys
+#'
+#' This function takes the cellset keys sent by the API, extracts the cell_ids
+#' that belong to them, subsets the seurat object and removes all unnecessary
+#' data from it.
+#'
+#' @param input list of input parameters, containing cellSetKeys to subset
+#' @param parent list containing parent scdata and parsed cellsets
+#'
+#' @return subset seurat object
+#' @export
+#'
+subset_experiment <- function(input, parent) {
+  # subset seurat object, remove unnecesary data
+  cell_ids_to_keep <- parent$cellsets[key %in% input$cellSetKeys, cell_id]
+  scdata <- subset_ids(parent$scdata, cell_ids_to_keep)
+  scdata <- diet_scdata(scdata)
+  return(scdata)
 }
 
 
@@ -115,34 +153,29 @@ create_sample_id_map <- function(parent_sample_id) {
 #' @export
 #'
 add_new_sample_ids <- function(scdata, sample_id_map) {
+
   sample_map_idx <- match(scdata$parent_samples, names(sample_id_map))
   scdata$samples <- unname(unlist(sample_id_map[sample_map_idx]))
   return(scdata)
 }
 
 
-#' Remove all unnecessary data from the parent seurat object
+#' add experiment level metadata to subset seurat object
 #'
-#' Seurat::DietSeurat is not able to remove certain slots from a seurat object.
-#' This function also removes elements from the misc slot which are not necessary
-#'
-#' @param scdata SeuratObject
+#' @param input list of input params, containing the experimentId
+#' @param scdata seurat object
+#' @param sample_id_map list with mapping between sample_ids from
+#'  parent and subset experiments
 #'
-#' @return leaner SeuratObject
+#' @return scdata with additional metadata
 #' @export
 #'
-diet_scdata <- function(scdata) {
-  lean_scdata <- Seurat::CreateSeuratObject(
-    counts = scdata@assays$RNA@counts,
-    meta.data = scdata@meta.data,
-    min.cells = 0,
-    min.features = 0
-  )
+add_subset_metadata <- function(input, scdata, sample_id_map) {
 
-  lean_scdata@misc <- list(
-    gene_annotations = scdata@misc$gene_annotations,
-    parent_experimentId = scdata@misc$experimentId
-  )
+  # add new sample_ids, keep originals in a new variable
+  scdata$parent_samples <- scdata$samples
+  scdata <- add_new_sample_ids(scdata, sample_id_map)
+  scdata@misc$experimentId <- input$experimentId
 
-  return(lean_scdata)
+  return(scdata)
 }

From e73e87fabef0f5d3e202669dd134165e5a2bc0f2 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Thu, 22 Dec 2022 16:47:16 -0300
Subject: [PATCH 18/34] document

---
 pipeline-runner/NAMESPACE                     |  5 +++++
 pipeline-runner/R/handle_data.R               |  2 +-
 pipeline-runner/man/add_subset_metadata.Rd    | 22 +++++++++++++++++++
 .../{call_subset_seurat.Rd => call_subset.Rd} |  0
 pipeline-runner/man/cbind_cellset_type.Rd     | 19 ++++++++++++++++
 .../man/create_subset_experiment.Rd           |  4 +++-
 pipeline-runner/man/load_parental_data.Rd     | 19 ++++++++++++++++
 ...run_gem2s_step.Rd => run_pipeline_step.Rd} |  6 ++---
 pipeline-runner/man/safe_cbind.Rd             | 19 ++++++++++++++++
 pipeline-runner/man/subset_experiment.Rd      | 21 ++++++++++++++++++
 10 files changed, 112 insertions(+), 5 deletions(-)
 create mode 100644 pipeline-runner/man/add_subset_metadata.Rd
 rename pipeline-runner/man/{call_subset_seurat.Rd => call_subset.Rd} (100%)
 create mode 100644 pipeline-runner/man/cbind_cellset_type.Rd
 create mode 100644 pipeline-runner/man/load_parental_data.Rd
 rename pipeline-runner/man/{run_gem2s_step.Rd => run_pipeline_step.Rd} (84%)
 create mode 100644 pipeline-runner/man/safe_cbind.Rd
 create mode 100644 pipeline-runner/man/subset_experiment.Rd

diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
index fc511d60..c111a52a 100644
--- a/pipeline-runner/NAMESPACE
+++ b/pipeline-runner/NAMESPACE
@@ -2,8 +2,10 @@
 
 export(add_metadata)
 export(add_new_sample_ids)
+export(add_subset_metadata)
 export(build_cc_gene_list)
 export(build_metadata_cellsets)
+export(cbind_cellset_type)
 export(create_sample_id_map)
 export(create_scdata)
 export(create_seurat)
@@ -30,6 +32,7 @@ export(integrate_scdata)
 export(learn_from_sketches)
 export(list_exclude_genes)
 export(load_cellsets)
+export(load_parental_data)
 export(load_user_files)
 export(log_normalize)
 export(make_annot_with_ids)
@@ -46,7 +49,9 @@ export(runClusters)
 export(run_emptydrops)
 export(run_geosketch)
 export(run_pca)
+export(safe_cbind)
 export(score_doublets)
+export(subset_experiment)
 export(subset_ids)
 export(subset_safe)
 export(sym_to_ids)
diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index 9b80d7f2..f180b4af 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -457,7 +457,7 @@ safe_cbind <- function(dt, ...) {
 #' @param dt data.table
 #' @param col string of corresponding cellset type
 #'
-#' @return
+#' @return data.table with cellset_type
 #' @export
 #'
 cbind_cellset_type <- function(dt, col) {
diff --git a/pipeline-runner/man/add_subset_metadata.Rd b/pipeline-runner/man/add_subset_metadata.Rd
new file mode 100644
index 00000000..d4e5c61c
--- /dev/null
+++ b/pipeline-runner/man/add_subset_metadata.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/subset-1-subset_seurat.R
+\name{add_subset_metadata}
+\alias{add_subset_metadata}
+\title{add experiment level metadata to subset seurat object}
+\usage{
+add_subset_metadata(input, scdata, sample_id_map)
+}
+\arguments{
+\item{input}{list of input params, containing the experimentId}
+
+\item{scdata}{seurat object}
+
+\item{sample_id_map}{list with mapping between sample_ids from
+parent and subset experiments}
+}
+\value{
+scdata with additional metadata
+}
+\description{
+add experiment level metadata to subset seurat object
+}
diff --git a/pipeline-runner/man/call_subset_seurat.Rd b/pipeline-runner/man/call_subset.Rd
similarity index 100%
rename from pipeline-runner/man/call_subset_seurat.Rd
rename to pipeline-runner/man/call_subset.Rd
diff --git a/pipeline-runner/man/cbind_cellset_type.Rd b/pipeline-runner/man/cbind_cellset_type.Rd
new file mode 100644
index 00000000..27b345c5
--- /dev/null
+++ b/pipeline-runner/man/cbind_cellset_type.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/handle_data.R
+\name{cbind_cellset_type}
+\alias{cbind_cellset_type}
+\title{add cellset type column to cellsets data.table}
+\usage{
+cbind_cellset_type(dt, col)
+}
+\arguments{
+\item{dt}{data.table}
+
+\item{col}{string of corresponding cellset type}
+}
+\value{
+data.table with cellset_type
+}
+\description{
+helper to correctly name the cellset_type column.
+}
diff --git a/pipeline-runner/man/create_subset_experiment.Rd b/pipeline-runner/man/create_subset_experiment.Rd
index 964e1a47..cb8321ef 100644
--- a/pipeline-runner/man/create_subset_experiment.Rd
+++ b/pipeline-runner/man/create_subset_experiment.Rd
@@ -4,7 +4,7 @@
 \alias{create_subset_experiment}
 \title{create a subset experiment}
 \usage{
-create_subset_experiment(input, pipeline_config)
+create_subset_experiment(input, pipeline_config, prev_out = NULL)
 }
 \arguments{
 \item{input}{list containing:
@@ -16,6 +16,8 @@ create_subset_experiment(input, pipeline_config)
 }}
 
 \item{pipeline_config}{list}
+
+\item{prev_out}{list, ignored because this is the first step in the subset pipeline}
 }
 \value{
 list containing scdata_list, annotations and sample_id_map
diff --git a/pipeline-runner/man/load_parental_data.Rd b/pipeline-runner/man/load_parental_data.Rd
new file mode 100644
index 00000000..4fedfe97
--- /dev/null
+++ b/pipeline-runner/man/load_parental_data.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/subset-1-subset_seurat.R
+\name{load_parental_data}
+\alias{load_parental_data}
+\title{load parent experiment data}
+\usage{
+load_parental_data(input, pipelne_config)
+}
+\arguments{
+\item{input}{list of input parameters}
+
+\item{pipelne_config}{list of pipeline parameters}
+}
+\value{
+list with scdata and parsed cellsets
+}
+\description{
+Loads the processed rds and cellsets file from the parent experiment from s3.
+}
diff --git a/pipeline-runner/man/run_gem2s_step.Rd b/pipeline-runner/man/run_pipeline_step.Rd
similarity index 84%
rename from pipeline-runner/man/run_gem2s_step.Rd
rename to pipeline-runner/man/run_pipeline_step.Rd
index b34f0348..924f7e33 100644
--- a/pipeline-runner/man/run_gem2s_step.Rd
+++ b/pipeline-runner/man/run_pipeline_step.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/init-functions.R
-\name{run_gem2s_step}
-\alias{run_gem2s_step}
+\name{run_pipeline_step}
+\alias{run_pipeline_step}
 \title{Run GEM2S step}
 \usage{
-run_gem2s_step(prev_out, input, pipeline_config, tasks, task_name)
+run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
 }
 \arguments{
 \item{prev_out}{list output from previous step}
diff --git a/pipeline-runner/man/safe_cbind.Rd b/pipeline-runner/man/safe_cbind.Rd
new file mode 100644
index 00000000..c9182e73
--- /dev/null
+++ b/pipeline-runner/man/safe_cbind.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/handle_data.R
+\name{safe_cbind}
+\alias{safe_cbind}
+\title{Bind columns not failing if there's an empty data.table}
+\usage{
+safe_cbind(dt, ...)
+}
+\arguments{
+\item{dt}{data.table}
+
+\item{...}{columns to add}
+}
+\value{
+data.table with new columns
+}
+\description{
+Bind columns not failing if there's an empty data.table
+}
diff --git a/pipeline-runner/man/subset_experiment.Rd b/pipeline-runner/man/subset_experiment.Rd
new file mode 100644
index 00000000..ba3dc297
--- /dev/null
+++ b/pipeline-runner/man/subset_experiment.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/subset-1-subset_seurat.R
+\name{subset_experiment}
+\alias{subset_experiment}
+\title{Subset seurat object by the input cellset keys}
+\usage{
+subset_experiment(input, parent)
+}
+\arguments{
+\item{input}{list of input parameters, containing cellSetKeys to subset}
+
+\item{parent}{list containing parent scdata and parsed cellsets}
+}
+\value{
+subset seurat object
+}
+\description{
+This function takes the cellset keys sent by the API, extracts the cell_ids
+that belong to them, subsets the seurat object and removes all unnecessary
+data from it.
+}

From c6f531b1bc05c70b6a1512ae083e0a101cc299f1 Mon Sep 17 00:00:00 2001
From: cosa65 <martin@biomage.net>
Date: Thu, 22 Dec 2022 17:10:31 -0300
Subject: [PATCH 19/34] Fix parameter mistyped

---
 pipeline-runner/R/subset-1-subset_seurat.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 810f38c6..82a886bb 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -62,7 +62,7 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 #' @return list with scdata and parsed cellsets
 #' @export
 #'
-load_parental_data <- function(input, pipelne_config) {
+load_parental_data <- function(input, pipeline_config) {
   # load parent processed scdata and cellsets
   s3 <- paws::s3(config = pipeline_config$aws_config)
   parent_scdata <-

From 7e4ce6a9191cf69fe40cbbb9b52a5f8e561f2d9d Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 12:35:29 -0300
Subject: [PATCH 20/34] fix some cellsets have type column

---
 pipeline-runner/R/handle_data.R | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index f180b4af..03e24a84 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -452,12 +452,13 @@ safe_cbind <- function(dt, ...) {
 
 #' add cellset type column to cellsets data.table
 #'
-#' helper to correctly name the cellset_type column.
+#' helper to correctly name the cellset type column. some cellsets already
+#' contain a "type" slot, which complicates matters, so we chose `cellset_type`,
 #'
 #' @param dt data.table
 #' @param col string of corresponding cellset type
 #'
-#' @return data.table with cellset_type
+#' @return data.table with cellset_type column
 #' @export
 #'
 cbind_cellset_type <- function(dt, col) {
@@ -484,10 +485,13 @@ parse_cellsets <- function(cellsets) {
   # fill columns in case there are empty cellset classes
   dt <- data.table::rbindlist(dt_list, fill = TRUE)
 
-  # rename cellset type to metadata in case of metadata cellsets
-  dt[!cellset_type%in% c("louvain", "scratchpad", "sample"), cellset_type := "metadata"]
+  # change cellset type to more generic names
+  dt[cellset_type %in% c("louvain", "leiden"), cellset_type := "cluster"]
+  dt[!cellset_type %in% c("cluster", "scratchpad", "sample"), cellset_type := "metadata"]
 
   # unnest, and change column name
-  dt[, setNames(.(unlist(cellIds)), "cell_id"), by = .(key, name, cellset_type)]
+  dt <- dt[, setNames(.(unlist(cellIds)), "cell_id"), by = .(key, name, cellset_type)]
+  data.table::setnames(dt, "cellset_type", "type")
+  return(dt)
 }
 

From ca96c112d2a65eea38a9f98133ca1e457d093174 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 12:35:55 -0300
Subject: [PATCH 21/34] fix handle data test

---
 pipeline-runner/tests/testthat/test-handle_data.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline-runner/tests/testthat/test-handle_data.R b/pipeline-runner/tests/testthat/test-handle_data.R
index e5c3ac42..0869f5fb 100644
--- a/pipeline-runner/tests/testthat/test-handle_data.R
+++ b/pipeline-runner/tests/testthat/test-handle_data.R
@@ -167,6 +167,6 @@ test_that("parse_cellsets parses a cellset object", {
   res <- parse_cellsets(cellsets)
 
   expect_s3_class(res, "data.table")
-  expect_identical(names(res), c("key", "name", "cellset_type", "cell_id"))
+  expect_identical(names(res), c("key", "name", "type", "cell_id"))
 
 })

From a9f5dac3e368efdda318441a635ffdb192dc1e7e Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 12:36:25 -0300
Subject: [PATCH 22/34] add unique for non-mutually-exclusive cellsets

---
 pipeline-runner/R/subset-1-subset_seurat.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 82a886bb..7f6495cc 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -115,7 +115,7 @@ diet_scdata <- function(scdata) {
 #'
 subset_experiment <- function(input, parent) {
   # subset seurat object, remove unnecesary data
-  cell_ids_to_keep <- parent$cellsets[key %in% input$cellSetKeys, cell_id]
+  cell_ids_to_keep <- unique(parent$cellsets[key %in% input$cellSetKeys, cell_id])
   scdata <- subset_ids(parent$scdata, cell_ids_to_keep)
   scdata <- diet_scdata(scdata)
   return(scdata)

From 1fea094fb8e1282843efcef9dd5a60a99a8c9b57 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 12:50:38 -0300
Subject: [PATCH 23/34] remove unnecessary block

---
 pipeline-runner/R/init-functions.R | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index 9080a5b7..414a463d 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -278,20 +278,12 @@ call_subset <- function(task_name, input, pipeline_config) {
   c(data, task_out) %<-% run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
   assign("prev_out", task_out, pos = ".GlobalEnv")
 
-  if (task_name == names(tasks)[1]) {
-    assign("cells_id", generate_first_step_ids(prev_out$scdata_list), pos = ".GlobalEnv")
-    next_task <- "dataIntegration"
-    for(sample_id in names(prev_out$scdata_list)) {
-      object_key <- paste0(experiment_id, "/", next_task, "/", sample_id, ".rds")
-      upload_cells_id(pipeline_config, object_key, cells_id)
-    }
-  }
-
   message_id <- send_gem2s_update_to_api(pipeline_config, experiment_id, task_name, data, input)
 
   return(message_id)
 }
 
+
 #' Call QC pipeline
 #'
 #' Runs step `task_name` of the data processing pipeline, sends plot data to s3
@@ -443,6 +435,7 @@ pipeline_heartbeat <- function(task_token, aws_config) {
   }
 }
 
+
 #' Start heartbeat as a background process
 #'
 #' messages inside the background process will ONLY be printed into
@@ -501,6 +494,7 @@ wrapper <- function(input, pipeline_config) {
   return(message_id)
 }
 
+
 #' Pipeline error handler
 #'
 #' Pretty prints errors, sends roport to the API, and uploads debug output to

From 1ee3b7a88e0fa9a5bc958373c246328d373d0d6d Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 12:50:53 -0300
Subject: [PATCH 24/34] document

---
 pipeline-runner/man/cbind_cellset_type.Rd | 5 +++--
 pipeline-runner/man/load_parental_data.Rd | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipeline-runner/man/cbind_cellset_type.Rd b/pipeline-runner/man/cbind_cellset_type.Rd
index 27b345c5..c64160db 100644
--- a/pipeline-runner/man/cbind_cellset_type.Rd
+++ b/pipeline-runner/man/cbind_cellset_type.Rd
@@ -12,8 +12,9 @@ cbind_cellset_type(dt, col)
 \item{col}{string of corresponding cellset type}
 }
 \value{
-data.table with cellset_type
+data.table with cellset_type column
 }
 \description{
-helper to correctly name the cellset_type column.
+helper to correctly name the cellset type column. some cellsets already
+contain a "type" slot, which complicates matters, so we chose \code{cellset_type},
 }
diff --git a/pipeline-runner/man/load_parental_data.Rd b/pipeline-runner/man/load_parental_data.Rd
index 4fedfe97..089dc3b2 100644
--- a/pipeline-runner/man/load_parental_data.Rd
+++ b/pipeline-runner/man/load_parental_data.Rd
@@ -4,7 +4,7 @@
 \alias{load_parental_data}
 \title{load parent experiment data}
 \usage{
-load_parental_data(input, pipelne_config)
+load_parental_data(input, pipeline_config)
 }
 \arguments{
 \item{input}{list of input parameters}

From 06d426b9105a3242ba9f375c754c50ecf7ed3e58 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 13:46:24 -0300
Subject: [PATCH 25/34] rename function

---
 pipeline-runner/R/subset-1-subset_seurat.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index 7f6495cc..c25dee55 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -17,7 +17,7 @@
 #'
 create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 
-  parent <- load_parental_data(input, pipeline_config)
+  parent <- load_parent_experiment_data(input, pipeline_config)
 
   scdata <- subset_experiment(input, parent)
   sample_id_map <- create_sample_id_map(unique(scdata$samples))
@@ -62,7 +62,7 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
 #' @return list with scdata and parsed cellsets
 #' @export
 #'
-load_parental_data <- function(input, pipeline_config) {
+load_parent_experiment_data <- function(input, pipeline_config) {
   # load parent processed scdata and cellsets
   s3 <- paws::s3(config = pipeline_config$aws_config)
   parent_scdata <-

From e9fcfb50e88a04f1470fa5468b27d360c890195c Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 13:49:50 -0300
Subject: [PATCH 26/34] document

---
 pipeline-runner/NAMESPACE                                   | 2 +-
 ...load_parental_data.Rd => load_parent_experiment_data.Rd} | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename pipeline-runner/man/{load_parental_data.Rd => load_parent_experiment_data.Rd} (76%)

diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
index c111a52a..6a1e6117 100644
--- a/pipeline-runner/NAMESPACE
+++ b/pipeline-runner/NAMESPACE
@@ -32,7 +32,7 @@ export(integrate_scdata)
 export(learn_from_sketches)
 export(list_exclude_genes)
 export(load_cellsets)
-export(load_parental_data)
+export(load_parent_experiment_data)
 export(load_user_files)
 export(log_normalize)
 export(make_annot_with_ids)
diff --git a/pipeline-runner/man/load_parental_data.Rd b/pipeline-runner/man/load_parent_experiment_data.Rd
similarity index 76%
rename from pipeline-runner/man/load_parental_data.Rd
rename to pipeline-runner/man/load_parent_experiment_data.Rd
index 089dc3b2..953aebea 100644
--- a/pipeline-runner/man/load_parental_data.Rd
+++ b/pipeline-runner/man/load_parent_experiment_data.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/subset-1-subset_seurat.R
-\name{load_parental_data}
-\alias{load_parental_data}
+\name{load_parent_experiment_data}
+\alias{load_parent_experiment_data}
 \title{load parent experiment data}
 \usage{
-load_parental_data(input, pipeline_config)
+load_parent_experiment_data(input, pipeline_config)
 }
 \arguments{
 \item{input}{list of input parameters}

From 67a4311da6dac50f6cf39ee9627b592ac8774889 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 13:54:50 -0300
Subject: [PATCH 27/34] comment WIP tests

---
 .../testthat/test-subset-1-subset_seurat.R    | 58 ++++++++++++-------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R b/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
index fcae2442..171553df 100644
--- a/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
+++ b/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
@@ -1,25 +1,39 @@
-mock_scdata <- function(){
-  processed_path <- "/Users/german/bm/cellenics/data/8ecc9d20-30e4-49eb-b536-a0d1f0ba420d/processed_r.rds"
-  readRDS(processed_path)
-}
+# mock_scdata <- function(){
+#   paths <- path_setup()
+#   source("tests/testthat/_snaps/qc/mock_experiment_id-integrated_scdata.R")
+#   scdata <- snap_list$data
+#   rm(snap_list, envir = parent.frame())
+#   return(scdata)
+# }
+#
+# mock_cellsets <- function(){
+#
+#   jsonlite::fromJSON("tests/testthat/_snaps/gem2s/gem2s-7-mock_experiment_id-cellsets.json", flatten = TRUE)
+#
+# }
+#
+# mock_cluster_cellsets <- function(cellsets) {
+#
+# }
+#
+# mock_input <- function() {
+#   input <- list(
+#     name = "mock_subset_experiment_name",
+#     parentExperimentId = "mock_parent_experiment_id",
+#     subsetExperimentId = "mock_subset_experiment_id",
+#     cellSetKeys =  c("louvain-0", "louvain-1")
+#   )
+#
+#   return(input)
+# }
+#
+# parent_scdata <- mock_scdata()
+# cellsets <- mock_cellsets()
+# parent_cellsets <- parse_cellsets(mock_cellsets())
+#
+# parent <- list(scdata = parent_scdata, cellsets = parent_cellsets)
+#
+# input <- mock_input()
 
-mock_cellsets <- function(){
-  cellsets_path <- "/Users/german/bm/cellenics/data/8ecc9d20-30e4-49eb-b536-a0d1f0ba420d/cellsets.json"
-  jsonlite::fromJSON(cellsets_path, flatten = TRUE)
-}
 
-mock_input <- function() {
-  input <- list(
-    name = "mock_subset_experiment_name",
-    parentExperimentId = "mock_parent_experiment_id",
-    subsetExperimentId = "mock_subset_experiment_id",
-    cellSetKeys =  c("louvain-0", "louvain-1")
-  )
 
-  return(input)
-}
-
-parent_scdata <- mock_scdata()
-parent_cellsets <- parse_cellsets(mock_cellsets())
-sample_mapping <- mock_sample_id_mapping()
-input <- mock_input()

From df596e1e4bacf0d3c7d874d860a3712049c76ad1 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 14:09:34 -0300
Subject: [PATCH 28/34] fix tests

---
 pipeline-runner/tests/testthat/test-handle_data.R            | 5 +++--
 pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R | 3 ---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pipeline-runner/tests/testthat/test-handle_data.R b/pipeline-runner/tests/testthat/test-handle_data.R
index 0869f5fb..fe96b8de 100644
--- a/pipeline-runner/tests/testthat/test-handle_data.R
+++ b/pipeline-runner/tests/testthat/test-handle_data.R
@@ -8,7 +8,8 @@ mock_sns <- function(config) {
 
 mock_cellsets <- function(){
   # get a snapshot cellsets json
-  jsonlite::fromJSON("tests/testthat/_snaps/gem2s/gem2s-7-mock_experiment_id-cellsets.json", flatten = TRUE)
+  paths <- setup_test_paths()
+  jsonlite::fromJSON(file.path(paths$snaps, "gem2s", "gem2s-7-mock_experiment_id-cellsets.json"), flatten = TRUE)
 
 }
 
@@ -152,7 +153,7 @@ test_that("cbind_cellset_type names the bound column correctly", {
   dt <- data.table::data.table(col1 = 1:10, col2 = 11:20)
   values <- seq(1, 20, 2)
 
-  res <- cbind_cellset_type(dt, values_1)
+  res <- cbind_cellset_type(dt, values)
 
   expect_true("cellset_type" %in% names(res))
   expect_identical(res[,cellset_type], values)
diff --git a/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R b/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
index 171553df..19d75f32 100644
--- a/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
+++ b/pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
@@ -34,6 +34,3 @@
 # parent <- list(scdata = parent_scdata, cellsets = parent_cellsets)
 #
 # input <- mock_input()
-
-
-

From 9c851a9558e02c00f445745e77dd3b4e2e205c50 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 14:16:21 -0300
Subject: [PATCH 29/34] rename function call in test

---
 pipeline-runner/tests/testthat/test-gem2s.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline-runner/tests/testthat/test-gem2s.R b/pipeline-runner/tests/testthat/test-gem2s.R
index fb17e345..3d4abdc7 100644
--- a/pipeline-runner/tests/testthat/test-gem2s.R
+++ b/pipeline-runner/tests/testthat/test-gem2s.R
@@ -74,7 +74,7 @@ test_gem2s <- function(experiment_id) {
     res <- list()
 
     for (task_name in names(tasks)) {
-      res <- run_gem2s_step(res$output,
+      res <- run_pipeline_step(res$output,
                             input,
                             pipeline_config,
                             tasks,

From 90a75c4ac3a0f3c62bd1683f248572221b7a4d3c Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Fri, 23 Dec 2022 14:26:23 -0300
Subject: [PATCH 30/34] update snaps

---
 .../_snaps/gem2s-6-prepare_experiment.md      |  63 ++---
 .../tests/testthat/_snaps/gem2s.md            | 256 ++++++------------
 .../gem2s/gem2s-6-mock_experiment_id-out.R    |  51 ++--
 .../gem2s-6-mock_experiment_id-qc_config.R    |  51 ++--
 4 files changed, 139 insertions(+), 282 deletions(-)

diff --git a/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md b/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md
index c3859e51..054f2e31 100644
--- a/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md
+++ b/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md
@@ -5,52 +5,34 @@
     Output
       List of 7
        $ cellSizeDistribution:List of 1
-        ..$ sample_a:List of 4
-        .. ..$ enabled              : logi FALSE
-        .. ..$ auto                 : logi TRUE
-        .. ..$ filterSettings       :List of 2
-        .. .. ..$ minCellSize: num 10
-        .. .. ..$ binStep    : num 200
-        .. ..$ defaultFilterSettings:List of 2
+        ..$ sample_a:List of 3
+        .. ..$ enabled       : logi FALSE
+        .. ..$ auto          : logi TRUE
+        .. ..$ filterSettings:List of 2
         .. .. ..$ minCellSize: num 10
         .. .. ..$ binStep    : num 200
        $ mitochondrialContent:List of 1
-        ..$ sample_a:List of 4
-        .. ..$ enabled              : logi TRUE
-        .. ..$ auto                 : logi TRUE
-        .. ..$ filterSettings       :List of 2
-        .. .. ..$ method        : chr "absoluteThreshold"
-        .. .. ..$ methodSettings:List of 1
-        .. .. .. ..$ absoluteThreshold:List of 2
-        .. .. .. .. ..$ maxFraction: num 0
-        .. .. .. .. ..$ binStep    : num 0.3
-        .. ..$ defaultFilterSettings:List of 2
+        ..$ sample_a:List of 3
+        .. ..$ enabled       : logi TRUE
+        .. ..$ auto          : logi TRUE
+        .. ..$ filterSettings:List of 2
         .. .. ..$ method        : chr "absoluteThreshold"
         .. .. ..$ methodSettings:List of 1
         .. .. .. ..$ absoluteThreshold:List of 2
         .. .. .. .. ..$ maxFraction: num 0
         .. .. .. .. ..$ binStep    : num 0.3
        $ classifier          :List of 1
-        ..$ sample_a:List of 5
-        .. ..$ enabled              : logi TRUE
-        .. ..$ prefiltered          : logi FALSE
-        .. ..$ auto                 : logi TRUE
-        .. ..$ filterSettings       :List of 1
-        .. .. ..$ FDR: num 0.01
-        .. ..$ defaultFilterSettings:List of 1
+        ..$ sample_a:List of 4
+        .. ..$ enabled       : logi TRUE
+        .. ..$ prefiltered   : logi FALSE
+        .. ..$ auto          : logi TRUE
+        .. ..$ filterSettings:List of 1
         .. .. ..$ FDR: num 0.01
        $ numGenesVsNumUmis   :List of 1
-        ..$ sample_a:List of 4
-        .. ..$ enabled              : logi TRUE
-        .. ..$ auto                 : logi TRUE
-        .. ..$ filterSettings       :List of 2
-        .. .. ..$ regressionType        : chr "linear"
-        .. .. ..$ regressionTypeSettings:List of 2
-        .. .. .. ..$ linear:List of 1
-        .. .. .. .. ..$ p.level: num 0.000132
-        .. .. .. ..$ spline:List of 1
-        .. .. .. .. ..$ p.level: num 0.001
-        .. ..$ defaultFilterSettings:List of 2
+        ..$ sample_a:List of 3
+        .. ..$ enabled       : logi TRUE
+        .. ..$ auto          : logi TRUE
+        .. ..$ filterSettings:List of 2
         .. .. ..$ regressionType        : chr "linear"
         .. .. ..$ regressionTypeSettings:List of 2
         .. .. .. ..$ linear:List of 1
@@ -58,13 +40,10 @@
         .. .. .. ..$ spline:List of 1
         .. .. .. .. ..$ p.level: num 0.001
        $ doubletScores       :List of 1
-        ..$ sample_a:List of 4
-        .. ..$ enabled              : logi TRUE
-        .. ..$ auto                 : logi TRUE
-        .. ..$ filterSettings       :List of 2
-        .. .. ..$ probabilityThreshold: num 0.8
-        .. .. ..$ binStep             : num 0.02
-        .. ..$ defaultFilterSettings:List of 2
+        ..$ sample_a:List of 3
+        .. ..$ enabled       : logi TRUE
+        .. ..$ auto          : logi TRUE
+        .. ..$ filterSettings:List of 2
         .. .. ..$ probabilityThreshold: num 0.8
         .. .. ..$ binStep             : num 0.02
        $ dataIntegration     :List of 2
diff --git a/pipeline-runner/tests/testthat/_snaps/gem2s.md b/pipeline-runner/tests/testthat/_snaps/gem2s.md
index 08e6fb81..c8c5f0d5 100644
--- a/pipeline-runner/tests/testthat/_snaps/gem2s.md
+++ b/pipeline-runner/tests/testthat/_snaps/gem2s.md
@@ -482,7 +482,7 @@
     Code
       rlang::hash(res)
     Output
-      [1] "988133ae29394ba9af3dcf9c03e0dba7"
+      [1] "e449dea4dd4b69f18d8fc7fd7fb58063"
     Code
       str(res)
     Output
@@ -697,101 +697,65 @@
         ..$ disable_qc_filters: logi FALSE
         ..$ qc_config         :List of 7
         .. ..$ cellSizeDistribution:List of 2
-        .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. ..$ enabled              : logi FALSE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. ..$ minCellSize: num 17
-        .. .. .. .. ..$ binStep    : num 200
-        .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. ..$ enabled       : logi FALSE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ minCellSize: num 17
         .. .. .. .. ..$ binStep    : num 200
-        .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. ..$ enabled              : logi FALSE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. ..$ minCellSize: num 27
-        .. .. .. .. ..$ binStep    : num 200
-        .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. ..$ enabled       : logi FALSE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ minCellSize: num 27
         .. .. .. .. ..$ binStep    : num 200
         .. ..$ mitochondrialContent:List of 2
-        .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
+        .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ method        : chr "absoluteThreshold"
         .. .. .. .. ..$ methodSettings:List of 1
         .. .. .. .. .. ..$ absoluteThreshold:List of 2
         .. .. .. .. .. .. ..$ maxFraction: num 0.521
         .. .. .. .. .. .. ..$ binStep    : num 0.3
-        .. .. .. ..$ defaultFilterSettings:List of 2
-        .. .. .. .. ..$ method        : chr "absoluteThreshold"
-        .. .. .. .. ..$ methodSettings:List of 1
-        .. .. .. .. .. ..$ absoluteThreshold:List of 2
-        .. .. .. .. .. .. ..$ maxFraction: num 0.521
-        .. .. .. .. .. .. ..$ binStep    : num 0.3
-        .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. ..$ method        : chr "absoluteThreshold"
-        .. .. .. .. ..$ methodSettings:List of 1
-        .. .. .. .. .. ..$ absoluteThreshold:List of 2
-        .. .. .. .. .. .. ..$ maxFraction: num 0.603
-        .. .. .. .. .. .. ..$ binStep    : num 0.3
-        .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ method        : chr "absoluteThreshold"
         .. .. .. .. ..$ methodSettings:List of 1
         .. .. .. .. .. ..$ absoluteThreshold:List of 2
         .. .. .. .. .. .. ..$ maxFraction: num 0.603
         .. .. .. .. .. .. ..$ binStep    : num 0.3
         .. ..$ classifier          :List of 2
-        .. .. ..$ mock_sample_2_id:List of 5
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ prefiltered          : logi FALSE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 1
-        .. .. .. .. ..$ FDR: num 0.01
-        .. .. .. ..$ defaultFilterSettings:List of 1
-        .. .. .. .. ..$ FDR: num 0.01
-        .. .. ..$ mock_sample_1_id:List of 5
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ prefiltered          : logi FALSE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 1
+        .. .. ..$ mock_sample_2_id:List of 4
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ prefiltered   : logi FALSE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 1
         .. .. .. .. ..$ FDR: num 0.01
-        .. .. .. ..$ defaultFilterSettings:List of 1
+        .. .. ..$ mock_sample_1_id:List of 4
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ prefiltered   : logi FALSE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 1
         .. .. .. .. ..$ FDR: num 0.01
         .. ..$ numGenesVsNumUmis   :List of 2
-        .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
+        .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ regressionType        : chr "linear"
         .. .. .. .. ..$ regressionTypeSettings:List of 2
         .. .. .. .. .. ..$ linear:List of 1
         .. .. .. .. .. .. ..$ p.level: num 0.001
         .. .. .. .. .. ..$ spline:List of 1
         .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. ..$ defaultFilterSettings:List of 2
-        .. .. .. .. ..$ regressionType        : chr "linear"
-        .. .. .. .. ..$ regressionTypeSettings:List of 2
-        .. .. .. .. .. ..$ linear:List of 1
-        .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. .. .. ..$ spline:List of 1
-        .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. ..$ regressionType        : chr "linear"
-        .. .. .. .. ..$ regressionTypeSettings:List of 2
-        .. .. .. .. .. ..$ linear:List of 1
-        .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. .. .. ..$ spline:List of 1
-        .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ regressionType        : chr "linear"
         .. .. .. .. ..$ regressionTypeSettings:List of 2
         .. .. .. .. .. ..$ linear:List of 1
@@ -799,22 +763,16 @@
         .. .. .. .. .. ..$ spline:List of 1
         .. .. .. .. .. .. ..$ p.level: num 0.001
         .. ..$ doubletScores       :List of 2
-        .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
+        .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ probabilityThreshold: num 0.979
         .. .. .. .. ..$ binStep             : num 0.02
-        .. .. .. ..$ defaultFilterSettings:List of 2
-        .. .. .. .. ..$ probabilityThreshold: num 0.979
-        .. .. .. .. ..$ binStep             : num 0.02
-        .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. ..$ probabilityThreshold: num 0.84
-        .. .. .. .. ..$ binStep             : num 0.02
-        .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. ..$ auto          : logi TRUE
+        .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. ..$ probabilityThreshold: num 0.84
         .. .. .. .. ..$ binStep             : num 0.02
         .. ..$ dataIntegration     :List of 2
@@ -862,7 +820,7 @@
     Code
       rlang::hash(res)
     Output
-      [1] "b43eb24698e7e929755e256573b8b817"
+      [1] "4afaaf94e9fd057c28b870c5eca95131"
     Code
       str(res)
     Output
@@ -877,101 +835,65 @@
         .. .. ..$ type    : chr "10x"
         .. ..$ processingConfig:List of 7
         .. .. ..$ cellSizeDistribution:List of 2
-        .. .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. .. ..$ enabled              : logi FALSE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. .. ..$ minCellSize: num 17
-        .. .. .. .. .. ..$ binStep    : num 200
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. .. ..$ enabled       : logi FALSE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ minCellSize: num 17
         .. .. .. .. .. ..$ binStep    : num 200
-        .. .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. .. ..$ enabled              : logi FALSE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. .. ..$ minCellSize: num 27
-        .. .. .. .. .. ..$ binStep    : num 200
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. .. ..$ enabled       : logi FALSE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ minCellSize: num 27
         .. .. .. .. .. ..$ binStep    : num 200
         .. .. ..$ mitochondrialContent:List of 2
-        .. .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
+        .. .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ method        : chr "absoluteThreshold"
         .. .. .. .. .. ..$ methodSettings:List of 1
         .. .. .. .. .. .. ..$ absoluteThreshold:List of 2
         .. .. .. .. .. .. .. ..$ maxFraction: num 0.521
         .. .. .. .. .. .. .. ..$ binStep    : num 0.3
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
-        .. .. .. .. .. ..$ method        : chr "absoluteThreshold"
-        .. .. .. .. .. ..$ methodSettings:List of 1
-        .. .. .. .. .. .. ..$ absoluteThreshold:List of 2
-        .. .. .. .. .. .. .. ..$ maxFraction: num 0.521
-        .. .. .. .. .. .. .. ..$ binStep    : num 0.3
-        .. .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. .. ..$ method        : chr "absoluteThreshold"
-        .. .. .. .. .. ..$ methodSettings:List of 1
-        .. .. .. .. .. .. ..$ absoluteThreshold:List of 2
-        .. .. .. .. .. .. .. ..$ maxFraction: num 0.603
-        .. .. .. .. .. .. .. ..$ binStep    : num 0.3
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ method        : chr "absoluteThreshold"
         .. .. .. .. .. ..$ methodSettings:List of 1
         .. .. .. .. .. .. ..$ absoluteThreshold:List of 2
         .. .. .. .. .. .. .. ..$ maxFraction: num 0.603
         .. .. .. .. .. .. .. ..$ binStep    : num 0.3
         .. .. ..$ classifier          :List of 2
-        .. .. .. ..$ mock_sample_2_id:List of 5
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ prefiltered          : logi FALSE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 1
-        .. .. .. .. .. ..$ FDR: num 0.01
-        .. .. .. .. ..$ defaultFilterSettings:List of 1
-        .. .. .. .. .. ..$ FDR: num 0.01
-        .. .. .. ..$ mock_sample_1_id:List of 5
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ prefiltered          : logi FALSE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 1
+        .. .. .. ..$ mock_sample_2_id:List of 4
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ prefiltered   : logi FALSE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 1
         .. .. .. .. .. ..$ FDR: num 0.01
-        .. .. .. .. ..$ defaultFilterSettings:List of 1
+        .. .. .. ..$ mock_sample_1_id:List of 4
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ prefiltered   : logi FALSE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 1
         .. .. .. .. .. ..$ FDR: num 0.01
         .. .. ..$ numGenesVsNumUmis   :List of 2
-        .. .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
+        .. .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ regressionType        : chr "linear"
         .. .. .. .. .. ..$ regressionTypeSettings:List of 2
         .. .. .. .. .. .. ..$ linear:List of 1
         .. .. .. .. .. .. .. ..$ p.level: num 0.001
         .. .. .. .. .. .. ..$ spline:List of 1
         .. .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
-        .. .. .. .. .. ..$ regressionType        : chr "linear"
-        .. .. .. .. .. ..$ regressionTypeSettings:List of 2
-        .. .. .. .. .. .. ..$ linear:List of 1
-        .. .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. .. .. .. ..$ spline:List of 1
-        .. .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. .. ..$ regressionType        : chr "linear"
-        .. .. .. .. .. ..$ regressionTypeSettings:List of 2
-        .. .. .. .. .. .. ..$ linear:List of 1
-        .. .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. .. .. .. ..$ spline:List of 1
-        .. .. .. .. .. .. .. ..$ p.level: num 0.001
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ regressionType        : chr "linear"
         .. .. .. .. .. ..$ regressionTypeSettings:List of 2
         .. .. .. .. .. .. ..$ linear:List of 1
@@ -979,22 +901,16 @@
         .. .. .. .. .. .. ..$ spline:List of 1
         .. .. .. .. .. .. .. ..$ p.level: num 0.001
         .. .. ..$ doubletScores       :List of 2
-        .. .. .. ..$ mock_sample_2_id:List of 4
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
+        .. .. .. ..$ mock_sample_2_id:List of 3
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ probabilityThreshold: num 0.979
         .. .. .. .. .. ..$ binStep             : num 0.02
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
-        .. .. .. .. .. ..$ probabilityThreshold: num 0.979
-        .. .. .. .. .. ..$ binStep             : num 0.02
-        .. .. .. ..$ mock_sample_1_id:List of 4
-        .. .. .. .. ..$ enabled              : logi TRUE
-        .. .. .. .. ..$ auto                 : logi TRUE
-        .. .. .. .. ..$ filterSettings       :List of 2
-        .. .. .. .. .. ..$ probabilityThreshold: num 0.84
-        .. .. .. .. .. ..$ binStep             : num 0.02
-        .. .. .. .. ..$ defaultFilterSettings:List of 2
+        .. .. .. ..$ mock_sample_1_id:List of 3
+        .. .. .. .. ..$ enabled       : logi TRUE
+        .. .. .. .. ..$ auto          : logi TRUE
+        .. .. .. .. ..$ filterSettings:List of 2
         .. .. .. .. .. ..$ probabilityThreshold: num 0.84
         .. .. .. .. .. ..$ binStep             : num 0.02
         .. .. ..$ dataIntegration     :List of 2
diff --git a/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-out.R b/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-out.R
index 3a2b9164..2e018970 100644
--- a/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-out.R
+++ b/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-out.R
@@ -49746,53 +49746,34 @@ list(data = list(), output = list(config = list(name = "mock_experiment",
         )), commands = list(), tools = list(flag_filtered = FALSE))), 
     disable_qc_filters = FALSE, qc_config = list(cellSizeDistribution = list(
         mock_sample_2_id = list(enabled = FALSE, auto = TRUE, 
-            filterSettings = list(minCellSize = 17, binStep = 200), 
-            defaultFilterSettings = list(minCellSize = 17, binStep = 200)), 
+            filterSettings = list(minCellSize = 17, binStep = 200)), 
         mock_sample_1_id = list(enabled = FALSE, auto = TRUE, 
-            filterSettings = list(minCellSize = 27, binStep = 200), 
-            defaultFilterSettings = list(minCellSize = 27, binStep = 200))), 
+            filterSettings = list(minCellSize = 27, binStep = 200))), 
         mitochondrialContent = list(mock_sample_2_id = list(enabled = TRUE, 
             auto = TRUE, filterSettings = list(method = "absoluteThreshold", 
-                methodSettings = list(absoluteThreshold = list(
-                  maxFraction = 0.52054794520547942, binStep = 0.29999999999999999))), 
-            defaultFilterSettings = list(method = "absoluteThreshold", 
                 methodSettings = list(absoluteThreshold = list(
                   maxFraction = 0.52054794520547942, binStep = 0.29999999999999999)))), 
             mock_sample_1_id = list(enabled = TRUE, auto = TRUE, 
                 filterSettings = list(method = "absoluteThreshold", 
-                  methodSettings = list(absoluteThreshold = list(
-                    maxFraction = 0.60256410256410253, binStep = 0.29999999999999999))), 
-                defaultFilterSettings = list(method = "absoluteThreshold", 
                   methodSettings = list(absoluteThreshold = list(
                     maxFraction = 0.60256410256410253, binStep = 0.29999999999999999))))), 
         classifier = list(mock_sample_2_id = list(enabled = TRUE, 
             prefiltered = FALSE, auto = TRUE, filterSettings = list(
-                FDR = 0.01), defaultFilterSettings = list(FDR = 0.01)), 
-            mock_sample_1_id = list(enabled = TRUE, prefiltered = FALSE, 
-                auto = TRUE, filterSettings = list(FDR = 0.01), 
-                defaultFilterSettings = list(FDR = 0.01))), numGenesVsNumUmis = list(
+                FDR = 0.01)), mock_sample_1_id = list(enabled = TRUE, 
+            prefiltered = FALSE, auto = TRUE, filterSettings = list(
+                FDR = 0.01))), numGenesVsNumUmis = list(mock_sample_2_id = list(
+            enabled = TRUE, auto = TRUE, filterSettings = list(
+                regressionType = "linear", regressionTypeSettings = list(
+                  linear = list(p.level = 0.001), spline = list(
+                    p.level = 0.001)))), mock_sample_1_id = list(
+            enabled = TRUE, auto = TRUE, filterSettings = list(
+                regressionType = "linear", regressionTypeSettings = list(
+                  linear = list(p.level = 0.001), spline = list(
+                    p.level = 0.001))))), doubletScores = list(
             mock_sample_2_id = list(enabled = TRUE, auto = TRUE, 
-                filterSettings = list(regressionType = "linear", 
-                  regressionTypeSettings = list(linear = list(
-                    p.level = 0.001), spline = list(p.level = 0.001))), 
-                defaultFilterSettings = list(regressionType = "linear", 
-                  regressionTypeSettings = list(linear = list(
-                    p.level = 0.001), spline = list(p.level = 0.001)))), 
-            mock_sample_1_id = list(enabled = TRUE, auto = TRUE, 
-                filterSettings = list(regressionType = "linear", 
-                  regressionTypeSettings = list(linear = list(
-                    p.level = 0.001), spline = list(p.level = 0.001))), 
-                defaultFilterSettings = list(regressionType = "linear", 
-                  regressionTypeSettings = list(linear = list(
-                    p.level = 0.001), spline = list(p.level = 0.001))))), 
-        doubletScores = list(mock_sample_2_id = list(enabled = TRUE, 
-            auto = TRUE, filterSettings = list(probabilityThreshold = 0.97920405864715576, 
-                binStep = 0.02), defaultFilterSettings = list(
-                probabilityThreshold = 0.97920405864715576, binStep = 0.02)), 
-            mock_sample_1_id = list(enabled = TRUE, auto = TRUE, 
-                filterSettings = list(probabilityThreshold = 0.83960545063018799, 
-                  binStep = 0.02), defaultFilterSettings = list(
-                  probabilityThreshold = 0.83960545063018799, 
+                filterSettings = list(probabilityThreshold = 0.97920405864715576, 
+                  binStep = 0.02)), mock_sample_1_id = list(enabled = TRUE, 
+                auto = TRUE, filterSettings = list(probabilityThreshold = 0.83960545063018799, 
                   binStep = 0.02))), dataIntegration = list(dataIntegration = list(
             method = "harmony", methodSettings = list(seuratv4 = list(
                 numGenes = 2000, normalisation = "logNormalize"), 
diff --git a/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-qc_config.R b/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-qc_config.R
index 3cced61a..198f146c 100644
--- a/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-qc_config.R
+++ b/pipeline-runner/tests/testthat/_snaps/gem2s/gem2s-6-mock_experiment_id-qc_config.R
@@ -1,52 +1,33 @@
 qc_config <-
 list(cellSizeDistribution = list(mock_sample_2_id = list(enabled = FALSE, 
-    auto = TRUE, filterSettings = list(minCellSize = 17, binStep = 200), 
-    defaultFilterSettings = list(minCellSize = 17, binStep = 200)), 
+    auto = TRUE, filterSettings = list(minCellSize = 17, binStep = 200)), 
     mock_sample_1_id = list(enabled = FALSE, auto = TRUE, filterSettings = list(
-        minCellSize = 27, binStep = 200), defaultFilterSettings = list(
         minCellSize = 27, binStep = 200))), mitochondrialContent = list(
     mock_sample_2_id = list(enabled = TRUE, auto = TRUE, filterSettings = list(
         method = "absoluteThreshold", methodSettings = list(absoluteThreshold = list(
-            maxFraction = 0.52054794520547942, binStep = 0.29999999999999999))), 
-        defaultFilterSettings = list(method = "absoluteThreshold", 
-            methodSettings = list(absoluteThreshold = list(maxFraction = 0.52054794520547942, 
-                binStep = 0.29999999999999999)))), mock_sample_1_id = list(
-        enabled = TRUE, auto = TRUE, filterSettings = list(method = "absoluteThreshold", 
-            methodSettings = list(absoluteThreshold = list(maxFraction = 0.60256410256410253, 
-                binStep = 0.29999999999999999))), defaultFilterSettings = list(
-            method = "absoluteThreshold", methodSettings = list(
-                absoluteThreshold = list(maxFraction = 0.60256410256410253, 
-                  binStep = 0.29999999999999999))))), classifier = list(
-    mock_sample_2_id = list(enabled = TRUE, prefiltered = FALSE, 
-        auto = TRUE, filterSettings = list(FDR = 0.01), defaultFilterSettings = list(
+            maxFraction = 0.52054794520547942, binStep = 0.29999999999999999)))), 
+    mock_sample_1_id = list(enabled = TRUE, auto = TRUE, filterSettings = list(
+        method = "absoluteThreshold", methodSettings = list(absoluteThreshold = list(
+            maxFraction = 0.60256410256410253, binStep = 0.29999999999999999))))), 
+    classifier = list(mock_sample_2_id = list(enabled = TRUE, 
+        prefiltered = FALSE, auto = TRUE, filterSettings = list(
             FDR = 0.01)), mock_sample_1_id = list(enabled = TRUE, 
         prefiltered = FALSE, auto = TRUE, filterSettings = list(
-            FDR = 0.01), defaultFilterSettings = list(FDR = 0.01))), 
-    numGenesVsNumUmis = list(mock_sample_2_id = list(enabled = TRUE, 
-        auto = TRUE, filterSettings = list(regressionType = "linear", 
+            FDR = 0.01))), numGenesVsNumUmis = list(mock_sample_2_id = list(
+        enabled = TRUE, auto = TRUE, filterSettings = list(regressionType = "linear", 
             regressionTypeSettings = list(linear = list(p.level = 0.001), 
-                spline = list(p.level = 0.001))), defaultFilterSettings = list(
-            regressionType = "linear", regressionTypeSettings = list(
-                linear = list(p.level = 0.001), spline = list(
-                  p.level = 0.001)))), mock_sample_1_id = list(
+                spline = list(p.level = 0.001)))), mock_sample_1_id = list(
         enabled = TRUE, auto = TRUE, filterSettings = list(regressionType = "linear", 
             regressionTypeSettings = list(linear = list(p.level = 0.001), 
-                spline = list(p.level = 0.001))), defaultFilterSettings = list(
-            regressionType = "linear", regressionTypeSettings = list(
-                linear = list(p.level = 0.001), spline = list(
-                  p.level = 0.001))))), doubletScores = list(
+                spline = list(p.level = 0.001))))), doubletScores = list(
         mock_sample_2_id = list(enabled = TRUE, auto = TRUE, 
             filterSettings = list(probabilityThreshold = 0.97920405864715576, 
-                binStep = 0.02), defaultFilterSettings = list(
-                probabilityThreshold = 0.97920405864715576, binStep = 0.02)), 
-        mock_sample_1_id = list(enabled = TRUE, auto = TRUE, 
-            filterSettings = list(probabilityThreshold = 0.83960545063018799, 
-                binStep = 0.02), defaultFilterSettings = list(
-                probabilityThreshold = 0.83960545063018799, binStep = 0.02))), 
-    dataIntegration = list(dataIntegration = list(method = "harmony", 
-        methodSettings = list(seuratv4 = list(numGenes = 2000, 
-            normalisation = "logNormalize"), unisample = list(
+                binStep = 0.02)), mock_sample_1_id = list(enabled = TRUE, 
+            auto = TRUE, filterSettings = list(probabilityThreshold = 0.83960545063018799, 
+                binStep = 0.02))), dataIntegration = list(dataIntegration = list(
+        method = "harmony", methodSettings = list(seuratv4 = list(
             numGenes = 2000, normalisation = "logNormalize"), 
+            unisample = list(numGenes = 2000, normalisation = "logNormalize"), 
             harmony = list(numGenes = 2000, normalisation = "logNormalize"), 
             fastmnn = list(numGenes = 2000, normalisation = "logNormalize"))), 
         dimensionalityReduction = list(method = "rpca", numPCs = NULL, 

From 1124f30324ded8b51ed631c31105d75a965c7b5d Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Mon, 26 Dec 2022 12:25:22 -0300
Subject: [PATCH 31/34] add comment

---
 pipeline-runner/R/handle_data.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index 03e24a84..a2d74631 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -87,6 +87,8 @@ reload_data_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks
   integration_index <- match("dataIntegration", task_names)
   s3 <- paws::s3(config = pipeline_config$aws_config)
 
+  # TODO: remove if block
+  # this never runs, because embed and cluster runs in the worker if modified.
   # If the task is after data integration, we need to get scdata from processed_matrix
   if (match(task_name, task_names) > integration_index) {
     return(load_processed_scdata(s3, pipeline_config, experiment_id))

From 6fe9f8f616042125f1162914e46c27ef9fa26489 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Mon, 26 Dec 2022 12:52:46 -0300
Subject: [PATCH 32/34] address martin's comments

---
 pipeline-runner/NAMESPACE                     |  2 +-
 pipeline-runner/R/handle_data.R               |  7 ++-
 pipeline-runner/R/subset-1-subset_seurat.R    | 45 +++++++++----------
 pipeline-runner/data-raw/sysdata.R            |  2 +-
 pipeline-runner/man/add_new_sample_ids.Rd     |  6 +--
 pipeline-runner/man/add_subset_metadata.Rd    |  6 +--
 pipeline-runner/man/safe_cbind.Rd             |  7 ++-
 pipeline-runner/man/subset_experiment.Rd      |  2 +-
 ..._subset_experiment.Rd => subset_seurat.Rd} |  6 +--
 9 files changed, 44 insertions(+), 39 deletions(-)
 rename pipeline-runner/man/{create_subset_experiment.Rd => subset_seurat.Rd} (85%)

diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
index 6a1e6117..290c77cb 100644
--- a/pipeline-runner/NAMESPACE
+++ b/pipeline-runner/NAMESPACE
@@ -9,7 +9,6 @@ export(cbind_cellset_type)
 export(create_sample_id_map)
 export(create_scdata)
 export(create_seurat)
-export(create_subset_experiment)
 export(diet_scdata)
 export(download_user_files)
 export(embed_and_cluster)
@@ -54,6 +53,7 @@ export(score_doublets)
 export(subset_experiment)
 export(subset_ids)
 export(subset_safe)
+export(subset_seurat)
 export(sym_to_ids)
 export(upload_to_aws)
 import(data.table)
diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R
index a2d74631..6323f8bc 100644
--- a/pipeline-runner/R/handle_data.R
+++ b/pipeline-runner/R/handle_data.R
@@ -436,7 +436,12 @@ load_cellsets <- function(s3, pipeline_config, experiment_id) {
 }
 
 
-#' Bind columns not failing if there's an empty data.table
+#' Bind columns not creating rows if there's an empty data.table
+#'
+#' `cbind` on `data.table` adds a row if binding an empty data.table to a non-empty
+#' one. We do not want that behavior when parsing cellsets, because it implies
+#' the "creation" of a cell that does not exists (i.e. when binding scratchpad
+#' cellsets slots of an experiment without custom cellsets)
 #'
 #' @param dt data.table
 #' @param ... columns to add
diff --git a/pipeline-runner/R/subset-1-subset_seurat.R b/pipeline-runner/R/subset-1-subset_seurat.R
index c25dee55..537db54c 100644
--- a/pipeline-runner/R/subset-1-subset_seurat.R
+++ b/pipeline-runner/R/subset-1-subset_seurat.R
@@ -15,15 +15,14 @@
 #' @return list containing scdata_list, annotations and sample_id_map
 #' @export
 #'
-create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
+subset_seurat <- function(input, pipeline_config, prev_out = NULL) {
+  parent_data <- load_parent_experiment_data(input, pipeline_config)
 
-  parent <- load_parent_experiment_data(input, pipeline_config)
+  subset_scdata <- subset_experiment(input, parent_data)
+  sample_id_map <- create_sample_id_map(unique(subset_scdata$samples))
+  subset_scdata <- add_subset_metadata(input, subset_scdata, sample_id_map)
 
-  scdata <- subset_experiment(input, parent)
-  sample_id_map <- create_sample_id_map(unique(scdata$samples))
-  scdata <- add_subset_metadata(input, scdata, sample_id_map)
-
-  scdata_list <- Seurat::SplitObject(scdata, split.by = "samples")
+  subset_scdata_list <- Seurat::SplitObject(subset_scdata, split.by = "samples")
 
   # TODO: remove from here and refactor all pipeline.
   config <- list(
@@ -37,13 +36,13 @@ create_subset_experiment <- function(input, pipeline_config, prev_out = NULL) {
       sampleIdMap = sample_id_map
       ),
     output = list(
-      scdata_list = scdata_list,
-      annot = scdata@misc$gene_annotations,
+      scdata_list = subset_scdata_list,
+      annot = subset_scdata@misc$gene_annotations,
       edrops = NULL,
       sample_id_map = sample_id_map,
       config = config,
       disable_qc_filters = TRUE,
-      parent_cellsets = parent$cellsets
+      parent_cellsets = parent_data$cellsets
     )
   )
 
@@ -113,10 +112,10 @@ diet_scdata <- function(scdata) {
 #' @return subset seurat object
 #' @export
 #'
-subset_experiment <- function(input, parent) {
+subset_experiment <- function(input, parent_data) {
   # subset seurat object, remove unnecesary data
-  cell_ids_to_keep <- unique(parent$cellsets[key %in% input$cellSetKeys, cell_id])
-  scdata <- subset_ids(parent$scdata, cell_ids_to_keep)
+  cell_ids_to_keep <- unique(parent_data$cellsets[key %in% input$cellSetKeys, cell_id])
+  scdata <- subset_ids(parent_data$scdata, cell_ids_to_keep)
   scdata <- diet_scdata(scdata)
   return(scdata)
 }
@@ -152,11 +151,10 @@ create_sample_id_map <- function(parent_sample_id) {
 #' @return SeuratObject with new sample ids
 #' @export
 #'
-add_new_sample_ids <- function(scdata, sample_id_map) {
-
-  sample_map_idx <- match(scdata$parent_samples, names(sample_id_map))
-  scdata$samples <- unname(unlist(sample_id_map[sample_map_idx]))
-  return(scdata)
+add_new_sample_ids <- function(subset_scdata, sample_id_map) {
+  sample_map_idx <- match(subset_scdata$parent_samples, names(sample_id_map))
+  subset_scdata$samples <- unname(unlist(sample_id_map[sample_map_idx]))
+  return(subset_scdata)
 }
 
 
@@ -170,12 +168,11 @@ add_new_sample_ids <- function(scdata, sample_id_map) {
 #' @return scdata with additional metadata
 #' @export
 #'
-add_subset_metadata <- function(input, scdata, sample_id_map) {
-
+add_subset_metadata <- function(input, subset_scdata, sample_id_map) {
   # add new sample_ids, keep originals in a new variable
-  scdata$parent_samples <- scdata$samples
-  scdata <- add_new_sample_ids(scdata, sample_id_map)
-  scdata@misc$experimentId <- input$experimentId
+  subset_scdata$parent_samples <- subset_scdata$samples
+  subset_scdata <- add_new_sample_ids(subset_scdata, sample_id_map)
+  subset_scdata@misc$experimentId <- input$experimentId
 
-  return(scdata)
+  return(subset_scdata)
 }
diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R
index 9c6e8f50..9b6a6171 100644
--- a/pipeline-runner/data-raw/sysdata.R
+++ b/pipeline-runner/data-raw/sysdata.R
@@ -25,7 +25,7 @@ GEM2S_TASK_LIST <- list(
 )
 
 SUBSET_SEURAT_TASK_LIST <- list(
-  "subsetSeurat" = "create_subset_experiment",
+  "subsetSeurat" = "subset_seurat",
   "prepareExperiment" = "prepare_experiment",
   "uploadToAWS" = "upload_to_aws"
 )
diff --git a/pipeline-runner/man/add_new_sample_ids.Rd b/pipeline-runner/man/add_new_sample_ids.Rd
index c1f8875b..b6c7aaaf 100644
--- a/pipeline-runner/man/add_new_sample_ids.Rd
+++ b/pipeline-runner/man/add_new_sample_ids.Rd
@@ -4,12 +4,12 @@
 \alias{add_new_sample_ids}
 \title{Add new sample ids to the subset Seurat Object}
 \usage{
-add_new_sample_ids(scdata, sample_id_map)
+add_new_sample_ids(subset_scdata, sample_id_map)
 }
 \arguments{
-\item{scdata}{Seurat Object}
-
 \item{sample_id_map}{data.table of parent/subset sample id map}
+
+\item{scdata}{Seurat Object}
 }
 \value{
 SeuratObject with new sample ids
diff --git a/pipeline-runner/man/add_subset_metadata.Rd b/pipeline-runner/man/add_subset_metadata.Rd
index d4e5c61c..688edcd5 100644
--- a/pipeline-runner/man/add_subset_metadata.Rd
+++ b/pipeline-runner/man/add_subset_metadata.Rd
@@ -4,15 +4,15 @@
 \alias{add_subset_metadata}
 \title{add experiment level metadata to subset seurat object}
 \usage{
-add_subset_metadata(input, scdata, sample_id_map)
+add_subset_metadata(input, subset_scdata, sample_id_map)
 }
 \arguments{
 \item{input}{list of input params, containing the experimentId}
 
-\item{scdata}{seurat object}
-
 \item{sample_id_map}{list with mapping between sample_ids from
 parent and subset experiments}
+
+\item{scdata}{seurat object}
 }
 \value{
 scdata with additional metadata
diff --git a/pipeline-runner/man/safe_cbind.Rd b/pipeline-runner/man/safe_cbind.Rd
index c9182e73..f8471e22 100644
--- a/pipeline-runner/man/safe_cbind.Rd
+++ b/pipeline-runner/man/safe_cbind.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/handle_data.R
 \name{safe_cbind}
 \alias{safe_cbind}
-\title{Bind columns not failing if there's an empty data.table}
+\title{Bind columns not creating rows if there's an empty data.table}
 \usage{
 safe_cbind(dt, ...)
 }
@@ -15,5 +15,8 @@ safe_cbind(dt, ...)
 data.table with new columns
 }
 \description{
-Bind columns not failing if there's an empty data.table
+\code{cbind} on \code{data.table} adds a row if binding an empty data.table to a non-empty
+one. We do not want that behavior when parsing cellsets, because it implies
+the "creation" of a cell that does not exists (i.e. when binding scratchpad
+cellsets slots of an experiment without custom cellsets)
 }
diff --git a/pipeline-runner/man/subset_experiment.Rd b/pipeline-runner/man/subset_experiment.Rd
index ba3dc297..16c10453 100644
--- a/pipeline-runner/man/subset_experiment.Rd
+++ b/pipeline-runner/man/subset_experiment.Rd
@@ -4,7 +4,7 @@
 \alias{subset_experiment}
 \title{Subset seurat object by the input cellset keys}
 \usage{
-subset_experiment(input, parent)
+subset_experiment(input, parent_data)
 }
 \arguments{
 \item{input}{list of input parameters, containing cellSetKeys to subset}
diff --git a/pipeline-runner/man/create_subset_experiment.Rd b/pipeline-runner/man/subset_seurat.Rd
similarity index 85%
rename from pipeline-runner/man/create_subset_experiment.Rd
rename to pipeline-runner/man/subset_seurat.Rd
index cb8321ef..f1eddefc 100644
--- a/pipeline-runner/man/create_subset_experiment.Rd
+++ b/pipeline-runner/man/subset_seurat.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/subset-1-subset_seurat.R
-\name{create_subset_experiment}
-\alias{create_subset_experiment}
+\name{subset_seurat}
+\alias{subset_seurat}
 \title{create a subset experiment}
 \usage{
-create_subset_experiment(input, pipeline_config, prev_out = NULL)
+subset_seurat(input, pipeline_config, prev_out = NULL)
 }
 \arguments{
 \item{input}{list containing:

From 7d4b71b436ef3d9243fa5842d6d20a93551271fe Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Mon, 26 Dec 2022 12:56:48 -0300
Subject: [PATCH 33/34] fix docs

---
 pipeline-runner/NAMESPACE                    |  1 +
 pipeline-runner/R/gem2s-3-run_emptydrops.R   |  6 ++++
 pipeline-runner/R/init-functions.R           |  6 ++--
 pipeline-runner/R/qc-1-filter_emptydrops.R   |  5 ++-
 pipeline-runner/filter_emptydrops.Rd         |  0
 pipeline-runner/man/compute_sample_edrops.Rd | 17 ++++++++++
 pipeline-runner/man/filter_emptydrops.Rd     | 34 ++++++++++++++++++++
 pipeline-runner/man/run_pipeline_step.Rd     |  6 ++--
 8 files changed, 68 insertions(+), 7 deletions(-)
 create mode 100644 pipeline-runner/filter_emptydrops.Rd
 create mode 100644 pipeline-runner/man/compute_sample_edrops.Rd
 create mode 100644 pipeline-runner/man/filter_emptydrops.Rd

diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
index 290c77cb..bbf2a269 100644
--- a/pipeline-runner/NAMESPACE
+++ b/pipeline-runner/NAMESPACE
@@ -6,6 +6,7 @@ export(add_subset_metadata)
 export(build_cc_gene_list)
 export(build_metadata_cellsets)
 export(cbind_cellset_type)
+export(compute_sample_edrops)
 export(create_sample_id_map)
 export(create_scdata)
 export(create_seurat)
diff --git a/pipeline-runner/R/gem2s-3-run_emptydrops.R b/pipeline-runner/R/gem2s-3-run_emptydrops.R
index 462d1478..956f31a1 100644
--- a/pipeline-runner/R/gem2s-3-run_emptydrops.R
+++ b/pipeline-runner/R/gem2s-3-run_emptydrops.R
@@ -33,7 +33,13 @@ run_emptydrops <- function(input, pipeline_config, prev_out) {
 }
 
 
+#' Calculate empty drops scores for sample
+#'
 #' @param sample_counts dgCMatrix with counts for one sample.
+#'
+#' @return data.frame with edrops scores
+#' @export
+#'
 compute_sample_edrops <- function(sample_counts) {
   # check if filtered
   num_empty_drops <- sum(Matrix::colSums(sample_counts) < gem2s$max.empty.counts)
diff --git a/pipeline-runner/R/init-functions.R b/pipeline-runner/R/init-functions.R
index 414a463d..58e8aba5 100644
--- a/pipeline-runner/R/init-functions.R
+++ b/pipeline-runner/R/init-functions.R
@@ -187,12 +187,12 @@ run_qc_step <- function(scdata, config, tasks, task_name, cells_id, sample_id, d
 }
 
 
-#' Run GEM2S step
+#' Run pipeline step
 #'
-#' Calls the corresponding task_name GEM2S step function.
+#' Calls the corresponding `task_name` pipeline  step function.
 #'
 #' The input list only contains experiment level parameters, such as project ID,
-#' and sample names. It's only used for downloading user files.
+#' and sample names and it's only used for downloading user files.
 #'
 #' @param task_name character
 #' @param input list
diff --git a/pipeline-runner/R/qc-1-filter_emptydrops.R b/pipeline-runner/R/qc-1-filter_emptydrops.R
index e808e33f..b2dbfbab 100644
--- a/pipeline-runner/R/qc-1-filter_emptydrops.R
+++ b/pipeline-runner/R/qc-1-filter_emptydrops.R
@@ -1,6 +1,9 @@
 # STEP 1. Classifier filter
 
-#' @description Filters seurat object based on mitochondrialContent
+#' Filter empty droplets
+#'
+#' filters seurat objects based on edrops scores.
+#'
 #' @param config list containing the following information
 #'          - enable: true/false. Refering to apply or not the filter.
 #'          - auto: true/false. 'True' indicates that the filter setting need to be changed depending on some sensible value (it requires
diff --git a/pipeline-runner/filter_emptydrops.Rd b/pipeline-runner/filter_emptydrops.Rd
new file mode 100644
index 00000000..e69de29b
diff --git a/pipeline-runner/man/compute_sample_edrops.Rd b/pipeline-runner/man/compute_sample_edrops.Rd
new file mode 100644
index 00000000..0919993a
--- /dev/null
+++ b/pipeline-runner/man/compute_sample_edrops.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gem2s-3-run_emptydrops.R
+\name{compute_sample_edrops}
+\alias{compute_sample_edrops}
+\title{Calculate empty drops scores for sample}
+\usage{
+compute_sample_edrops(sample_counts)
+}
+\arguments{
+\item{sample_counts}{dgCMatrix with counts for one sample.}
+}
+\value{
+data.frame with edrops scores
+}
+\description{
+Calculate empty drops scores for sample
+}
diff --git a/pipeline-runner/man/filter_emptydrops.Rd b/pipeline-runner/man/filter_emptydrops.Rd
new file mode 100644
index 00000000..66375661
--- /dev/null
+++ b/pipeline-runner/man/filter_emptydrops.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/qc-1-filter_emptydrops.R
+\name{filter_emptydrops}
+\alias{filter_emptydrops}
+\title{Filter empty droplets}
+\usage{
+filter_emptydrops(
+  scdata_list,
+  config,
+  sample_id,
+  cells_id,
+  task_name = "classifier",
+  num_cells_to_downsample = 6000
+)
+}
+\arguments{
+\item{config}{list containing the following information
+- enable: true/false. Refering to apply or not the filter.
+- auto: true/false. 'True' indicates that the filter setting need to be changed depending on some sensible value (it requires
+to call generate_default_values_mitochondrialContent)
+- filterSettings: slot with thresholds
+- method: String. Method to be used {absoluteThreshold}
+- methodSettings: List with the method as key and contain all the filterSettings for this specific method.
+* absoluteThreshold: based on a cut-off threshold
+- maxFraction: Float. maximun pct MT-content that we considere for a alive cell
+- binStep: Float. Bin size for the histogram
+* we are supposed to add more methods ....}
+}
+\value{
+a list with the filtered seurat object by mitochondrial content, the config and the plot values
+}
+\description{
+filters seurat objects based on edrops scores.
+}
diff --git a/pipeline-runner/man/run_pipeline_step.Rd b/pipeline-runner/man/run_pipeline_step.Rd
index 924f7e33..f62d2b80 100644
--- a/pipeline-runner/man/run_pipeline_step.Rd
+++ b/pipeline-runner/man/run_pipeline_step.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/init-functions.R
 \name{run_pipeline_step}
 \alias{run_pipeline_step}
-\title{Run GEM2S step}
+\title{Run pipeline step}
 \usage{
 run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
 }
@@ -24,9 +24,9 @@ run_pipeline_step(prev_out, input, pipeline_config, tasks, task_name)
 list of task results
 }
 \description{
-Calls the corresponding task_name GEM2S step function.
+Calls the corresponding \code{task_name} pipeline  step function.
 }
 \details{
 The input list only contains experiment level parameters, such as project ID,
-and sample names. It's only used for downloading user files.
+and sample names and it's only used for downloading user files.
 }

From f937381d6f814d619cd421fff34818ccafa53fa8 Mon Sep 17 00:00:00 2001
From: German Beldorati Stark <gerbeldo@gmail.com>
Date: Mon, 26 Dec 2022 13:18:21 -0300
Subject: [PATCH 34/34] fix sysdata

---
 pipeline-runner/R/sysdata.rda | Bin 3169 -> 3152 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda
index c750a90d5513b4f53e06689742df642eb0dbfa12..819509cc1bfe8f44563a8060d604dbafe63294fd 100644
GIT binary patch
delta 3151
zcmV-V46yUz7|<9FLRx4!F+o`-Q(5!=3(=7dB7gX_YkTj$!3}Rj?9<m_ZBr}IDBE5X
zZMK><d?if~jMP;BN#sn?q{c+qMD&e>Xeeo>o{6K>Y@lcW0077V>HugUng)Q4G%4w-
zZ7`-WG}Q8%Jx!<sKxhB}0000qGz3%0fgqVv^-ojM8UO$Q0MG!?000000000Qo<Rge
z)qg!lq|ByeJx!F;)XAor007g}27mwn000Jn0F>2JX-DdtBxqwIVH#w~fCRxZX|*<*
z5xqe(Mn;1{=`?9Dm_rZ*iY6$~&}eFCq3UVnJ){sepfUge05UWH000IF`l0G%<?BZO
z5B612v{3>Ck`RO>k{C%Q5@r)_<XU1RRDWl<p;9Udg2+ZN$W#Ihq#%+7m|{T-N)##u
zoQz_FK+)jR0x=B*!$rToqZv>S^Ynl0<b|YSNM5hG(&#t0i*e(!F*?!yh_jV&dfcFu
zO#06dp!iYK)kwht%&f3_mLmtKLs|)#W@c^fj4}vygv=qlA=eYIZ;Q@oG@6bBgn!m?
z9A_cQVwlIk$37-O82K3_b10BynTBceBcDS!c(yam14Pn3rkSE@5=~L#RMsUA5{YKH
zuxJwvlW5x{l1;*Cx^Eli`M^F-I7#Ob#!M(2huE4<<s}mNx%xi+JzUSf*Ftw6PpO-$
znAtwQeiG~zY}kHDBp-!RIgORc3V&R11q2Ktz=sr~bWq%OLxZWICx?0>QA335Ff%Yx
zA(93dQThw2dFHMh9V4OJqocpl_n5XLA?PilCe@dU!~y2)UUk&gr?n9-;vT<TSwOJ4
z(SlNyxg%}}xQW7z=!1=XV%3dU>nCj4he9}VhPPz1vtC$p*C%;fcXwlFiGP<tw)QnF
zWyDLY+&19ag<115#kyIfJGD}}ksX4j!WCvCN9;C{34$GjPDDaD24>NgAyk{-;g&Z!
z^T^>fLIK<}&4ulI^5qO0KbBp>6*@)CbzySS=9UGB43#-tj-PmQTYJ(?z}(J0(@oxC
z6I;#NWuZ&i)V)>S_cI2iG=DoQRI{+NL)!6p44SrSyt<O^&3TK~!y%rO0Z4VdB*dB;
zPFdRRh6%}o3q!o<n26IVD`ika=@+co?hQ?>tev9hD~h;_nF^>(#MnVn%564^&06Tn
zFs0`x()_0K*9z$)e-LAY66T~F3`hXdgpy2YAV9<j5s3){N&tqNX@3k91OPCL1q3D}
z0tmE|Vgx~gNJzP>%tAlLkOC%LI5aT;SuD?yyf$;A(Oh6Mag3zN2V5?5@0igb?SmAG
ziy7<Fopd$YL)z9X$Ze#A2;xFWR4FD-`8xgSUzDIqsJB&hQd1I*nV#;2pr;js2s=Ey
zII*#-p){sro4vum*MCyRQtR7x1mw>2wKubC;akwX4w#JiYG>PCm-M|?1>6_*j0M%Y
z>fNE-7Ux`!VP;j|9In8jx5d?KQ4xX}kQ&4`M?F!aQQx+<5Xm4-7C3I=gyukrKt8@#
z&Sr$sj)%NFBra@Ct7Q?E5V52YTC5Q_RW`U(V(h`Iz*lk#tABa1EPz4**70$~GqD|$
zaK}vd<!wYs=UmvPo28u0$&v-yWMwq^Q1DZ#<+qvFl5CThm{PYWagHS`b=9MYgz2{R
zXk?W=W|`k+*xff7X2H=D(BN~0ZBdzCL5hboBxYGlNHJ0!^*}DeClc!uQJQ>=*6{az
zZEU6IBTL=*B7X%Ojsnx$9W3ig8cjK5MCFGgUGJqtLOOSHS#EYYME3FD!n^En%GS1}
z552l_*(jqNFxGx)waaozSAE18#MQ^0!gmq5dp%!o1UV9J&1PqEh{9M-jbo9sH+41)
z?5<0!XVmGfO)3tIUBGb8VT2I@ktbXl2_7(JvlrfxZGR9{gow(z=u6DyW4Ks`V;3Pp
zPU)({5S-AdV~Co<&pd+*V<Ov{6h06+<(s@{K(ZnUk`82&wX-$TNXEgzcCZexP%N^`
zGHo7Rvzif}(6<fQV^JY&se-$)AP75_94wgl^GU5jV5>8r)uN+FWC5oc9tl+00xL<u
z5RArfMt@>L7=n<)g~Jvr;=r`2s{z3p6-#R(BL(d0W@xVROF^83L>-<E;hd6oQ%ItV
z1D8K0RvJyLF2c>QO98>7*C~Uo2=i$!8uiz8BZr1HUMn>^*s*IVbD|vgzP47PcfD0*
zRD8H+t}Pa}D$xq&#vPcou<Wgh;v!SBc*8NPT7RmkYE+ygFd+~~vJ?=dm>k?>1uKEB
zyGx`ym>zS=AxJHov!{B&+1aaWUf95!;YO;6B2DV(-PNGl7FWFrqA1}C1fvX2Q-~Il
zC>o(8-#MFm!n(|rlv*1OM2AN>HPCF+Y65d(2BFn%!G?hNA+N@iEoN9|t6iYD$&;pM
zmw&9%xq(gqL7@@khgMCd+y?;8ICCK7>s7zMWg!k^n5te1XF<p)p-jrrH8x;_z>Pb~
zA)g>sIy#my#>z$-jn?7Q3%HChQR~Z}Zi2wM0FXAB5*XrxYca1LhDx-tL;^rWk>E1t
zr`#x;Dha@-!0qy-4D@(KCNay%{{_wUDt{Gd#*J5oXiSca_3}4#2UQ3cPz06YvqNS=
zc`)hQ+}?_}RPYs++hPPn34tYky7M^bZ(EIxJ(>jeu$!V)CD8G^usPuXy8-xV++|xD
z@Gd$8dxJpt=!N8Y?!?0e1d3olX_W2opNdOH1x>u?T~s|$5X&|klA&uj-a{q%6Mr^a
z=WJwv0Em*aQWCW}YZb#S(XzN4vskRuqO*7t10X~|n<1QtIhK(}F_t&5OPdU|nXt<#
z++m2^V%7>|bL^}%$FA{Du{QJG48=qHp#;_9cD^0HhBi&vn@w4$kYonO7@~>P1*B9r
z#WL%;>ce|$ZzG~f1enqi8*CFB0Dodv=u`#(2#FYS*KvNZ6^aK#us4qGtVW!JIa9Lw
z=_N(UOsac(ldbzucxLcGk4J+rVAdj>Xme`J)<Sv4%d>z8iW5i>#`G3}xM8w`LEa81
zUl{N_RxB`p7Am#tED_x<GjV&a<at{G&pB>4MY=lC!o>qm4Dv;`&0oo{jDKqiYu;Ie
zkjHUGd7Pg~`41^Iie^xX+GjM<mAOLN%;QgIN={st_;!Ri&|IZh8oFAq76BYVVy1Qg
zgusP}vN^n#M{*E;rd;}<mk*84CI=B=)9byFI3^G-=U&MpKo~Ycd#+qAZ1+<pDR1+j
z$kyTvK*5h^$XmxZ^Wf>h9DgS5waB32*7k)M%fu@0Y-u_>p1&R<Vj}MPP>L8inSqAM
zW<+){z<~Yln_{iyVyPI@FIG<MJVcf)1$B5WNJ~L_n%V)?=p24!Wop%hf>anrsR7nO
z0whdu+VO^mc@7&Inm|n{-<eI&k)$lfjZl5m3?mGI?b77#ch;8MjDPJalOO`20?}dx
z1V>b(@IMU;-eQff9!$GqvD@mfJ%67)Ija^_+XCof*e*-fgFGZkd`emkmU*tJ`AudD
z@8Pjjs-v8XbG9lT%|XQ|MHzKujC{+Za}MxVfJVS}xudPzSnwZ)UIUt~hv_3NBnMeZ
z`vZCrOF8RsH%cO$@PE?=1+v85X=vXu2PzfBoH1D2GU68jlQ(u*O%$98pcE!n3!{UO
z?BI?LG$161gb_)8zU{78C+k(mSw=S3&9ipvvfpdMT}f_%AeuI^9<+~Ss9{p~c^)vg
zrm_J1#)L=NmaZ!@UtZBsa$Cw7*DY1TcIM;A26}72U_@5E{eM_aU`oSpvA;J4U0Yur
z2w-3i(+Z95^MKYoWdmBRw!=6H%{^KNemJO%g0d6TUSy;y7~cas+mS|ADv@yufZ`Wr
zQArt!ki1o`i9@5xTv@Q4r9(YZc82?a)jNq*l~q~Oee1ega>5LmsWqI{3??d^M?l*d
zWMi>N1wdSaCV#p9Ho8MZm|1r1iu(ltMWaK?_ih~Z6S(W&Ht9<No&7|L^kBhYsBVum
zTqYV{WcIi)h};`)%SoEhAS?QT`|K#nOvnjYM(<$_cT9Jv7>n+^As#J6*J)}nXy1;!
zz+npx%?FOb1xsYvpUgSJ;1qOq7IODyY@-V*Wz~lwfjH307s(mX48g@&Qm(x`?86@k
pYrLk+Vx%M&&XKU=<?-?55y_bw3sHgq5KH6!F64@Ep&{q|7ouCm#ZCYK

delta 3168
zcmV-m44?DR7~vQWLRx4!F+o`-Q(096^DdDNB7gZZ7ktor$8OOghrY{gQ!CG5_j<0p
zNSE7n?aB#LM9>mb{S6VAlQbr20XCW%38H#xcu4gRC=CDr000Jn8VDe1lT#**B%i8e
zW|5P_RP{YWH8k{q8UO$QVrT#W(9jW6@-T%bsP#Ql)HGm?2AX6HBPI|U8Z;R+U`;d(
zBYz>IOqv)>8mHtFLMfD!@;0ZcndDQ$RP`~bqerO8pfgiJpa1{>00HU%0!2YhsLfMN
zsiX8L(9jwH01W^DJwR<TLqVpVQ%9hWMvy&2K_rCAWMXNEAg77tPc){FQ%@ApwHgM1
z00000000JusoCz=>+2SEYG5iu1`W0`jDKymfws`vLt&dTEh#1-U6_#sKq3nm_q!PY
z5Lh|MAXx?y2U<vx09Oxt&|We-xj025Aj~vuE>kc_l6pQ5_8AzqLlQ#q*Rhb@Np^1s
zT?{AY=>mso71*t}PdgI)an5F({Yg+ojACx8;Y@=RV&9D+B+OdF4M7cYHUu_AHh+XW
zhS<EO;h5bwjq^FcaGXaO-8l~N-#n9)@EpgKbDalC)Ob%U^A2;Y^G`U}JoB*aJja~$
z9#bH!WR*;kxnQbTGLK2>Jcq2yJ%_ZcRz1hO%svKbntV$oqvU5FGw5ZO%=19ZEohbu
ztuL?I=ANdNt2cA)Th;r~{82Aoet)p>iKoSS@t~4HZYa(RW&~(a>p%(w3@JlGkpzRG
z6rL!F8lLKsT$&0K<Pao+RwmGe3O^BVoo4R+-qE4WPj6+1d6vXveTQNqY1nkALLF76
zYg@eEy{&EDQ#Nm;#i{C36;+p(a*a14Pz$DFgNIQ!4$Y@6Hmt+dj~-#G+<z@;uPi;v
z?srvo`+VJ=zI|5c=H9WMx?a<y*Mp7s@!<Yba^p04dN~`RDd%O<wCfSOZy@g_ql^0K
zc#_cELcN;I2O5crmd1{|#4@36YYn72R{lIK)esR6%?`(mEj)$Hc7pR#@RkLRjFz2V
zXR_{(3wM5!Zv%PWUlyCZ!ha^$o3zV9m#0$pS9{#d8x+v&s>Pj!G(BD~0h3yqZ!V;}
zvtDBMp^>j50HizqM8usAlbveqh6%}o3uk%SVk44NT9tth$S-N9xHdIPleAq0ac2>8
zAz2BSnh;d7nwpDEn&`~;O9M>#Pc&M}9Ym%zQWd$taikmUBmiYXNq;6ZkRW0N2$cx~
zN&rJmw1x@+01P2uph99mAcdrx5F!Q%AtGj^Xc7Er03!I9a5OZ4A}Gs?xf?X&)?}~{
zZyG9b^N?i8-EpJm_8|c(qnfqJnLAy$Jl44Qj@U^E5y*s)q)KKZIT|5?5OD}SniIk^
zL`pVhdUPrZacD4uvwzFOiZ$yhge|6;j}Ldj-pP@S$k%aJ6N@Xey|b~OeQw6=d8?Uk
zJ(m*mL7CU@OM2zf@haO3x2H(2LgabHnOnR%yz+-fKKiW@7$KPfxQ5Bsd~Lk_irs`V
zkS9foVWBx7JRl!_D>FJ1qa9xG@Q}HpYSf}Jged6*W~s78uz#rE0cge9gH(uQ4=u3P
zSpb9sVc_AcGr3+vAm$2h&oob>U%t98QAcYz*@p}(vdaZD`VsF_yK70-Y}>!7fT4ve
za~q6tDO*~!aS)+9_Gk}V?A==HZ`afy$<v2C?i%eN>KZrh%UeL~j&Dz#%SbU|9rwU4
z!!9M(Cc{+uGk<Ht-S=+HyyWS7ySUMa_NAl5c4u25vD3{WCm3=%&wZ3cBe!b|k!53;
z@95y3>gQvID58n%a677T*fHb59CaX~jPu&JI_@}wI{W;&=_jQdo89U9AcrE2*{PY_
z_*o34l?c>yahy7u4D7H=tnJUb);d@nv3CK(Iff8K27g4p*LW%T4>ON1EMI<?Sqz<R
zwvo#2+?7L;9nJdCuNb;(FvKT16)bTRXj<!#!m*Owt&~0xI_Fn-f(4-vR+PDvRMytl
zS3?_1g7Y+oOh8U^o_=BDwsq4RC<@Ax@m_*LyhnmFDHsSd6-tVB77;k5F?b?MM@ZGN
ztMQ!FgMYUOz7)^dijkdXTqI6TZk@^_fYf$76gY6^zaZYTuIIXVTF(1KMTa%2UEU_C
zqj<F<NTp~#af^;KN<wHV3|SbCndcRljtMiSdpy>gpbV6bieAjamCBZDM$MMS@gq~Q
zi)&ePoe<}}_cG-cJ?g7ct78oI#YI*u(F*2_JAW~1!?LVf5fYt~#u<vasne3l34y}_
z2tbV-WD->ddkcV_I8Q%9)>Ei7ahQc5wyw^d?Ho8+_g=AqH^Pnf5Ja2V(YvaH)(Gkv
zk|2q&K>(0oq{SFORmOnyGB~xJlTZ~^MoP?7HXMl#j<YJ;7ip~lI<bRb>epbyVE7@g
z&wnUYGQ%~R?FGhJI%au$PL|XSJ+L6?M0jcMXxnYzZw*6clpJ0Rx3{NgL#-<*F8#J3
z;ShqEmSJ79(1WKNcT`iWI8``on;dMkXx47cqi`=Z7+|B)t5)0#oVfsyII<)$#)FQd
z%8#LyZ78AvAR<WkOuZC+;uYe8z$9RHx_?Oqdb{G&8Dro-#B+U#32kK!cm{1IcSc+r
z+n0y%gbVipC4iv4n8rsQEV~;zUaip_B?Y$Bgpi;*iEmk=Elk|qI8559F<S*T)eleF
zk;A%a92aLk`;@s>^%j-AphtZcJoNBcQcYr194#qqK!CXlN#J@_!wm&)ZrRt$pMPXT
zGJU&}q%KLv0vYhmnsLI$#)?1$M3)+nsj1PpT~0T~cT#m%HD<LJ8^Ej?0wM(34Cq0|
zwuo_=WrGD;bYp3o46!@RF&mUx!EFwF?hP^Ryj$*9xc7rGA^pJxcVu~=PivQy>+~z+
zOH3#k!Li063ls&kNN$s`x3fBQB!8t-Ugso`M#hkk+hCa30}{WIq<{%0L8!9j`p#HV
zc0MV@*K$^E<u+AXrDaL(ep;}{ztyXAcduanctKfM5(q80a&~r@-@+dH>N}If0Fs{$
z0nBku`b}(2K+ZXiS?{jCl#2|EI2Oa-X7bLR(%{_wpQJulFmvcGmi8Oi-G3f7C^FO5
zJ;7ZDALQHDHX>!!s}V56W-zhhaQRKvYY3`hSPxm)<1(dffo%oBliAOsi#{GUV22nN
z5nKkoe!K+AIReQ{9Kb?gLdZ_N9t#iIbDPYv?G8&{f87xSkf{B?rt7@lh&av7gB-=+
zMWPrba?xu?r8Ov?=UNPrZhs)e3%GQ77U|FY`n{KAjMma!wFd@wcNAeJBUgsDW3!{h
z{Bd6}5))-ih@pd<ED9`^Wk%Qw$PiER*M4<hdchG-j~7&YB(-dYg1lC3mogJ-)XWWE
zDGBtgcwDz$nnTsAKur25X2|#*g+>ta818ZO@RS`5qt1yfz?jO4tA9p^HyeyY1~huo
z%$qY#va>TjHjXt2KzM{&#j7D*UOC7if;v~U#TtB;@!}gMmc3Eioq2L>R4ghgqUmDT
zF5K2bEM;{-CP;!Q@L!csPG>IYL3*p>>N?8$yuzdJyf~!@D<1Boga$p$TiYR_2$4c^
z^T|`8WI|c<bej%6*?(=OTWC6uQr$SGt=Tlb;iExYw39HwZ55l0mHLW9vULw4>S|_U
z%0<3o$;wqL<rffeNQnq8o$e#Qc1U<ogtuIcLMbk2YgyxS2D|ubD8m+p#*DJIdy+tn
zkx^a@604!`%4K{=LB&xqJOM?+m;q~qfB<~ath%kseVND^l7G^ja_FdQIE|LZ1=8Tc
zVMEsjtr(8XCaBoC1Fnu(tk<KRXcmN#1}(Yf2DtI0qJ}a_Aea~4saA2uI3XGZ#4M3{
z^$?KD3QX{Bbr`W!i-2BS0{m23MrBkcQ}rlN(bX=##p9Jx;C#Unjkf|NHicAE>dKpL
z%bG5I=ymvV7Jt;TA$D1MCt=?mih1m-hC@#f+*^<0iK3NaclT3Q%s?p2Hl8qdH0+T(
zhV9dDAtns)-$<`IEE5QDo@sKUZdowf5(zX(t<^Gy^M?xl*q3Vnu7wN4_*28Q?hLUT
z1cH)wqNhKTITg^g7;n`641i(^jnp0k0D?<8^uNO#kx6+d=W8uR@YdRfiOHAT9Et}o
z5WPXE5ZD>&s46JS&5<zL%Uc4A6^c%nCp~knbM$%n>=DBgt<?0;JPD3d{}*yaI8cyP
G1oJMzo5(!?