Merge pull request #8 from broadinstitute/lauren_refactor

Lauren refactor
broadinstitute · Jun 6, 2023 · d9c9246 · d9c9246
2 parents 28e6eac + a02793e
commit d9c9246
Show file tree

Hide file tree

Showing 25 changed files with 1,190 additions and 3,231 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,13 @@ data/*
 .LSOverride
 .vscode
 
+# output model
+*.pkl
+
+# sparkles
+.kubeque-cached-file-hashes
+.sparkles-cache
+
 # Icon must end with two \r
 Icon
 

diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "celligner/contrastive"]
-path = celligner/contrastive
-url = https://github.com/jkobject/contrastive.git
-[submodule "celligner/mnnpy"]
-path = celligner/mnnpy
-url = https://github.com/jkobject/mnnpy.git
+[submodule "mnnpy"]
+	path = mnnpy
+	url = [email protected]:DeKegel/mnnpy.git
diff --git a/Celligner_demo.ipynb b/Celligner_demo.ipynb
diff --git a/Dockerfile b/Dockerfile
@@ -1,69 +1,17 @@
-## celligner
-FROM debian:latest
-LABEL version="1.0"
+#!/bin/bash
 
-RUN apt-get update && apt-get install -y --no-install-recommends apt-utils &&\
-  apt-get install -y sudo &&\
-  sudo apt-get install -y wget libterm-readline-gnu-perl &&\
+# Dockerfile to create celligner image
+#
+# Run build_docker.sh
 
-  # all nice packages
-  ## install the [tools](https://www.datacamp.com/community/tutorials/google-cloud-data-science) sudo apt-get -y install htop parallel curl  tar  vim  nano  bzip2  unzip libssl-dev  make cmake libcurl4-openssl-dev  default-jre  && sudo apt-get -y install dirmngr apt-transport-https  ca-certificates  gnupg2  software-properties-common  zlib1g-dev  libbz2-dev  liblzma-dev  openssh-server  default-libmysqlclient-dev  acl  g++
-  ## sudo apt install git libmagickwand-dev libtool libexpat1-dev ghostscript graphviz libgraphviz-dev pkg-config libxml-simple-perl zlib1g-dev
-  sudo apt-get -y install \
-  htop \
-  parallel \
-  curl \
-  tar \
-  vim \
-  nano \
-  bc \
-  bzip2 \
-  unzip \
-  libssl-dev \
-  make \
-  cmake \
-  libcurl4-openssl-dev \
-  default-jre && \
-  sudo apt-get -y install dirmngr apt-transport-https \
-  ca-certificates \
-  gnupg2 \
-  software-properties-common \
-  zlib1g-dev \
-  libbz2-dev \
-  liblzma-dev \
-  libxml2-dev \
-  openssh-server \
-  default-libmysqlclient-dev \
-  acl \
-  g++ \
-  autoconf \
-  automake \
-  git libmagickwand-dev libtool \
-  libexpat1-dev \
-  ghostscript \
-  graphviz \
-  libgraphviz-dev \
-  pkg-config \
-  libxml-simple-perl \
-  zlib1g-dev && \
+FROM python:3.8
 
-  # install R	sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key 'E19F5F87128899B192B1A2C2AD5F960A256A04AF' && \ echo "deb http://http.debian.net/debian sid main" | sudo tee -a /etc/apt/sources.list && \ echo "deb http://ftp.de.debian.org/debian testing main" | sudo tee -a /etc/apt/sources.list && \ sudo add-apt-repository 'deb http://cran.rstudio.com/bin/linux/debian buster-cran35/' && \ sudo apt update && \ sudo apt -y install r-base && \ sudo apt -y install python3 python3-pip && \
-  sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key 'E19F5F87128899B192B1A2C2AD5F960A256A04AF' && \
-  echo "deb http://http.debian.net/debian sid main" | sudo tee -a /etc/apt/sources.list && \
-  echo "deb http://ftp.de.debian.org/debian testing main" | sudo tee -a /etc/apt/sources.list && \
-  sudo add-apt-repository 'deb http://cran.rstudio.com/bin/linux/debian buster-cran35/' && \
-  sudo apt update && \
-  sudo apt -y install r-base && \
-  sudo apt -y install python3 python3-pip && \
-  # all python config pip3 install numpy pandas && \ pip3 install MACS2 && \ pip3 install dxpy  jupytext  scikit-learn  google-api-core  igv  igv-jupyter firecloud-dalmatian  awscli  seaborn  pipreqs  pysradb  nbstripout  bokeh  matplotlib  deeptools  tensorflow  cutadapt ipykernel jupyter_contrib_nbextensions && \ jupyter contrib nbextension install && \ nbstripout --install --global && \ ipykernel install && \ nbstripout --install --global
-  pip3 install numpy pandas && \
+#add R and CMAKE
+RUN apt-get update && apt-get install -y r-base cmake
+# install
+RUN R -e 'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install("limma");' 
 
-  # search history
-  touch ~/.inputrc && \
-  echo "$include /etc/inputrc" >~/.inputrc && \
-  echo ""\e[A":history-search-backward" >~/.inputrc && \
-  echo ""\e[B":history-search-forward" >~/.inputrc
-
-COPY . /app
-WORKDIR /app
-RUN pip install .
+#install requirements
+COPY requirements.txt .
+RUN pip install --upgrade pip &&\
+	pip install -r requirements.txt
diff --git a/README.md b/README.md
@@ -1,167 +1,110 @@
 # Celligner
 
-![](docs/typical_celligner.webp)
+![](docs/celligner_public22q2.png)
 
-__Celligner__ is a computational project to align multiple cancer datasets across sequencing modalities, tissue conditions (media, perturbations..) and format (CL/tumor/organoids/spheroids)
+__Celligner__ is a computational approach for aligning tumor and cell line transcriptional profiles.
 
-See our latest paper on aligning CCLE cell lines with TCGA tumors:
-[2020 paper](https://www.nature.com/articles/s41467-020-20294-x)
+To learn more, see the [paper](https://www.nature.com/articles/s41467-020-20294-x)
 
 ## Remark
 
 __Celligner__ is initially an R project that you can find in the `R/` folder.
 
-A Python version was made that performs the exact same computations as the R version. However one should not expect the exact same plot for a couple reasons:
+A Python version was made that performs the same computations as the R version, but the results may differ slightly due to small implementation differences in the Louvain clustering and contrastive PCA steps.
 
-#### UMAP
+## Overview
 
-The plot some users have been used to is a unique run of UMAP on the __Celligner__ realignment data. This is done by fixing the seed of the UMAP algorithm. You can still do that for the python version but it is disabled by default and not recommended. We recommend users to play with the UMAP parameter and make multiple plots. 
-This helps to prevent reading too much into UMAP's output. Things that don't stay the same are not necessarily true attributes of the data.
+A **reference** expression dataset (e.g. CCLE cell lines) should be fit using the `fit()` function, and a **target** expression dataset (e.g. TCGA+ tumor samples) can then be aligned to this reference using the `transform()` function. See the `run_celligner.py` script for example usage. Celligner is unsupervised and does not require annotations to be run; as such they are not used in this version of the model but can be added post-hoc to aid in interpretation of the output. See the `celligner_output.ipynb` notebook for an example of how to draw an output UMAP.
 
-Learn more here: [distill](https://distill.pub/2016/misread-tsne/), [Lior's twittorial](https://twitter.com/lpachter/status/1431325969411821572).
-
-Additionally we also advice users to complement assumptions by applying methods like differential expression analysis across clusters to find any meaningful information.
-
-#### Algorithmic differences
-
-__Celligner__ is composed of 4 key steps:
-
-1. A Louvain clustering: this version is the ScanPy implementation of this method while __Celligner__ is using Seurat's. There might be some slight implementation differences.
-2. A limma diff expression analysis to find key variance genes across clusters for each dataset: this version is 100% similar to the R version of __Celligner__.
-3. A cPCA to remove tumor impurity signal. This method is exactly the same except that the python version does exact PCA computation while the R version does an approximate version.
-4. An MNN allignment: this version is 100% similar to the R version of __Celligner__ in its output.
-
-#### Is there any other differences?
-
-Overall improvements, yes:
-
-1. A “pre-fitted” model is available to download here: `gs://celligner/model.pkl` (on request for now)
-2. Using your own dataset and adding new dataset is super simple now with `fit()`, `transform()` syntax
-3. You don’t need to rerun the entire model when adding new (adding 600 new samples take only 5mns to run)
-4. The model takes much less memory to run and can run on any machine now (you don’t need 64Gb of RAM anymore), and it also takes less than an hour to fully run (on a good machine).
-5. There is now an interactive plot using _Bokeh_ to better visualise your samples of interest.
-6. You can now easily choose parameters and even choose between 2 different versions of MNN.
-
-## Just want a quick look?
-
-Go here for the production version: [https://depmap.org/portal/celligner/](https://depmap.org/portal/celligner/)
-
-Go here for some usage examples: [https://raw.githack.com/broadinstitute/celligner/master/docs/example.html](https://raw.githack.com/broadinstitute/celligner/master/docs/example.html)
+The Celligner output can be explored at: [https://depmap.org/portal/celligner/](https://depmap.org/portal/celligner/)
 
 ## Install
 
-> TO see the old R package installation instruction, see the `R/`folder.
+> To see the old R package installation instruction, see the `R/` folder.
 
 Before running pip, make sure that you have R installed.
-Please use an up to date version of python (>=3.8).
-Note that some people have had issues with mnnpy installation and needed to have cython installed first.
-
-`pip install celligner`
-
-Even with R, some platform might not have all the required packages already installed.
 
-In that case, please refer to our docker image (see Dockerfile)
-
-The dockerized version is available at `jkobject:pycelligner`
-
-to install the latest version of Celligner in dev mode, do:
+To install the latest version of Celligner in dev mode, run the following (note that Celligner requires the specific version of mnnpy that is associated with the repository as a submodule):
 
 ```bash
 git clone https://github.com/broadinstitute/celligner.git
+git checkout new_dev
 cd celligner
 pip install -e .
+cd mnnpy 
+pip install .
 ```
 
-## For developers
+A dockerfile and build script is also provided.
 
-see `CONTRIBUTING.md`
 
-## Use Celligner
+## Using Celligner
 
-See `docs/Celligner_demo.[html|pdf]` for an example of usage.
-(view [here](https://raw.githack.com/broadinstitute/celligner/master/docs/example.html))
+Celligner has `fit()` and `transform()` functions in the style of scikit-learn models.
 
-celligner works like most scikit learn tool.
-
-A user fits a dataset (e.g. CCLE tpm expression),
+A reference expression dataset (e.g. CCLE cell lines TPM expression) should first be fit:
 
 ```python
 from celligner import Celligner
 
-my_alligner = Celligner(make_plots=True)
-my_alligner.fit(CCLE_expression, CCLE_annotation)
+my_celligner = Celligner()
+my_celligner.fit(CCLE_expression)
 ```
 
-and then transforms another one based on this fitted dataset
+A target expression dataset (e.g. TCGA+ tumor samples) can then be aligned to this reference using the transform function:
 
 ```python
-my_alligner.method = "mnn_marioni"
-my_alligner.mnn_kwargs = {'k1': 5, 'k2': 50, 'cosine_norm': True, "fk":5}
-transformed_TCGA = my_alligner.transform(TCGA_expression, TCGA_annotation)
-
-my_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, umap_kwargs={'n_neighbors': 15,'min_dist': 0.2, 'metric': 'cosine'})
+my_celligner.transform(TCGA_expression)
 ```
 
-Users can access other methods such as save(), load(), addToFit(), etc, as well as many data in values: pca_transform, transform_clusters, differential_genes_names, mnn_pairs, etc.
+The combined transformed expression matrix can then be accessed via `my_celligner.combined_output`. Clusters, UMAP coordinates and tumor-model distances for all samples can be computed with `my_celligner.computeMetricsForOutput()`. There are also functions to save/load a fitted Celligner model as a .pkl file.
 
-Please have a look at `docs/Celligner_demo.[html|pdf]` for an example of usage.
-(view [here](https://raw.githack.com/broadinstitute/celligner/master/docs/example.html))
-
-### Computational complexity
-
-Depending on the dataset, Celligner can be quite memory hungry.
-for TCGA, expect at least _50-60Gb_ of memory being used. You might need a powerfull computer, lots of _swap_ and to increase R's default _maximum allowed memory_.
-
-You can also use the `low_memory=True` option to reduce the memory used by celligner in the memory intensive `PCA` & `cPCA` methods.
-
-### Add your own data to a pretrained model
-
-If you want to see your dataset in celligner, you can decide to use our own prefitted version.
+### Aligning the target dataset to a new reference dataset
+This use case is for the scenario where you want to align the same target dataset to a new reference dataset (which might be the same reference dataset as before with some new samples). In this case you can call transform without the target dataset to re-use the previous target dataset and skip re-doing some computation (see diagram below).
 
 ```python
-! curl https://storage.googleapis.com/celligner/model.pkl -output temp/model.pkl
-
-from celligner import Celligner
-
-my_alligner = Celligner()
-my_alligner.load('temp/model.pkl')
+my_celligner.fit(new_reference_expression)
+my_celligner.transform()
 ```
 
-We fit the model with CCLE and then transform TCGA. But you can decide differently.
-
-For example: If you want to see how some of your newly sequenced tumors mapped to the CCLE (and TCGA) dataset, just load the model as displayed above and then run:
+### Aligning a third dataset to the previous combined output
+This use case is for the scenario where you have a third dataset (e.g. Met500 tumor samples), that you want to align the the previously aligned (e.g. CCLE+TCGA) dataset. This is the current approach for multi-dataset alignment taken by the Celligner app.
 
 ```python
-my_alligner.addTotransform(your_tpm, your_annotations)
-my_alligner.plot()
+my_celligner.makeNewReference()
+# The value of k1 should be selected based on the size of the new dataset. 
+# We use k=20 for Met500 (n=~850), and k1=10 for the PDX datasets (n=~250-450).
+my_celligner.mnn_kwargs.update({"k1":20, "k2":50}) 
+my_celligner.transform(met500_TPM, compute_cPCs=False)
 ```
 
-This way you will not rerun the entire model.
+### Diagram 
+This diagram provides an overview of how Celligner works, including for the different use cases described above.
 
-See `docs/Celligner_demo.[html|pdf]` for other examples of usage.
-(view [here](https://raw.githack.com/broadinstitute/celligner/master/docs/example.html))
+![](docs/celligner_diagram.png)
 
-## Multidataset alignment
+### Computational complexity
 
-Dee `docs/Celligner_demo.[html|pdf]` for an example of usage.
-(view [here](https://raw.githack.com/broadinstitute/celligner/master/docs/example.html))
+Depending on the dataset, Celligner can be quite memory hungry.
+For TCGA, expect at least _50-60Gb_ of memory to be used. You might need a powerfull computer, lots of _swap_ and to increase R's default _maximum allowed memory_.
 
-One can use addToFit(), addToPredict() depending on whether they want to align their dataset to another or align another dataset to theirs.
+You can also use the `low_memory=True` option to reduce the memory used by Celligner in the memory intensive `PCA` & `cPCA` methods.
 
-If you have a very small dataset and want to align to CCLE or CGA, use the parameter `doAdd=True` in order to not rerun the entire pipeline and use cached information.
 
 # R Celligner
 
 For the original R version of celligner, please check the R/README.md file here: [https://github.com/broadinstitute.org/celligner/tree/master/R/README.md](https://github.com/broadinstitute.org/celligner/tree/master/R/README.md)
 
-Please use _github issues_ for any problem related to the tool.
-
 ---
 
-__Initial Project:__
+__Initial project:__
 
 Allie Warren @awarren
 
-__Maintainer:__
+__Initial python version:__
 
 Jérémie Kalfon @jkobject
+
+__Current maintainer:__
+
+Barbara De Kegel @bdekegel
diff --git a/build_docker.sh b/build_docker.sh
@@ -0,0 +1,4 @@
+# Builds celligner docker image
+# Note that this docker image does not have taigapy or mnnpy installed
+
+docker buildx build --platform linux/amd64 --push -t us.gcr.io/bdekegel/celligner:latest .