Skip to content

Commit

Permalink
Update CellLineMetaData (#334)
Browse files Browse the repository at this point in the history
* update cell line annotation

* add LookUp object

* pass nox test

* fix test

* keep downloaded files

* improve error message and lookup initialize

* add annotate_from_gdsc

* add metadata source in the docstring

* fix typo

* update test script and speed up downloading metadata

* add remotezip dependency

* parametrize Pytest fixture

* fix import error

* poetry update

Signed-off-by: zethson <[email protected]>

* Fix fixture

Signed-off-by: zethson <[email protected]>

* fix test error

* Refactor

Signed-off-by: zethson <[email protected]>

* Refactor

Signed-off-by: zethson <[email protected]>

* Refactor

Signed-off-by: zethson <[email protected]>

* improve LookUp object

* add openpyxl

Signed-off-by: zethson <[email protected]>

* update lookup

* replace dict with namedtuple

---------

Signed-off-by: zethson <[email protected]>
Co-authored-by: Lukas Heumos <[email protected]>
  • Loading branch information
wxicu and Zethson authored Aug 8, 2023
1 parent 5dc2b9b commit bbdf99b
Show file tree
Hide file tree
Showing 8 changed files with 873 additions and 2,282 deletions.
682 changes: 367 additions & 315 deletions pertpy/tools/_metadata/_cell_line.py

Large diffs are not rendered by default.

340 changes: 340 additions & 0 deletions pertpy/tools/_metadata/_look_up.py

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pertpy/tools/_perturbation_space/_perturbation_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,14 @@ def add(
for key in data["layers"]:
key_name = key
if key.endswith("_control_diff"):
key_name = key.remove_suffix("_control_diff")
key_name = key.removesuffix("_control_diff")
new_perturbation.layers[key_name] = data["layers"][key]

if "embeddings" in data.keys():
key_name = key
for key in data["embeddings"]:
if key.endswith("_control_diff"):
key_name = key.remove_suffix("_control_diff")
key_name = key.removesuffix("_control_diff")
new_perturbation.obsm[key_name] = data["embeddings"][key]

if ensure_consistency:
Expand Down Expand Up @@ -275,14 +275,14 @@ def subtract(
for key in data["layers"]:
key_name = key
if key.endswith("_control_diff"):
key_name = key.remove_suffix("_control_diff")
key_name = key.removesuffix("_control_diff")
new_perturbation.layers[key_name] = data["layers"][key]

if "embeddings" in data.keys():
key_name = key
for key in data["embeddings"]:
if key.endswith("_control_diff"):
key_name = key.remove_suffix("_control_diff")
key_name = key.removesuffix("_control_diff")
new_perturbation.obsm[key_name] = data["embeddings"][key]

if ensure_consistency:
Expand Down
214 changes: 128 additions & 86 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ pyqt5 = {version = "^5.15.9", optional = true}
ott-jax = "^0.4.0"
sparsecca = ">=0.3.0"
numba = "^0.57.1"
remotezip = "^0.12.1"
openpyxl = "^3.1.2"

[tool.poetry.dev-dependencies]
black = ">=22.12.0"
Expand Down
65 changes: 31 additions & 34 deletions tests/tools/_metadata/test_cell_line.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import anndata
import numpy as np
import pandas as pd
import pytest
from anndata import AnnData
from scipy import sparse

Expand All @@ -12,7 +13,10 @@


class TestMetaData:
def make_test_adata(self) -> AnnData:
pt_metadata = pt.tl.CellLineMetaData()

@pytest.fixture
def adata(self) -> AnnData:
np.random.seed(1)

X = np.random.normal(0, 1, (NUM_CELLS, NUM_GENES))
Expand All @@ -21,13 +25,14 @@ def make_test_adata(self) -> AnnData:
cell_line = {
"DepMap_ID": ["ACH-000016"] * NUM_CELLS_PER_ID
+ ["ACH-000049"] * NUM_CELLS_PER_ID
+ ["ACH-000130"] * NUM_CELLS_PER_ID
+ ["ACH-000216"] * NUM_CELLS_PER_ID
+ ["ACH-001208"] * NUM_CELLS_PER_ID
+ ["ACH-000956"] * NUM_CELLS_PER_ID
}
cell_line = pd.DataFrame(cell_line)
obs = pd.concat([cell_line], axis=1)
obs = obs.set_index(np.arange(NUM_GENES))
obs.index.rename("index", inplace=True)
obs["perturbation"] = "Midostaurin"

var_data = {"gene_name": ["gene" + str(i) for i in range(1, NUM_GENES + 1)]}
var = pd.DataFrame(var_data)
Expand All @@ -39,59 +44,51 @@ def make_test_adata(self) -> AnnData:

return adata

def test_cell_line_annotation(self):
adata = self.make_test_adata()
pt_metadata = pt.tl.CellLineMetaData()
pt_metadata.annotate_cell_lines(adata=adata)

assert len(adata.obs.columns) == len(pt_metadata.cell_line_meta.columns)
assert set(pt_metadata.cell_line_meta.columns).issubset(adata.obs)
def test_cell_line_annotation(self, adata):
self.pt_metadata.annotate_cell_lines(adata=adata)
assert (
len(adata.obs.columns) == len(self.pt_metadata.cell_line_meta.columns) + 1
) # due to the perturbation column
assert set(self.pt_metadata.cell_line_meta.columns).issubset(adata.obs)
stripped_cell_line_name = (
["SLR21"] * NUM_CELLS_PER_ID
+ ["HEKTE"] * NUM_CELLS_PER_ID
+ ["NALM19"] * NUM_CELLS_PER_ID
+ ["JHESOAD1"] * NUM_CELLS_PER_ID
+ ["TK10"] * NUM_CELLS_PER_ID
+ ["22RV1"] * NUM_CELLS_PER_ID
)

assert stripped_cell_line_name == list(adata.obs["stripped_cell_line_name"])

def test_ccle_expression_annotation(self):
adata = self.make_test_adata()
pt_metadata = pt.tl.CellLineMetaData()
pt_metadata.annotate_cell_lines(adata)
pt_metadata.annotate_ccle_expression(adata)

assert len(adata.obsm) == 1
assert adata.obsm["CCLE_expression"].shape == (NUM_CELLS, len(pt_metadata.ccle_expr.columns))
def test_gdsc_annotation(self, adata):
self.pt_metadata.annotate_cell_lines(adata)
self.pt_metadata.annotate_from_gdsc(adata, query_id="stripped_cell_line_name")
assert "drug_name" in adata.obs
assert "ln_ic50" in adata.obs

def test_protein_expression_annotation(self):
adata = self.make_test_adata()
pt_metadata = pt.tl.CellLineMetaData()
pt_metadata.annotate_cell_lines(adata)
pt_metadata.annotate_protein_expression(adata)
def test_protein_expression_annotation(self, adata):
self.pt_metadata.annotate_cell_lines(adata)
self.pt_metadata.annotate_protein_expression(adata, query_id="stripped_cell_line_name")

assert len(adata.obsm) == 1
assert adata.obsm["proteomics_protein_intensity"].shape == (
NUM_GENES,
len(pt_metadata.proteomics_data.uniprot_id.unique()),
len(self.pt_metadata.proteomics_data.uniprot_id.unique()),
)

def test_bulk_rna_expression_annotation(self):
adata = self.make_test_adata()
pt_metadata = pt.tl.CellLineMetaData()
pt_metadata.annotate_cell_lines(adata)
pt_metadata.annotate_bulk_rna_expression(adata)
def test_bulk_rna_expression_annotation(self, adata):
self.pt_metadata.annotate_cell_lines(adata)
self.pt_metadata.annotate_bulk_rna_expression(adata, query_id="DepMap_ID", cell_line_source="broad")

assert len(adata.obsm) == 1
assert adata.obsm["bulk_rna_expression_broad"].shape == (
NUM_GENES,
len(pt_metadata.bulk_rna_broad.gene_id.unique()),
self.pt_metadata.bulk_rna_broad.shape[1],
)

pt_metadata.annotate_bulk_rna_expression(adata, bulk_rna_source="sanger")
self.pt_metadata.annotate_bulk_rna_expression(adata, query_id="stripped_cell_line_name")

assert len(adata.obsm) == 2
assert adata.obsm["bulk_rna_expression_sanger"].shape == (
NUM_GENES,
len(pt_metadata.bulk_rna_sanger.gene_id.unique()),
self.pt_metadata.bulk_rna_sanger.shape[1],
)
Loading

0 comments on commit bbdf99b

Please sign in to comment.