From aeb407309603f1746b588b8e98fc6ce281e9bda9 Mon Sep 17 00:00:00 2001 From: matin Date: Wed, 24 Jul 2024 10:44:01 +0100 Subject: [PATCH] added preprocess --- notebooks/preprocess.ipynb | 182 ++ .../cistarget/tf_lists/allTFs_hg38.txt | 1892 ----------------- src/workflows/run_benchmark/config.vsh.yaml | 9 +- 3 files changed, 184 insertions(+), 1899 deletions(-) create mode 100644 notebooks/preprocess.ipynb delete mode 100644 notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt diff --git a/notebooks/preprocess.ipynb b/notebooks/preprocess.ipynb new file mode 100644 index 000000000..25ead9f45 --- /dev/null +++ b/notebooks/preprocess.ipynb @@ -0,0 +1,182 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/sc_counts.h5ad ./resources_raw/ --no-sign-request\n", + "!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/multiome_counts.h5mu ./resources_raw/ --no-sign-request" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install sctk, anndata\n", + "\n", + "import anndata as ad \n", + "import pandas as pd\n", + "import numpy as np\n", + "import sctk\n", + "par = {\n", + " 'sc_counts': '../resources_raw/sc_counts.h5ad',\n", + " 'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n", + "}\n", + "def preprocess_sc(par):\n", + " # clean up\n", + " sc_counts = ad.read_h5ad(par['sc_counts'])\n", + " sc_counts.obs = sc_counts.obs[['well', 'row', 'col', 'plate_name', 'cell_type', 'donor_id']]\n", + " sc_counts.X = sc_counts.layers['counts']\n", + " del sc_counts.layers \n", + " del sc_counts.obsm \n", + " sc_counts.var_names_make_unique()\n", + " sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_name'].astype('str') \\\n", + " + '_' + sc_counts.obs['well'].astype('str') \\\n", + " + '_' + sc_counts.obs['cell_type'].astype('str')\n", + " sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_well_cell_type'].astype('category')\n", + "\n", + " # merge cell types\n", + " CELL_TYPES = ['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells', 'B cells', 'Myeloid cells']\n", + " T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']\n", + " cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in CELL_TYPES}\n", + " sc_counts.obs['cell_type'] = sc_counts.obs['cell_type'].map(cell_type_map)\n", + " sc_counts.obs['cell_type'].unique()\n", + "\n", + " # qc \n", + " sctk.calculate_qc(sc_counts)\n", + " sctk.cellwise_qc(sc_counts)\n", + "\n", + " # filtering\n", + " # cell wise\n", + " filter_percent_hb = sc_counts.obs.percent_hb>.2\n", + " filter_percent_hb.sum()\n", + " # gene wise\n", + " plates = sc_counts.obs['plate_name'].unique()\n", + "\n", + " # Step 2: Initialize a DataFrame to store counts\n", + " gene_counts_per_plate = pd.DataFrame(index=sc_counts.var_names, columns=plates, dtype=int)\n", + "\n", + " # Step 3: Iterate over each plate and calculate expression counts\n", + " for plate in plates:\n", + " # Subset the AnnData object for the current plate\n", + " subset = sc_counts[sc_counts.obs['plate_name'] == plate]\n", + "\n", + " # Calculate expression counts (genes x cells > 0)\n", + " expressed_genes = (subset.X > 0).sum(axis=0)\n", + "\n", + " # Check if the result needs conversion from sparse matrix format\n", + " if isinstance(expressed_genes, np.matrix):\n", + " expressed_genes = np.array(expressed_genes).flatten()\n", + "\n", + " # Store the counts in the DataFrame\n", + " gene_counts_per_plate[plate] = expressed_genes\n", + "\n", + " # Step 4: Aggregate counts across plates (max or sum based on the requirement)\n", + " # We use `max` here to find if any gene meets the criteria in at least one plate\n", + " max_counts = gene_counts_per_plate.max(axis=1)\n", + "\n", + " # Step 5: Create a mask for genes to keep (genes expressed in at least 100 cells in any plate)\n", + " genes_to_keep = max_counts >= 100\n", + " print('retained genes:', genes_to_keep.sum())\n", + " # actual filtering\n", + " sc_counts = sc_counts[(~filter_percent_hb), genes_to_keep]\n", + " # clean\n", + " sc_counts.obs = sc_counts.obs[['cell_type', 'sm_name', 'donor_id', 'row', 'plate_name', 'well']]\n", + " sc_counts.var = sc_counts.var[[]]\n", + "\n", + " del sc_counts.obsm\n", + " del sc_counts.uns\n", + "\n", + " sc_counts.write(par['sc_counts_filtered'])\n", + "preprocess_sc(par)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import anndata as ad \n", + "from scipy import sparse\n", + "par = {\n", + " 'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n", + " 'pseudobulked': '../resources_raw/pseudobulked.h5ad',\n", + "}\n", + "def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData:\n", + " \"\"\"\n", + " Adapted from this forum post: \n", + " https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4\n", + " \"\"\"\n", + " \n", + " assert pd.api.types.is_categorical_dtype(adata.obs[col])\n", + "\n", + " # sum `.X` entries for each unique value in `col`\n", + " cat = adata.obs[col].values\n", + "\n", + " indicator = sparse.coo_matrix(\n", + " (\n", + " np.broadcast_to(True, adata.n_obs),\n", + " (cat.codes, np.arange(adata.n_obs))\n", + " ),\n", + " shape=(len(cat.categories), adata.n_obs),\n", + " )\n", + " \n", + " sum_adata = ad.AnnData(\n", + " indicator @ adata.X,\n", + " var=adata.var,\n", + " obs=pd.DataFrame(index=cat.categories),\n", + " )\n", + " \n", + " # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]`\n", + " obs_cols = adata.obs.columns\n", + " obs_cols = list(set(adata.obs.columns) - set([col]))\n", + " \n", + " one_to_one_mapped_obs_cols = []\n", + " nunique_in_col = adata.obs[col].nunique()\n", + " for other_col in obs_cols:\n", + " if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col:\n", + " one_to_one_mapped_obs_cols.append(other_col)\n", + "\n", + " joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col)\n", + " assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all()\n", + " sum_adata.obs = sum_adata.obs.join(joining_df)\n", + " sum_adata.obs.index.name = col\n", + " sum_adata.obs = sum_adata.obs.reset_index()\n", + " sum_adata.obs.index = sum_adata.obs.index.astype('str')\n", + "\n", + " return sum_adata\n", + "def pseudobulked_and_filter(par):\n", + " # pseudobulk\n", + " sc_counts = ad.read_h5ad(par['sc_counts_filtered'])\n", + " bulk_adata = sum_by(sc_counts, 'plate_well_cell_type')\n", + " bulk_adata.obs['cell_count'] = sc_counts.obs.groupby('plate_well_cell_type').size().values\n", + " bulk_adata.X = np.array(bulk_adata.X.todense())\n", + "\n", + " print('ratio of missingness' , (bulk_adata.X==0).sum()/bulk_adata.X.size)\n", + " bulk_adata.var = bulk_adata.var.reset_index()\n", + " bulk_adata.var.set_index('index', inplace=True)\n", + "\n", + " bulk_adata.X = np.nan_to_num(bulk_adata.X, nan=0)\n", + "\n", + " \n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt b/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt deleted file mode 100644 index 6769dac51..000000000 --- a/notebooks/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt +++ /dev/null @@ -1,1892 +0,0 @@ -ZNF354C -KLF12 -ZNF143 -ZIC2 -ZNF274 -SP2 -ZBTB7A -BCL6B -ZBTB49 -ZIC1 -ZNF232 -ZNF282 -ZNF410 -ZSCAN16 -ZNF524 -ZNF713 -ZNF75A -ZSCAN4 -ZNF200 -SNAI2 -KLF1 -BCL6 -EGR2 -OVOL2 -GFI1 -GFI1B -KLF11 -WT1 -ZNF655 -FOXC1 -ARX -VSX1 -CRX -PBX4 -PHOX2B -VAX2 -VSX2 -MSX2 -ESX1 -HOXD13 -NKX2-8 -VENTX -HESX1 -PITX2 -PROP1 -ISX -NKX2-5 -SIX6 -HOXC4 -HOXB7 -PAX6 -PAX7 -PAX4 -PAX3 -POU4F3 -POU6F2 -POU3F4 -NR1H4 -NR2E3 -POU2F1 -RBPJ -FOXP1 -MAX -PHF1 -MTF2 -BCL11A -BCL11B -FOXN2 -FOXR1 -SOX4 -SOHLH2 -ZSCAN29 -PLAGL2 -VEZF1 -ZBTB44 -CENPBD1 -TIGD1 -CXXC5 -FOXN3 -HDX -DUXA -MSANTD3 -ZZZ3 -LCORL -NFATC4 -CUX2 -CUX1 -DLX3 -LHX9 -POU5F1B -NFATC2 -ZFHX3 -KDM2B -KMT2A -DNMT1 -TET1 -DMRT3 -DMRT1 -DMRTA2 -DMRT2 -E2F2 -FOXK1 -FOXG1 -GCM1 -HOXA2 -NOBOX -LHX2 -LHX6 -TLX2 -EMX1 -ZFHX2 -LBX1 -HOXB13 -ZHX1 -POU6F1 -SHOX -ANHX -MYRF -NR2E1 -NR3C2 -NR2F6 -RARG -NFATC3 -RFX2 -GMEB1 -THAP12 -GLI2 -GLI3 -GLI1 -ETS1 -NFIL3 -MZF1 -RREB1 -SPIB -FOXF2 -FOXD1 -PBX1 -IRF2 -RORA -PPARG -REL -RELA -SOX9 -SRY -TFEB -TCF4 -CEBPE -XBP1 -PRDM1 -EHF -ERG -FOXJ3 -GATA1 -MEIS2 -POU2F2 -HSF2 -MEF2C -RXRG -NFATC1 -RFX3 -RUNX3 -EOMES -TFAP2B -TFAP2C -TFAP2A -ZBED1 -MLXIPL -TFE3 -MNT -TCF3 -TFAP4 -TFEC -MLX -MYF6 -BHLHE41 -BHLHE23 -ARNTL -BHLHE40 -CLOCK -HEY2 -USF1 -HEY1 -MESP1 -NEUROD2 -NHLH1 -OLIG3 -NEUROG2 -MSC -HES7 -BHLHA15 -BHLHE22 -FIGLA -OLIG1 -HES5 -SREBF2 -OLIG2 -MGA -DBP -CREB3 -HLF -NFE2 -BATF3 -ATF4 -NRL -JDP2 -CEBPG -CREB3L1 -TEF -CEBPB -MAFF -MAFG -MAFK -CEBPD -ATF7 -YY1 -CTCF -SP4 -GLIS3 -PRDM4 -EGR1 -GLIS2 -KLF16 -EGR4 -ZNF740 -ZIC3 -ZBTB7B -SP8 -HIC2 -KLF13 -HINFP -SP3 -GLIS1 -ZIC4 -EGR3 -ZBTB18 -ZNF784 -ZBTB7C -SP1 -MTF1 -ZKSCAN3 -SCRT2 -YY2 -SCRT1 -KLF14 -CENPB -ONECUT2 -ONECUT1 -ONECUT3 -E2F1 -E2F3 -E2F8 -E2F7 -E2F4 -EBF1 -ETV1 -SPI1 -ELF4 -ETV2 -ERF -ELK3 -ETV3 -ELF1 -SPDEF -ELK1 -ELF5 -ETV6 -FLI1 -GABPA -ELK4 -ELF3 -FEV -SPIC -ETV4 -ETV5 -FOXP3 -FOXJ2 -FOXO3 -FOXO1 -FOXI1 -FOXB1 -FOXL1 -FOXC2 -FOXO4 -FOXD2 -FOXD3 -FOXO6 -GATA3 -GATA5 -GATA4 -GCM2 -GRHL1 -TFCP2 -MEOX1 -DLX6 -ALX4 -GSC2 -PITX1 -HOXA9 -RHOXF1 -MEIS3 -DLX5 -HOXA1 -HOXA13 -EVX1 -MEOX2 -PITX3 -DLX4 -CDX1 -OTX1 -DLX2 -PRRX1 -TGIF2 -HOXB5 -HOXB3 -HOXC13 -HOXC11 -HOXC12 -HOXD11 -MNX1 -BARX1 -GSC -RAX -HNF1A -LMX1B -PDX1 -BARHL2 -MEIS1 -DLX1 -HMBOX1 -VAX1 -TGIF2LX -ALX3 -ISL2 -PKNOX1 -LMX1A -EN1 -MSX1 -EN2 -UNCX -GBX1 -PHOX2A -PKNOX2 -CDX2 -OTX2 -DRGX -PRRX2 -GBX2 -SHOX2 -GSX1 -HOXD12 -EMX2 -IRX2 -HOXB2 -RAX2 -EVX2 -HOXD8 -IRX5 -TGIF1 -LBX2 -ALX1 -GSX2 -HOXC10 -MIXL1 -HMX3 -HMX2 -BSX -DMBX1 -DPRX -NOTO -HOMEZ -HMX1 -HNF1B -PAX2 -POU1F1 -POU2F3 -POU4F2 -POU4F1 -POU3F2 -POU3F1 -POU3F3 -HSF4 -HSFY2 -HSF1 -IRF3 -IRF5 -IRF4 -IRF8 -IRF7 -IRF9 -MEF2A -SRF -MEF2D -MEF2B -MYBL2 -MYBL1 -RARB -ESR1 -HNF4A -VDR -NR3C1 -ESRRB -THRA -RARA -THRB -NR4A2 -AR -ESRRA -NR2F1 -NR2C2 -RXRA -ESRRG -RXRB -TP63 -PAX1 -PAX5 -PAX9 -PROX1 -NFKB2 -NFAT5 -NFKB1 -RFX4 -RFX5 -RUNX2 -GMEB2 -NFIX -NFIB -NFIA -SMAD3 -SOX8 -SOX10 -SOX21 -SOX15 -LEF1 -TCF7L1 -SOX14 -SOX7 -SOX2 -SOX18 -TBX21 -TBX5 -TBX15 -TBX2 -TBX4 -TBR1 -TBX19 -TBX20 -TBX1 -TEAD3 -TEAD1 -TEAD4 -NRF1 -CPEB1 -PGR -NR1I3 -NR1I2 -NFE2L1 -ATF2 -ATF6 -CREB1 -ATF3 -FOSL1 -JUN -MAFB -ATF6B -CEBPA -TFAP2E -HES2 -SREBF1 -TCFL5 -USF2 -HES1 -TCF21 -MYOG -MYOD1 -MYCN -ASCL1 -TCF12 -HES6 -FERD3L -MSGN1 -NEUROD1 -HAND2 -PTF1A -NPAS2 -ATOH1 -ARNT2 -NHLH2 -ATOH7 -NEUROG1 -ASCL2 -MESP2 -CREM -BACH2 -FOSB -JUND -CREB3L4 -CREB5 -BATF -FOS -JUNB -MAF -MAFA -ZNF263 -DPF1 -ZBTB32 -ZNF76 -KLF6 -ZNF343 -KLF5 -ZNF821 -ZNF174 -KLF3 -ZNF684 -ZBTB45 -SNAI1 -ZNF384 -KLF2 -ZSCAN5A -KLF4 -ZSCAN9 -ZIC5 -ZNF787 -OSR1 -ZNF660 -ZNF385D -ZSCAN1 -KLF10 -ZNF276 -ZNF281 -KLF15 -ZNF12 -ZNF704 -OSR2 -ZNF23 -ZNF444 -ZNF597 -ZBTB43 -ZNF32 -ZNF296 -ZBTB26 -KLF17 -OVOL1 -ZNF449 -HIC1 -ZBTB33 -ZNF454 -ZFP42 -ZNF771 -ZBTB2 -ZFP41 -ZBTB20 -ZFP1 -ZBTB37 -SNAI3 -ZNF501 -ZNF396 -ZSCAN23 -ZNF177 -ZNF250 -ZNF140 -ZNF460 -ZBTB14 -ZBTB12 -ZNF580 -SP9 -ZSCAN31 -ZBTB22 -ZNF345 -MBNL2 -YBX1 -LIN28B -DMRTC2 -DMRTA1 -ETV7 -ELF2 -ETS2 -FOXA2 -FOXA1 -FOXQ1 -FOXA3 -FOXE1 -FOXL2 -FOXR2 -GATA6 -GATA2 -TFCP2L1 -UBP1 -HOXA11 -ISL1 -HOXC8 -BARX2 -LHX5 -SIX4 -HOXA5 -HOXA6 -HOXB6 -NKX3-2 -NANOG -NKX2-3 -HOXB8 -HOXB1 -LHX4 -HOXA7 -BARHL1 -SIX1 -HOXD1 -HOXD3 -HOXD9 -HOXD10 -CDX4 -RHOXF2 -SIX3 -NKX6-2 -LHX8 -TLX3 -NKX6-3 -NKX3-1 -HOXD4 -IRX1 -SIX2 -HOXB9 -TGIF2LY -IRX3 -HOXC9 -HOXB4 -ARGFX -HOXA4 -HOXA10 -LHX1 -POU5F1 -HSFY1 -HSF5 -IRF6 -PPARD -NR5A2 -NR2C1 -NR4A1 -NR1D1 -NR5A1 -RORC -NR6A1 -NR1D2 -RORB -PAX8 -RFX1 -RFX7 -SKOR2 -SMAD5 -NFIC -SOX30 -TCF7 -BBX -SOX3 -SOX12 -TBX18 -TBX3 -TBX6 -TBXT -TEAD2 -XPA -SKOR1 -FOSL2 -ZKSCAN1 -ZFP14 -ZNF415 -ZNF135 -ZFP82 -ZKSCAN7 -ZNF777 -ZNF682 -FOXP2 -SOX6 -SOX5 -SOX17 -PLAG1 -ZKSCAN2 -ZNF582 -ZNF506 -ZNF324 -ZNF671 -ZNF264 -ZNF302 -ZNF184 -ZNF419 -ZNF85 -ZNF430 -ZNF549 -ZNF211 -ZNF205 -ZNF45 -ZNF133 -ZNF484 -ZNF557 -ZNF337 -ZNF317 -ZNF331 -ZNF141 -ZNF304 -ZNF132 -ZNF189 -ZNF287 -ZIM3 -ZNF614 -ZNF300 -RBAK -ZNF157 -ZNF182 -ZNF7 -ZNF214 -ZNF547 -ZNF776 -ZNF18 -ZNF19 -ZNF222 -ZNF235 -ZNF714 -ZNF333 -ZNF382 -ZNF496 -PRDM9 -ZNF202 -ZNF3 -ZNF180 -ZNF641 -ZNF610 -ZNF528 -ZNF701 -ZNF283 -ZNF558 -ZNF30 -ZNF354A -ZNF764 -ZNF778 -ZNF212 -ZNF439 -ZNF440 -ZNF562 -ZNF561 -ZNF584 -ZIK1 -ZNF540 -ZNF570 -ZNF621 -ZNF680 -ZNF483 -ZNF417 -ZNF791 -ZNF266 -ZNF519 -ZNF25 -ZNF77 -ZNF169 -ZNF613 -ZNF620 -ZNF619 -ZNF114 -ZNF543 -ZNF354B -ZNF223 -ZNF552 -ZNF154 -ZNF816 -ZNF571 -ZNF443 -ZNF792 -ZNF707 -ZNF875 -ZNF101 -ZNF716 -ZNF708 -ZNF662 -ZNF320 -ZNF530 -ZNF730 -ZNF93 -ZFP90 -ZNF479 -ZNF445 -ZNF74 -ZNF267 -ZNF566 -ZNF529 -ZNF284 -ZNF749 -ZNF17 -ZNF555 -ZNF75D -ZNF197 -ZFP69B -ZFP69 -ZNF626 -ZNF793 -ZNF383 -ZNF669 -ZNF548 -ZNF567 -ZNF573 -ZNF527 -ZNF33A -ZNF79 -ZNF681 -ZNF766 -ZNF565 -ZNF765 -ZNF124 -ZNF605 -ZNF799 -ZNF782 -ZNF846 -ZNF136 -ZKSCAN5 -ZNF33B -ZNF431 -ZNF418 -ZNF585A -ZNF429 -ZNF100 -ZNF398 -ZNF441 -ZNF257 -ZNF785 -ZNF786 -ZNF675 -ZNF860 -ZNF695 -ZNF615 -ZNF433 -ZNF81 -ZNF780A -ZNF181 -ZNF44 -ZNF790 -ZNF823 -ZNF311 -ZNF273 -ZNF84 -ZNF667 -ZNF649 -ZNF248 -ZNF334 -ZNF485 -ZNF442 -ZNF26 -ZNF69 -ZNF480 -ZNF587 -ZNF808 -ZNF28 -ZNF627 -ZNF789 -ZNF534 -ZNF525 -ZNF805 -ZNF468 -ZNF616 -ZFP57 -ZNF783 -ZNF425 -ZNF611 -ZNF254 -ZNF90 -ZNF891 -ZNF705G -ZNF880 -ZNF492 -ZNF879 -ZNF736 -ZNF737 -ZNF324B -ZNF564 -ZNF674 -ZNF550 -ZNF432 -ZNF10 -ZNF486 -ZNF225 -ZNF285 -ZNF224 -ZIM2 -ZNF2 -ZNF8 -ZNF487 -MXI1 -MYC -ZEB1 -REST -CTCFL -E2F6 -PBX3 -STAT1 -STAT3 -STAT2 -THAP1 -TP73 -HIF1A -TWIST1 -MITF -KLF9 -ZNF24 -NFYA -TFDP1 -FOXK2 -FOXH1 -GRHL2 -PBX2 -DUX4 -IRF1 -MYB -ESR2 -HNF4G -NR2F2 -RELB -SOX13 -TCF7L2 -NFYB -BACH1 -SIX5 -TBP -ZNF416 -ZNF574 -ZNF41 -ZNF653 -ZNF35 -ZNF16 -ZNF692 -ZFP3 -ZNF322 -ZNF467 -ZSCAN22 -ZNF71 -ZFP64 -PRDM6 -ZNF37A -ZNF586 -MYNN -ZNF213 -PATZ1 -MAZ -ZNF175 -KLF7 -GTF3A -ZNF436 -FEZF1 -ZNF341 -ZNF394 -IKZF3 -ZNF513 -ZNF22 -ZNF146 -ZNF280A -ZNF768 -ZNF554 -ZNF596 -ZBTB42 -ZNF594 -ZNF329 -ZBTB6 -ZSCAN30 -ZNF490 -ZNF563 -ZNF34 -ZNF774 -ZNF502 -ZFP28 -ZNF98 -ZNF677 -ZNF121 -ZNF770 -ZSCAN5C -ZBTB48 -ZNF134 -GLI4 -ZNF260 -ZNF350 -ZNF595 -INSM1 -ARID5B -LYL1 -AHR -EPAS1 -ARNT -TAL1 -NFE2L2 -ATF1 -ZFX -MECOM -SALL4 -KLF8 -ZBTB17 -PRDM14 -IKZF1 -ZNF335 -E2F5 -FOXM1 -LHX3 -NKX2-1 -NKX6-1 -MBD2 -MECP2 -NR1H3 -PPARA -TP53 -RUNX1 -AIRE -SMAD4 -STAT5A -STAT4 -STAT6 -STAT5B -THAP11 -NFYC -ZNF711 -ARID3A -HMGA1 -HMGA2 -MYF5 -NFE2L3 -ATF5 -DDIT3 -ZEB2 -HIVEP2 -IKZF2 -ZBTB11 -ZNF423 -ZBTB16 -ZNF541 -GZF1 -ZSCAN10 -PRDM12 -ZNF236 -PRDM15 -PRDM16 -ZNF761 -ZNF148 -ZNF589 -ZNF219 -SALL2 -E4F1 -SP7 -ZNF581 -ZNF217 -ZFP92 -ZSCAN26 -ZNF628 -ZNF521 -SP5 -ZNF316 -ZNF705E -ZNF727 -ZNF735 -ZNF883 -ZNF718 -ZNF658 -SATB1 -CXXC1 -EBF4 -EBF3 -EBF2 -FOXF1 -FOXN1 -FOXJ1 -FOXD4L4 -TRPS1 -GTF2IRD1 -GTF2I -HOXA3 -NKX2-2 -SETDB1 -MTERF1 -CDC5L -SMAD9 -SMAD1 -HBP1 -SOX11 -TBX22 -LTF -DNTTIP1 -POU2AF1 -CEBPZ -GTF2B -CARF -SPZ1 -NR0B1 -BPTF -PURA -TOPORS -NFE4 -ADNP -CHAMP1 -DACH1 -DRAP1 -GATAD1 -GATAD2A -HHEX -HMG20A -HMG20B -HMGXB4 -IKZF5 -INSM2 -KAT7 -KMT2B -MBD1 -MXD3 -MXD4 -NCOA1 -NCOA3 -NFXL1 -PHF20 -PRDM10 -SKI -ZBED5 -ZBTB10 -ZBTB21 -ZBTB25 -ZBTB40 -ZBTB8A -ZFP37 -ZFP91 -ZGPAT -ZKSCAN8 -ZNF239 -ZNF362 -ZNF366 -ZNF407 -ZNF426 -ZNF48 -ZNF507 -ZNF511 -ZNF512 -ZNF518A -ZNF577 -ZNF579 -ZNF585B -ZNF592 -ZNF600 -ZNF629 -ZNF639 -ZNF644 -ZNF652 -ZNF654 -ZNF664 -ZNF697 -ZNF781 -ZNF83 -ZNF843 -ZSCAN21 -ZXDB -AFF4 -ASCC1 -BAD -CBFA2T2 -CBFB -ZNF830 -CNOT6 -NELFB -DDX20 -ENO1 -FEZF2 -FHL2 -FOXP4 -GTF2H3 -GTF3C2 -GTF3C5 -HCFC2 -HCLS1 -HDAC8 -UBE2K -HTATIP2 -ID2 -KDM5A -LARP1 -CERS4 -MAGED4 -MAGEF1 -MYEF2 -NCALD -NME1 -NMRAL1 -NUCB1 -OTUD4 -PAXIP1 -PDCD11 -PDLIM5 -PHTF1 -PIR -PLAGL1 -PQBP1 -PURG -RAB18 -RAN -RBBP5 -RBFOX2 -RFXANK -SCAND2P -SCMH1 -SEMA4A -SF1 -SMAD2 -SNAPC4 -SNAPC5 -SND1 -SSBP3 -SSX2 -SSX3 -TAF1A -TAF9 -TBPL1 -TCEAL2 -TFAM -THAP5 -MED30 -TIMELESS -TRMT1 -TSC22D4 -TSNAX -TULP1 -VPS4B -YEATS4 -ZBTB4 -ZBTB46 -ZHX3 -ZNF131 -ZNF160 -ZNF207 -RNF114 -ZNF326 -ZNF385A -ZNF503 -ZNF510 -ZNF706 -TFAP2D -BRCA1 -CREB3L2 -FUBP1 -HAND1 -HLTF -HOXC6 -ID4 -NR1H2 -NR4A3 -SMARCA1 -SMARCA5 -SOX1 -TAF1 -TLX1 -HIVEP1 -ZNF165 -NF1 -BNC2 -ZBED2 -NKX2-4 -ARID5A -BCL3 -CHD1 -CHD2 -DBX2 -DMC1 -EP300 -EZH2 -GTF2F1 -HCFC1 -HLX -HOXC5 -IRX4 -IRX6 -MTA3 -NKX1-1 -NKX1-2 -NKX2-6 -OTP -PML -RAD21 -RCOR1 -SIN3A -SMARCC1 -SMARCC2 -SMC3 -SP100 -TBL1XR1 -WRNIP1 -ZBTB3 -ZNF691 -TRAF4 -CPSF4 -MYCLP1 -TCF15 -TAF6 -GABPB1 -ILF2 -SIRT6 -ING4 -CHURC1 -MXD1 -TAL2 -RFXAP -GTF2A2 -GTF2A1 -TFDP2 -RB1 -SMAD7 -SMAD6 -DEAF1 -ARNTL2 -TRIM28 -PARP1 -TERF1 -CNOT3 -DBX1 -BRF1 -BDP1 -POLR3A -EWSR1 -CTNNB1 -FOXN4 -BCLAF1 -CCNT2 -HDAC2 -OVOL3 -ZNF536 -ZBTB5 -ZNF688 -TBX10 -FOXD4L6 -FOXE3 -RLF -SP6 -ZNF746 -FOXD4L5 -FOXD4L3 -TBPL2 -ZNF687 -ZNF438 -ZNF516 -ZSCAN18 -PRDM13 -FOXD4L1 -SALL1 -ZBTB41 -ZBTB1 -ZSCAN5B -GTF2A1L -ZBTB8B -ZNF575 -ZNF280B -ZBTB34 -IKZF4 -AEBP2 -ZNF772 -ZSCAN25 -FIZ1 -ZNF215 -SALL3 -ZNF500 -ZFY -ZBTB24 -ZNF853 -ZSCAN20 -ZNF80 -ZNF20 -ZNF630 -ZNF699 -ZNF470 -ZNF57 -ZXDC -ZNF648 -ZNF544 -ZNF546 -ZNF517 -ZFP2 -ZNF572 -ZNF66 -ZNF689 -ZNF837 -ZNF710 -ZNF625 -ZNF491 -ZNF709 -ZNF526 -ZNF676 -ZNF556 -ZNF408 -ZNF700 -ZNF286A -ZNF471 -ZFP30 -ZNF230 -ZNF233 -ZNF275 -ZNF729 -ZSCAN32 -ZNF195 -ZNF814 -ZNF878 -ZNF726 -ZNF208 -ZNF732 -ZNF99 -ZNF253 -ZNF623 -ZNF14 -ZNF705D -ZNF43 -ZNF92 -ZNF117 -ZNF138 -ZNF91 -ZXDA -ZNF155 -ZNF234 -ZNF844 -ZNF763 -ZNF569 -ZNF404 -ZNF678 -ZNF829 -ZNF672 -ZNF568 -ZNF841 -ZNF813 -ZNF836 -ZNF705A -ZNF773 -ZNF551 -ZSCAN2 -ZNF227 -ZNF497 -ZNF493 -ZNF679 -ZNF683 -ZFP62 -ZNF721 -ZNF461 -ZNF397 -ZNF420 -ZNF578 -ZNF775 -ZNF845 -ZNF560 -ZNF606 -ZNF668 -ZKSCAN4 -ZNF514 -ZNF696 -ZNF607 -ZNF599 -ZNF559 -ZNF251 -ZNF583 -ZNF665 -ZNF670 -ZNF358 -ZNF319 -ZNF70 -ZNF226 -ZNF624 -PRDM5 -ZNF112 -ZNF780B -ZBTB47 -ZBTB39 -ZNF646 -ZNF835 -ZNF107 -ZNF391 -ZSCAN12 -ZFPM1 -PEG3 -ZBTB38 -ZNF367 -ZNF256 -HDAC1 -APEX1 -CTBP1 -BANP -CRTC2 -NONO -SFPQ -ABL1 -HELT -DIDO1 -HNRNPUL1 -DPF2 -NCOA2 -ILF3 -RHOXF2B -AHDC1 -HMGXB3 -LCOR -MLLT10 -SATB2 -GPBP1L1 -ZNF280D -ZNF142 -ZNF462 -ZNF576 -ATF7-NPFF -NANOGP8 -MBTPS2 -CIC -SETBP1 -FOXL3 -SEBOX -DMRTB1 -RFX6 -TAF1L -TWIST2 -FREM1 -ARID3B -RBPJL -CREBL2 -FOXB2 -FOXD4 -SP140 -CPHXL -AHCTF1 -DNAJC21 -MYPOP -PRDM11 -PHF21A -CCDC169-SOHLH2 -MLXIP -CREBZF -TERF2 -SP110 -NFX1 -ASH2L -METTL14 -VPS72 -CERS6 -CERS3 -CERS5 -CERS2 -PRDM7 -HIF3A -BNC1 -FANK1 -IL21 -ZNF622 -NPAS4 -ZBED6 -TMEM33 -ACAA1 -ZNF800 -ADNP2 -ZNF414 -ZFP91-CNTF -ZNF587B -ZNF451 -ZNF532 -LDB1 -LMO2 -YOD1 -METTL3 -A1CF -ABCF2 -ACO1 -ADARB1 -AGAP2 -AGGF1 -AGMAT -AHRR -AKR1A1 -ANXA1 -ANXA11 -APEX2 -ARFGAP1 -ARG1 -ARG2 -ARID3C -ASAP3 -ASPSCR1 -ATOH8 -AVEN -BAX -BOLL -BORCS8-MEF2B -BRF2 -C19orf25 -CANX -CAT -CBX7 -CCDC25 -CD59 -CDK2AP1 -CELF4 -CELF5 -CELF6 -CFL2 -CKMT1B -CLK1 -CNOT4 -CPTP -CSNK2B -CSTF2 -CYB5R1 -CYCS -DAB2 -DAZAP1 -DDX4 -DDX43 -DDX53 -DGCR8 -DHX36 -DIABLO -DIS3 -DMAP1 -DNMT3A -DR1 -DTL -DUS3L -DUSP22 -DUSP26 -ECSIT -EDN1 -EEF1D -EIF5A2 -ESRP1 -ESRP2 -ETFB -EXO5 -EXOSC3 -EZR -FAAP24 -FAM127B -FBXL19 -FEZ1 -FGF19 -FIP1L1 -FOXS1 -GADD45A -GAR1 -GIT2 -GLYCTK -GOT1 -GPAM -GPANK1 -GPD1 -GRHL3 -GRHPR -GTPBP1 -GTPBP6 -H1FX -H2AFY -H2AFZ -HADHB -HDAC3 -HES4 -HEYL -HHAT -HIRIP3 -HIST1H2BN -HIST2H2AB -HIST2H2BE -HIVEP3 -HKR1 -HLCS -HMGB1 -HMGB2 -HMGB3 -HMGB4 -HNRNPA0 -HNRNPA1 -HNRNPC -HNRNPH3 -HNRNPLL -HP1BP3 -HSPA1L -HSPA5 -HUNK -ID1 -IL24 -ING3 -IVD -JAZF1 -JRK -JRKL -KCNIP1 -KDM2A -KDM4A -KDM4B -KDM4C -KDM4D -KDM4E -KDM5D -KDM7A -KIAA0907 -KIF22 -KLF18 -KLRG1 -LARP4 -LAS1L -LIN28A -LRRFIP1 -LSM6 -LUZP1 -LUZP2 -MAGEA8 -MAGOH -MAP4K2 -MAPK1 -MCTP2 -MDM2 -MELK -METTL21B -MEX3C -MIEF1 -MIOS -MKX -MORN1 -MRPL1 -MRPL2 -MRPS25 -MSI1 -MSI2 -MSRA -MSRB3 -MTHFD1 -MYCL -MYLK -NAGS -NANOS1 -NAP1L1 -NCBP2 -NCOR1 -NCOR2 -NELFA -NEUROG3 -NMI -NNT -NOC2L -NPDC1 -NUP107 -NUP133 -NXPH3 -ODC1 -P4HB -PCK2 -PDE6H -PDS5A -PGAM2 -PHF12 -PHF2 -PHF8 -PHLDA2 -PICK1 -PIK3C3 -PKM -PLG -POLD2 -POLE3 -POLE4 -POLI -POLR2A -POLR3G -PPARGC1A -PPP1R10 -PPP2R3B -PPP5C -PRDX5 -PRKAA1 -PRKAA2 -PRNP -PROX2 -PSMA6 -PSMC2 -PSMD12 -PTCD1 -PTPMT1 -PUM3 -R3HDM2 -RAB14 -RAB2A -RAB7A -RBBP9 -RBM17 -RBM22 -RBM3 -RBM42 -RBM7 -RBM8A -RBMS1 -RFC2 -RFC3 -RFX8 -RIOK2 -RNASEH2C -RNF138 -RPL35 -RPL6 -RPP25 -RPS10 -RPS4X -RPS6KA5 -RUFY3 -RUVBL1 -SCAND1 -SCX -SF3B1 -SFT2D1 -SIM1 -SIM2 -SLC18A1 -SMAP2 -SMPX -SMUG1 -SNRNP70 -SNRPB2 -SOCS4 -SOD1 -SPAG7 -SPATS2 -SPR -SRBD1 -SRP9 -SRRM3 -SSRP1 -STAU2 -STK40 -STUB1 -SUCLG1 -T -TAF7 -TAGLN2 -TCEAL6 -TCF23 -TCF24 -TFDP3 -TFF3 -THOC2 -TIA1 -TIGD2 -TIGD3 -TIGD4 -TIGD5 -TIGD6 -TIGD7 -TIMM44 -TIMM8A -TMSB4XP8 -TOB2 -TPI1 -TPPP -TRIB1 -TRIB2 -TRIB3 -TRIM21 -TRIM69 -TRIP10 -TRMO -TROVE2 -TSN -U2AF1 -UBB -UBE2V1 -UBTF -UBXN1 -UGP2 -UQCRB -USP39 -UTP18 -VAMP3 -WDR83 -WISP2 -XG -XRCC1 -YWHAE -YWHAZ -ZC3H7A -ZCCHC14 -ZCCHC17 -ZDHHC15 -ZDHHC24 -ZDHHC5 -ZHX2 -ZMAT2 -ZMAT4 -ZNF286B -ZNF355P -ZNF542P -ZNF598 -ZNF658B -ZNF702P -ZNF705CP -ZNF717 -ZNF720 -ZNF788 -ZNF806 -ZNF826P -ZNF827 -ZNF831 -ZRSR2 -ZSWIM1 diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index ba46c1230..da17eb566 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -51,14 +51,9 @@ functionality: entrypoint: run_wf - type: file path: "../../api/task_info.yaml" - dependencies: - - name: common/extract_metadata - repository: openproblemsv2 # dependencies: - # # - name: common/extract_metadata - # # repository: openproblemsv2 - # # - name: control_methods/positive_control - # # - name: control_methods/negative_control + # - name: common/extract_metadata + # repository: openproblemsv2 # - name: methods/scglue # - name: metrics/regression_1 # repositories: