From 8b6e4f57dd9bc58f19952576dc7c3ed16ec80815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 7 Aug 2024 19:10:35 +0100 Subject: [PATCH] Benchmark for MDA 2.4.3 (#5) * switch to chembl33 and update env * formatting with black * standardization code modifies mol inplace * benchmark results * simplify conda env * add scaffold analysis * use rdkit.js to generate images on hover * add link to scaffold network viz in readme * cleanup --- Makefile | 12 +- README.md | 15 +- environment.yaml | 173 +-- results/badge.json | 2 +- results/failed_molecules.html | 277 ++-- results/failed_molecules.smi | 2446 +++++++++++++++++++++--------- results/failed_scaffolds.html | 585 +++++++ results/failed_scaffolds.smi | 2069 +++++++++++++++++++++++++ results/results.json | 17 +- results/scaffold_network.html | 70 + results/scaffold_networkx.pkl.xz | Bin 0 -> 321140 bytes scripts/benchmark.py | 96 +- scripts/fetch_chembl.sh | 4 +- scripts/process_molecules.py | 31 +- scripts/report.py | 130 +- scripts/scaffold_analysis.py | 353 +++++ scripts/utils.py | 36 +- 17 files changed, 5138 insertions(+), 1178 deletions(-) create mode 100644 results/failed_scaffolds.html create mode 100644 results/failed_scaffolds.smi create mode 100644 results/scaffold_network.html create mode 100644 results/scaffold_networkx.pkl.xz create mode 100644 scripts/scaffold_analysis.py diff --git a/Makefile b/Makefile index 6750f30..7ed0e1a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,5 @@ # MDAnalysis fork to use -GITHUB_USER ?= cbouy -# Branch of MDAnalysis to install -BRANCH ?= fix-converter +MDA_VERSION ?= 2.4.3 # Use conda or mamba CONDA ?= conda # Number of threads to use in parallel @@ -15,9 +13,9 @@ MAX_ATOMS ?= 50 SHELL := /bin/bash SET_CONDA_ENV := source $$(conda info --base)/etc/profile.d/conda.sh && conda activate && conda activate rdkitconverter -CHEMBL_SDF := ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_30/chembl_30.sdf.gz +CHEMBL_SDF := ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_33/chembl_33.sdf.gz -fetch := data/chembl_30.sdf.gz +fetch := data/chembl_33.sdf.gz process := data/chembl_processed_unique.smi.gz benchmark := data/chembl_failed.smi report := results/failed_molecules.html @@ -30,7 +28,7 @@ help: @echo 'targets:' @echo ' help Show this help' @echo ' install Install dependencies' - @echo ' fetch Fetch ChEMBL 30' + @echo ' fetch Fetch ChEMBL 33' @echo ' process Filter, standardize and remove duplicate molecules' @echo ' benchmark Run the benchmark' @echo ' report Generate the report' @@ -42,7 +40,7 @@ help: install: $(CONDA) env create -f environment.yaml @$(SET_CONDA_ENV) - @pip install git+https://github.com/$(GITHUB_USER)/mdanalysis.git@$(BRANCH)#subdirectory=package + @$(CONDA) install 'mdanalysis==$(MDA_VERSION)' $(fetch): @export CHEMBL_SDF=$(CHEMBL_SDF) diff --git a/README.md b/README.md index 3c645d7..db5f735 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,19 @@ To cite this repository, please use the following DOI: | Description | Value | | --- | --- | -| **MDAnalysis version** | 2.2.0-dev0 | -| **Accuracy** | 99.14% | -| **Number of molecules fetched** | 2,136,187 | -| **Number of molecules processed** | 1,942,004 | -| **Number of molecules failed** | 16,615 | +| **MDAnalysis version** | 2.4.3 | +| **Accuracy** | 99.19% | +| **Number of molecules fetched** | 2,372,174 | +| **Number of molecules processed** | 2,166,327 | +| **Number of molecules failed** | 17,577 | Details on the benchmark can also be found [here](results/results.json). The **interactive list of molecules** currently failing can be accessed [here](https://raw.githack.com/MDAnalysis/RDKitConverter-benchmark/main/results/failed_molecules.html) (click on a molecule's image to zoom in). +Failing **scaffolds** can be accessed [here](https://raw.githack.com/MDAnalysis/RDKitConverter-benchmark/main/results/failed_scaffolds.html). The scaffold network used to +create this file can be viewed [here](https://raw.githack.com/MDAnalysis/RDKitConverter-benchmark/main/results/scaffold_network.html). + ## Instructions Running the benchmark requires conda (or mamba) on a Linux machine. @@ -53,7 +56,7 @@ The results are available in the `results/` directory: ## Methods -The benchmark will fetch ChEMBL 30 as an SDF file and process the molecules the following way: +The benchmark will fetch ChEMBL 33 as an SDF file and process the molecules the following way: - Discard molecules that could not be read or sanitized by RDKit - Keep only the largest fragment - Keep only molecules with 2 to 50 heavy atoms diff --git a/environment.yaml b/environment.yaml index 26dc2ce..688141d 100644 --- a/environment.yaml +++ b/environment.yaml @@ -1,165 +1,18 @@ name: rdkitconverter channels: + - uclcheminformatics - conda-forge - defaults dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=1_gnu - - alsa-lib=1.2.3=h516909a_0 - - biopython=1.79=py39h3811e60_1 - - boost=1.74.0=py39h5472131_5 - - boost-cpp=1.74.0=h6cacc03_7 - - brotli=1.0.9=h7f98852_6 - - brotli-bin=1.0.9=h7f98852_6 - - brotlipy=0.7.0=py39h3811e60_1003 - - bzip2=1.0.8=h7f98852_4 - - c-ares=1.18.1=h7f98852_0 - - ca-certificates=2022.2.1=h06a4308_0 - - cairo=1.16.0=ha12eb4b_1010 - - certifi=2021.10.8=py39hf3d152e_1 - - cffi=1.15.0=py39h4bc2ebd_0 - - charset-normalizer=2.0.12=pyhd8ed1ab_0 - - colorama=0.4.4=pyh9f0ad1d_0 - - cryptography=36.0.0=py39h9ce1e76_0 - - cycler=0.11.0=pyhd8ed1ab_0 - - cython=0.29.28=py39he80948d_0 - - dbus=1.13.18=hb2f20db_0 - - easydict=1.9=py_0 - - expat=2.4.7=h27087fc_0 - - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - - font-ttf-inconsolata=3.000=h77eed37_0 - - font-ttf-source-code-pro=2.038=h77eed37_0 - - font-ttf-ubuntu=0.83=hab24e00_0 - - fontconfig=2.13.96=h8e229c2_2 - - fonts-conda-ecosystem=1=0 - - fonts-conda-forge=1=0 - - fonttools=4.30.0=py39hb9d737c_0 - - freetype=2.11.0=h70c0345_0 - - gettext=0.21.0=hf68c758_0 - - gevent=21.12.0=py39h3811e60_0 - - giflib=5.2.1=h36c2ea0_2 - - glib=2.70.2=h780b84a_4 - - glib-tools=2.70.2=h780b84a_4 - - greenlet=1.1.2=py39he80948d_1 - - grequests=0.6.0=pyh9f0ad1d_0 - - griddataformats=0.7.0=pyhd8ed1ab_0 - - gst-plugins-base=1.18.5=hf529b03_3 - - gstreamer=1.18.5=h9f60fe5_3 - - icu=69.1=h9c3ff4c_0 - - idna=3.3=pyhd8ed1ab_0 - - jbig=2.1=h7f98852_2003 - - jinja2=3.0.3=pyhd8ed1ab_0 - - jpeg=9e=h7f98852_0 - - keyutils=1.6.1=h166bdaf_0 - - kiwisolver=1.4.0=py39hf939315_0 - - krb5=1.19.3=h3790be6_0 - - lcms2=2.12=hddcbb42_0 - - ld_impl_linux-64=2.36.1=hea4e1c9_2 - - lerc=3.0=h9c3ff4c_0 - - libblas=3.9.0=13_linux64_openblas - - libbrotlicommon=1.0.9=h7f98852_6 - - libbrotlidec=1.0.9=h7f98852_6 - - libbrotlienc=1.0.9=h7f98852_6 - - libcblas=3.9.0=13_linux64_openblas - - libclang=13.0.1=default_hc23dcda_0 - - libdeflate=1.10=h7f98852_0 - - libedit=3.1.20210910=h7f8727e_0 - - libev=4.33=h516909a_1 - - libevent=2.1.10=h9b69904_4 - - libffi=3.4.2=h7f98852_5 - - libgcc-ng=11.2.0=h1d223b6_14 - - libgfortran-ng=11.2.0=h69a702a_14 - - libgfortran5=11.2.0=h5c6108e_14 - - libglib=2.70.2=h174f98d_4 - - libgomp=11.2.0=h1d223b6_14 - - libiconv=1.16=h516909a_0 - - liblapack=3.9.0=13_linux64_openblas - - libllvm13=13.0.1=hf817b99_2 - - libnsl=2.0.0=h7f98852_0 - - libogg=1.3.5=h27cfd23_1 - - libopenblas=0.3.18=pthreads_h8fe5266_0 - - libopus=1.3.1=h7f98852_1 - - libpng=1.6.37=h21135ba_2 - - libpq=14.2=hd57d9b9_0 - - libstdcxx-ng=11.2.0=he4da1e4_14 - - libtiff=4.3.0=h542a066_3 - - libuuid=2.32.1=h7f98852_1000 - - libuv=1.43.0=h7f98852_0 - - libvorbis=1.3.7=h9c3ff4c_0 - - libwebp=1.2.2=h3452ae3_0 - - libwebp-base=1.2.2=h7f98852_1 - - libxcb=1.13=h7f98852_1004 - - libxkbcommon=1.0.3=he3ba5ed_0 - - libxml2=2.9.12=h885dcf4_1 - - libzlib=1.2.11=h36c2ea0_1013 - - lz4-c=1.9.3=h9c3ff4c_1 - - markupsafe=2.1.1=py39hb9d737c_0 - - matplotlib=3.5.1=py39hf3d152e_0 - - matplotlib-base=3.5.1=py39h2fa2bec_0 - - mmtf-python=1.1.2=py_0 - - mols2grid=0.2.1=pyhd8ed1ab_0 - - mrcfile=1.3.0=pyh44b312d_0 - - msgpack-python=1.0.3=py39h1a9c180_0 - - munkres=1.1.4=pyh9f0ad1d_0 - - mysql-common=8.0.28=ha770c72_0 - - mysql-libs=8.0.28=hfa10184_0 - - ncurses=6.3=h9c3ff4c_0 - - networkx=2.7.1=pyhd8ed1ab_0 - - nspr=4.33=h295c915_0 - - nss=3.74=hb5efdd6_0 - - numpy=1.22.3=py39h18676bf_0 - - openjpeg=2.4.0=hb52868f_1 - - openssl=1.1.1m=h7f8727e_0 - - packaging=21.3=pyhd8ed1ab_0 - - pandas=1.4.1=py39hde0f152_0 - - pcre=8.45=h9c3ff4c_0 - - pillow=9.0.1=py39hae2aec6_2 - - pip=22.0.4=pyhd8ed1ab_0 - - pixman=0.40.0=h36c2ea0_0 - - pthread-stubs=0.4=h36c2ea0_1001 - - pycairo=1.21.0=py39h0934665_0 - - pycparser=2.21=pyhd8ed1ab_0 - - pyopenssl=22.0.0=pyhd8ed1ab_0 - - pyparsing=3.0.7=pyhd8ed1ab_0 - - pyqt=5.12.3=py39hf3d152e_8 - - pyqt-impl=5.12.3=py39hde8b62d_8 - - pyqt5-sip=4.19.18=py39he80948d_8 - - pyqtchart=5.12=py39h0fcd23e_8 - - pyqtwebengine=5.12.1=py39h0fcd23e_8 - - pysocks=1.7.1=py39hf3d152e_4 - - python=3.9.10=h85951f9_2_cpython - - python-dateutil=2.8.2=pyhd8ed1ab_0 - - python_abi=3.9=2_cp39 - - pytz=2021.3=pyhd8ed1ab_0 - - qt=5.12.9=ha98a1a1_5 - - rdkit=2020.03.6=py39h54e287e_1 - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd8ed1ab_0 - - requests_cache=0.4.13=py_0 - - scipy=1.8.0=py39hee8e79c_1 - - setuptools=60.10.0=py39hf3d152e_0 - - six=1.16.0=pyh6c4a22f_0 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.12=h27826a3_0 - - tornado=6.1=py39h3811e60_2 - - tqdm=4.63.0=pyhd8ed1ab_0 - - tzdata=2022a=h191b570_0 - - unicodedata2=14.0.0=py39h3811e60_0 - - urllib3=1.26.9=pyhd8ed1ab_0 - - wheel=0.37.1=pyhd8ed1ab_0 - - xorg-kbproto=1.0.7=h7f98852_1002 - - xorg-libice=1.0.10=h7f98852_0 - - xorg-libsm=1.2.3=hd9c2040_1000 - - xorg-libx11=1.7.2=h7f98852_0 - - xorg-libxau=1.0.9=h7f98852_0 - - xorg-libxdmcp=1.1.3=h7f98852_0 - - xorg-libxext=1.3.4=h7f98852_1 - - xorg-libxrender=0.9.10=h7f98852_1003 - - xorg-renderproto=0.11.1=h7f98852_1002 - - xorg-xextproto=7.3.0=h7f98852_1002 - - xorg-xproto=7.0.31=h7f98852_1007 - - xz=5.2.5=h516909a_1 - - zlib=1.2.11=h36c2ea0_1013 - - zope.event=4.5.0=pyh9f0ad1d_0 - - zope.interface=5.4.0=py39h3811e60_1 - - zstd=1.5.2=ha95c52a_0 + - bokeh=3.2.0 + - mols2grid=1.1.1 + - networkx=3.1 + - pandas=2.0.2 + - python=3.10.11 + - rdkit=2022.09.1 + - scaffoldgraph=1.1.2 + - tqdm=4.65.0 + - numpy=1.25.0 + - pydot=1.4.2 + - pygraphviz=1.11 + - scipy=1.9.3 diff --git a/results/badge.json b/results/badge.json index f5ef049..1b04d6c 100644 --- a/results/badge.json +++ b/results/badge.json @@ -1 +1 @@ -{"schemaVersion": 1, "label": "accuracy", "message": "99.14%", "color": "success"} \ No newline at end of file +{"schemaVersion": 1, "label": "accuracy", "message": "99.19%", "color": "success"} \ No newline at end of file diff --git a/results/failed_molecules.html b/results/failed_molecules.html index 05167e7..98eb782 100644 --- a/results/failed_molecules.html +++ b/results/failed_molecules.html @@ -5,15 +5,15 @@ #mols2grid.gridcontainer { display: block; padding-left: 1em; - max-width: 820px; - width: 820px; + max-width: 1632px; + width: 1632px; } #mols2grid .cell { border: 1px solid #cccccc; text-align: center; vertical-align: top; - max-width: 160px; - width: 160px; + max-width: 200px; + width: 200px; font-family: 'DejaVu', sans-serif; font-size: 12pt; padding: 0; @@ -52,10 +52,17 @@ + + - - + @@ -72,6 +79,7 @@