Merge pull request #233 from iomega/update_matchms_version

Update matchms version
iomega · Jan 19, 2024 · a3ed83d · a3ed83d
2 parents 66c4af2 + 8febebe
commit a3ed83d
Show file tree

Hide file tree

Showing 30 changed files with 157 additions and 396 deletions.
diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml
@@ -46,7 +46,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
-        python-version: ['3.7', '3.8']
+        python-version: ['3.8', '3.9']
         exclude:
           # already tested in first_check job
           - python-version: 3.8

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## unpublished
+## 1.3.0
+### Changed
+- New models have to be downloaded, since this version is not compatible with the older models! Embeddings have to be stored as parquet.
+- Embeddings are now stored by parquet instead of pickle
+- Made MS2Query compatible with matchms 0.24.0
+## 1.2.4
 ### Added
 - environment.yml and CI_build test fur building a conda env from this file
 ### fixed

diff --git a/environment.yml b/environment.yml
@@ -4,228 +4,19 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_gnu
-  - abseil-cpp=20210324.2=h9c3ff4c_0
-  - absl-py=1.4.0=pyhd8ed1ab_0
-  - aiohttp=3.8.5=py38h01eb140_0
-  - aiosignal=1.3.1=pyhd8ed1ab_0
-  - astunparse=1.6.3=pyhd8ed1ab_0
-  - async-timeout=4.0.3=pyhd8ed1ab_0
-  - attrs=23.1.0=pyh71513ae_1
-  - blinker=1.6.2=pyhd8ed1ab_0
-  - boost=1.78.0=py38h4e30db6_4
-  - boost-cpp=1.78.0=h5adbc97_2
-  - brotli=1.0.9=h166bdaf_9
-  - brotli-bin=1.0.9=h166bdaf_9
-  - brotli-python=1.0.9=py38hfa26641_9
-  - bzip2=1.0.8=h7f98852_4
-  - c-ares=1.19.1=hd590300_0
-  - ca-certificates=2023.7.22=hbcca054_0
-  - cached-property=1.5.2=hd8ed1ab_1
-  - cached_property=1.5.2=pyha770c72_1
-  - cachetools=5.3.1=pyhd8ed1ab_0
-  - cairo=1.16.0=ha61ee94_1014
-  - certifi=2023.7.22=pyhd8ed1ab_0
-  - cffi=1.15.1=py38h4a40e3a_3
-  - charset-normalizer=3.2.0=pyhd8ed1ab_0
-  - click=8.1.7=unix_pyh707e725_0
-  - colorama=0.4.6=pyhd8ed1ab_0
-  - coloredlogs=15.0.1=pyhd8ed1ab_3
-  - contourpy=1.1.0=py38h7f3f72f_0
-  - cryptography=39.0.0=py38h1724139_0
-  - cycler=0.11.0=pyhd8ed1ab_0
-  - deprecated=1.2.14=pyh1a96a4e_0
-  - exceptiongroup=1.1.3=pyhd8ed1ab_0
-  - expat=2.5.0=hcb278e6_1
-  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
-  - font-ttf-inconsolata=3.000=h77eed37_0
-  - font-ttf-source-code-pro=2.038=h77eed37_0
-  - font-ttf-ubuntu=0.83=hab24e00_0
-  - fontconfig=2.14.2=h14ed4e7_0
-  - fonts-conda-ecosystem=1=0
-  - fonts-conda-forge=1=0
-  - fonttools=4.42.1=py38h01eb140_0
-  - freetype=2.12.1=hca18f0e_1
-  - freetype-py=2.3.0=pyhd8ed1ab_0
-  - frozenlist=1.4.0=py38h01eb140_0
-  - fst-pso=1.8.1=pyhd8ed1ab_0
-  - fuzzytm=2.0.5=pyhd8ed1ab_0
-  - gast=0.5.4=pyhd8ed1ab_0
-  - gensim=4.3.2=py38h53bb729_0
-  - gettext=0.21.1=h27087fc_0
-  - giflib=5.2.1=h0b41bf4_3
-  - gmp=6.2.1=h58526e2_0
-  - gmpy2=2.1.2=py38h793c122_1
-  - google-auth=2.17.3=pyh1a96a4e_0
-  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
-  - google-pasta=0.2.0=pyh8c360ce_0
-  - greenlet=2.0.2=py38h17151c0_1
-  - grpc-cpp=1.45.2=h9d3bbbb_5
-  - grpcio=1.45.0=py38ha0cdfde_0
-  - h5py=3.9.0=nompi_py38h89e2d6c_100
-  - hdf5=1.14.0=nompi_h5231ba7_103
-  - humanfriendly=10.0=py38h578d9bd_4
-  - icu=70.1=h27087fc_0
-  - idna=3.4=pyhd8ed1ab_0
-  - importlib-metadata=6.8.0=pyha770c72_0
-  - importlib-resources=6.0.1=pyhd8ed1ab_0
-  - importlib_resources=6.0.1=pyhd8ed1ab_0
-  - iniconfig=2.0.0=pyhd8ed1ab_0
-  - joblib=1.3.2=pyhd8ed1ab_0
-  - jpeg=9e=h0b41bf4_3
-  - keras=2.8.0=pyhd8ed1ab_0
-  - keras-preprocessing=1.1.2=pyhd8ed1ab_0
-  - keyutils=1.6.1=h166bdaf_0
-  - kiwisolver=1.4.5=py38h7f3f72f_0
-  - krb5=1.20.1=hf9c8cef_0
-  - lcms2=2.15=hfd0df8a_0
-  - ld_impl_linux-64=2.40=h41732ed_0
-  - lerc=4.0.0=h27087fc_0
-  - libaec=1.0.6=hcb278e6_1
-  - libblas=3.9.0=17_linux64_openblas
-  - libbrotlicommon=1.0.9=h166bdaf_9
-  - libbrotlidec=1.0.9=h166bdaf_9
-  - libbrotlienc=1.0.9=h166bdaf_9
-  - libcblas=3.9.0=17_linux64_openblas
-  - libcurl=7.87.0=h6312ad2_0
-  - libdeflate=1.17=h0b41bf4_0
-  - libedit=3.1.20191231=he28a2e2_2
-  - libev=4.33=h516909a_1
-  - libexpat=2.5.0=hcb278e6_1
-  - libffi=3.4.2=h7f98852_5
-  - libgcc-ng=13.1.0=he5830b7_0
-  - libgfortran-ng=13.1.0=h69a702a_0
-  - libgfortran5=13.1.0=h15d22d2_0
-  - libglib=2.76.4=hebfc3b9_0
-  - libgomp=13.1.0=he5830b7_0
-  - libiconv=1.17=h166bdaf_0
-  - liblapack=3.9.0=17_linux64_openblas
-  - libllvm14=14.0.6=hcd5def8_4
-  - libnghttp2=1.51.0=hdcd2b5c_0
-  - libnsl=2.0.0=h7f98852_0
-  - libopenblas=0.3.23=pthreads_h80387f5_0
-  - libpng=1.6.39=h753d276_0
-  - libprotobuf=3.20.3=h3eb15da_0
-  - libsqlite=3.43.0=h2797004_0
-  - libssh2=1.10.0=haa6b8db_3
-  - libstdcxx-ng=13.1.0=hfd8a6a1_0
-  - libtiff=4.5.0=h6adf6a1_2
-  - libuuid=2.38.1=h0b41bf4_0
-  - libwebp-base=1.3.1=hd590300_0
-  - libxcb=1.13=h7f98852_1004
-  - libxml2=2.10.3=hca2bb57_4
-  - libxslt=1.1.37=h873f0b0_0
-  - libzlib=1.2.13=hd590300_5
-  - llvmlite=0.40.1=py38h94a1851_0
-  - lxml=4.9.2=py38h215a2d7_0
-  - markdown=3.4.4=pyhd8ed1ab_0
-  - markupsafe=2.1.3=py38h01eb140_0
-  - matchms=0.17.0=pyh7cba7a3_0
-  - matchmsextras=0.4.1=pyhdfd78af_0
-  - matplotlib-base=3.7.2=py38hf5b0b65_0
-  - miniful=0.0.6=pyhd8ed1ab_0
-  - mpc=1.3.1=hfe3b2da_0
-  - mpfr=4.2.0=hb012696_0
-  - mpmath=1.3.0=pyhd8ed1ab_0
-  - ms2deepscore=0.4.0=pyhdfd78af_0
-  - ms2query=1.2.2=pyhdfd78af_0
-  - multidict=6.0.4=py38h1de0b5d_0
-  - munkres=1.1.4=pyh9f0ad1d_0
-  - ncurses=6.4=hcb278e6_0
-  - networkx=3.1=pyhd8ed1ab_0
-  - numba=0.57.1=py38hd559b08_0
-  - numpy=1.24.4=py38h59b608b_0
-  - oauthlib=3.2.2=pyhd8ed1ab_0
-  - onnx=1.12.0=py38h8d49f1e_0
-  - onnxconverter-common=1.13.0=pyhd8ed1ab_0
-  - onnxruntime=1.15.1=py38h896e202_2_cpu
-  - openjpeg=2.5.0=hfec8fc6_2
-  - openssl=1.1.1v=hd590300_0
-  - opt_einsum=3.3.0=pyhd8ed1ab_1
-  - packaging=23.1=pyhd8ed1ab_0
-  - pandas=1.5.3=py38hdc8b05c_1
-  - pcre2=10.40=hc3806b6_0
-  - pickydict=0.4.0=pyhd8ed1ab_0
-  - pillow=9.4.0=py38hde6dc18_1
-  - pip=23.2.1=pyhd8ed1ab_0
-  - pixman=0.40.0=h36c2ea0_0
-  - platformdirs=3.10.0=pyhd8ed1ab_0
-  - pluggy=1.3.0=pyhd8ed1ab_0
-  - pooch=1.7.0=pyha770c72_3
-  - protobuf=3.20.3=py38h8dc9893_1
-  - pthread-stubs=0.4=h36c2ea0_1001
-  - pubchempy=1.0.4=py_0
-  - pyasn1=0.4.8=py_0
-  - pyasn1-modules=0.2.7=py_0
-  - pycairo=1.24.0=py38h1a1917b_0
-  - pycparser=2.21=pyhd8ed1ab_0
-  - pyfume=0.2.25=pyhd8ed1ab_0
-  - pyjwt=2.8.0=pyhd8ed1ab_0
-  - pyopenssl=23.2.0=pyhd8ed1ab_1
-  - pyparsing=3.0.9=pyhd8ed1ab_0
-  - pysocks=1.7.1=pyha2e5f31_6
-  - pyteomics=4.6=pyh7cba7a3_0
-  - pytest=7.4.0=pyhd8ed1ab_0
-  - python=3.8.15=h257c98d_0_cpython
-  - python-dateutil=2.8.2=pyhd8ed1ab_0
-  - python-flatbuffers=23.5.26=pyhd8ed1ab_0
-  - python-louvain=0.16=pyhd8ed1ab_0
-  - python_abi=3.8=3_cp38
-  - pytz=2023.3=pyhd8ed1ab_0
-  - pyu2f=0.1.5=pyhd8ed1ab_0
-  - rdkit=2023.03.3=py38h36d2b2f_0
-  - re2=2022.06.01=h27087fc_1
-  - readline=8.2=h8228510_1
-  - reportlab=4.0.4=py38h01eb140_0
-  - requests=2.31.0=pyhd8ed1ab_0
-  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
-  - rlpycairo=0.2.0=pyhd8ed1ab_0
-  - rsa=4.9=pyhd8ed1ab_0
-  - scikit-learn=1.3.0=py38hc099248_0
-  - scipy=1.10.1=py38h59b608b_3
-  - setuptools=68.1.2=pyhd8ed1ab_0
-  - simpful=2.11.0=pyhd8ed1ab_0
-  - six=1.16.0=pyh6c4a22f_0
-  - skl2onnx=1.15.0=pyhd8ed1ab_0
-  - smart_open=6.3.0=pyhd8ed1ab_1
-  - snappy=1.1.10=h9fff704_0
-  - spec2vec=0.8.0=pyhdfd78af_0
-  - sqlalchemy=2.0.20=py38h01eb140_0
-  - sqlite=3.43.0=h2c6b66d_0
-  - sympy=1.12=pypyh9d50eac_103
-  - tensorboard=2.8.0=pyhd8ed1ab_1
-  - tensorboard-data-server=0.6.1=py38h2b5fc30_4
-  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
-  - tensorflow=2.8.1=cpu_py38h66f0ec1_0
-  - tensorflow-base=2.8.1=cpu_py38hc7a75a0_0
-  - tensorflow-estimator=2.8.1=cpu_py38h4e23bc6_0
-  - termcolor=2.3.0=pyhd8ed1ab_0
-  - threadpoolctl=3.2.0=pyha21a80b_0
-  - tk=8.6.12=h27826a3_0
-  - tomli=2.0.1=pyhd8ed1ab_0
-  - tqdm=4.66.1=pyhd8ed1ab_0
-  - typing-extensions=4.7.1=hd8ed1ab_0
-  - typing_extensions=4.7.1=pyha770c72_0
-  - unicodedata2=15.0.0=py38h0a891b7_0
-  - urllib3=2.0.4=pyhd8ed1ab_0
-  - werkzeug=2.3.7=pyhd8ed1ab_0
-  - wheel=0.41.2=pyhd8ed1ab_0
-  - wrapt=1.15.0=py38h1de0b5d_0
-  - xorg-kbproto=1.0.7=h7f98852_1002
-  - xorg-libice=1.1.1=hd590300_0
-  - xorg-libsm=1.2.4=h7391055_0
-  - xorg-libx11=1.8.4=h0b41bf4_0
-  - xorg-libxau=1.0.11=hd590300_0
-  - xorg-libxdmcp=1.1.3=h7f98852_0
-  - xorg-libxext=1.3.4=h0b41bf4_2
-  - xorg-libxrender=0.9.10=h7f98852_1003
-  - xorg-renderproto=0.11.1=h7f98852_1002
-  - xorg-xextproto=7.3.0=h0b41bf4_1003
-  - xorg-xproto=7.0.31=h7f98852_1007
-  - xz=5.2.6=h166bdaf_0
-  - yarl=1.9.2=py38h01eb140_0
-  - zip
-  - zipp=3.16.2=pyhd8ed1ab_0
-  - zlib=1.2.13=hd590300_5
-  - zstd=1.5.5=hfc55251_0
+  - python=3.8.18
+  - matchms=0.24.1
+  - numpy=1.24.4
+  - spec2vec=0.8.0
+  - h5py=3.9.0
+  - pyarrow=12.0.1
+  - tensorflow=2.12.1
+  - scikit-learn=1.3.2
+  - ms2deepscore=0.4.0
+  - pandas=2.0.3
+  - matplotlib=3.7.3
+  - skl2onnx=1.16.0
+  - onnxruntime=1.16.3
+  - pytest=7.4.0
+  - pytest-cov=4.1.0
+  - zip
diff --git a/ms2query/__version__.py b/ms2query/__version__.py
@@ -1 +1 @@
-__version__ = '1.2.4'
+__version__ = '1.3.0'
diff --git a/ms2query/benchmarking/collect_test_data_results.py b/ms2query/benchmarking/collect_test_data_results.py
@@ -157,17 +157,22 @@ def get_modified_cosine_score_results(lib_spectra,
         else:
             selected_lib_spectra = lib_spectra
         if len(selected_lib_spectra) != 0:
-            scores_list = calculate_scores(selected_lib_spectra,
-                                           [test_spectrum], ModifiedCosine()).scores_by_query(test_spectrum)
-            # Scores list is a List[spectrum, (mod_cos, matching_peaks)
-            cosine_scores = [scores_tuple[1]["score"] for scores_tuple in scores_list]
-            highest_cosine_score = float(max(cosine_scores))
-            highest_scoring_spectrum = scores_list[cosine_scores.index(highest_cosine_score)][0]
+            scores = calculate_scores(references=selected_lib_spectra,
+                                      queries=[test_spectrum],
+                                      similarity_function=ModifiedCosine())
+            # Matchms allows to get the best matches for any query using scores_by_query
+            sorted_scores = scores.scores_by_query(test_spectrum, 'ModifiedCosine_score', sort=True)
+            # Scores are not stored if the cosine score is 0 (no overlapping peaks).
+            if len(sorted_scores) == 0:
+                highest_scoring_spectrum = random.choice(selected_lib_spectra)
+                highest_cosine_score = (0, 0)
+            else:
+                highest_scoring_spectrum, highest_cosine_score = sorted_scores[0]
 
             tanimoto_score = calculate_single_tanimoto_score(test_spectrum.get("smiles"),
                                                              highest_scoring_spectrum.get("smiles"))
             exact_match = highest_scoring_spectrum.get("inchikey")[:14] == test_spectrum.get("inchikey")[:14]
-            best_matches_for_test_spectra.append((highest_cosine_score, tanimoto_score, exact_match))
+            best_matches_for_test_spectra.append((highest_cosine_score[0], tanimoto_score, exact_match))
         else:
             best_matches_for_test_spectra.append(None)
     return best_matches_for_test_spectra
@@ -189,6 +194,7 @@ def get_cosines_score_results(lib_spectra,
             scores_list = calculate_scores(selected_lib_spectra,
                                            [test_spectrum],
                                            CosineGreedy(tolerance=fragment_mass_tolerance)).scores_by_query(test_spectrum)
+            # todo This was build with old matchms in mind, this version will not find cosine scores = 0
             cosine_scores = [scores_tuple[1].item()[0] for scores_tuple in scores_list if scores_tuple[1].item()[1] >= minimum_matched_peaks]
             if len(cosine_scores) != 0:
                 highest_cosine_score = max(cosine_scores)

diff --git a/ms2query/benchmarking/create_accuracy_vs_recall_plot.py b/ms2query/benchmarking/create_accuracy_vs_recall_plot.py
@@ -8,7 +8,8 @@
 import numpy as np
 from matplotlib import pyplot as plt
 from tqdm import tqdm
-from ms2query.utils import load_json_file, load_pickled_file, save_pickled_file
+from ms2query.utils import (load_df_from_parquet_file, load_json_file,
+                            save_df_as_parquet_file)
 
 
 def plot_all_with_standard_deviation(means_and_standars_deviation,
@@ -231,10 +232,11 @@ def create_plot(exact_matches,
     if recalculate_means:
         dict_with_results = load_all_test_results(20, test_results_folder, exact_match=exact_matches)
         means_and_standard_deviation = calculate_all_means_and_standard_deviation(dict_with_results, exact_matches=exact_matches)
-        save_pickled_file(means_and_standard_deviation,
-                          os.path.join(test_results_folder, f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
+        save_df_as_parquet_file(means_and_standard_deviation, os.path.join(test_results_folder,
+                                                                           f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
     else:
-        means_and_standard_deviation = load_pickled_file(os.path.join(test_results_folder, f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
+        means_and_standard_deviation = load_df_from_parquet_file(
+            os.path.join(test_results_folder, f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
 
     if exact_matches:
         optimal_results = means_and_standard_deviation["Optimal"]