Skip to content

Commit

Permalink
Merge pull request #233 from iomega/update_matchms_version
Browse files Browse the repository at this point in the history
Update matchms version
  • Loading branch information
niekdejonge authored Jan 19, 2024
2 parents 66c4af2 + 8febebe commit a3ed83d
Show file tree
Hide file tree
Showing 30 changed files with 157 additions and 396 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
python-version: ['3.7', '3.8']
python-version: ['3.8', '3.9']
exclude:
# already tested in first_check job
- python-version: 3.8
Expand Down
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## unpublished
## 1.3.0
### Changed
- New models have to be downloaded, since this version is not compatible with the older models! Embeddings have to be stored as parquet.
- Embeddings are now stored by parquet instead of pickle
- Made MS2Query compatible with matchms 0.24.0
## 1.2.4
### Added
- environment.yml and CI_build test fur building a conda env from this file
### fixed
Expand Down
241 changes: 16 additions & 225 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,228 +4,19 @@ channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- abseil-cpp=20210324.2=h9c3ff4c_0
- absl-py=1.4.0=pyhd8ed1ab_0
- aiohttp=3.8.5=py38h01eb140_0
- aiosignal=1.3.1=pyhd8ed1ab_0
- astunparse=1.6.3=pyhd8ed1ab_0
- async-timeout=4.0.3=pyhd8ed1ab_0
- attrs=23.1.0=pyh71513ae_1
- blinker=1.6.2=pyhd8ed1ab_0
- boost=1.78.0=py38h4e30db6_4
- boost-cpp=1.78.0=h5adbc97_2
- brotli=1.0.9=h166bdaf_9
- brotli-bin=1.0.9=h166bdaf_9
- brotli-python=1.0.9=py38hfa26641_9
- bzip2=1.0.8=h7f98852_4
- c-ares=1.19.1=hd590300_0
- ca-certificates=2023.7.22=hbcca054_0
- cached-property=1.5.2=hd8ed1ab_1
- cached_property=1.5.2=pyha770c72_1
- cachetools=5.3.1=pyhd8ed1ab_0
- cairo=1.16.0=ha61ee94_1014
- certifi=2023.7.22=pyhd8ed1ab_0
- cffi=1.15.1=py38h4a40e3a_3
- charset-normalizer=3.2.0=pyhd8ed1ab_0
- click=8.1.7=unix_pyh707e725_0
- colorama=0.4.6=pyhd8ed1ab_0
- coloredlogs=15.0.1=pyhd8ed1ab_3
- contourpy=1.1.0=py38h7f3f72f_0
- cryptography=39.0.0=py38h1724139_0
- cycler=0.11.0=pyhd8ed1ab_0
- deprecated=1.2.14=pyh1a96a4e_0
- exceptiongroup=1.1.3=pyhd8ed1ab_0
- expat=2.5.0=hcb278e6_1
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
- font-ttf-inconsolata=3.000=h77eed37_0
- font-ttf-source-code-pro=2.038=h77eed37_0
- font-ttf-ubuntu=0.83=hab24e00_0
- fontconfig=2.14.2=h14ed4e7_0
- fonts-conda-ecosystem=1=0
- fonts-conda-forge=1=0
- fonttools=4.42.1=py38h01eb140_0
- freetype=2.12.1=hca18f0e_1
- freetype-py=2.3.0=pyhd8ed1ab_0
- frozenlist=1.4.0=py38h01eb140_0
- fst-pso=1.8.1=pyhd8ed1ab_0
- fuzzytm=2.0.5=pyhd8ed1ab_0
- gast=0.5.4=pyhd8ed1ab_0
- gensim=4.3.2=py38h53bb729_0
- gettext=0.21.1=h27087fc_0
- giflib=5.2.1=h0b41bf4_3
- gmp=6.2.1=h58526e2_0
- gmpy2=2.1.2=py38h793c122_1
- google-auth=2.17.3=pyh1a96a4e_0
- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
- google-pasta=0.2.0=pyh8c360ce_0
- greenlet=2.0.2=py38h17151c0_1
- grpc-cpp=1.45.2=h9d3bbbb_5
- grpcio=1.45.0=py38ha0cdfde_0
- h5py=3.9.0=nompi_py38h89e2d6c_100
- hdf5=1.14.0=nompi_h5231ba7_103
- humanfriendly=10.0=py38h578d9bd_4
- icu=70.1=h27087fc_0
- idna=3.4=pyhd8ed1ab_0
- importlib-metadata=6.8.0=pyha770c72_0
- importlib-resources=6.0.1=pyhd8ed1ab_0
- importlib_resources=6.0.1=pyhd8ed1ab_0
- iniconfig=2.0.0=pyhd8ed1ab_0
- joblib=1.3.2=pyhd8ed1ab_0
- jpeg=9e=h0b41bf4_3
- keras=2.8.0=pyhd8ed1ab_0
- keras-preprocessing=1.1.2=pyhd8ed1ab_0
- keyutils=1.6.1=h166bdaf_0
- kiwisolver=1.4.5=py38h7f3f72f_0
- krb5=1.20.1=hf9c8cef_0
- lcms2=2.15=hfd0df8a_0
- ld_impl_linux-64=2.40=h41732ed_0
- lerc=4.0.0=h27087fc_0
- libaec=1.0.6=hcb278e6_1
- libblas=3.9.0=17_linux64_openblas
- libbrotlicommon=1.0.9=h166bdaf_9
- libbrotlidec=1.0.9=h166bdaf_9
- libbrotlienc=1.0.9=h166bdaf_9
- libcblas=3.9.0=17_linux64_openblas
- libcurl=7.87.0=h6312ad2_0
- libdeflate=1.17=h0b41bf4_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libexpat=2.5.0=hcb278e6_1
- libffi=3.4.2=h7f98852_5
- libgcc-ng=13.1.0=he5830b7_0
- libgfortran-ng=13.1.0=h69a702a_0
- libgfortran5=13.1.0=h15d22d2_0
- libglib=2.76.4=hebfc3b9_0
- libgomp=13.1.0=he5830b7_0
- libiconv=1.17=h166bdaf_0
- liblapack=3.9.0=17_linux64_openblas
- libllvm14=14.0.6=hcd5def8_4
- libnghttp2=1.51.0=hdcd2b5c_0
- libnsl=2.0.0=h7f98852_0
- libopenblas=0.3.23=pthreads_h80387f5_0
- libpng=1.6.39=h753d276_0
- libprotobuf=3.20.3=h3eb15da_0
- libsqlite=3.43.0=h2797004_0
- libssh2=1.10.0=haa6b8db_3
- libstdcxx-ng=13.1.0=hfd8a6a1_0
- libtiff=4.5.0=h6adf6a1_2
- libuuid=2.38.1=h0b41bf4_0
- libwebp-base=1.3.1=hd590300_0
- libxcb=1.13=h7f98852_1004
- libxml2=2.10.3=hca2bb57_4
- libxslt=1.1.37=h873f0b0_0
- libzlib=1.2.13=hd590300_5
- llvmlite=0.40.1=py38h94a1851_0
- lxml=4.9.2=py38h215a2d7_0
- markdown=3.4.4=pyhd8ed1ab_0
- markupsafe=2.1.3=py38h01eb140_0
- matchms=0.17.0=pyh7cba7a3_0
- matchmsextras=0.4.1=pyhdfd78af_0
- matplotlib-base=3.7.2=py38hf5b0b65_0
- miniful=0.0.6=pyhd8ed1ab_0
- mpc=1.3.1=hfe3b2da_0
- mpfr=4.2.0=hb012696_0
- mpmath=1.3.0=pyhd8ed1ab_0
- ms2deepscore=0.4.0=pyhdfd78af_0
- ms2query=1.2.2=pyhdfd78af_0
- multidict=6.0.4=py38h1de0b5d_0
- munkres=1.1.4=pyh9f0ad1d_0
- ncurses=6.4=hcb278e6_0
- networkx=3.1=pyhd8ed1ab_0
- numba=0.57.1=py38hd559b08_0
- numpy=1.24.4=py38h59b608b_0
- oauthlib=3.2.2=pyhd8ed1ab_0
- onnx=1.12.0=py38h8d49f1e_0
- onnxconverter-common=1.13.0=pyhd8ed1ab_0
- onnxruntime=1.15.1=py38h896e202_2_cpu
- openjpeg=2.5.0=hfec8fc6_2
- openssl=1.1.1v=hd590300_0
- opt_einsum=3.3.0=pyhd8ed1ab_1
- packaging=23.1=pyhd8ed1ab_0
- pandas=1.5.3=py38hdc8b05c_1
- pcre2=10.40=hc3806b6_0
- pickydict=0.4.0=pyhd8ed1ab_0
- pillow=9.4.0=py38hde6dc18_1
- pip=23.2.1=pyhd8ed1ab_0
- pixman=0.40.0=h36c2ea0_0
- platformdirs=3.10.0=pyhd8ed1ab_0
- pluggy=1.3.0=pyhd8ed1ab_0
- pooch=1.7.0=pyha770c72_3
- protobuf=3.20.3=py38h8dc9893_1
- pthread-stubs=0.4=h36c2ea0_1001
- pubchempy=1.0.4=py_0
- pyasn1=0.4.8=py_0
- pyasn1-modules=0.2.7=py_0
- pycairo=1.24.0=py38h1a1917b_0
- pycparser=2.21=pyhd8ed1ab_0
- pyfume=0.2.25=pyhd8ed1ab_0
- pyjwt=2.8.0=pyhd8ed1ab_0
- pyopenssl=23.2.0=pyhd8ed1ab_1
- pyparsing=3.0.9=pyhd8ed1ab_0
- pysocks=1.7.1=pyha2e5f31_6
- pyteomics=4.6=pyh7cba7a3_0
- pytest=7.4.0=pyhd8ed1ab_0
- python=3.8.15=h257c98d_0_cpython
- python-dateutil=2.8.2=pyhd8ed1ab_0
- python-flatbuffers=23.5.26=pyhd8ed1ab_0
- python-louvain=0.16=pyhd8ed1ab_0
- python_abi=3.8=3_cp38
- pytz=2023.3=pyhd8ed1ab_0
- pyu2f=0.1.5=pyhd8ed1ab_0
- rdkit=2023.03.3=py38h36d2b2f_0
- re2=2022.06.01=h27087fc_1
- readline=8.2=h8228510_1
- reportlab=4.0.4=py38h01eb140_0
- requests=2.31.0=pyhd8ed1ab_0
- requests-oauthlib=1.3.1=pyhd8ed1ab_0
- rlpycairo=0.2.0=pyhd8ed1ab_0
- rsa=4.9=pyhd8ed1ab_0
- scikit-learn=1.3.0=py38hc099248_0
- scipy=1.10.1=py38h59b608b_3
- setuptools=68.1.2=pyhd8ed1ab_0
- simpful=2.11.0=pyhd8ed1ab_0
- six=1.16.0=pyh6c4a22f_0
- skl2onnx=1.15.0=pyhd8ed1ab_0
- smart_open=6.3.0=pyhd8ed1ab_1
- snappy=1.1.10=h9fff704_0
- spec2vec=0.8.0=pyhdfd78af_0
- sqlalchemy=2.0.20=py38h01eb140_0
- sqlite=3.43.0=h2c6b66d_0
- sympy=1.12=pypyh9d50eac_103
- tensorboard=2.8.0=pyhd8ed1ab_1
- tensorboard-data-server=0.6.1=py38h2b5fc30_4
- tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
- tensorflow=2.8.1=cpu_py38h66f0ec1_0
- tensorflow-base=2.8.1=cpu_py38hc7a75a0_0
- tensorflow-estimator=2.8.1=cpu_py38h4e23bc6_0
- termcolor=2.3.0=pyhd8ed1ab_0
- threadpoolctl=3.2.0=pyha21a80b_0
- tk=8.6.12=h27826a3_0
- tomli=2.0.1=pyhd8ed1ab_0
- tqdm=4.66.1=pyhd8ed1ab_0
- typing-extensions=4.7.1=hd8ed1ab_0
- typing_extensions=4.7.1=pyha770c72_0
- unicodedata2=15.0.0=py38h0a891b7_0
- urllib3=2.0.4=pyhd8ed1ab_0
- werkzeug=2.3.7=pyhd8ed1ab_0
- wheel=0.41.2=pyhd8ed1ab_0
- wrapt=1.15.0=py38h1de0b5d_0
- xorg-kbproto=1.0.7=h7f98852_1002
- xorg-libice=1.1.1=hd590300_0
- xorg-libsm=1.2.4=h7391055_0
- xorg-libx11=1.8.4=h0b41bf4_0
- xorg-libxau=1.0.11=hd590300_0
- xorg-libxdmcp=1.1.3=h7f98852_0
- xorg-libxext=1.3.4=h0b41bf4_2
- xorg-libxrender=0.9.10=h7f98852_1003
- xorg-renderproto=0.11.1=h7f98852_1002
- xorg-xextproto=7.3.0=h0b41bf4_1003
- xorg-xproto=7.0.31=h7f98852_1007
- xz=5.2.6=h166bdaf_0
- yarl=1.9.2=py38h01eb140_0
- zip
- zipp=3.16.2=pyhd8ed1ab_0
- zlib=1.2.13=hd590300_5
- zstd=1.5.5=hfc55251_0
- python=3.8.18
- matchms=0.24.1
- numpy=1.24.4
- spec2vec=0.8.0
- h5py=3.9.0
- pyarrow=12.0.1
- tensorflow=2.12.1
- scikit-learn=1.3.2
- ms2deepscore=0.4.0
- pandas=2.0.3
- matplotlib=3.7.3
- skl2onnx=1.16.0
- onnxruntime=1.16.3
- pytest=7.4.0
- pytest-cov=4.1.0
- zip
2 changes: 1 addition & 1 deletion ms2query/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.2.4'
__version__ = '1.3.0'
20 changes: 13 additions & 7 deletions ms2query/benchmarking/collect_test_data_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,17 +157,22 @@ def get_modified_cosine_score_results(lib_spectra,
else:
selected_lib_spectra = lib_spectra
if len(selected_lib_spectra) != 0:
scores_list = calculate_scores(selected_lib_spectra,
[test_spectrum], ModifiedCosine()).scores_by_query(test_spectrum)
# Scores list is a List[spectrum, (mod_cos, matching_peaks)
cosine_scores = [scores_tuple[1]["score"] for scores_tuple in scores_list]
highest_cosine_score = float(max(cosine_scores))
highest_scoring_spectrum = scores_list[cosine_scores.index(highest_cosine_score)][0]
scores = calculate_scores(references=selected_lib_spectra,
queries=[test_spectrum],
similarity_function=ModifiedCosine())
# Matchms allows to get the best matches for any query using scores_by_query
sorted_scores = scores.scores_by_query(test_spectrum, 'ModifiedCosine_score', sort=True)
# Scores are not stored if the cosine score is 0 (no overlapping peaks).
if len(sorted_scores) == 0:
highest_scoring_spectrum = random.choice(selected_lib_spectra)
highest_cosine_score = (0, 0)
else:
highest_scoring_spectrum, highest_cosine_score = sorted_scores[0]

tanimoto_score = calculate_single_tanimoto_score(test_spectrum.get("smiles"),
highest_scoring_spectrum.get("smiles"))
exact_match = highest_scoring_spectrum.get("inchikey")[:14] == test_spectrum.get("inchikey")[:14]
best_matches_for_test_spectra.append((highest_cosine_score, tanimoto_score, exact_match))
best_matches_for_test_spectra.append((highest_cosine_score[0], tanimoto_score, exact_match))
else:
best_matches_for_test_spectra.append(None)
return best_matches_for_test_spectra
Expand All @@ -189,6 +194,7 @@ def get_cosines_score_results(lib_spectra,
scores_list = calculate_scores(selected_lib_spectra,
[test_spectrum],
CosineGreedy(tolerance=fragment_mass_tolerance)).scores_by_query(test_spectrum)
# todo This was build with old matchms in mind, this version will not find cosine scores = 0
cosine_scores = [scores_tuple[1].item()[0] for scores_tuple in scores_list if scores_tuple[1].item()[1] >= minimum_matched_peaks]
if len(cosine_scores) != 0:
highest_cosine_score = max(cosine_scores)
Expand Down
10 changes: 6 additions & 4 deletions ms2query/benchmarking/create_accuracy_vs_recall_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
from ms2query.utils import load_json_file, load_pickled_file, save_pickled_file
from ms2query.utils import (load_df_from_parquet_file, load_json_file,
save_df_as_parquet_file)


def plot_all_with_standard_deviation(means_and_standars_deviation,
Expand Down Expand Up @@ -231,10 +232,11 @@ def create_plot(exact_matches,
if recalculate_means:
dict_with_results = load_all_test_results(20, test_results_folder, exact_match=exact_matches)
means_and_standard_deviation = calculate_all_means_and_standard_deviation(dict_with_results, exact_matches=exact_matches)
save_pickled_file(means_and_standard_deviation,
os.path.join(test_results_folder, f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
save_df_as_parquet_file(means_and_standard_deviation, os.path.join(test_results_folder,
f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
else:
means_and_standard_deviation = load_pickled_file(os.path.join(test_results_folder, f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))
means_and_standard_deviation = load_df_from_parquet_file(
os.path.join(test_results_folder, f"means_and_standard_deviations_20_fold{extra_file_name}.pickle"))

if exact_matches:
optimal_results = means_and_standard_deviation["Optimal"]
Expand Down
Loading

0 comments on commit a3ed83d

Please sign in to comment.