diff --git a/package/CHANGELOG b/package/CHANGELOG index cd2819079e6..e9c481de426 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -17,7 +17,7 @@ The rules for this file: HenokB, umak1106, tamandeeps, Mrqeoqqt, megosato, AnirG, rishu235, mtiberti, manishsaini6421, Sukeerti1, robotjellyzone, markvrma, alescoulie, mjtadema, PicoCentauri, Atharva7K, aditi2906, orbeckst, yuxuanzhuang, - rsexton2, rafaelpap, richardjgowers + rsexton2, rafaelpap, richardjgowers, orioncohen * 2.2.0 @@ -29,7 +29,7 @@ Fixes * Fix the competition when generating offsets from multiple processes by using a interprocess file lock. (Issue #1988, PR #3375) * Fix the error when trying to load a corrupted offset file (Issue #3230 - PR #3375) + PR #3375) * Fixed `OpenMMTopologyParser` not working when some or none of the elements are present in the openmm topology (Issue #3317, PR #3511). * Remove None from filenames in analysis.hole.HoleAnalysis (PR #3650) @@ -39,8 +39,8 @@ Fixes an external reference (Issue #3539, PR #3621) * Fixed LinearDensity not working with grouping='residues' or 'segments' (Issue #3571, PR #3572) - * LinearDensity now uses the definition of Avogadro's number from - MDAnalysis.units, which has more significant digits. + * LinearDensity now uses the definition of Avogadro's number from + MDAnalysis.units, which has more significant digits. (Issue #3571, PR #3572) * MOL2 can read files without providing optional columns: subst_id, subst_name and charge. (Issue #3385, PR #3598) @@ -58,6 +58,9 @@ Fixes * Fixed BAT method Cartesian modifies input data. (Issue #3501) Enhancements + * Add smarts_kwargs to allow greater flexiblity with smarts atom selection + (Issue #3469, PR #3470). + * Added `frames` argument to AnalysisBase.run to allow analysis to run on * Added DL_POLY Classic support. Now HISTORY files from DL_POLY Classic can be * used. (Issue #3678) * Wheels and dist now get automatically built and deployed on release @@ -86,11 +89,13 @@ Enhancements * Added benchmarks in lib.distances for enhancement (Issue #3205, PR #3641) Changes + * smarts selection's `maxMatches` now defaults to max(1000, n_atoms*10) + instead of the standard RDKit default of 1000 (Issue #3469, PR #3470). * `fasteners` package is now a core dependency (Issues #3230, #1988, PR #3375) - * The minimm Python, NumPy, and SciPy versions (following NEP29) are now 3.8, + * The minimm Python, NumPy, and SciPy versions (following NEP29) are now 3.8, 1.19, and 1.5.0; GSD now also has a minimum version of 1.9.3 and networkx has minimum version 2.0 (Issue #3665) - * LinearDensity now saves the histogram bin edges for easier plotting as + * LinearDensity now saves the histogram bin edges for easier plotting as "hist_bin_edges" for each dimension xyz in the results dictionary. (Issue #2508, PR #3617) * ITPParser no longer adds an angle for water molecules that have the SETTLE diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 9fb7fdbb9d9..1082cd527c3 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -2861,7 +2861,7 @@ def ts(self): def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05, atol=1e-08, updating=False, sorted=True, - rdkit_kwargs=None, **selgroups): + rdkit_kwargs=None, smarts_kwargs=None, **selgroups): """Select atoms from within this Group using a selection string. Returns an :class:`AtomGroup` sorted according to their index in the @@ -2894,6 +2894,10 @@ def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05, Arguments passed to the :class:`~MDAnalysis.converters.RDKit.RDKitConverter` when using selection based on SMARTS queries + smarts_kwargs : dict (optional) + Arguments passed internally to RDKit's `GetSubstructMatches + `_. + **selgroups : keyword arguments of str: AtomGroup (optional) when using the "group" keyword in selections, groups are defined by passing them as keyword arguments. See section on **preexisting @@ -3012,7 +3016,25 @@ def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05, smarts *SMARTS-query* select atoms using Daylight's SMARTS queries, e.g. ``smarts [#7;R]`` to find nitrogen atoms in rings. Requires RDKit. - All matches (max 1000) are combined as a unique match + All matches are combined as a single unique match. The `smarts` + selection accepts two sets of key word arguments from + `select_atoms()`: the ``rdkit_kwargs`` are passed internally to + `RDKitConverter.convert()` and the ``smarts_kwargs`` are passed to + RDKit's `GetSubstructMatches + `_. + By default, the `useChirality` kwarg in ``rdkit_kwargs`` is set to true + and maxMatches in ``smarts_kwargs`` is + ``max(1000, 10 * n_atoms)``, where ``n_atoms`` is either + ``len(AtomGroup)`` or ``len(Universe.atoms)``, whichever is + applicable. Note that the number of matches can occasionally + exceed the default value of maxMatches, causing too few atoms + to be returned. If this occurs, a warning will be issued. The + problem can be fixed by increasing the value of maxMatches. + This behavior may be updated in the future. + + >>> universe.select_atoms("C", smarts_kwargs={"maxMatches": 100}) + + chiral *R | S* select a particular stereocenter. e.g. ``name C and chirality S`` to select only S-chiral carbon atoms. Only ``R`` and @@ -3168,6 +3190,9 @@ def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05, Added the *smarts* selection. Added `atol` and `rtol` keywords to select float values. Added the ``sort`` keyword. Added `rdkit_kwargs` to pass parameters to the RDKitConverter. + .. versionchanged:: 2.2.0 + Added `smarts_kwargs` to pass parameters to the RDKit + GetSubstructMatch for *smarts* selection. """ if not sel: @@ -3187,7 +3212,8 @@ def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05, periodic=periodic, atol=atol, rtol=rtol, sorted=sorted, - rdkit_kwargs=rdkit_kwargs) + rdkit_kwargs=rdkit_kwargs, + smarts_kwargs=smarts_kwargs) for s in sel_strs)) if updating: atomgrp = UpdatingAtomGroup(self, selections, sel_strs) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 8775a6a771b..9692fc66395 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -637,6 +637,15 @@ class SmartsSelection(Selection): Uses RDKit to run the query and converts the result to MDAnalysis. Supports chirality. + + .. versionchanged:: 2.2.0 + ``rdkit_wargs`` and ``smarts_kwargs`` can now be passed to control + the behaviour of the RDKit converter and RDKit's ``GetSubstructMatches`` + respectively. + The default ``maxMatches`` value passed to ``GetSubstructMatches`` has + been changed from ``1000`` to ``max(1000, n_atoms * 10)`` in order to + limit cases where too few matches were generated. A warning is now also + thrown if ``maxMatches`` has been reached. """ token = 'smarts' @@ -665,6 +674,7 @@ def __init__(self, parser, tokens): pattern.append(val) self.pattern = "".join(pattern) self.rdkit_kwargs = parser.rdkit_kwargs + self.smarts_kwargs = parser.smarts_kwargs def _apply(self, group): try: @@ -678,7 +688,18 @@ def _apply(self, group): if not pattern: raise ValueError(f"{self.pattern!r} is not a valid SMARTS query") mol = group.convert_to("RDKIT", **self.rdkit_kwargs) - matches = mol.GetSubstructMatches(pattern, useChirality=True) + self.smarts_kwargs.setdefault("useChirality", True) + self.smarts_kwargs.setdefault("maxMatches", max(1000, len(group) * 10)) + matches = mol.GetSubstructMatches(pattern, **self.smarts_kwargs) + if len(matches) == self.smarts_kwargs["maxMatches"]: + warnings.warn("Your smarts-based atom selection returned the max" + "number of matches. This indicates that not all" + "matching atoms were selected. When calling" + "atom_group.select_atoms(), the default value" + "of maxMatches is max(100, len(atom_group * 10)). " + "To fix this, add the following argument to " + "select_atoms: \n" + "smarts_kwargs={maxMatches: }") # flatten all matches and remove duplicated indices indices = np.unique([idx for match in matches for idx in match]) # create boolean mask for atoms based on index @@ -1408,7 +1429,7 @@ def expect(self, token): "".format(self.tokens[0], token)) def parse(self, selectstr, selgroups, periodic=None, atol=1e-08, - rtol=1e-05, sorted=True, rdkit_kwargs=None): + rtol=1e-05, sorted=True, rdkit_kwargs=None, smarts_kwargs=None): """Create a Selection object from a string. Parameters @@ -1431,7 +1452,9 @@ def parse(self, selectstr, selgroups, periodic=None, atol=1e-08, rdkit_kwargs : dict, optional Arguments passed to the RDKitConverter when using selection based on SMARTS queries - + smarts_kwargs : dict, optional + Arguments passed internally to RDKit's `GetSubstructMatches + `_. Returns ------- @@ -1447,12 +1470,16 @@ def parse(self, selectstr, selgroups, periodic=None, atol=1e-08, .. versionchanged:: 2.0.0 Added `atol` and `rtol` keywords to select float values. Added `rdkit_kwargs` to pass arguments to the RDKitConverter + .. versionchanged:: 2.2.0 + Added ``smarts_kwargs`` argument, allowing users to pass a + a dictionary of arguments to RDKit's ``GetSubstructMatches``. """ self.periodic = periodic self.atol = atol self.rtol = rtol self.sorted = sorted self.rdkit_kwargs = rdkit_kwargs or {} + self.smarts_kwargs = smarts_kwargs or {} self.selectstr = selectstr self.selgroups = selgroups diff --git a/package/doc/sphinx/source/documentation_pages/selections.rst b/package/doc/sphinx/source/documentation_pages/selections.rst index 9431f0ef4b8..986cde2527d 100644 --- a/package/doc/sphinx/source/documentation_pages/selections.rst +++ b/package/doc/sphinx/source/documentation_pages/selections.rst @@ -164,9 +164,22 @@ moltype *molecule-type* the TPR format defines the molecule type. smarts *SMARTS-query* - select atoms using Daylight's SMARTS queries, e.g. ``smarts [#7;R]`` to - find nitrogen atoms in rings. Requires RDKit. All matches (max 1000) are - combined as a unique match. + select atoms using Daylight's SMARTS queries, e.g. ``smarts + [#7;R]`` to find nitrogen atoms in rings. Requires RDKit. + All matches are combined as a single unique match. The `smarts` + selection accepts two sets of key word arguments from + `select_atoms()`: the ``rdkit_kwargs`` are passed internally to + `RDKitConverter.convert()` and the ``smarts_kwargs`` are passed to + RDKit's `GetSubstructMatches + `_. + By default, the `useChirality` kwarg in ``rdkit_kwargs`` is set to true + and maxMatches in ``smarts_kwargs`` is ``max(1000, 10 * n_atoms)``, where + ``n_atoms`` is either ``len(AtomGroup)`` or ``len(Universe.atoms)``, + whichever is applicable. Note that the number of matches can occasionally + exceed the default value of maxMatches, causing too few atoms to be + returned. If this occurs, a warning will be issued. The problem can be + fixed by increasing the value of maxMatches. This behavior may be updated + in the future chiral *R | S* select a particular stereocenter. e.g. ``name C and chirality S`` diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py index 67ad7f8c6a0..8f27c99d794 100644 --- a/testsuite/MDAnalysisTests/core/test_atomselections.py +++ b/testsuite/MDAnalysisTests/core/test_atomselections.py @@ -575,11 +575,29 @@ def test_invalid_smarts_sel_raises_error(self, u2): with pytest.raises(ValueError, match="not a valid SMARTS"): u2.select_atoms("smarts foo") - def test_passing_args_to_converter(self): + def test_passing_rdkit_kwargs_to_converter(self): u = mda.Universe.from_smiles("O=C=O") sel = u.select_atoms("smarts [$(O=C)]", rdkit_kwargs=dict(force=True)) assert sel.n_atoms == 2 + def test_passing_max_matches_to_converter(self, u2): + with pytest.warns(UserWarning, match="Your smarts-based") as wsmg: + sel = u2.select_atoms("smarts C", smarts_kwargs=dict(maxMatches=2)) + sel2 = u2.select_atoms( + "smarts C", smarts_kwargs=dict(maxMatches=1000)) + assert sel.n_atoms == 2 + assert sel2.n_atoms == 3 + + sel3 = u2.select_atoms("smarts c") + assert sel3.n_atoms == 4 + + def test_passing_use_chirality_to_converter(self): + u = mda.Universe.from_smiles("CC[C@H](C)O") + sel3 = u.select_atoms("byres smarts CC[C@@H](C)O") + assert sel3.n_atoms == 0 + sel4 = u.select_atoms("byres smarts CC[C@@H](C)O", smarts_kwargs={"useChirality": False}) + assert sel4.n_atoms == 15 + class TestSelectionsNucleicAcids(object): @pytest.fixture(scope='class')