From 45014a91ce2b3585d62b7d6a51193d7916a3f0b7 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 30 Jun 2024 13:46:48 +0200 Subject: [PATCH 1/9] Enable code style checks --- .github/workflows/test_and_deploy.yml | 17 +++++++++++++ .gitignore | 1 - environment.yml | 2 ++ pyproject.toml | 36 +++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_and_deploy.yml b/.github/workflows/test_and_deploy.yml index 7c2b97591..6ce6f4990 100644 --- a/.github/workflows/test_and_deploy.yml +++ b/.github/workflows/test_and_deploy.yml @@ -31,6 +31,22 @@ env: jobs: + lint: + name: Check code style + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install ruff + run: pip install ruff + - name: Check code formatting + run: ruff format --diff + - name: Lint code base + run: ruff check + + generate-wheels-matrix: name: "Generate wheels matrix" runs-on: "ubuntu-latest" @@ -245,6 +261,7 @@ jobs: permissions: contents: write needs: + - lint - test-and-build - make-sdist - test-interfaces diff --git a/.gitignore b/.gitignore index b76f59e63..037c3f92e 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,6 @@ htmlcov # Ignore all compiled python files (e.g. from running the unit tests) *.pyc *.pyo -*.py{} *.py-e # Ignore potential directory created during install diff --git a/environment.yml b/environment.yml index 8e484d75c..9e2017a35 100644 --- a/environment.yml +++ b/environment.yml @@ -24,6 +24,8 @@ dependencies: # Testing # - mdtraj >=1.9.3, <1.10 # tempoarily disabled due to incompatibility with numpy 2.0 - pytest >=7.0 + # Code style + - ruff =0.5.0 # Interfaced software in biotite.application (can also be installed separately) - autodock-vina - clustalo diff --git a/pyproject.toml b/pyproject.toml index fe580be37..d4016888c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,42 @@ homepage = "https://www.biotite-python.org" repository = "https://github.com/biotite-dev/biotite" documentation = "https://www.biotite-python.org" +[tool.ruff.lint] +# pyflakes, pycodestyle isort and varibale naming +select = ["F", "E", "W", "I", "TID", "N"] +ignore = [ + # In docstrings long lines are often intentional + # Most other ocassions are caught by the ruff formatter + "E501", + # Due to constants and class placeholders defined in functions + "N806", +] + +[tool.ruff.lint.per-file-ignores] +# Due to `* import` of BCIF encoding +"setup_ccd.py" = ["F405", "F403"] +# Due to imports after the PATH has been adjusted +"doc/conf.py" = ["E402"] +# Due to `from .module import *` imports in `__init__.py` modules +"__init__.py" = ["F403", "TID252"] +# Due to pymol scripts that are evaluated in other example scripts +"doc/examples/**/*_pymol.py" = ["F821"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.isort] +# No separator lines between import sections +no-lines-before = [ + "future", + "standard-library", + "third-party", + "first-party", + "local-folder", +] +order-by-type = true +known-first-party = ["biotite"] + [tool.hatch.build.targets.sdist] exclude = [ "tests", From 3d4b77fc1391fac13f567cddca4a54d3b6d5610a Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 30 Jun 2024 22:03:20 +0200 Subject: [PATCH 2/9] Document new code style --- doc/contribution/development.rst | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/contribution/development.rst b/doc/contribution/development.rst index 9cee6fefa..9def9fb3a 100644 --- a/doc/contribution/development.rst +++ b/doc/contribution/development.rst @@ -53,13 +53,19 @@ Official support for PyPy might be added someday. Code style ---------- -*Biotite* is in compliance with PEP 8. -The maximum line length is 79 for code lines and 72 for docstring and -comment lines. +*Biotite* is compliant with :pep:`8` and uses `Ruff `_ for +code formatting and linting. +The maximum line length is 88 characters. An exception is made for docstring lines, if it is not possible to use a -maximum of 72 characters (e.g. tables), and for -`doctest `_ lines, -where the actual code may take up to 79 characters. +maximum of 88 characters (e.g. tables and parameter type descriptions). +To make code changes ready for a pull request, simply run + +.. code-block:: console + + $ ruff format + $ ruff check --fix + +and fix the remaining linter complaints. Dependencies ------------ @@ -124,14 +130,14 @@ accessible, in a relative manner. Import statements should be the only statements in a ``__init__.py`` file. In case a module needs functionality from another subpackage of *Biotite*, -use a relative import. +use an absolute import as suggested by PEP 8. This import should target the module directly and not the package to avoid circular imports and thus an ``ImportError``. So import statements like the following are totally OK: .. code-block:: python - from ...package.subpackage.module import foo + from biotite.subpackage.module import foo In order to prevent namespace pollution, all modules must define the `__all__` variable with all publicly accessible attributes of the module. From 5827c3ddf563dc67d2762984352983167f54935c Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 1 Jul 2024 09:44:27 +0200 Subject: [PATCH 3/9] Guard aligned list/dicts from formatting --- .../scripts/structure/protein/pb_alignment.py | 2 +- src/biotite/database/entrez/dbnames.py | 10 +- src/biotite/structure/bonds.pyx | 2 + src/biotite/structure/geometry.py | 116 +++++++++--------- src/biotite/structure/info/atoms.py | 12 +- src/biotite/structure/info/radii.py | 28 +++-- tests/application/test_msa.py | 105 +++++++++------- tests/database/test_rcsb.py | 2 +- tests/sequence/align/test_alignment.py | 12 +- tests/sequence/align/test_kmertable.py | 2 +- tests/sequence/align/test_localungapped.py | 18 +-- tests/sequence/align/test_matrix.py | 2 +- tests/sequence/align/test_pairwise.py | 49 ++++---- tests/sequence/test_phylo.py | 8 +- tests/structure/test_bonds.py | 2 +- tests/structure/test_box.py | 4 +- 16 files changed, 200 insertions(+), 174 deletions(-) diff --git a/doc/examples/scripts/structure/protein/pb_alignment.py b/doc/examples/scripts/structure/protein/pb_alignment.py index 5b6dc3818..6145cedeb 100644 --- a/doc/examples/scripts/structure/protein/pb_alignment.py +++ b/doc/examples/scripts/structure/protein/pb_alignment.py @@ -79,7 +79,7 @@ [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], -]) +]) # fmt: skip # Fetch animal lysoyzme structures diff --git a/src/biotite/database/entrez/dbnames.py b/src/biotite/database/entrez/dbnames.py index dfa0a8e0a..2aa967a61 100644 --- a/src/biotite/database/entrez/dbnames.py +++ b/src/biotite/database/entrez/dbnames.py @@ -7,6 +7,7 @@ __all__ = ["get_database_name"] +# fmt: off _db_names = { "BioProject" : "bioproject", "BioSample" : "biosample", @@ -45,26 +46,27 @@ "UniGene" : "unigene", "UniSTS" : "unists" } +# fmt: on def get_database_name(database): """ Map a common NCBI Entrez database name to an E-utility database name. - + Parameters ---------- database : str Entrez database name. - + Returns ------- name : str E-utility database name. - + Examples -------- - + >>> print(get_database_name("Nucleotide")) nuccore """ diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx index 783fbda0e..e3d30105c 100644 --- a/src/biotite/structure/bonds.pyx +++ b/src/biotite/structure/bonds.pyx @@ -1330,6 +1330,7 @@ def _invert_index(IndexType[:] index_v, uint32 length): +# fmt: off _DEFAULT_DISTANCE_RANGE = { # Taken from Allen et al. # min - 2*std max + 2*std @@ -1376,6 +1377,7 @@ _DEFAULT_DISTANCE_RANGE = { ("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024), ("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012), } +# fmt: on def connect_via_distances(atoms, dict distance_range=None, bint inter_residue=True, default_bond_type=BondType.ANY, bint periodic=False): diff --git a/src/biotite/structure/geometry.py b/src/biotite/structure/geometry.py index ce39d1e82..e27ba233a 100644 --- a/src/biotite/structure/geometry.py +++ b/src/biotite/structure/geometry.py @@ -27,7 +27,7 @@ def displacement(atoms1, atoms2, box=None): """ Measure the displacement vector, i.e. the vector difference, from one array of atom coordinates to another array of coordinates. - + Parameters ---------- atoms1, atoms2 : ndarray, shape=(m,n,3) or ndarray, shape=(n,3) or ndarray, shape=(3,) or Atom or AtomArray or AtomArrayStack @@ -43,13 +43,13 @@ def displacement(atoms1, atoms2, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- disp : ndarray, shape=(m,n,3) or ndarray, shape=(n,3) or ndarray, shape=(3,) The displacement vector(s). The shape is equal to the shape of the input `atoms` with the highest dimensionality. - + See also -------- index_displacement @@ -62,7 +62,7 @@ def displacement(atoms1, atoms2, box=None): diff = v2 - v1 else: diff = -(v1 - v2) - + # Use minimum-image convention if box is given if box is not None: # Transform difference vector @@ -131,7 +131,7 @@ def displacement(atoms1, atoms2, box=None): f"{diff.shape} is an invalid shape for atom coordinates" ) return disp - + else: return diff @@ -139,7 +139,7 @@ def displacement(atoms1, atoms2, box=None): def index_displacement(*args, **kwargs): """ index_displacement(atoms, indices, periodic=False, box=None) - + Measure the displacement, i.e. the vector difference, between pairs of atoms. @@ -159,7 +159,7 @@ def index_displacement(*args, **kwargs): :class:`ndarray`. indices : ndarray, shape=(k,2) Pairs of indices that point to `atoms`. - The displacement is measured from ``indices[x,0]`` to + The displacement is measured from ``indices[x,0]`` to ``indices[x,1]``. periodic : bool, optional If set to true, periodic boundary conditions are taken into @@ -171,14 +171,14 @@ def index_displacement(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- disp : ndarray, shape=(k,) or shape=(m,k) The pairwise displacements. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -199,7 +199,7 @@ def index_displacement(*args, **kwargs): def distance(atoms1, atoms2, box=None): """ Measure the euclidian distance between atoms. - + Parameters ---------- atoms1, atoms2 : ndarray or Atom or AtomArray or AtomArrayStack @@ -214,14 +214,14 @@ def distance(atoms1, atoms2, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- dist : float or ndarray The atom distances. The shape is equal to the shape of the input `atoms` with the highest dimensionality minus the last axis. - + See also -------- index_distance @@ -233,7 +233,7 @@ def distance(atoms1, atoms2, box=None): def index_distance(*args, **kwargs): """ index_distance(atoms, indices, periodic=False, box=None) - + Measure the euclidian distance between pairs of atoms. The pairs refer to indices of a given atom array, whose pairwise @@ -262,14 +262,14 @@ def index_distance(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- dist : ndarray, shape=(k,) or shape=(m,k) The pairwise distances. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -290,7 +290,7 @@ def index_distance(*args, **kwargs): def angle(atoms1, atoms2, atoms3, box=None): """ Measure the angle between 3 atoms. - + Parameters ---------- atoms1, atoms2, atoms3 : ndarray or Atom or AtomArray or AtomArrayStack @@ -302,14 +302,14 @@ def angle(atoms1, atoms2, atoms3, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- angle : float or ndarray The angle(s) between the atoms. The shape is equal to the shape of the input `atoms` with the highest dimensionality minus the last axis. - + See also -------- index_angle @@ -324,7 +324,7 @@ def angle(atoms1, atoms2, atoms3, box=None): def index_angle(*args, **kwargs): """ index_angle(atoms, indices, periodic=False, box=None) - + Measure the angle between triples of atoms. The triples refer to indices of a given atom array, whose triplewise @@ -351,14 +351,14 @@ def index_angle(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- angle : ndarray, shape=(k,) or shape=(m,k) The triplewise angles. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -379,7 +379,7 @@ def index_angle(*args, **kwargs): def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): """ Measure the dihedral angle between 4 atoms. - + Parameters ---------- atoms1, atoms2, atoms3, atoms4 : ndarray or Atom or AtomArray or AtomArrayStack @@ -392,14 +392,14 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- dihed : float or ndarray The dihedral angle(s) between the atoms. The shape is equal to the shape of the input `atoms` with the highest dimensionality minus the last axis. - + See Also -------- index_dihedral @@ -411,11 +411,11 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): norm_vector(v1) norm_vector(v2) norm_vector(v3) - + n1 = np.cross(v1, v2) n2 = np.cross(v2, v3) - - # Calculation using atan2, to ensure the correct sign of the angle + + # Calculation using atan2, to ensure the correct sign of the angle x = vector_dot(n1,n2) y = vector_dot(np.cross(n1,n2), v2) return np.arctan2(y,x) @@ -424,7 +424,7 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): def index_dihedral(*args, **kwargs): """ index_dihedral(atoms, indices, periodic=False, box=None) - + Measure the dihedral angle between quadruples of atoms. The triples refer to indices of a given atom array, whose @@ -452,14 +452,14 @@ def index_dihedral(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- dihedral : ndarray, shape=(k,) or shape=(m,k) The quadruplewise dihedral angles. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -482,7 +482,7 @@ def dihedral_backbone(atom_array): """ Measure the characteristic backbone dihedral angles of a protein structure. - + Parameters ---------- atom_array: AtomArray or AtomArrayStack @@ -492,7 +492,7 @@ def dihedral_backbone(atom_array): `NaN`. The order of the backbone atoms for each residue must be (N, CA, C). - + Returns ------- phi, psi, omega : ndarray @@ -502,20 +502,20 @@ def dihedral_backbone(atom_array): have *NaN* values. If an :class:`AtomArrayStack` is given, the output angles are 2-dimensional, the first dimension corresponds to the model number. - + Raises ------ BadStructureError If the amount of backbone atoms is not equal to amount of residues times 3 (for N, CA and C). - + See Also -------- dihedral - + Examples -------- - + >>> phi, psi, omega = dihedral_backbone(atom_array) >>> print(np.stack([np.rad2deg(phi), np.rad2deg(psi)]).T) [[ nan -56.145] @@ -541,7 +541,7 @@ def dihedral_backbone(atom_array): """ bb_filter = filter_peptide_backbone(atom_array) backbone = atom_array[..., bb_filter] - + if backbone.array_length() % 3 != 0 \ or (backbone.atom_name[0::3] != "N" ).any() \ or (backbone.atom_name[1::3] != "CA").any() \ @@ -577,43 +577,45 @@ def _dihedral_backbone(chain_bb): phi_coord = np.full(angle_coord_shape, np.nan) psi_coord = np.full(angle_coord_shape, np.nan) omega_coord = np.full(angle_coord_shape, np.nan) - - # Indices for coordinates of CA atoms + + # Indices for coordinates of CA atoms ca_i = np.arange(bb_coord.shape[-2]//3) * 3 + 1 - phi_coord [..., 1: , :, 0] = bb_coord[..., ca_i[1: ]-2 ,:] - phi_coord [..., 1: , :, 1] = bb_coord[..., ca_i[1: ]-1 ,:] - phi_coord [..., 1: , :, 2] = bb_coord[..., ca_i[1: ] ,:] - phi_coord [..., 1: , :, 3] = bb_coord[..., ca_i[1: ]+1 ,:] - psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1 ,:] - psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1] ,:] - psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1 ,:] - psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2 ,:] - omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1] ,:] - omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1 ,:] - omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2 ,:] - omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3 ,:] - + # fmt: off + phi_coord [..., 1:, :, 0] = bb_coord[..., ca_i[1: ]-2, :] + phi_coord [..., 1:, :, 1] = bb_coord[..., ca_i[1: ]-1, :] + phi_coord [..., 1:, :, 2] = bb_coord[..., ca_i[1: ], :] + phi_coord [..., 1:, :, 3] = bb_coord[..., ca_i[1: ]+1, :] + psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1, :] + psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1], :] + psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1, :] + psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2, :] + omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1], :] + omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1, :] + omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2, :] + omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3, :] + # fmt: on + phi = dihedral(phi_coord[...,0], phi_coord[...,1], phi_coord[...,2], phi_coord[...,3]) psi = dihedral(psi_coord[...,0], psi_coord[...,1], psi_coord[...,2], psi_coord[...,3]) omega = dihedral(omega_coord[...,0], omega_coord[...,1], omega_coord[...,2], omega_coord[...,3]) - + return phi, psi, omega def centroid(atoms): """ Measure the centroid of a structure. - + Parameters ---------- atoms: ndarray or AtomArray or AtomArrayStack The structures to determine the centroid from. Alternatively an ndarray containing the coordinates can be provided. - + Returns ------- centroid : float or ndarray @@ -656,7 +658,7 @@ def _displacement_orthogonal_box(fractions, box, disp): Fill in the PBC-aware displacement vector for non-PBC-aware displacements given as fractions of given box vectors. """ - # Fraction components are guaranteed to be positive + # Fraction components are guaranteed to be positive # Use fraction vector components with lower absolute # -> new_vec[i] = vec[i] - 1 if vec[i] > 0.5 else vec[i] fractions[fractions > 0.5] -= 1 @@ -669,7 +671,7 @@ def _displacement_triclinic_box(fractions, box, disp): displacements given as fractions of given box vectors. """ diffs = fraction_to_coord(fractions, box) - # Fraction components are guaranteed to be positive + # Fraction components are guaranteed to be positive # Test all 3 fraction vector components # with positive and negative sign # (i,j,k in {-1, 0}) diff --git a/src/biotite/structure/info/atoms.py b/src/biotite/structure/info/atoms.py index 6ab063a99..9b8f29113 100644 --- a/src/biotite/structure/info/atoms.py +++ b/src/biotite/structure/info/atoms.py @@ -9,12 +9,14 @@ from .ccd import get_ccd -non_hetero_residues = set([ - "ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS", - "ILE","LEU","LYS","MET","PHE","PRO","PYL","SER","THR", - "TRP","TYR","VAL", "SEC", +# fmt: off +NON_HETERO_RESIDUES = set([ + "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", + "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "THR", + "TRP", "TYR", "VAL", "SEC", "A", "DA", "G", "DG", "C", "DC", "U", "DT", ]) +# fmt: on def residue(res_name): @@ -78,5 +80,5 @@ def residue(res_name): raise KeyError( f"No atom information found for residue '{res_name}' in CCD" ) - component.hetero[:] = res_name not in non_hetero_residues + component.hetero[:] = res_name not in NON_HETERO_RESIDUES return component diff --git a/src/biotite/structure/info/radii.py b/src/biotite/structure/info/radii.py index 392dd0c00..e1c2651f7 100644 --- a/src/biotite/structure/info/radii.py +++ b/src/biotite/structure/info/radii.py @@ -9,6 +9,7 @@ from .bonds import bonds_in_residue +# fmt: off # Contains tuples for the different ProtOr groups: # Tuple contains: element, valency, H count _PROTOR_RADII = { @@ -35,28 +36,29 @@ _SINGLE_RADII = { "H": 1.20, "HE": 1.40, - + "C": 1.70, "N": 1.55, "O": 1.52, "F": 1.47, "NE": 1.54, - + "SI": 2.10, "P": 1.80, "S": 1.80, "CL": 1.75, "AR": 1.88, - + "AS": 1.85, "SE": 1.90, "BR": 1.85, "KR": 2.02, - + "TE": 2.06, "I": 1.98, "XE": 2.16, } +# fmt: on # A dictionary that caches radii for each residue _protor_radii = {} @@ -82,7 +84,7 @@ def vdw_radius_protor(res_name, atom_name): to. atom_name : str The name of the non-hydrogen atom. - + Returns ------- The Van-der-Waals radius of the given atom. @@ -91,12 +93,12 @@ def vdw_radius_protor(res_name, atom_name): See also -------- vdw_radius_single - + References ---------- - + .. footbibliography:: - + Examples -------- @@ -173,21 +175,21 @@ def vdw_radius_single(element): ---------- element : str The chemical element of the atoms. - + Returns ------- The Van-der-Waals radius of the atom. If the radius is unknown for the element, `None` is returned. - + See also -------- vdw_radius_protor - + References ---------- - + .. footbibliography:: - + Examples -------- diff --git a/tests/application/test_msa.py b/tests/application/test_msa.py index ca0554e1e..f4a8c16e6 100644 --- a/tests/application/test_msa.py +++ b/tests/application/test_msa.py @@ -34,32 +34,43 @@ def sequences(): ]] -@pytest.mark.parametrize("app_cls, exp_ali, exp_order", - [(MuscleApp, - "BIQT-ITE\n" - "TITANITE\n" - "BISM-ITE\n" - "-IQL-ITE", - [1, 2, 0, 3]), - (Muscle5App, - "BI-QTITE\n" - "TITANITE\n" - "BI-SMITE\n" - "-I-QLITE", - [0, 3, 1, 2]), - (MafftApp, - "-BIQTITE\n" - "TITANITE\n" - "-BISMITE\n" - "--IQLITE", - [0, 3, 2, 1]), - (ClustalOmegaApp, - "-BIQTITE\n" - "TITANITE\n" - "-BISMITE\n" - "--IQLITE", - [1, 2, 0, 3])] -) +@pytest.mark.parametrize( + "app_cls, exp_ali, exp_order", + [ + ( + MuscleApp, + "BIQT-ITE\n" + "TITANITE\n" + "BISM-ITE\n" + "-IQL-ITE", + [1, 2, 0, 3] + ), + ( + Muscle5App, + "BI-QTITE\n" + "TITANITE\n" + "BI-SMITE\n" + "-I-QLITE", + [0, 3, 1, 2] + ), + ( + MafftApp, + "-BIQTITE\n" + "TITANITE\n" + "-BISMITE\n" + "--IQLITE", + [0, 3, 2, 1] + ), + ( + ClustalOmegaApp, + "-BIQTITE\n" + "TITANITE\n" + "-BISMITE\n" + "--IQLITE", + [1, 2, 0, 3] + ) + ] +) # fmt: skip def test_msa(sequences, app_cls, exp_ali, exp_order): """ Test MSA software on short toy sequences with known alignment @@ -120,11 +131,11 @@ def test_additional_options(sequences): app1 = ClustalOmegaApp(sequences) app1.start() - + app2 = ClustalOmegaApp(sequences) app2.add_additional_options(["--full"]) app2.start() - + app1.join() app2.join() assert "--full" not in app1.get_command() @@ -137,7 +148,7 @@ def test_custom_substitution_matrix(sequences, app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.ProteinSequence.alphabet # Strong identity matrix score_matrix = np.identity(len(alph)) * 1000 @@ -147,7 +158,7 @@ def test_custom_substitution_matrix(sequences, app_cls): "TITANITE\n" "BI-SMITE\n" "-I-QLITE" - ) + ) # fmt: skip try: app = app_cls(sequences, matrix=matrix) except VersionError: @@ -165,12 +176,12 @@ def test_custom_sequence_type(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.Alphabet(("foo", "bar", 42)) sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], - ]] + ]] # fmt: skip exp_trace = [ [ 0, 0], [ 1, -1], @@ -206,12 +217,12 @@ def test_invalid_sequence_type_no_matrix(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.Alphabet(("foo", "bar", 42)) sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], - ]] + ]] # fmt: skip with pytest.raises(TypeError): try: app_cls(sequences) @@ -228,7 +239,7 @@ def test_invalid_sequence_type_unsuitable_alphabet(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.Alphabet(range(50)) sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ [1,2,3], @@ -249,7 +260,7 @@ def test_invalid_muscle_version(sequences): bin_path = BIN_PATH[MuscleApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + if is_not_installed("muscle"): pytest.skip(f"'muscle' is not installed") @@ -262,13 +273,13 @@ def test_clustalo_matrix(sequences): bin_path = BIN_PATH[ClustalOmegaApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + ref_matrix = [ [0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0] - ] + ] # fmt: skip app = ClustalOmegaApp(sequences) app.full_matrix_calculation() app.set_distance_matrix(np.array(ref_matrix)) @@ -282,7 +293,7 @@ def test_clustalo_tree(sequences): bin_path = BIN_PATH[ClustalOmegaApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + leaves = [phylo.TreeNode(index=i) for i in range(len(sequences))] inter1 = phylo.TreeNode([leaves[0], leaves[1]], [1.0, 1.0]) inter2 = phylo.TreeNode([leaves[2], leaves[3]], [2.5, 2.5]) @@ -305,7 +316,7 @@ def test_mafft_tree(sequences): bin_path = BIN_PATH[MafftApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + app = MafftApp(sequences) app.start() app.join() @@ -317,7 +328,7 @@ def test_muscle_tree(sequences): bin_path = BIN_PATH[MuscleApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + try: app = MuscleApp(sequences) except VersionError: @@ -334,7 +345,7 @@ def test_muscle5_options(sequences): bin_path = BIN_PATH[Muscle5App] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + try: app = Muscle5App(sequences) except VersionError: @@ -350,7 +361,9 @@ def test_muscle5_options(sequences): assert "-threads" in app.get_command() app.join() - assert str(app.get_alignment()) == "BI-QTITE\n" \ - "TITANITE\n" \ - "BI-SMITE\n" \ - "-I-QLITE" \ No newline at end of file + assert str(app.get_alignment()) == ( + "BI-QTITE\n" \ + "TITANITE\n" \ + "BI-SMITE\n" \ + "-I-QLITE" + ) # fmt: skip \ No newline at end of file diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index b05aa3cef..4d26cb9f6 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -209,7 +209,7 @@ def test_search_composite(): ("non_polymer_entity", [] ), ("polymer_instance", ["1L2Y.A"]), ] -) +) # fmt: skip @pytest.mark.skipif( cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" diff --git a/tests/sequence/align/test_alignment.py b/tests/sequence/align/test_alignment.py index a56cee3c8..483b50c74 100644 --- a/tests/sequence/align/test_alignment.py +++ b/tests/sequence/align/test_alignment.py @@ -16,8 +16,10 @@ def test_alignment_str(): """ seq1 = seq.NucleotideSequence("ACCTGA") seq2 = seq.NucleotideSequence("TATGCT") - ali_str = ["A-CCTGA----", - "----T-ATGCT"] + ali_str = [ + "A-CCTGA----", + "----T-ATGCT" + ] # fmt: skip trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) assert str(alignment).split("\n") == ali_str @@ -69,16 +71,16 @@ def test_pairwise_identity(sequences, mode): sequences, matrix=align.SubstitutionMatrix.std_protein_matrix() ) - + ref_identity_matrix = np.zeros((len(sequences), len(sequences))) for i in range(len(sequences)): for j in range(len(sequences)): ref_identity_matrix[i,j] = align.get_sequence_identity( msa[:, [i,j]], mode=mode ) - + test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode) - + # Identity of two equal sequences should be 1, if only the length of # the sequence is counted if mode == "shortest": diff --git a/tests/sequence/align/test_kmertable.py b/tests/sequence/align/test_kmertable.py index 64439bd27..0f471ca34 100644 --- a/tests/sequence/align/test_kmertable.py +++ b/tests/sequence/align/test_kmertable.py @@ -433,7 +433,7 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask): ), ], ids = idfn -) +)# fmt: skip def test_masking(k, input_mask, ref_output_mask): """ Explicitly test the conversion of removal masks to k-mer masks diff --git a/tests/sequence/align/test_localungapped.py b/tests/sequence/align/test_localungapped.py index b3f24dc59..f2b0ff74e 100644 --- a/tests/sequence/align/test_localungapped.py +++ b/tests/sequence/align/test_localungapped.py @@ -66,12 +66,12 @@ ], [["both"], ["upstream"], ["downstream"]], # direction - + [[False], [True]], # score_only [[False], [True]], # uint8_code )] -) +) # fmt: skip def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, ref_range1, ref_range2, direction, score_only, uint8_code): @@ -90,16 +90,16 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, seq1 = seq_type(seq1) seq2 = seq_type(seq2) - + if seq_type == seq.NucleotideSequence: matrix = align.SubstitutionMatrix.std_nucleotide_matrix() else: matrix = align.SubstitutionMatrix.std_protein_matrix() - + if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) - + ref_alignment = align.Alignment( [seq1, seq2], np.stack([ @@ -112,7 +112,7 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, test_result = align.align_local_ungapped( seq1, seq2, matrix, seed, threshold, direction, score_only) - + if score_only: assert test_result == ref_score else: @@ -141,7 +141,7 @@ def test_random_alignment(seed, uint8_code): CONSERVED_ENDS = 5 MUTATION_PROB = 0.1 THRESHOLD = 100 - + np.random.seed(seed) # Create conserved regions @@ -193,10 +193,10 @@ def test_random_alignment(seed, uint8_code): matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) - + ref_score = align.align_optimal( seq1, seq2, matrix, local=True, max_number=1, - # High gap penalty to prevent introduction of gaps, + # High gap penalty to prevent introduction of gaps, # since 'align_local_ungapped()' is also no able to place gaps gap_penalty=-1000 )[0].score diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 79763213f..582eec2d3 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -33,4 +33,4 @@ def test_matrix_str(): "a 0 1 2", "b 3 4 5", "c 6 7 8"] - ) \ No newline at end of file + ) # fmt: skip \ No newline at end of file diff --git a/tests/sequence/align/test_pairwise.py b/tests/sequence/align/test_pairwise.py index 00717df15..727f8fc2f 100644 --- a/tests/sequence/align/test_pairwise.py +++ b/tests/sequence/align/test_pairwise.py @@ -26,28 +26,29 @@ def test_align_ungapped(): # [local, gap_penalty, input1, input2, expect] -align_cases = [(False,True, -7, "TATGGGTATCC","TATGTATAA", - ("TATGGGTATCC\nTATG--TATAA", - "TATGGGTATCC\nTAT-G-TATAA", - "TATGGGTATCC\nTAT--GTATAA",)), - (True, True, -6, "TATGGGTATCC","TATGTATAA", - ("TATGGGTAT\nTATG--TAT", - "TATGGGTAT\nTAT-G-TAT", - "TATGGGTAT\nTAT--GTAT",)), - (False,True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", - ("TACTATGGGTATCC\nTCATATG--TATAA", - "TACTATGGGTATCC\nTCATAT--GTATAA",)), - (True, True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", - ("TATGGGTAT\nTATG--TAT", - "TATGGGTAT\nTAT--GTAT",)), - (False,True, (-7,-1), "T","TTT", - ("T--\nTTT", - "--T\nTTT",)), - (False,True, -7, "TAAAGCGAAAT","TGCGT", - ("TAAAGCGAAAT\nT---GCG---T")), - (False,False,-7, "TAAAGCGAAAT","TGCGT", - ("TAAAGCGAAAT\n---TGCGT---")) - ] +align_cases = [ + (False,True, -7, "TATGGGTATCC","TATGTATAA", + ("TATGGGTATCC\nTATG--TATAA", + "TATGGGTATCC\nTAT-G-TATAA", + "TATGGGTATCC\nTAT--GTATAA",)), + (True, True, -6, "TATGGGTATCC","TATGTATAA", + ("TATGGGTAT\nTATG--TAT", + "TATGGGTAT\nTAT-G-TAT", + "TATGGGTAT\nTAT--GTAT",)), + (False,True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", + ("TACTATGGGTATCC\nTCATATG--TATAA", + "TACTATGGGTATCC\nTCATAT--GTATAA",)), + (True, True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", + ("TATGGGTAT\nTATG--TAT", + "TATGGGTAT\nTAT--GTAT",)), + (False,True, (-7,-1), "T","TTT", + ("T--\nTTT", + "--T\nTTT",)), + (False,True, -7, "TAAAGCGAAAT","TGCGT", + ("TAAAGCGAAAT\nT---GCG---T")), + (False,False,-7, "TAAAGCGAAAT","TGCGT", + ("TAAAGCGAAAT\n---TGCGT---")) +] # fmt: skip @pytest.mark.parametrize("local, term, gap_penalty, input1, input2, expect", align_cases) def test_align_optimal_simple(local, term, gap_penalty, @@ -63,7 +64,7 @@ def test_align_optimal_simple(local, term, gap_penalty, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local) - + for ali in alignments: assert str(ali) in expect # Test if separate score function calculates the same score @@ -148,7 +149,7 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed): len(sequence.alphabet), size=length ) sequences.append(sequence) - + matrix = align.SubstitutionMatrix.std_nucleotide_matrix() ref_alignments = align.align_optimal( diff --git a/tests/sequence/test_phylo.py b/tests/sequence/test_phylo.py index e2e52349c..d483d2534 100644 --- a/tests/sequence/test_phylo.py +++ b/tests/sequence/test_phylo.py @@ -60,7 +60,7 @@ def test_neighbor_joining(): [ 7, 10, 7, 0, 5, 9], [ 6, 9, 6, 5, 0, 8], [ 8, 11, 8, 9, 8, 0], - ]) + ]) # fmt: skip ref_tree = phylo.Tree(phylo.TreeNode( [ @@ -107,7 +107,7 @@ def test_node_distance(tree): # Example topological distances assert tree.get_distance(0, 19, True) == 9 assert tree.get_distance(4, 2, True) == 10 - + # All pairwise leaf node distances should be sufficient # to reconstruct the same tree via UPGMA ref_dist_mat = np.zeros((len(tree), len(tree))) @@ -148,7 +148,7 @@ def test_get_leaves(tree): assert set(tree.leaves[10].get_indices()) == set([10]) assert tree.root.get_leaf_count() == 20 - + def test_copy(tree): assert tree is not tree.copy() assert tree == tree.copy() @@ -297,7 +297,7 @@ def test_as_binary_distances(): for i in range(len(tree)): for j in range(len(tree)): ref_dist_mat[i,j] = tree.get_distance(i,j) - + bin_tree = phylo.as_binary(tree) test_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py index a5474ffde..a2c940f8f 100644 --- a/tests/structure/test_bonds.py +++ b/tests/structure/test_bonds.py @@ -498,7 +498,7 @@ def test_find_connected(bond_list): ("C17", "C22"), ]), ] -) +) # fmt: skip def test_find_rotatable_bonds(res_name, expected_bonds): """ Check the :func:`find_rotatable_bonds()` function based on diff --git a/tests/structure/test_box.py b/tests/structure/test_box.py index 513f9cd35..b280b85d0 100644 --- a/tests/structure/test_box.py +++ b/tests/structure/test_box.py @@ -21,14 +21,14 @@ (2, 4, 6, 100, 110, 120), (9, 9, 9, 90, 90, 170), (9, 8, 7, 50, 80, 50), -] +] # fmt: skip SAMPLE_COORD = [ ( 1, 1, 1), ( 5, 10, 20), (-1, 5, 8), ( 3, 1, 54) -] +] # fmt: skip From a786ae0d7af284161e6d2dfb9fb777647803168e Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Wed, 3 Jul 2024 09:26:23 +0200 Subject: [PATCH 4/9] Move fixture to 'conftest.py' --- tests/sequence/align/{util.py => conftest.py} | 0 tests/sequence/align/test_alignment.py | 1 - tests/sequence/align/test_banded.py | 13 ++++++------- tests/sequence/align/test_localgapped.py | 9 ++++----- tests/sequence/align/test_multiple.py | 7 +++---- tests/sequence/align/test_pairwise.py | 1 - tests/sequence/align/test_statistics.py | 5 ++--- 7 files changed, 15 insertions(+), 21 deletions(-) rename tests/sequence/align/{util.py => conftest.py} (100%) diff --git a/tests/sequence/align/util.py b/tests/sequence/align/conftest.py similarity index 100% rename from tests/sequence/align/util.py rename to tests/sequence/align/conftest.py diff --git a/tests/sequence/align/test_alignment.py b/tests/sequence/align/test_alignment.py index 483b50c74..ca1250d3e 100644 --- a/tests/sequence/align/test_alignment.py +++ b/tests/sequence/align/test_alignment.py @@ -6,7 +6,6 @@ import pytest import biotite.sequence as seq import biotite.sequence.align as align -from .util import sequences diff --git a/tests/sequence/align/test_banded.py b/tests/sequence/align/test_banded.py index 351139925..f6098f7c7 100644 --- a/tests/sequence/align/test_banded.py +++ b/tests/sequence/align/test_banded.py @@ -7,7 +7,6 @@ import numpy as np import biotite.sequence as seq import biotite.sequence.align as align -from .util import sequences @pytest.mark.parametrize( @@ -34,7 +33,7 @@ def test_simple_alignment(gap_penalty, local, band_width): # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] - + test_alignments = align.align_banded( seq1, seq2, matrix, (-band_width, band_width), gap_penalty=gap_penalty, local=local @@ -59,7 +58,7 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): can return the optimal alignment(s). """ MAX_NUMBER = 100 - + matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] @@ -73,7 +72,7 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] - + identity = align.get_sequence_identity(ref_alignments[0]) # Use a relatively small band width, if the sequences are similar, # otherwise use the entire search space @@ -114,7 +113,7 @@ def test_large_sequence_mapping(length, excerpt_length, seed): sequence at the position, where the excerpt was taken from. """ BAND_WIDTH = 100 - + np.random.seed(seed) sequence = seq.NucleotideSequence() @@ -148,7 +147,7 @@ def test_large_sequence_mapping(length, excerpt_length, seed): np.arange(excerpt_pos, len(excerpt) + excerpt_pos) ], axis=1) assert np.array_equal(test_trace, ref_trace) - + @pytest.mark.parametrize( @@ -187,7 +186,7 @@ def test_swapping(gap_penalty, local, seed): return ref_alignment = ref_alignments[0] test_alignment = test_alignments[0] - + assert test_alignment.sequences[0] == ref_alignment.sequences[1] assert test_alignment.sequences[1] == ref_alignment.sequences[0] assert np.array_equal(test_alignment.trace, ref_alignment.trace[:, ::-1]) diff --git a/tests/sequence/align/test_localgapped.py b/tests/sequence/align/test_localgapped.py index 7fbe19f48..3d7d0854b 100644 --- a/tests/sequence/align/test_localgapped.py +++ b/tests/sequence/align/test_localgapped.py @@ -7,7 +7,6 @@ import numpy as np import biotite.sequence as seq import biotite.sequence.align as align -from .util import sequences @pytest.mark.parametrize( @@ -46,7 +45,7 @@ def test_simple_alignment(gap_penalty, seed, threshold, elif direction == "downstream": alignment.trace = alignment.trace[seed_index:] alignment.score = align.score(alignment, matrix, gap_penalty) - + test_result = align.align_local_gapped( seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only @@ -84,7 +83,7 @@ def test_complex_alignment(sequences, gap_penalty, score_only, # The linear gap penalty for longer gaps easily exceeds # a small threshold -> increase threshold for linear penalty THRESHOLD = 200 if isinstance(gap_penalty, int) else 50 - + matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] @@ -98,7 +97,7 @@ def test_complex_alignment(sequences, gap_penalty, score_only, trace = ref_alignments[0].trace trace = trace[(trace != -1).all(axis=1)] seed = trace[len(trace) // 2] - + test_result = align.align_local_gapped( seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, MAX_NUMBER, "both", score_only @@ -171,7 +170,7 @@ def test_max_table_size(gap_penalty, direction, score_only, should_raise): max_table_size = 1_000_000_000 # Align a long random sequence to itself, - # effectively resulting in a global alignment + # effectively resulting in a global alignment np.random.seed(0) seq1 = seq.NucleotideSequence() seq1.code = np.random.randint(len(seq1.alphabet), size=10000) diff --git a/tests/sequence/align/test_multiple.py b/tests/sequence/align/test_multiple.py index 3a5a470c9..2dd4f757a 100644 --- a/tests/sequence/align/test_multiple.py +++ b/tests/sequence/align/test_multiple.py @@ -7,7 +7,6 @@ import biotite.application.muscle as muscle from biotite.application import VersionError from ...util import is_not_installed -from .util import sequences @@ -26,14 +25,14 @@ def test_align_multiple(sequences, gap_penalty): score of the MUSCLE alignment. """ matrix = align.SubstitutionMatrix.std_protein_matrix() - + test_alignment, order, tree, distances = align.align_multiple( sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True ) test_score = align.score( test_alignment, matrix, gap_penalty, terminal_penalty=True ) - + try: ref_alignment = muscle.MuscleApp.align( sequences, matrix=matrix, gap_penalty=gap_penalty @@ -43,5 +42,5 @@ def test_align_multiple(sequences, gap_penalty): ref_score = align.score( ref_alignment, matrix, gap_penalty, terminal_penalty=True ) - + assert test_score >= ref_score * 0.5 \ No newline at end of file diff --git a/tests/sequence/align/test_pairwise.py b/tests/sequence/align/test_pairwise.py index 727f8fc2f..38c93ffb9 100644 --- a/tests/sequence/align/test_pairwise.py +++ b/tests/sequence/align/test_pairwise.py @@ -10,7 +10,6 @@ import biotite.application.muscle as muscle from biotite.application import VersionError from ...util import is_not_installed -from .util import sequences def test_align_ungapped(): diff --git a/tests/sequence/align/test_statistics.py b/tests/sequence/align/test_statistics.py index b9defcc46..c63b5513c 100644 --- a/tests/sequence/align/test_statistics.py +++ b/tests/sequence/align/test_statistics.py @@ -7,7 +7,6 @@ import biotite.sequence as seq import biotite.sequence.align as align from biotite.sequence.align.statistics import EValueEstimator -from .util import sequences BACKGROUND = np.array(list({ @@ -55,7 +54,7 @@ def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k): """ SAMPLE_LENGTH = 500 SAMPLE_SIZE = 1000 - + alphabet = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix(alphabet, alphabet, matrix_name) @@ -185,6 +184,6 @@ def test_invalid_scoring_scheme(): ) # Uniform background frequencies freq = np.ones(len(alph)) - + with pytest.raises(ValueError): estimator = EValueEstimator.from_samples(alph, matrix, -10, freq) \ No newline at end of file From 0937591940807234efbeadf1c7e4252946c265fe Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Wed, 3 Jul 2024 10:13:38 +0200 Subject: [PATCH 5/9] Force import of attributes required for evaluation of `repr()` strings --- tests/test_repr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_repr.py b/tests/test_repr.py index 5f9714af8..0d986a38f 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -14,7 +14,7 @@ from biotite.sequence.align import Alignment from biotite.structure import Atom import numpy as np -from numpy import float32, int32 +from numpy import float32, int32 # noqa: F401 from biotite.sequence import CodonTable from biotite.sequence.align import SubstitutionMatrix from biotite.sequence import SequenceProfile From f81e561cad07a74135ad1a3da4e76aa665151d3f Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Wed, 3 Jul 2024 10:15:08 +0200 Subject: [PATCH 6/9] Reformat code base Uses `ruff format` and `ruff check --fix` --- doc/apidoc.py | 121 ++--- doc/bibliography.py | 32 +- doc/conf.py | 117 ++--- .../scripts/sequence/annotation/operon_map.py | 44 +- .../sequence/annotation/plasmid_map.py | 38 +- .../sequence/annotation/plasmid_map_custom.py | 189 +++---- .../annotation/region_visualization.py | 35 +- .../sequence/annotation/sigma_domains.py | 125 ++--- .../sequence/homology/avidin_alignment.py | 23 +- .../sequence/homology/bionigma_alignment.py | 350 +++++++------ .../sequence/homology/genome_comparison.py | 170 ++++--- .../sequence/homology/genome_search.py | 87 ++-- .../sequence/homology/gpcr_evolution.py | 33 +- .../sequence/homology/hcn_hydropathy.py | 119 +++-- .../sequence/homology/hcn_similarity.py | 69 +-- .../scripts/sequence/homology/homolog_msa.py | 19 +- .../sequence/homology/lexa_conservation.py | 46 +- .../sequence/homology/luxa_comparison.py | 24 +- .../sequence/homology/pi3k_alignment.py | 91 ++-- .../scripts/sequence/homology/plotepiscan.py | 419 ++++++++-------- .../sequence/homology/residue_coevolution.py | 64 ++- .../homology/thca_synthase_polymorphism.py | 43 +- .../sequence/misc/blosum_dendrogram.py | 26 +- .../scripts/sequence/misc/codon_usage.py | 15 +- .../scripts/sequence/misc/color_schemes.py | 48 +- .../sequence/misc/color_schemes_protein.py | 51 +- .../misc/local_alignment_statistics.py | 174 ++++--- .../sequence/misc/orf_identification.py | 18 +- .../scripts/sequence/profile/anderson_logo.py | 56 ++- .../sequence/profile/rbs_identification.py | 49 +- .../sequence/sequencing/gene_counts.py | 79 +-- .../sequence/sequencing/genome_assembly.py | 262 +++++----- .../sequence/sequencing/quality_control.py | 86 ++-- .../sequence/sequencing/read_quality.py | 16 +- .../structure/contacts/adjacency_matrix.py | 9 +- .../structure/contacts/contact_sites.py | 15 +- .../structure/contacts/contact_sites_pymol.py | 54 +- .../structure/contacts/disulfide_bonds.py | 91 ++-- .../structure/contacts/domain_hbonds.py | 28 +- .../scripts/structure/contacts/leaflet.py | 36 +- .../structure/contacts/leaflet_pymol.py | 23 +- .../structure/misc/biological_assembly.py | 7 +- .../misc/biological_assembly_pymol.py | 8 +- .../scripts/structure/misc/diameter.py | 8 +- .../scripts/structure/misc/gap_bars.py | 22 +- .../structure/misc/glycan_visualization.py | 100 ++-- .../structure/misc/homolog_superimposition.py | 11 +- .../misc/homolog_superimposition_pymol.py | 12 +- .../scripts/structure/misc/pdb_statistics.py | 54 +- .../scripts/structure/modeling/docking.py | 22 +- .../structure/modeling/docking_pymol.py | 16 +- .../scripts/structure/modeling/md_analysis.py | 16 +- .../structure/modeling/mmtf_trajectory.py | 22 +- .../structure/modeling/normal_modes.py | 21 +- .../structure/modeling/normal_modes_pymol.py | 35 +- .../structure/modeling/rotamer_library.py | 44 +- .../structure/modeling/solvation_shells.py | 42 +- .../structure/modeling/trajectory_sse.py | 47 +- .../modeling/water_exchange_noexec.py | 62 +-- .../structure/molecule/alkane_isomers.py | 42 +- .../molecule/molecular_visualization.py | 34 +- .../structure/molecule/peoe_visualization.py | 58 ++- .../structure/nucleotide/base_pairs.py | 33 +- .../structure/nucleotide/leontis_westhof.py | 50 +- .../structure/nucleotide/transfer_rnas.py | 65 +-- .../structure/nucleotide/watson_crick.py | 66 ++- .../scripts/structure/protein/pb_alignment.py | 49 +- .../structure/protein/peptide_assembly.py | 116 ++--- .../protein/peptide_assembly_pymol.py | 21 +- .../scripts/structure/protein/ramachandran.py | 25 +- .../structure/protein/residue_chirality.py | 17 +- .../structure/protein/sheet_arrangement.py | 198 ++++---- .../structure/protein/transketolase_sse.py | 124 +++-- doc/key.py | 2 +- doc/scraper.py | 32 +- doc/switcher.py | 42 +- doc/viewcode.py | 52 +- setup_ccd.py | 374 +++++++------- src/biotite/__init__.py | 4 +- src/biotite/application/__init__.py | 2 +- src/biotite/application/application.py | 30 +- src/biotite/application/autodock/__init__.py | 2 +- src/biotite/application/autodock/app.py | 149 +++--- src/biotite/application/blast/__init__.py | 2 +- src/biotite/application/blast/alignment.py | 27 +- src/biotite/application/blast/webapp.py | 175 +++---- src/biotite/application/clustalo/__init__.py | 2 +- src/biotite/application/clustalo/app.py | 101 ++-- src/biotite/application/dssp/__init__.py | 2 +- src/biotite/application/dssp/app.py | 17 +- src/biotite/application/localapp.py | 108 ++-- src/biotite/application/mafft/__init__.py | 2 +- src/biotite/application/mafft/app.py | 36 +- src/biotite/application/msaapp.py | 165 +++--- src/biotite/application/muscle/__init__.py | 2 +- src/biotite/application/muscle/app3.py | 107 ++-- src/biotite/application/muscle/app5.py | 50 +- src/biotite/application/sra/__init__.py | 2 +- src/biotite/application/sra/app.py | 121 +++-- src/biotite/application/tantan/__init__.py | 2 +- src/biotite/application/tantan/app.py | 63 +-- src/biotite/application/util.py | 14 +- .../application/viennarna/rnaalifold.py | 56 ++- src/biotite/application/viennarna/rnafold.py | 26 +- src/biotite/application/viennarna/rnaplot.py | 55 +- src/biotite/application/viennarna/util.py | 27 +- src/biotite/application/webapp.py | 25 +- src/biotite/copyable.py | 26 +- src/biotite/database/__init__.py | 2 +- src/biotite/database/entrez/__init__.py | 2 +- src/biotite/database/entrez/check.py | 3 +- src/biotite/database/entrez/dbnames.py | 2 +- src/biotite/database/entrez/download.py | 98 ++-- src/biotite/database/entrez/key.py | 2 +- src/biotite/database/entrez/query.py | 79 ++- src/biotite/database/error.py | 3 +- src/biotite/database/pubchem/__init__.py | 2 +- src/biotite/database/pubchem/download.py | 86 ++-- src/biotite/database/pubchem/error.py | 4 +- src/biotite/database/pubchem/query.py | 61 +-- src/biotite/database/pubchem/throttle.py | 7 +- src/biotite/database/rcsb/__init__.py | 2 +- src/biotite/database/rcsb/download.py | 85 ++-- src/biotite/database/rcsb/query.py | 157 +++--- src/biotite/database/uniprot/check.py | 7 +- src/biotite/database/uniprot/download.py | 15 +- src/biotite/database/uniprot/query.py | 144 ++++-- src/biotite/file.py | 17 +- src/biotite/sequence/__init__.py | 6 +- src/biotite/sequence/align/__init__.py | 4 +- src/biotite/sequence/align/alignment.py | 91 ++-- src/biotite/sequence/align/buckets.py | 22 +- src/biotite/sequence/align/cigar.py | 85 ++-- src/biotite/sequence/align/matrix.py | 163 +++--- src/biotite/sequence/align/statistics.py | 61 ++- src/biotite/sequence/alphabet.py | 85 ++-- src/biotite/sequence/annotation.py | 151 +++--- src/biotite/sequence/codon.py | 158 +++--- src/biotite/sequence/graphics/__init__.py | 2 +- src/biotite/sequence/graphics/alignment.py | 277 ++++++---- src/biotite/sequence/graphics/colorschemes.py | 20 +- src/biotite/sequence/graphics/dendrogram.py | 113 +++-- src/biotite/sequence/graphics/features.py | 232 +++++---- src/biotite/sequence/graphics/logo.py | 26 +- src/biotite/sequence/graphics/plasmid.py | 397 ++++++++------- src/biotite/sequence/io/fasta/__init__.py | 2 +- src/biotite/sequence/io/fasta/convert.py | 42 +- src/biotite/sequence/io/fasta/file.py | 97 ++-- src/biotite/sequence/io/fastq/__init__.py | 2 +- src/biotite/sequence/io/fastq/convert.py | 23 +- src/biotite/sequence/io/fastq/file.py | 94 ++-- src/biotite/sequence/io/genbank/__init__.py | 4 +- src/biotite/sequence/io/genbank/annotation.py | 26 +- src/biotite/sequence/io/genbank/file.py | 146 +++--- src/biotite/sequence/io/genbank/metadata.py | 134 ++--- src/biotite/sequence/io/genbank/sequence.py | 21 +- src/biotite/sequence/io/general.py | 47 +- src/biotite/sequence/io/gff/__init__.py | 4 +- src/biotite/sequence/io/gff/convert.py | 25 +- src/biotite/sequence/io/gff/file.py | 144 +++--- src/biotite/sequence/phylo/__init__.py | 2 +- src/biotite/sequence/profile.py | 79 ++- src/biotite/sequence/search.py | 32 +- src/biotite/sequence/seqtypes.py | 361 ++++++++------ src/biotite/sequence/sequence.py | 30 +- src/biotite/structure/__init__.py | 4 +- src/biotite/structure/atoms.py | 450 ++++++++--------- src/biotite/structure/basepairs.py | 471 +++++++++--------- src/biotite/structure/box.py | 128 +++-- src/biotite/structure/chains.py | 85 ++-- src/biotite/structure/compare.py | 60 +-- src/biotite/structure/density.py | 29 +- src/biotite/structure/dotbracket.py | 36 +- src/biotite/structure/error.py | 12 +- src/biotite/structure/filter.py | 110 ++-- src/biotite/structure/geometry.py | 129 ++--- src/biotite/structure/graphics/atoms.py | 103 ++-- src/biotite/structure/graphics/rna.py | 147 +++--- src/biotite/structure/hbond.py | 201 ++++---- src/biotite/structure/info/__init__.py | 2 - src/biotite/structure/info/atoms.py | 5 +- src/biotite/structure/info/bonds.py | 20 +- src/biotite/structure/info/ccd.py | 7 +- src/biotite/structure/info/groups.py | 4 +- src/biotite/structure/info/masses.py | 11 +- src/biotite/structure/info/radii.py | 10 +- src/biotite/structure/info/standardize.py | 39 +- src/biotite/structure/integrity.py | 28 +- src/biotite/structure/io/__init__.py | 2 +- src/biotite/structure/io/dcd/__init__.py | 2 +- src/biotite/structure/io/dcd/file.py | 37 +- src/biotite/structure/io/general.py | 42 +- src/biotite/structure/io/gro/__init__.py | 2 +- src/biotite/structure/io/gro/file.py | 139 +++--- src/biotite/structure/io/mol/__init__.py | 2 +- src/biotite/structure/io/mol/convert.py | 15 +- src/biotite/structure/io/mol/ctab.py | 67 +-- src/biotite/structure/io/mol/header.py | 24 +- src/biotite/structure/io/mol/mol.py | 16 +- src/biotite/structure/io/mol/sdf.py | 82 ++- src/biotite/structure/io/netcdf/__init__.py | 2 +- src/biotite/structure/io/netcdf/file.py | 42 +- src/biotite/structure/io/pdb/__init__.py | 2 +- src/biotite/structure/io/pdb/convert.py | 52 +- src/biotite/structure/io/pdb/file.py | 309 ++++++------ src/biotite/structure/io/pdbqt/__init__.py | 2 +- src/biotite/structure/io/pdbqt/convert.py | 28 +- src/biotite/structure/io/pdbqt/file.py | 196 +++++--- src/biotite/structure/io/pdbx/__init__.py | 4 +- src/biotite/structure/io/pdbx/bcif.py | 74 ++- src/biotite/structure/io/pdbx/cif.py | 73 +-- src/biotite/structure/io/pdbx/component.py | 20 +- src/biotite/structure/io/pdbx/convert.py | 388 +++++++-------- src/biotite/structure/io/tng/__init__.py | 2 +- src/biotite/structure/io/tng/file.py | 22 +- src/biotite/structure/io/trajfile.py | 146 +++--- src/biotite/structure/io/trr/__init__.py | 2 +- src/biotite/structure/io/trr/file.py | 22 +- src/biotite/structure/io/xtc/__init__.py | 2 +- src/biotite/structure/io/xtc/file.py | 20 +- src/biotite/structure/mechanics.py | 16 +- src/biotite/structure/molecules.py | 3 +- src/biotite/structure/pseudoknots.py | 120 ++--- src/biotite/structure/rdf.py | 38 +- src/biotite/structure/repair.py | 151 +++++- src/biotite/structure/residues.py | 31 +- src/biotite/structure/resutil.py | 45 +- src/biotite/structure/sequence.py | 17 +- src/biotite/structure/sse.py | 152 +++--- src/biotite/structure/superimpose.py | 78 ++- src/biotite/structure/transform.py | 166 +++--- src/biotite/structure/util.py | 24 +- src/biotite/visualize.py | 120 +++-- tests/application/test_autodock.py | 23 +- tests/application/test_blast.py | 69 +-- tests/application/test_dssp.py | 3 +- tests/application/test_msa.py | 95 ++-- tests/application/test_rnaalifold.py | 33 +- tests/application/test_rnafold.py | 48 +- tests/application/test_rnaplot.py | 25 +- tests/application/test_sra.py | 16 +- tests/application/test_tantan.py | 26 +- tests/conftest.py | 9 +- tests/database/test_entrez.py | 36 +- tests/database/test_pubchem.py | 80 +-- tests/database/test_rcsb.py | 288 +++++------ tests/database/test_uniprot.py | 53 +- tests/sequence/align/conftest.py | 2 +- tests/sequence/align/test_alignment.py | 33 +- tests/sequence/align/test_banded.py | 126 +++-- tests/sequence/align/test_cigar.py | 54 +- tests/sequence/align/test_kmeralphabet.py | 26 +- tests/sequence/align/test_kmersimilarity.py | 20 +- tests/sequence/align/test_kmertable.py | 218 +++----- tests/sequence/align/test_localgapped.py | 117 +++-- tests/sequence/align/test_localungapped.py | 107 ++-- tests/sequence/align/test_matrix.py | 16 +- tests/sequence/align/test_multiple.py | 22 +- tests/sequence/align/test_pairwise.py | 112 +++-- tests/sequence/align/test_permutation.py | 30 +- tests/sequence/align/test_selector.py | 66 +-- tests/sequence/align/test_statistics.py | 138 ++--- tests/sequence/test_alphabet.py | 105 ++-- tests/sequence/test_annotation.py | 87 ++-- tests/sequence/test_codon.py | 33 +- tests/sequence/test_fasta.py | 95 ++-- tests/sequence/test_fastq.py | 71 ++- tests/sequence/test_genbank.py | 126 +++-- tests/sequence/test_generalio.py | 36 +- tests/sequence/test_gff.py | 68 ++- tests/sequence/test_graphics.py | 22 +- tests/sequence/test_phylo.py | 213 ++++---- tests/sequence/test_profile.py | 151 ++++-- tests/sequence/test_search.py | 11 +- tests/sequence/test_seqtypes.py | 37 +- tests/sequence/test_sequence.py | 23 +- .../create_bond_orientation_test_data.py | 41 +- .../create_interacting_edge_test_data.py | 44 +- .../structure/data/create_test_structures.py | 62 ++- .../data/molecules/create_v3000_sdf.py | 2 +- tests/structure/test_atoms.py | 112 +++-- tests/structure/test_basepairs.py | 114 ++--- tests/structure/test_bonds.py | 232 +++++---- tests/structure/test_box.py | 118 ++--- tests/structure/test_celllist.py | 49 +- tests/structure/test_chains.py | 23 +- tests/structure/test_charges.py | 347 ++++++------- tests/structure/test_compare.py | 173 +++++-- tests/structure/test_density.py | 55 +- tests/structure/test_dotbracket.py | 88 ++-- tests/structure/test_filter.py | 171 ++++--- tests/structure/test_generalio.py | 67 +-- tests/structure/test_geometry.py | 154 +++--- tests/structure/test_gro.py | 52 +- tests/structure/test_hbond.py | 58 +-- tests/structure/test_info.py | 33 +- tests/structure/test_integrity.py | 31 +- tests/structure/test_mechanics.py | 56 ++- tests/structure/test_mol.py | 89 ++-- tests/structure/test_molecules.py | 47 +- tests/structure/test_pdb.py | 159 +++--- tests/structure/test_pdbqt.py | 17 +- tests/structure/test_pdbx.py | 145 +++--- tests/structure/test_pseudoknots.py | 42 +- tests/structure/test_rdf.py | 143 +++--- tests/structure/test_repair.py | 56 +-- tests/structure/test_residues.py | 49 +- tests/structure/test_sasa.py | 54 +- tests/structure/test_sequence.py | 11 +- tests/structure/test_sse.py | 33 +- tests/structure/test_superimpose.py | 101 ++-- tests/structure/test_trajectory.py | 88 ++-- tests/structure/test_transform.py | 98 ++-- tests/test_doctest.py | 170 +++---- tests/test_init.py | 3 +- tests/test_modname.py | 20 +- tests/test_repr.py | 114 +++-- tests/test_version.py | 2 +- tests/util.py | 12 +- 319 files changed, 11310 insertions(+), 10813 deletions(-) diff --git a/doc/apidoc.py b/doc/apidoc.py index 5bc412333..07c152620 100644 --- a/doc/apidoc.py +++ b/doc/apidoc.py @@ -5,15 +5,14 @@ __author__ = "Patrick Kunzmann" __all__ = ["create_api_doc", "skip_non_methods"] -from os.path import join, isdir -from os import listdir, makedirs -from importlib import import_module -import types -import json import enum -from textwrap import dedent +import json +import types from collections import OrderedDict - +from importlib import import_module +from os import listdir, makedirs +from os.path import isdir, join +from textwrap import dedent _INDENT = " " * 4 @@ -24,7 +23,6 @@ _pck_categories = json.load(file, object_pairs_hook=OrderedDict) - def create_api_doc(src_path, doc_path): """ Create *.rst files for API documentation. @@ -40,11 +38,7 @@ def create_api_doc(src_path, doc_path): # Create directory to store apidoc if not isdir(doc_path): makedirs(doc_path) - package_list = _create_package_doc( - "biotite", - join(src_path, "biotite"), - doc_path - ) + package_list = _create_package_doc("biotite", join(src_path, "biotite"), doc_path) _create_package_index(doc_path, package_list) @@ -67,19 +61,24 @@ def _create_package_doc(pck, src_path, doc_path): module = import_module(pck) attr_list = dir(module) # Classify attribute names into classes and functions - class_list = [attr for attr in attr_list - # Do not document private classes - if attr[0] != "_" - # Check if object is a class - and isinstance(getattr(module, attr), type)] - func_list = [attr for attr in attr_list - # Do not document private classes - if attr[0] != "_" - # All functions are callable... - and callable(getattr(module, attr)) - # ...but classes are also callable - and attr not in class_list - ] + class_list = [ + attr + for attr in attr_list + # Do not document private classes + if attr[0] != "_" + # Check if object is a class + and isinstance(getattr(module, attr), type) + ] + func_list = [ + attr + for attr in attr_list + # Do not document private classes + if attr[0] != "_" + # All functions are callable... + and callable(getattr(module, attr)) + # ...but classes are also callable + and attr not in class_list + ] # Create *.rst files _create_package_page(doc_path, pck, class_list, func_list, sub_pck) for class_name in class_list: @@ -87,11 +86,10 @@ def _create_package_doc(pck, src_path, doc_path): for function_name in func_list: _create_function_page(doc_path, pck, function_name) - return([pck] + sub_pck) + return [pck] + sub_pck -def _create_package_page(doc_path, package_name, - classes, functions, subpackages): +def _create_package_page(doc_path, package_name, classes, functions, subpackages): attributes = classes + functions # Get categories for this package @@ -114,7 +112,6 @@ def _create_package_page(doc_path, package_name, misc_category_name = "Miscellaneous" if categories else "Content" categories[misc_category_name] = misc_attributes - # String for categorized class and function enumeration category_strings = [] for category, attrs in categories.items(): @@ -135,12 +132,11 @@ def _create_package_page(doc_path, package_name, attributes_string = "\n".join(category_strings) # String for subpackage enumeration - subpackages_string = "\n".join( - [_INDENT + pck for pck in subpackages] - ) + subpackages_string = "\n".join([_INDENT + pck for pck in subpackages]) # Assemble page - file_content = dedent(f""" + file_content = ( + dedent(f""" ``{package_name}`` {"=" * (len(package_name) + 4)} @@ -150,16 +146,21 @@ def _create_package_page(doc_path, package_name, .. currentmodule:: {package_name} - """) + attributes_string + """) + + attributes_string + ) if len(subpackages) > 0: - file_content += dedent(f""" + file_content += ( + dedent(""" Subpackages ----------- .. autosummary:: - """) + subpackages_string + """) + + subpackages_string + ) with open(join(doc_path, f"{package_name}.rst"), "w") as f: f.write(file_content) @@ -201,18 +202,19 @@ def _create_function_page(doc_path, package_name, function_name): def _create_package_index(doc_path, package_list): # String for package enumeration - packages_string = "\n".join( - [_INDENT + pck for pck in sorted(package_list)] - ) + packages_string = "\n".join([_INDENT + pck for pck in sorted(package_list)]) - file_content = dedent(f""" + file_content = ( + dedent(""" API Reference ============= .. autosummary:: :toctree: - """) + packages_string + """) + + packages_string + ) with open(join(doc_path, "index.rst"), "w") as f: f.write(file_content) @@ -249,20 +251,21 @@ def _is_relevant_type(obj): # These are some special built-in Python methods return False return ( - # Functions - type(obj) in [ - types.FunctionType, types.BuiltinFunctionType, types.MethodType - ] - ) | ( - # Functions from C-extensions - type(obj).__name__ in [ - "cython_function_or_method", - "fused_cython_function" - ] - ) | ( - # Enum instance - isinstance(obj, enum.Enum) - ) | ( - # Inner class - isinstance(obj, type) - ) \ No newline at end of file + ( + # Functions + type(obj) + in [types.FunctionType, types.BuiltinFunctionType, types.MethodType] + ) + | ( + # Functions from C-extensions + type(obj).__name__ in ["cython_function_or_method", "fused_cython_function"] + ) + | ( + # Enum instance + isinstance(obj, enum.Enum) + ) + | ( + # Inner class + isinstance(obj, type) + ) + ) diff --git a/doc/bibliography.py b/doc/bibliography.py index 9c0bc4831..2d9093adf 100644 --- a/doc/bibliography.py +++ b/doc/bibliography.py @@ -5,14 +5,14 @@ __author__ = "Patrick Kunzmann" import warnings -from pybtex.richtext import Text, Tag, HRef +from pybtex.richtext import HRef, Tag, Text from pybtex.style.formatting import BaseStyle class IEEEStyle(BaseStyle): def format_article(self, param): entry = param["entry"] - + try: authors = [] for author in entry.persons["author"]: @@ -28,7 +28,7 @@ def format_article(self, param): text += " " text += " ".join([s for s in author.last_names]) authors.append(Text(text + ", ")) - + title = "" in_protected = False for char in entry.fields["title"]: @@ -46,34 +46,34 @@ def format_article(self, param): else: title += char.lower() title = Text('"', title, '," ') - + journal = Text(Tag("em", entry.fields["journal"]), ", ") - + if "volume" in entry.fields: volume = Text("vol. ", entry.fields["volume"], ", ") else: volume = Text() - + if "pages" in entry.fields: pages = Text("pp. ", entry.fields["pages"], ", ") else: pages = Text() - + date = entry.fields["year"] if "month" in entry.fields: date = entry.fields["month"] + " " + date date = Text(date, ". ") - - if "doi" in entry.fields: - doi = Text("doi: ", HRef( - "https://doi.org/" + entry.fields["doi"], - entry.fields["doi"] - )) + + if "doi" in entry.fields: + doi = Text( + "doi: ", + HRef("https://doi.org/" + entry.fields["doi"], entry.fields["doi"]), + ) else: doi = Text() - + return Text(*authors, title, journal, volume, pages, date, doi) - + except: warnings.warn(f"Invalid BibTeX entry '{entry.key}'") - return Text(entry.key) \ No newline at end of file + return Text(entry.key) diff --git a/doc/conf.py b/doc/conf.py index d3f6e53c0..e3a785800 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -5,24 +5,21 @@ __author__ = "Patrick Kunzmann" # Setup Cython for import of uncompiled *.pyx files -import pyximport import numpy as np +import pyximport + pyximport.install( - setup_args={'include_dirs': np.get_include()}, - build_in_temp=False, - language_level=3 + setup_args={"include_dirs": np.get_include()}, build_in_temp=False, language_level=3 ) -from os.path import realpath, dirname, join import sys import warnings -import pybtex -from sphinx_gallery.sorting import FileNameSortKey, ExplicitOrder +from os.path import dirname, join, realpath import matplotlib - +import pybtex +from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey import biotite - BIOTITE_DOMAIN = "www.biotite-python.org" DOC_PATH = dirname(realpath(__file__)) PACKAGE_PATH = join(dirname(DOC_PATH), "src") @@ -32,28 +29,21 @@ # in order to import modules for API doc generation etc. sys.path.insert(0, DOC_PATH) import apidoc -import viewcode -import scraper import bibliography import key +import scraper import switcher - +import viewcode # Reset matplotlib params matplotlib.rcdefaults() # Pregeneration of files apidoc.create_api_doc(PACKAGE_PATH, join(DOC_PATH, "apidoc")) -switcher.create_switcher_json( - join("static", "switcher.json"), - "v0.41.0", - n_versions=5 -) +switcher.create_switcher_json(join("static", "switcher.json"), "v0.41.0", n_versions=5) # Use custom citation style -pybtex.plugin.register_plugin( - "pybtex.style.formatting", "ieee", bibliography.IEEEStyle -) +pybtex.plugin.register_plugin("pybtex.style.formatting", "ieee", bibliography.IEEEStyle) #### Source code link ### @@ -61,14 +51,13 @@ #### General #### -import warnings # Removed standard matplotlib warning when generating gallery warnings.filterwarnings( "ignore", category=UserWarning, message="Matplotlib is currently using agg, which is a non-GUI backend, " - "so cannot show the figure." + "so cannot show the figure.", ) extensions = [ @@ -127,10 +116,7 @@ html_theme = "pydata_sphinx_theme" html_static_path = ["static"] -html_css_files = [ - "biotite.css", - "fonts.css" -] +html_css_files = ["biotite.css", "fonts.css"] html_title = "Biotite" html_logo = "static/assets/general/biotite_logo.svg" html_favicon = "static/assets/general/biotite_icon_32p.png" @@ -162,11 +148,11 @@ "url": "https://biotite.bsky.social", "icon": "fa-brands fa-bluesky", "type": "fontawesome", - } - ], - "use_edit_page_button": True, - "show_prev_next": False, - "show_toc_level": 2, + }, + ], + "use_edit_page_button": True, + "show_prev_next": False, + "show_toc_level": 2, } html_sidebars = { # No primary sidebar for these pages @@ -183,53 +169,50 @@ } sphinx_gallery_conf = { - "examples_dirs" : [ - "examples/scripts/sequence", - "examples/scripts/structure" - ], - "gallery_dirs" : [ - "examples/gallery/sequence", - "examples/gallery/structure" - ], - "subsection_order": ExplicitOrder([ - "examples/scripts/sequence/homology", - "examples/scripts/sequence/sequencing", - "examples/scripts/sequence/profile", - "examples/scripts/sequence/annotation", - "examples/scripts/sequence/misc", - "examples/scripts/structure/protein", - "examples/scripts/structure/nucleotide", - "examples/scripts/structure/molecule", - "examples/scripts/structure/contacts", - "examples/scripts/structure/modeling", - "examples/scripts/structure/misc", - ]), - "within_subsection_order" : FileNameSortKey, + "examples_dirs": ["examples/scripts/sequence", "examples/scripts/structure"], + "gallery_dirs": ["examples/gallery/sequence", "examples/gallery/structure"], + "subsection_order": ExplicitOrder( + [ + "examples/scripts/sequence/homology", + "examples/scripts/sequence/sequencing", + "examples/scripts/sequence/profile", + "examples/scripts/sequence/annotation", + "examples/scripts/sequence/misc", + "examples/scripts/structure/protein", + "examples/scripts/structure/nucleotide", + "examples/scripts/structure/molecule", + "examples/scripts/structure/contacts", + "examples/scripts/structure/modeling", + "examples/scripts/structure/misc", + ] + ), + "within_subsection_order": FileNameSortKey, # Do not run example scripts with a trailing '_noexec' - "filename_pattern" : "^((?!_noexec).)*$", - "ignore_pattern" : "(.*ignore\.py)|(.*pymol\.py)", - "backreferences_dir" : None, - "download_all_examples" : False, + "filename_pattern": "^((?!_noexec).)*$", + "ignore_pattern": r"(.*ignore\.py)|(.*pymol\.py)", + "backreferences_dir": None, + "download_all_examples": False, # Never report run time - "min_reported_time" : sys.maxsize, - "default_thumb_file" : join( + "min_reported_time": sys.maxsize, + "default_thumb_file": join( DOC_PATH, "static/assets/general/biotite_icon_thumb.png" ), - "image_scrapers" : ( + "image_scrapers": ( "matplotlib", scraper.static_image_scraper, - scraper.pymol_scraper + scraper.pymol_scraper, ), - "matplotlib_animations" : True, - "backreferences_dir" : "examples/backreferences", - "doc_module" : ("biotite",), + "matplotlib_animations": True, + "backreferences_dir": "examples/backreferences", + "doc_module": ("biotite",), # Set the NCBI API key - "reset_modules" : (key.set_ncbi_api_key_from_env,), - "remove_config_comments" : True, + "reset_modules": (key.set_ncbi_api_key_from_env,), + "remove_config_comments": True, } #### App setup #### + def setup(app): - app.connect("autodoc-skip-member", apidoc.skip_nonrelevant) \ No newline at end of file + app.connect("autodoc-skip-member", apidoc.skip_nonrelevant) diff --git a/doc/examples/scripts/sequence/annotation/operon_map.py b/doc/examples/scripts/sequence/annotation/operon_map.py index dcd730ee7..37be67652 100644 --- a/doc/examples/scripts/sequence/annotation/operon_map.py +++ b/doc/examples/scripts/sequence/annotation/operon_map.py @@ -10,31 +10,39 @@ # License: BSD 3 clause import matplotlib.pyplot as plt -from biotite.sequence import Annotation, Feature, Location import biotite.sequence.graphics as graphics +from biotite.sequence import Annotation, Feature, Location strand = Location.Strand.FORWARD -prom = Feature("regulatory", [Location(10, 50, strand)], - {"regulatory_class" : "promoter", - "note" : "T7"}) -rbs1 = Feature("regulatory", [Location(60, 75, strand)], - {"regulatory_class" : "ribosome_binding_site", - "note" : "RBS1"}) -gene1 = Feature("gene", [Location(81, 380, strand)], - {"gene" : "gene1"}) -rbs2 = Feature("regulatory", [Location(400, 415, strand)], - {"regulatory_class" : "ribosome_binding_site", - "note" : "RBS2"}) -gene2 = Feature("gene", [Location(421, 1020, strand)], - {"gene" : "gene2"}) -term = Feature("regulatory", [Location(1050, 1080, strand)], - {"regulatory_class" : "terminator"}) +prom = Feature( + "regulatory", + [Location(10, 50, strand)], + {"regulatory_class": "promoter", "note": "T7"}, +) +rbs1 = Feature( + "regulatory", + [Location(60, 75, strand)], + {"regulatory_class": "ribosome_binding_site", "note": "RBS1"}, +) +gene1 = Feature("gene", [Location(81, 380, strand)], {"gene": "gene1"}) +rbs2 = Feature( + "regulatory", + [Location(400, 415, strand)], + {"regulatory_class": "ribosome_binding_site", "note": "RBS2"}, +) +gene2 = Feature("gene", [Location(421, 1020, strand)], {"gene": "gene2"}) +term = Feature( + "regulatory", [Location(1050, 1080, strand)], {"regulatory_class": "terminator"} +) annotation = Annotation([prom, rbs1, gene1, rbs2, gene2, term]) fig = plt.figure(figsize=(8.0, 0.8)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, multi_line=False, loc_range=(1, 1101), + ax, + annotation, + multi_line=False, + loc_range=(1, 1101), ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/annotation/plasmid_map.py b/doc/examples/scripts/sequence/annotation/plasmid_map.py index b25623bf4..bf4118352 100644 --- a/doc/examples/scripts/sequence/annotation/plasmid_map.py +++ b/doc/examples/scripts/sequence/annotation/plasmid_map.py @@ -18,26 +18,33 @@ # License: BSD 3 clause import io -import requests import matplotlib.pyplot as plt -import numpy as np +import requests import biotite -import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.genbank as gb -PLASMID_URL = "https://media.addgene.org/snapgene-media/v1.7.9-0-g88a3305/"\ - "sequences/12250/9998fdbe-051f-4dc6-ba0f-24e65127a0c5/" \ - "addgene-plasmid-26092-sequence-12250.gbk" +PLASMID_URL = ( + "https://media.addgene.org/snapgene-media/v1.7.9-0-g88a3305/" + "sequences/12250/9998fdbe-051f-4dc6-ba0f-24e65127a0c5/" + "addgene-plasmid-26092-sequence-12250.gbk" +) response = requests.get(PLASMID_URL) gb_file = gb.GenBankFile.read(io.StringIO(response.text)) -annotation = gb.get_annotation(gb_file, include_only=[ - "promoter", "terminator", "protein_bind", - "RBS", "CDS", "rep_origin", "primer_bind" -]) +annotation = gb.get_annotation( + gb_file, + include_only=[ + "promoter", + "terminator", + "protein_bind", + "RBS", + "CDS", + "rep_origin", + "primer_bind", + ], +) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) # AddGene stores the plasmid name in the 'KEYWORDS' field # [0][0][0] -> @@ -69,8 +76,11 @@ def custom_feature_formatter(feature): fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111, projection="polar") graphics.plot_plasmid_map( - ax, annotation, plasmid_size=seq_length, - label=plasmid_name, feature_formatter=custom_feature_formatter + ax, + annotation, + plasmid_size=seq_length, + label=plasmid_name, + feature_formatter=custom_feature_formatter, ) fig.tight_layout() plt.show() diff --git a/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py b/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py index 67fb87834..494b2d17e 100644 --- a/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py +++ b/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py @@ -5,129 +5,100 @@ .. currentmodule:: biotite.sequence This script shows how :class:`Feature` objects are displayed in a -plasmid map by using a custom 'toy' :class:`Annotation`. +plasmid map by using a custom 'toy' :class:`Annotation`. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import matplotlib.pyplot as plt -import numpy as np import biotite.sequence as seq -import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - -annotation = seq.Annotation([ - seq.Feature( - "source", - [seq.Location(0, 1500)], - {"organism": "Escherichia coli"} - ), - - # Ori - seq.Feature( - "rep_origin", - [seq.Location(600, 700, seq.Location.Strand.REVERSE)], - {"regulatory_class": "promoter", "note": "MyProm"} - ), - - # Promoter - seq.Feature( - "regulatory", - [seq.Location(1000, 1060)], - {"regulatory_class": "promoter", "note": "MyProm"} - ), - seq.Feature( - "protein_bind", - [seq.Location(1025, 1045)], - {"note": "repr"} - ), - - # Gene A - seq.Feature( - "regulatory", - [seq.Location(1070, 1080)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - [seq.Location(1091, 1150)], - {"product": "geneA"} - ), - - # Gene B - seq.Feature( - "regulatory", - [seq.Location(1180, 1190)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - [seq.Location(1201, 1350)], - {"product": "geneB"} - ), - seq.Feature( - "regulatory", - [seq.Location(1220, 1230)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - [seq.Location(1240, 1350)], - {"product": "geneB2"} - ), - - # Gene C - seq.Feature( - "regulatory", - [seq.Location(1380, 1390)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - # CDS extends over periodic boundary -> two locations - [seq.Location(1, 300), seq.Location(1402, 1500)], - {"product": "geneC"} - ), - - # Terminator - seq.Feature( - "regulatory", - [seq.Location(310, 350)], - {"regulatory_class": "terminator", "note": "MyTerm"} - ), - - # Primers - # The labels will be too long to be displayed on the map - # If you want to display them nevertheless, set the - # 'omit_oversized_labels' to False - seq.Feature( - "primer_bind", - [seq.Location(1385, 1405)], - {"note": "geneC"} - ), - seq.Feature( - "primer_bind", - [seq.Location(345, 365, seq.Location.Strand.REVERSE)], - {"note": "geneC_R"} - ), - - # Terminator - seq.Feature( - "regulatory", - [seq.Location(310, 350)], - {"regulatory_class": "terminator", "note": "MyTerm"} - ), -]) +annotation = seq.Annotation( + [ + seq.Feature( + "source", [seq.Location(0, 1500)], {"organism": "Escherichia coli"} + ), + # Ori + seq.Feature( + "rep_origin", + [seq.Location(600, 700, seq.Location.Strand.REVERSE)], + {"regulatory_class": "promoter", "note": "MyProm"}, + ), + # Promoter + seq.Feature( + "regulatory", + [seq.Location(1000, 1060)], + {"regulatory_class": "promoter", "note": "MyProm"}, + ), + seq.Feature("protein_bind", [seq.Location(1025, 1045)], {"note": "repr"}), + # Gene A + seq.Feature( + "regulatory", + [seq.Location(1070, 1080)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature("CDS", [seq.Location(1091, 1150)], {"product": "geneA"}), + # Gene B + seq.Feature( + "regulatory", + [seq.Location(1180, 1190)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature("CDS", [seq.Location(1201, 1350)], {"product": "geneB"}), + seq.Feature( + "regulatory", + [seq.Location(1220, 1230)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature("CDS", [seq.Location(1240, 1350)], {"product": "geneB2"}), + # Gene C + seq.Feature( + "regulatory", + [seq.Location(1380, 1390)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature( + "CDS", + # CDS extends over periodic boundary -> two locations + [seq.Location(1, 300), seq.Location(1402, 1500)], + {"product": "geneC"}, + ), + # Terminator + seq.Feature( + "regulatory", + [seq.Location(310, 350)], + {"regulatory_class": "terminator", "note": "MyTerm"}, + ), + # Primers + # The labels will be too long to be displayed on the map + # If you want to display them nevertheless, set the + # 'omit_oversized_labels' to False + seq.Feature("primer_bind", [seq.Location(1385, 1405)], {"note": "geneC"}), + seq.Feature( + "primer_bind", + [seq.Location(345, 365, seq.Location.Strand.REVERSE)], + {"note": "geneC_R"}, + ), + # Terminator + seq.Feature( + "regulatory", + [seq.Location(310, 350)], + {"regulatory_class": "terminator", "note": "MyTerm"}, + ), + ] +) fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111, projection="polar") graphics.plot_plasmid_map( - ax, annotation, plasmid_size=1500, label="My plasmid", - label_properties={"fontsize": 8} + ax, + annotation, + plasmid_size=1500, + label="My plasmid", + label_properties={"fontsize": 8}, ) ticks = ax.get_xticks() diff --git a/doc/examples/scripts/sequence/annotation/region_visualization.py b/doc/examples/scripts/sequence/annotation/region_visualization.py index 6bdd55455..09aa3b43c 100644 --- a/doc/examples/scripts/sequence/annotation/region_visualization.py +++ b/doc/examples/scripts/sequence/annotation/region_visualization.py @@ -9,16 +9,13 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import biotite.sequence as seq +import matplotlib.pyplot as plt +import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb -import biotite.database.entrez as entrez -import numpy as np -import matplotlib.pyplot as plt # Download E. coli BL21 genome -file = entrez.fetch("CP001509", None, suffix="gb", - db_name="nuccore", ret_type="gb") +file = entrez.fetch("CP001509", None, suffix="gb", db_name="nuccore", ret_type="gb") gb_file = gb.GenBankFile.read(file) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) annotation = gb.get_annotation(gb_file, include_only=["gene"]) @@ -29,13 +26,15 @@ for loc in feature.locs: # Ignore if feature is only a pseudo-gene (e.g. gene fragment) # and check if feature is lacA gene (begin of lac operon) - if "gene" in feature.qual \ - and "pseudo" not in feature.qual \ - and feature.qual["gene"] == "lacA": - if min_loc > loc.first: - min_loc = loc.first - if max_loc < loc.last: - max_loc = loc.last + if ( + "gene" in feature.qual + and "pseudo" not in feature.qual + and feature.qual["gene"] == "lacA" + ): + if min_loc > loc.first: + min_loc = loc.first + if max_loc < loc.last: + max_loc = loc.last # Extend the location range by 1000 (arbitrary) in each direction min_loc -= 10000 max_loc += 10000 @@ -44,9 +43,13 @@ fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, loc_range=(min_loc, max_loc), symbols_per_line=2000, - show_numbers=True, show_line_position=True + ax, + annotation, + loc_range=(min_loc, max_loc), + symbols_per_line=2000, + show_numbers=True, + show_line_position=True, ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/annotation/sigma_domains.py b/doc/examples/scripts/sequence/annotation/sigma_domains.py index 5d5a5ea36..d3c6b7365 100644 --- a/doc/examples/scripts/sequence/annotation/sigma_domains.py +++ b/doc/examples/scripts/sequence/annotation/sigma_domains.py @@ -11,37 +11,37 @@ import re from collections import OrderedDict -import numpy as np import matplotlib.pyplot as plt -from matplotlib.patches import Rectangle, FancyBboxPatch -import biotite.sequence as seq -import biotite.sequence.io.genbank as gb +import numpy as np +from matplotlib.patches import FancyBboxPatch, Rectangle import biotite.database.entrez as entrez - +import biotite.sequence.io.genbank as gb # The names of the sigma factors and the corresponding genes -genes = OrderedDict({ - r"$\sigma^{70}$": "rpoD", - r"$\sigma^{24}$": "rpoE", - r"$\sigma^{28}$": "rpoF", - r"$\sigma^{32}$": "rpoH", - r"$\sigma^{38}$": "rpoS", -}) +genes = OrderedDict( + { + r"$\sigma^{70}$": "rpoD", + r"$\sigma^{24}$": "rpoE", + r"$\sigma^{28}$": "rpoF", + r"$\sigma^{32}$": "rpoH", + r"$\sigma^{38}$": "rpoS", + } +) # Find SwissProt entries for these genes in NCBI Entrez protein database uids = [] for name, gene in genes.items(): - query = entrez.SimpleQuery(gene, "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \ - & entrez.SimpleQuery("Escherichia coli K-12", "Organism") + query = ( + entrez.SimpleQuery(gene, "Gene Name") + & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") + & entrez.SimpleQuery("Escherichia coli K-12", "Organism") + ) ids = entrez.search(query, "protein") # Only one entry per gene in E. coli K-12 is expected assert len(ids) == 1 uids += ids # Download corresponding GenBank files as single, merged file -file = entrez.fetch_single_file( - uids, None, "protein", ret_type="gb" -) +file = entrez.fetch_single_file(uids, None, "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position @@ -55,53 +55,66 @@ # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file) - seq_lengths[i] = length + seq_lengths[i] = length annotation = gb.get_annotation(gb_file) # Find features, that represent a sigma factor domain for feature in annotation: - if feature.key == "Region" and "note" in feature.qual \ - and "Sigma-70 factor domain" in feature.qual["note"]: - # Extract the domain number - # and decrement for 0-based indexing - # - # e.g. 'Sigma-70 factor domain-2.' => 1 - # ^ - domain_index = int(re.findall( - "(?<=Sigma-70 factor domain-)\d+", - feature.qual["note"] - )[0]) -1 - # Expect a single contiguous location of the domain - assert len(feature.locs) == 1 - loc = list(feature.locs)[0] - # Store first and last position of the domain - domain_pos[i, domain_index, :] = [loc.first, loc.last] + if ( + feature.key == "Region" + and "note" in feature.qual + and "Sigma-70 factor domain" in feature.qual["note"] + ): + # Extract the domain number + # and decrement for 0-based indexing + # + # e.g. 'Sigma-70 factor domain-2.' => 1 + # ^ + domain_index = ( + int( + re.findall( + r"(?<=Sigma-70 factor domain-)\d+", feature.qual["note"] + )[0] + ) + - 1 + ) + # Expect a single contiguous location of the domain + assert len(feature.locs) == 1 + loc = list(feature.locs)[0] + # Store first and last position of the domain + domain_pos[i, domain_index, :] = [loc.first, loc.last] fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.gca() # The color for each one of the four domains colors = ["firebrick", "forestgreen", "dodgerblue", "goldenrod"] # Draw each sequence -for i, (gene_name, domain_pos_for_gene, length) \ - in enumerate(zip(genes.keys(), domain_pos, seq_lengths)): - # Add base line representing the sequence itself - ax.add_patch(Rectangle( - (1, i-0.05), length, 0.1, color="gray" - )) - # Draw each domain - for j, ((first, last), color) \ - in enumerate(zip(domain_pos_for_gene, colors)): - if first != -1 and last != -1: - # FancyBboxPatch to get rounded corners in rectangle - ax.add_patch(FancyBboxPatch( - (first, i-0.4), last-first, 0.8, #color=color, - boxstyle="round,pad=0,rounding_size=10", - ec="black", fc=color, - mutation_aspect=0.02 - )) - ax.text( - x=(last+first)/2, y=i, s=fr"$\sigma_{j+1}$", - ha="center", va="center" - ) +for i, (gene_name, domain_pos_for_gene, length) in enumerate( + zip(genes.keys(), domain_pos, seq_lengths) +): + # Add base line representing the sequence itself + ax.add_patch(Rectangle((1, i - 0.05), length, 0.1, color="gray")) + # Draw each domain + for j, ((first, last), color) in enumerate(zip(domain_pos_for_gene, colors)): + if first != -1 and last != -1: + # FancyBboxPatch to get rounded corners in rectangle + ax.add_patch( + FancyBboxPatch( + (first, i - 0.4), + last - first, + 0.8, # color=color, + boxstyle="round,pad=0,rounding_size=10", + ec="black", + fc=color, + mutation_aspect=0.02, + ) + ) + ax.text( + x=(last + first) / 2, + y=i, + s=rf"$\sigma_{j+1}$", + ha="center", + va="center", + ) ax.set_xlim(0, max(seq_lengths)) ax.set_xlabel("Sequence position") # Inverted y-axis diff --git a/doc/examples/scripts/sequence/homology/avidin_alignment.py b/doc/examples/scripts/sequence/homology/avidin_alignment.py index 40b50083f..da67ff617 100644 --- a/doc/examples/scripts/sequence/homology/avidin_alignment.py +++ b/doc/examples/scripts/sequence/homology/avidin_alignment.py @@ -11,16 +11,16 @@ # License: BSD 3 clause import matplotlib.pyplot as plt +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.io.fasta as fasta -import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics +import biotite.sequence.io.fasta as fasta # Download and parse protein sequences of avidin and streptavidin -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - ["CAC34569", "ACL82594"], None, "protein", "fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta") +) for name, sequence in fasta_file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) @@ -31,16 +31,21 @@ matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized -alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, - gap_penalty=(-10, -1), terminal_penalty=False) +alignments = align.align_optimal( + avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False +) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"], - show_numbers=True, show_line_position=True + ax, + alignments[0], + matrix=matrix, + labels=["Avidin", "Streptavidin"], + show_numbers=True, + show_line_position=True, ) fig.tight_layout() diff --git a/doc/examples/scripts/sequence/homology/bionigma_alignment.py b/doc/examples/scripts/sequence/homology/bionigma_alignment.py index 9a3b8d1b5..4b7fe5be5 100644 --- a/doc/examples/scripts/sequence/homology/bionigma_alignment.py +++ b/doc/examples/scripts/sequence/homology/bionigma_alignment.py @@ -12,121 +12,132 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Rectangle from matplotlib.transforms import Bbox +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.io.fasta as fasta import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.fasta as fasta # The polygon coordinates for the different shapes -_hexagon_coord = np.array([ - (0.500, 0.000), - (0.937, 0.250), - (0.937, 0.750), - (0.500, 1.000), - (0.063, 0.750), - (0.063, 0.250) -]) - -_spiked_coord = np.array([ - (0.000, 0.000), - (0.500, 0.150), - (1.000, 0.000), - (0.850, 0.500), - (1.000, 1.000), - (0.500, 0.850), - (0.000, 1.000), - (0.150, 0.500), -]) - -_spiked_coord = np.array([ - (0.000, 0.000), - (0.500, 0.150), - (1.000, 0.000), - (0.850, 0.500), - (1.000, 1.000), - (0.500, 0.850), - (0.000, 1.000), - (0.150, 0.500), -]) - -_cross_coord = np.array([ - (0.220, 0.000), - (0.780, 0.000), - (0.780, 0.220), - (1.000, 0.220), - (1.000, 0.780), - (0.780, 0.780), - (0.780, 1.000), - (0.220, 1.000), - (0.220, 0.780), - (0.000, 0.780), - (0.000, 0.220), - (0.220, 0.220), -]) - -_star_coord = np.array([ - (0.500, 0.000), - (0.648, 0.150), - (0.852, 0.150), - (0.852, 0.352), - (1.000, 0.500), - (0.852, 0.648), - (0.852, 0.852), - (0.648, 0.852), - (0.500, 1.000), - (0.352, 0.852), - (0.148, 0.852), - (0.148, 0.648), - (0.000, 0.500), - (0.148, 0.352), - (0.148, 0.148), - (0.352, 0.148), -]) - -_hourglass_coord = np.array([ - (0.000, 0.000), - (1.000, 0.000), - (1.000, 0.220), - (0.740, 0.420), - (0.740, 0.580), - (1.000, 0.780), - (1.000, 1.000), - (0.000, 1.000), - (0.000, 0.780), - (0.260, 0.580), - (0.260, 0.420), - (0.000, 0.220), -]) +_hexagon_coord = np.array( + [ + (0.500, 0.000), + (0.937, 0.250), + (0.937, 0.750), + (0.500, 1.000), + (0.063, 0.750), + (0.063, 0.250), + ] +) + +_spiked_coord = np.array( + [ + (0.000, 0.000), + (0.500, 0.150), + (1.000, 0.000), + (0.850, 0.500), + (1.000, 1.000), + (0.500, 0.850), + (0.000, 1.000), + (0.150, 0.500), + ] +) + +_spiked_coord = np.array( + [ + (0.000, 0.000), + (0.500, 0.150), + (1.000, 0.000), + (0.850, 0.500), + (1.000, 1.000), + (0.500, 0.850), + (0.000, 1.000), + (0.150, 0.500), + ] +) + +_cross_coord = np.array( + [ + (0.220, 0.000), + (0.780, 0.000), + (0.780, 0.220), + (1.000, 0.220), + (1.000, 0.780), + (0.780, 0.780), + (0.780, 1.000), + (0.220, 1.000), + (0.220, 0.780), + (0.000, 0.780), + (0.000, 0.220), + (0.220, 0.220), + ] +) + +_star_coord = np.array( + [ + (0.500, 0.000), + (0.648, 0.150), + (0.852, 0.150), + (0.852, 0.352), + (1.000, 0.500), + (0.852, 0.648), + (0.852, 0.852), + (0.648, 0.852), + (0.500, 1.000), + (0.352, 0.852), + (0.148, 0.852), + (0.148, 0.648), + (0.000, 0.500), + (0.148, 0.352), + (0.148, 0.148), + (0.352, 0.148), + ] +) + +_hourglass_coord = np.array( + [ + (0.000, 0.000), + (1.000, 0.000), + (1.000, 0.220), + (0.740, 0.420), + (0.740, 0.580), + (1.000, 0.780), + (1.000, 1.000), + (0.000, 1.000), + (0.000, 0.780), + (0.260, 0.580), + (0.260, 0.420), + (0.000, 0.220), + ] +) # The shape color for each symbols _colors = { - "A" : "#1e67b6", - "C" : "#00a391", - "D" : "#ea42fc", - "E" : "#109c4b", - "F" : "#fed700", - "G" : "#8d4712", - "H" : "#ff8e00", - "I" : "#d82626", - "K" : "#109c4b", - "L" : "#d82626", - "M" : "#d82626", - "N" : "#ea42fc", - "P" : "#ffa9e3", - "Q" : "#109c4b", - "R" : "#109c4b", - "S" : "#1e67b6", - "T" : "#1e67b6", - "V" : "#d82626", - "W" : "#fed700", - "Y" : "#fed700" + "A": "#1e67b6", + "C": "#00a391", + "D": "#ea42fc", + "E": "#109c4b", + "F": "#fed700", + "G": "#8d4712", + "H": "#ff8e00", + "I": "#d82626", + "K": "#109c4b", + "L": "#d82626", + "M": "#d82626", + "N": "#ea42fc", + "P": "#ffa9e3", + "Q": "#109c4b", + "R": "#109c4b", + "S": "#1e67b6", + "T": "#1e67b6", + "V": "#d82626", + "W": "#fed700", + "Y": "#fed700", } @@ -134,31 +145,32 @@ class ShapePlotter(graphics.SymbolPlotter): """ A symbol plotter that depicts each symbol by color and shape. """ + def __init__(self, axes, font_size=None, font_param=None): super().__init__(axes) # The symbol to shape mapping self._draw_funcs = { - "A" : ShapePlotter._draw_circle, - "T" : ShapePlotter._draw_circle, - "S" : ShapePlotter._draw_circle, - "N" : ShapePlotter._draw_circle, - "D" : ShapePlotter._draw_rectangle, - "E" : ShapePlotter._draw_rectangle, - "Q" : ShapePlotter._draw_rectangle, - "K" : ShapePlotter._draw_rectangle, - "R" : ShapePlotter._draw_rectangle, - "I" : ShapePlotter._draw_hexagon, - "L" : ShapePlotter._draw_hexagon, - "V" : ShapePlotter._draw_hexagon, - "M" : ShapePlotter._draw_hexagon, - "F" : ShapePlotter._draw_spiked, - "W" : ShapePlotter._draw_spiked, - "Y" : ShapePlotter._draw_spiked, - "H" : ShapePlotter._draw_spiked, - "G" : ShapePlotter._draw_cross, - "P" : ShapePlotter._draw_star, - "C" : ShapePlotter._draw_hourglass + "A": ShapePlotter._draw_circle, + "T": ShapePlotter._draw_circle, + "S": ShapePlotter._draw_circle, + "N": ShapePlotter._draw_circle, + "D": ShapePlotter._draw_rectangle, + "E": ShapePlotter._draw_rectangle, + "Q": ShapePlotter._draw_rectangle, + "K": ShapePlotter._draw_rectangle, + "R": ShapePlotter._draw_rectangle, + "I": ShapePlotter._draw_hexagon, + "L": ShapePlotter._draw_hexagon, + "V": ShapePlotter._draw_hexagon, + "M": ShapePlotter._draw_hexagon, + "F": ShapePlotter._draw_spiked, + "W": ShapePlotter._draw_spiked, + "Y": ShapePlotter._draw_spiked, + "H": ShapePlotter._draw_spiked, + "G": ShapePlotter._draw_cross, + "P": ShapePlotter._draw_star, + "C": ShapePlotter._draw_hourglass, } self._font_size = font_size @@ -166,8 +178,8 @@ def __init__(self, axes, font_size=None, font_param=None): def plot_symbol(self, bbox, alignment, column_i, seq_i): trace = alignment.trace - if trace[column_i,seq_i] != -1: - symbol = alignment.sequences[seq_i][trace[column_i,seq_i]] + if trace[column_i, seq_i] != -1: + symbol = alignment.sequences[seq_i][trace[column_i, seq_i]] else: symbol = "" color = self._get_color(alignment, column_i, seq_i) @@ -178,16 +190,21 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i): # Shrink Bbox slightly to get a small margin between shapes f = 0.04 shape_bbox = Bbox( - ((bbox.x0 + f*bbox.width, - bbox.y0 + f*bbox.height), - (bbox.x1 - f*bbox.width, - bbox.y1 - f*bbox.height)), + ( + (bbox.x0 + f * bbox.width, bbox.y0 + f * bbox.height), + (bbox.x1 - f * bbox.width, bbox.y1 - f * bbox.height), + ), ) draw_func(self, shape_bbox, color) text = self.axes.text( - bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2, - symbol, color="black", ha="center", va="center", - size=self._font_size, **self._font_param + bbox.x0 + bbox.width / 2, + bbox.y0 + bbox.height / 2, + symbol, + color="black", + ha="center", + va="center", + size=self._font_size, + **self._font_param, ) text.set_clip_on(True) @@ -203,15 +220,17 @@ def _draw_circle(self, bbox, color): from matplotlib.patches import Circle circle = Circle( - (bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2), bbox.width/2, - facecolor=color, edgecolor="None", fill=True + (bbox.x0 + bbox.width / 2, bbox.y0 + bbox.height / 2), + bbox.width / 2, + facecolor=color, + edgecolor="None", + fill=True, ) self.axes.add_patch(circle) def _draw_rectangle(self, bbox, color): rectangle = Rectangle( - bbox.p0, bbox.width, bbox.height, - facecolor=color, edgecolor="None" + bbox.p0, bbox.width, bbox.height, facecolor=color, edgecolor="None" ) self.axes.add_patch(rectangle) @@ -241,45 +260,52 @@ def _draw_polygon(self, bbox, color, coord): self.axes.add_patch(polygon) -def plot_alignment_shapes(axes, alignment, symbols_per_line=30, - show_numbers=False, number_size=None, - number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, color_symbols=False, - symbol_size=None, symbol_param=None): +def plot_alignment_shapes( + axes, + alignment, + symbols_per_line=30, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color_symbols=False, + symbol_size=None, + symbol_param=None, +): """ A thin wrapper around the 'ShapePlotter' and 'plot_alignment()' function. """ alphabet = alignment.sequences[0].get_alphabet() - symbol_plotter = ShapePlotter( - axes, font_size=symbol_size, font_param=symbol_param - ) + symbol_plotter = ShapePlotter(axes, font_size=symbol_size, font_param=symbol_param) graphics.plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing + spacing=spacing, ) twin = axes.get_shared_x_axes().get_siblings(axes)[0] for ax in (axes, twin): - ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color":"white"}) + ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"}) axes.get_figure().patch.set_facecolor("#181818") - - # Using cyclotide sequences as example -query = ( - entrez.SimpleQuery("Cyclotide") & - entrez.SimpleQuery("cter") & - entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ - entrez.SimpleQuery("Precursor") +query = entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery( + "cter" +) & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ entrez.SimpleQuery( + "Precursor" ) uids = entrez.search(query, "protein") fasta_file = fasta.FastaFile.read( @@ -289,8 +315,7 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30, # Currently there seems to b a bug in the NCBI search, # so that 'Precursor' results are still included # Solve this by filtering the sequence length -sequence_dict = {header: seq for header, seq in sequence_dict.items() - if len(seq) < 100} +sequence_dict = {header: seq for header, seq in sequence_dict.items() if len(seq) < 100} headers = list(sequence_dict.keys()) sequences = list(sequence_dict.values()) labels = [header[-1] for header in headers] @@ -306,8 +331,7 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30, fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.add_subplot(111) plot_alignment_shapes( - ax, alignment, labels=labels, symbols_per_line=len(alignment), - symbol_size=8 + ax, alignment, labels=labels, symbols_per_line=len(alignment), symbol_size=8 ) # The aspect ratio of the shapes should be preserved: # Squares should look like squares, circles should look like circles @@ -316,4 +340,4 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30, ax.set_ylabel("Type", color="white") ax.set_title("Comparison of cyclotide sequences", color="white") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/genome_comparison.py b/doc/examples/scripts/sequence/homology/genome_comparison.py index fc360804d..066388ce5 100644 --- a/doc/examples/scripts/sequence/homology/genome_comparison.py +++ b/doc/examples/scripts/sequence/homology/genome_comparison.py @@ -31,28 +31,25 @@ # License: BSD 3 clause import tempfile -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Rectangle from matplotlib.ticker import MultipleLocator import biotite +import biotite.application.tantan as tantan +import biotite.database.entrez as entrez import biotite.sequence as seq +import biotite.sequence.align as align import biotite.sequence.io as seqio import biotite.sequence.io.genbank as gb -import biotite.sequence.align as align -import biotite.database.entrez as entrez -import biotite.application.tantan as tantan - fasta_file = entrez.fetch( - "NC_000932", tempfile.gettempdir(), "fasta", - db_name="Nucleotide", ret_type="fasta" + "NC_000932", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta" ) chloroplast_seq = seqio.load_sequence(fasta_file) fasta_file = entrez.fetch( - "NC_000911", tempfile.gettempdir(), "fasta", - db_name="Nucleotide", ret_type="fasta" + "NC_000911", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta" ) bacterium_seq = seqio.load_sequence(fasta_file) @@ -73,15 +70,13 @@ # one ``111∗1∗11∗1∗∗11∗111`` :footcite:`Choi2004` is used here. repeat_mask = tantan.TantanApp.mask_repeats(bacterium_seq) -bacterium_seqs = [ - bacterium_seq, bacterium_seq.reverse(copy=False).complement() -] +bacterium_seqs = [bacterium_seq, bacterium_seq.reverse(copy=False).complement()] table = align.KmerTable.from_sequences( - k = 12, - sequences = bacterium_seqs, - spacing = "111∗1∗11∗1∗∗11∗111", - ignore_masks = [repeat_mask, repeat_mask[::-1].copy()] + k=12, + sequences=bacterium_seqs, + spacing="111∗1∗11∗1∗∗11∗111", + ignore_masks=[repeat_mask, repeat_mask[::-1].copy()], ) ######################################################################## @@ -117,7 +112,7 @@ # Store the indices to the match array # for each combination of diagonal and strand on the bacterial genome matches_for_diagonals = {} -for i, (diag, strand) in enumerate(zip(diagonals, matches[:,1])): +for i, (diag, strand) in enumerate(zip(diagonals, matches[:, 1])): if (diag, strand) not in matches_for_diagonals: matches_for_diagonals[(diag, strand)] = [i] else: @@ -125,8 +120,9 @@ # If a diagonal has more than one match, # the first match on this diagonal is a double hit -double_hit_indices = [indices[0] for indices - in matches_for_diagonals.values() if len(indices) > 1] +double_hit_indices = [ + indices[0] for indices in matches_for_diagonals.values() if len(indices) > 1 +] double_hits = matches[double_hit_indices] print("Number of double hits:", len(double_hits)) @@ -148,13 +144,19 @@ ACCEPT_THRESHOLD = 100 matrix = align.SubstitutionMatrix.std_nucleotide_matrix() -ungapped_scores = np.array([ - align.align_local_ungapped( - chloroplast_seq, bacterium_seqs[strand], matrix, - seed=(i,j), threshold=X_DROP, score_only=True - ) - for i, strand, j in double_hits -]) +ungapped_scores = np.array( + [ + align.align_local_ungapped( + chloroplast_seq, + bacterium_seqs[strand], + matrix, + seed=(i, j), + threshold=X_DROP, + score_only=True, + ) + for i, strand, j in double_hits + ] +) accepted_hits = double_hits[ungapped_scores > ACCEPT_THRESHOLD] print("Number of accepted ungapped alignments:", len(accepted_hits)) @@ -190,19 +192,27 @@ estimator = align.EValueEstimator.from_samples( chloroplast_seq.alphabet, # The scoring scheme must be the same as used for the alignment - matrix, GAP_PENALTY, - background + matrix, + GAP_PENALTY, + background, ) # Compute similarity scores for each hit -gapped_scores = np.array([ - align.align_local_gapped( - chloroplast_seq, bacterium_seqs[strand], matrix, - seed=(i,j), gap_penalty=GAP_PENALTY, threshold=X_DROP, score_only=True, - max_table_size=100_000_000 - ) - for i, strand, j in accepted_hits -]) +gapped_scores = np.array( + [ + align.align_local_gapped( + chloroplast_seq, + bacterium_seqs[strand], + matrix, + seed=(i, j), + gap_penalty=GAP_PENALTY, + threshold=X_DROP, + score_only=True, + max_table_size=100_000_000, + ) + for i, strand, j in accepted_hits + ] +) # Calculate the E-values # For numeric stability reasons the method returns the common logarithm @@ -215,10 +225,14 @@ accepted_alignments = [ ( align.align_local_gapped( - chloroplast_seq, bacterium_seqs[strand], matrix, - seed=(i,j), gap_penalty=GAP_PENALTY, threshold=X_DROP, + chloroplast_seq, + bacterium_seqs[strand], + matrix, + seed=(i, j), + gap_penalty=GAP_PENALTY, + threshold=X_DROP, )[0], - log_evalue + log_evalue, ) for (i, strand, j), log_evalue in zip(accepted_hits, log_evalues) if log_evalue <= np.log10(EVALUE_THRESHOLD) @@ -248,11 +262,11 @@ stop = alignment.trace[-1, 0] # If this region was not covered by any other alignment before, # accept it and mark the region as covered - if not covered_range[start : stop].any(): + if not covered_range[start:stop].any(): unique_alignments.append((alignment, log_evalue)) - covered_range[start : stop] = True + covered_range[start:stop] = True -print("Number of unique alignments:", len(unique_alignments)) +print("Number of unique alignments:", len(unique_alignments)) ######################################################################## # To take a closer look on the found homologous regions, they are viewed @@ -269,9 +283,9 @@ MARGIN_SIZE = 250 COLORS = { - "CDS" : biotite.colors["dimgreen"], + "CDS": biotite.colors["dimgreen"], "tRNA": biotite.colors["orange"], - "rRNA": biotite.colors["orange"] + "rRNA": biotite.colors["orange"], } @@ -282,7 +296,6 @@ annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"]) - def draw_arrow(ax, feature, loc): x = loc.first dx = loc.last - loc.first + 1 @@ -294,18 +307,25 @@ def draw_arrow(ax, feature, loc): dx = loc.first - loc.last + 1 # Create head with 90 degrees tip -> head width/length ratio = 1/2 - ax.add_patch(biotite.AdaptiveFancyArrow( - x, 0.5, dx, 0, tail_width=0.4, head_width=0.7, head_ratio=0.5, - draw_head=True, color=COLORS[feature.key], linewidth=0 - )) + ax.add_patch( + biotite.AdaptiveFancyArrow( + x, + 0.5, + dx, + 0, + tail_width=0.4, + head_width=0.7, + head_ratio=0.5, + draw_head=True, + color=COLORS[feature.key], + linewidth=0, + ) + ) label = feature.qual.get("gene") if label is not None: - ax.text( - x + dx/2, 0.5, label, color="black", - ha="center", va="center", size=8 - ) + ax.text(x + dx / 2, 0.5, label, color="black", ha="center", va="center", size=8) # Fetch features of the chloroplast genome @@ -315,21 +335,15 @@ def draw_arrow(ax, feature, loc): annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"]) n_rows = int(np.ceil(len(unique_alignments) / N_COL)) -fig, axes = plt.subplots( - n_rows, N_COL, - figsize=(8.0, 24.0), - constrained_layout=True -) +fig, axes = plt.subplots(n_rows, N_COL, figsize=(8.0, 24.0), constrained_layout=True) -for (alignment, log_evalue), ax in zip( - unique_alignments, axes.flatten() -): +for (alignment, log_evalue), ax in zip(unique_alignments, axes.flatten()): # Transform 0-based sequence index to 1-based sequence position first = alignment.trace[0, 0] + 1 last = alignment.trace[-1, 0] + 1 center = (first + last) // 2 if last - first < EXCERPT_SIZE - MARGIN_SIZE * 2: - excerpt_loc = (center - EXCERPT_SIZE//2, center + EXCERPT_SIZE//2) + excerpt_loc = (center - EXCERPT_SIZE // 2, center + EXCERPT_SIZE // 2) else: # Exceed excerpt size to show entire alignment range excerpt_loc = (first - MARGIN_SIZE, last + MARGIN_SIZE) @@ -345,11 +359,18 @@ def draw_arrow(ax, feature, loc): for loc in feature.locs: draw_arrow(ax, feature, loc) # Draw rectangle representing homologuous region - ax.add_patch(Rectangle( - (first, 0.1), last - first + 1, 1 - 2*0.1, - facecolor="None", edgecolor="black", alpha=0.2, linewidth=1, - clip_on=False - )) + ax.add_patch( + Rectangle( + (first, 0.1), + last - first + 1, + 1 - 2 * 0.1, + facecolor="None", + edgecolor="black", + alpha=0.2, + linewidth=1, + clip_on=False, + ) + ) ax.xaxis.set_major_locator(MultipleLocator(1000)) ax.tick_params(labelsize=6) @@ -359,13 +380,13 @@ def draw_arrow(ax, feature, loc): ax.get_yaxis().set_tick_params(left=False, right=False, labelleft=False) exponent = int(np.floor(log_evalue)) - mantissa = 10**(log_evalue-exponent) + mantissa = 10 ** (log_evalue - exponent) homolog_excerpt = annotation[first : last + 1] if len(homolog_excerpt) > 0: # Select the longest feature in range for name display in title representative_feature = max( homolog_excerpt, - key=lambda feature: -np.subtract(*feature.get_location_range()) + key=lambda feature: -np.subtract(*feature.get_location_range()), ) feature_name = representative_feature.qual["product"] else: @@ -377,14 +398,15 @@ def draw_arrow(ax, feature, loc): ax.set_title( f"{feature_name}\n" - fr"E-Value: ${mantissa:.2f} \times 10^{{{exponent}}}$" + rf"E-Value: ${mantissa:.2f} \times 10^{{{exponent}}}$" f"\nIdentity: {align.get_sequence_identity(alignment) * 100:3.1f} %", - loc="left", size=8 + loc="left", + size=8, ) # Hide empty axes -for ax in axes.flatten()[len(unique_alignments):]: - ax.axis('off') +for ax in axes.flatten()[len(unique_alignments) :]: + ax.axis("off") fig.tight_layout(h_pad=3.0, w_pad=0.5) @@ -399,4 +421,4 @@ def draw_arrow(ax, feature, loc): # ---------- # # .. footbibliography:: -# \ No newline at end of file +# diff --git a/doc/examples/scripts/sequence/homology/genome_search.py b/doc/examples/scripts/sequence/homology/genome_search.py index 8e98bd446..6649704b8 100644 --- a/doc/examples/scripts/sequence/homology/genome_search.py +++ b/doc/examples/scripts/sequence/homology/genome_search.py @@ -15,30 +15,26 @@ and is similar to the method used by software like *BLAST*. At first the sequences for the *M1* coding gene and the *S. enterica* -genome are downloaded from *NCBI Entrez*. +genome are downloaded from *NCBI Entrez*. """ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.collections import LineCollection import biotite -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import biotite.sequence.io.genbank as gb -import biotite.sequence.graphics as seqgraphics -import biotite.sequence.align as align -import biotite.database.entrez as entrez -import biotite.structure.graphics as strucgraphics import biotite.application.viennarna as viennarna - +import biotite.database.entrez as entrez +import biotite.sequence.align as align +import biotite.sequence.graphics as seqgraphics +import biotite.sequence.io.genbank as gb # Download Escherichia coli BL21 and Salmonella enterica genome -gb_file = gb.MultiFile.read(entrez.fetch_single_file( - ["CP001509", "CP019649"], None, "nuccore", "gb" -)) +gb_file = gb.MultiFile.read( + entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb") +) ec_file, se_file = tuple(gb_file) annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"]) @@ -83,24 +79,27 @@ trigger_matches = [] # 0 represents the original genome sequence, 1 the reverse complement for strand in (0, 1): - matches_for_strand = matches[matches[:,1] == strand] + matches_for_strand = matches[matches[:, 1] == strand] # Plot match positions - ax = fig.add_subplot(1, 2, strand+1) + ax = fig.add_subplot(1, 2, strand + 1) ax.scatter( - matches_for_strand[:,0], matches_for_strand[:,2] / 1e6, - s=4, marker="o", color=biotite.colors["dimorange"] + matches_for_strand[:, 0], + matches_for_strand[:, 2] / 1e6, + s=4, + marker="o", + color=biotite.colors["dimorange"], ) ax.set_xlim(0, len(m1_sequence)) ax.set_ylim(0, len(se_genome) / 1e6) ax.set_xlabel("E. coli M1 position (b)") if strand == 0: ax.set_ylabel("S. enterica genome position (Mb)") - else: # strand == 1 + else: # strand == 1 ax.set_ylabel("S. enterica genome position (Mb) (reverse complement)") - + # Check if there are two adjacent matches on the same diagonal - diagonals = matches_for_strand[:,2] - matches_for_strand[:,0] + diagonals = matches_for_strand[:, 2] - matches_for_strand[:, 0] unique_diag = np.unique(diagonals) trigger_diagonals = np.array([], dtype=int) for diag in unique_diag: @@ -116,7 +115,7 @@ # The other match on the same diagonal should not overlap # with this match and should be within a cutoff range if np.any((distances > K) & (distances < DISCARD_RANGE)): - trigger_matches.append((strand, pos, pos+diag)) + trigger_matches.append((strand, pos, pos + diag)) trigger_diagonals = np.append(trigger_diagonals, diag) # Only add one match per diagonal at maximum break @@ -142,11 +141,14 @@ genome = genomic_seqs[strand] diagonal = genome_pos - m1_pos alignment = align.align_banded( - m1_sequence, genome, matrix, - band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), max_number=1 + m1_sequence, + genome, + matrix, + band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), + max_number=1, )[0] alignments.append((strand, alignment)) - + strand, best_alignment = max( alignments, key=lambda strand_alignment: alignment[1].score ) @@ -159,15 +161,19 @@ # genomic sequence. # Reverse sequence numbering for second sequence (genome) in alignment -number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x] +number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x] # Visualize alignment, use custom color fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.add_subplot(111) seqgraphics.plot_alignment_similarity_based( - ax, best_alignment, matrix=matrix, - labels=["E. coli M1 coding gene", "S. enterica genome"], show_numbers=True, - number_functions=number_funcs, show_line_position=True, - color=biotite.colors["brightorange"] + ax, + best_alignment, + matrix=matrix, + labels=["E. coli M1 coding gene", "S. enterica genome"], + show_numbers=True, + number_functions=number_funcs, + show_line_position=True, + color=biotite.colors["brightorange"], ) fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 @@ -199,22 +205,25 @@ # Plot base connections ax.plot(*plot_coord.T, color="black", linewidth=1, zorder=1) # Plot base pairings -ax.add_collection(LineCollection( - [(plot_coord[i], plot_coord[j]) for i, j in base_pairs], - color="silver", linewidth=1, zorder=1 -)) +ax.add_collection( + LineCollection( + [(plot_coord[i], plot_coord[j]) for i, j in base_pairs], + color="silver", + linewidth=1, + zorder=1, + ) +) # Plot base markers ax.scatter( *plot_coord.T, - s = 12, + s=12, # Render markers over lines - zorder = 2, - # Display base marker color based on the identity in the alignment - color = ["forestgreen" if identity else "firebrick" - for identity in identities] + zorder=2, + # Display base marker color based on the identity in the alignment + color=["forestgreen" if identity else "firebrick" for identity in identities], ) ax.set_aspect("equal") ax.axis("off") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/gpcr_evolution.py b/doc/examples/scripts/sequence/homology/gpcr_evolution.py index 5ac3d7ea1..7072601c2 100644 --- a/doc/examples/scripts/sequence/homology/gpcr_evolution.py +++ b/doc/examples/scripts/sequence/homology/gpcr_evolution.py @@ -20,22 +20,23 @@ import re import matplotlib.pyplot as plt import networkx as nx +import biotite.application.clustalo as clustalo +import biotite.database.uniprot as uniprot import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.phylo as phylo import biotite.sequence.io.fasta as fasta -import biotite.database.uniprot as uniprot -import biotite.application.clustalo as clustalo - +import biotite.sequence.phylo as phylo # The bovine GPCRs are investigated SPECIES = "Bovine" query = ( - uniprot.SimpleQuery("reviewed", "true") & + uniprot.SimpleQuery("reviewed", "true") + & # Bovine proteins - uniprot.SimpleQuery("organism_name", "Bos taurus") & + uniprot.SimpleQuery("organism_name", "Bos taurus") + & # Keyword ID for GPCRs uniprot.SimpleQuery("keyword", "KW-0297") ) @@ -62,13 +63,11 @@ # The distance measure required for the tree calculation is the # percentage of non-identical amino acids in the respective two # sequences -distances = 1 - align.get_pairwise_sequence_identity( - alignment, mode="shortest" -) +distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") # Create tree via neighbor joining tree = phylo.neighbor_joining(distances) # Convert to NetworkX graph -#For the graph visualization, the edge directions are unnecessary +# For the graph visualization, the edge directions are unnecessary graph = tree.as_graph().to_undirected() fig = plt.figure(figsize=(8.0, 8.0)) @@ -78,15 +77,17 @@ pos = nx.kamada_kawai_layout(graph) # Assign the gene names to the nodes that represent a reference index node_labels = {i: name for i, name in enumerate(genes)} -nx.draw_networkx_edges( - graph, pos, ax=ax -) +nx.draw_networkx_edges(graph, pos, ax=ax) nx.draw_networkx_labels( - graph, pos, ax=ax, labels=node_labels, font_size=7, + graph, + pos, + ax=ax, + labels=node_labels, + font_size=7, # Draw a white background behind the labeled nodes # for better readability - bbox=dict(pad=0, color="white") + bbox=dict(pad=0, color="white"), ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/hcn_hydropathy.py b/doc/examples/scripts/sequence/homology/hcn_hydropathy.py index 637959a52..ff879afca 100644 --- a/doc/examples/scripts/sequence/homology/hcn_hydropathy.py +++ b/doc/examples/scripts/sequence/homology/hcn_hydropathy.py @@ -16,17 +16,17 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Patch import biotite +import biotite.application.mafft as mafft import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb -import biotite.application.mafft as mafft # Taken from # Kyte, J and Doolittle, RF. @@ -35,37 +35,39 @@ # Journal of Molecular Biology (2015). 157(1):105–32. # doi:10.1016/0022-2836(82)90515-0 hydropathy_dict = { - "I" : 4.5, - "V" : 4.2, - "L" : 3.8, - "F" : 2.8, - "C" : 2.5, - "M" : 1.9, - "A" : 1.8, - "G" : -0.4, - "T" : -0.7, - "S" : -0.8, - "W" : -0.9, - "Y" : -1.3, - "P" : -1.6, - "H" : -3.2, - "E" : -3.5, - "Q" : -3.5, - "D" : -3.5, - "N" : -3.5, - "K" : -3.9, - "R" : -4.5 + "I": 4.5, + "V": 4.2, + "L": 3.8, + "F": 2.8, + "C": 2.5, + "M": 1.9, + "A": 1.8, + "G": -0.4, + "T": -0.7, + "S": -0.8, + "W": -0.9, + "Y": -1.3, + "P": -1.6, + "H": -3.2, + "E": -3.5, + "Q": -3.5, + "D": -3.5, + "N": -3.5, + "K": -3.9, + "R": -4.5, } # Look for the Swiss-Prot entry contaning the human HCN1 channel -query = entrez.SimpleQuery("HCN1", "Gene Name") \ - & entrez.SimpleQuery("homo sapiens", "Organism") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = ( + entrez.SimpleQuery("HCN1", "Gene Name") + & entrez.SimpleQuery("homo sapiens", "Organism") + & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +) uids = entrez.search(query, db_name="protein") -gp_file = gb.GenBankFile.read(entrez.fetch( - uids[0], None, "gp", db_name="protein", ret_type="gp" -)) +gp_file = gb.GenBankFile.read( + entrez.fetch(uids[0], None, "gp", db_name="protein", ret_type="gp") +) hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp")) print(hcn1) @@ -75,13 +77,15 @@ hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1]) + def moving_average(data_set, window_size): - weights = np.full(window_size, 1/window_size) - return np.convolve(data_set, weights, mode='valid') + weights = np.full(window_size, 1 / window_size) + return np.convolve(data_set, weights, mode="valid") + # Apply moving average over 15 amino acids for clearer visualization ma_radius = 7 -hydropathies = moving_average(hydropathies, 2*ma_radius+1) +hydropathies = moving_average(hydropathies, 2 * ma_radius + 1) ######################################################################## # In order to assess the positional conservation, the sequences @@ -91,14 +95,16 @@ def moving_average(data_set, window_size): uids = [] for name in names: - query = entrez.SimpleQuery(name, "Gene Name") \ - & entrez.SimpleQuery("homo sapiens", "Organism") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") + query = ( + entrez.SimpleQuery(name, "Gene Name") + & entrez.SimpleQuery("homo sapiens", "Organism") + & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") + ) uids += entrez.search(query, db_name="protein") -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) for header in fasta_file: print(header) @@ -121,8 +127,8 @@ def moving_average(data_set, window_size): scores = np.zeros(len(hcn1)) for i in range(len(alignment)): # The column is also an alignment with length 1 - column = alignment[i:i+1] - hcn1_index = column.trace[0,0] + column = alignment[i : i + 1] + hcn1_index = column.trace[0, 0] if hcn1_index == -1: # Gap in HCN1 row # As similarity score should be analyzed in dependence of the @@ -131,7 +137,7 @@ def moving_average(data_set, window_size): continue scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5) -scores = moving_average(scores, 2*ma_radius+1) +scores = moving_average(scores, 2 * ma_radius + 1) ######################################################################## # Now the hydropathy and the similarity score can be plotted. @@ -141,11 +147,12 @@ def moving_average(data_set, window_size): # Plot hydropathy ax.plot( - np.arange(1+ma_radius, len(hcn1)-ma_radius+1), hydropathies, - color=biotite.colors["dimorange"] + np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), + hydropathies, + color=biotite.colors["dimorange"], ) ax.axhline(0, color="gray", linewidth=0.5) -ax.set_xlim(1, len(hcn1)+1) +ax.set_xlim(1, len(hcn1) + 1) ax.set_xlabel("HCN1 sequence position") ax.set_ylabel("Hydropathy (15 residues moving average)") @@ -153,8 +160,11 @@ def moving_average(data_set, window_size): # with hydropathy plot annotation = gb.get_annotation(gp_file, include_only=["Region"]) transmembrane_annotation = seq.Annotation( - [feature for feature in annotation - if feature.qual["region_name"] == "Transmembrane region"] + [ + feature + for feature in annotation + if feature.qual["region_name"] == "Transmembrane region" + ] ) for feature in transmembrane_annotation: first, last = feature.get_location_range() @@ -163,17 +173,18 @@ def moving_average(data_set, window_size): # Plot similarity score as measure for conservation ax2 = ax.twinx() ax2.plot( - np.arange(1+ma_radius, len(hcn1)-ma_radius+1), scores, - color=biotite.colors["brightorange"] + np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), + scores, + color=biotite.colors["brightorange"], ) ax2.set_ylabel("Similarity score (15 residues moving average)") ax.legend( handles=[ - Patch(color=biotite.colors["dimorange"], label="Hydropathy"), - Patch(color=biotite.colors["brightorange"], label="Score" ) + Patch(color=biotite.colors["dimorange"], label="Hydropathy"), + Patch(color=biotite.colors["brightorange"], label="Score"), ], - fontsize=9 + fontsize=9, ) ######################################################################## @@ -190,17 +201,20 @@ def moving_average(data_set, window_size): # values as input. # Hydrophilic amino acids are depicted in blue, hydrophobic ones in red. + def hydropathy_to_color(hydropathy, colormap): # Normalize hydropathy to range between 0 and 1 # (orginally between -4.5 and 4.5) norm_hydropathy = (hydropathy - (-4.5)) / (4.5 - (-4.5)) return colormap(norm_hydropathy) + # Create a color scheme highlighting the hydropathy colormap = plt.get_cmap("coolwarm") colorscheme = [ hydropathy_to_color(hydropathy_dict[symbol], colormap) - if symbol in hydropathy_dict else None + if symbol in hydropathy_dict + else None for symbol in sequences[0].get_alphabet() ] @@ -210,8 +224,7 @@ def hydropathy_to_color(hydropathy, colormap): ax = fig.add_subplot(111) # Color the symbols instead of the background graphics.plot_alignment_type_based( - ax, alignment[:600], labels=names, show_numbers=True, - color_scheme=colorscheme + ax, alignment[:600], labels=names, show_numbers=True, color_scheme=colorscheme ) plt.show() diff --git a/doc/examples/scripts/sequence/homology/hcn_similarity.py b/doc/examples/scripts/sequence/homology/hcn_similarity.py index f81c55ee5..961abcd07 100644 --- a/doc/examples/scripts/sequence/homology/hcn_similarity.py +++ b/doc/examples/scripts/sequence/homology/hcn_similarity.py @@ -15,32 +15,31 @@ # Code source: Daniel Bauer # License: BSD 3 clause -import biotite.sequence.io.fasta as fasta +import matplotlib.pyplot as plt +import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.application.clustalo as clustalo import biotite.sequence.align as align -import biotite.sequence.phylo as phylo -import matplotlib.pyplot as plt import biotite.sequence.graphics as graphics - +import biotite.sequence.io.fasta as fasta +import biotite.sequence.phylo as phylo UNIPROT_IDS = dict( - hHCN1 = "O60741", - hHCN2 = "Q9UL51", - hHCN3 = "Q9P1Z3", - hHCN4 = "Q9Y3Q4", - spHCN = "O76977", - hEAG1 = "O95259", - hERG1 = "Q12809", - KAT1 = "Q39128", + hHCN1="O60741", + hHCN2="Q9UL51", + hHCN3="Q9P1Z3", + hHCN4="Q9Y3Q4", + spHCN="O76977", + hEAG1="O95259", + hERG1="Q12809", + KAT1="Q39128", ) ### fetch sequences for UniProt IDs from NCBI Entrez -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - list(UNIPROT_IDS.values()), None, "protein", "fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(list(UNIPROT_IDS.values()), None, "protein", "fasta") +) sequences = { name: seq.ProteinSequence(seq_str) for name, seq_str in zip(UNIPROT_IDS.keys(), fasta_file.values()) @@ -50,42 +49,44 @@ # create MSA alignment = clustalo.ClustalOmegaApp.align(list(sequences.values())) # build simple tree based on deviation from sequence identity -distances = 1 - align.get_pairwise_sequence_identity( - alignment, mode="shortest" -) +distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") tree = phylo.upgma(distances) ### plot the tree fig, ax = plt.subplots(1, 1, figsize=(8, 5)) graphics.plot_dendrogram( - ax, tree, orientation="left", labels=list(UNIPROT_IDS.keys()), - show_distance=False, linewidth=2 - ) + ax, + tree, + orientation="left", + labels=list(UNIPROT_IDS.keys()), + show_distance=False, + linewidth=2, +) ax.grid(False) ax.set_xticks([]) # distance indicator indicator_len = 0.1 indicator_start = ( - ax.get_xlim()[0] + ax.get_xlim()[1]*0.02, - ax.get_ylim()[1] - ax.get_ylim()[1]*0.15 -) -indicator_stop = ( - indicator_start[0] + indicator_len, - indicator_start[1] + ax.get_xlim()[0] + ax.get_xlim()[1] * 0.02, + ax.get_ylim()[1] - ax.get_ylim()[1] * 0.15, ) +indicator_stop = (indicator_start[0] + indicator_len, indicator_start[1]) indicator_center = ( - (indicator_start[0] + indicator_stop[0])/2, - (indicator_start[1] + 0.25) + (indicator_start[0] + indicator_stop[0]) / 2, + (indicator_start[1] + 0.25), ) ax.annotate( - "", xy=indicator_start, xytext=indicator_stop, xycoords="data", - textcoords="data", arrowprops={"arrowstyle": "|-|", "linewidth": 2} + "", + xy=indicator_start, + xytext=indicator_stop, + xycoords="data", + textcoords="data", + arrowprops={"arrowstyle": "|-|", "linewidth": 2}, ) ax.annotate( - f"{int(indicator_len * 100)} %", xy=indicator_center, - ha="center", va="center" + f"{int(indicator_len * 100)} %", xy=indicator_center, ha="center", va="center" ) ax.set_title("Sequence deviation of HCN to other CNG superfamily channels") diff --git a/doc/examples/scripts/sequence/homology/homolog_msa.py b/doc/examples/scripts/sequence/homology/homolog_msa.py index c186bd222..f3b91dd65 100644 --- a/doc/examples/scripts/sequence/homology/homolog_msa.py +++ b/doc/examples/scripts/sequence/homology/homolog_msa.py @@ -10,13 +10,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 cl from tempfile import gettempdir -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import biotite.sequence.graphics as graphics -import biotite.application.muscle as muscle +import matplotlib.pyplot as plt import biotite.application.blast as blast +import biotite.application.muscle as muscle import biotite.database.entrez as entrez -import matplotlib.pyplot as plt +import biotite.sequence.graphics as graphics +import biotite.sequence.io.fasta as fasta # Download sequence of Streptococcus pyogenes Cas9 file_name = entrez.fetch("Q99ZW2", gettempdir(), "fa", "protein", "fasta") @@ -49,7 +48,7 @@ print("MSA results:") gapped_seqs = alignment.get_gapped_sequences() for i in range(len(gapped_seqs)): - print(hits[i], " "*3, gapped_seqs[i]) + print(hits[i], " " * 3, gapped_seqs[i]) # Visualize the first 200 columns of the alignment # Reorder alignments to reflect sequence distance @@ -58,9 +57,11 @@ ax = fig.add_subplot(111) order = app.get_alignment_order() graphics.plot_alignment_type_based( - ax, alignment[:200, order.tolist()], labels=[hits[i] for i in order], - show_numbers=True + ax, + alignment[:200, order.tolist()], + labels=[hits[i] for i in order], + show_numbers=True, ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/lexa_conservation.py b/doc/examples/scripts/sequence/homology/lexa_conservation.py index 104fe9fd4..957db2f6a 100644 --- a/doc/examples/scripts/sequence/homology/lexa_conservation.py +++ b/doc/examples/scripts/sequence/homology/lexa_conservation.py @@ -21,24 +21,22 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import biotite.sequence.io.genbank as gb -import biotite.sequence.graphics as graphics import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez +import biotite.sequence as seq +import biotite.sequence.graphics as graphics +import biotite.sequence.io.genbank as gb + # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database -query = entrez.SimpleQuery("lexA", "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = entrez.SimpleQuery("lexA", "Gene Name") & entrez.SimpleQuery( + "srcdb_swiss-prot", "Properties" +) # Search for the first 200 hits # More than 200 UIDs are not recommended for the EFetch service # for a single fetch uids = entrez.search(query, db_name="protein", number=200) -file = entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="gp" -) +file = entrez.fetch_single_file(uids, None, db_name="protein", ret_type="gp") # The file contains multiple concatenated GenPept files # -> Usage of MultiFile multi_file = gb.MultiFile.read(file) @@ -57,12 +55,14 @@ # on. Therefore, we write a function that creates a proper abbreviation # for a species name. + def abbreviate(species): # Remove possible brackets - species = species.replace("[","").replace("]","") - splitted_species= species.split() + species = species.replace("[", "").replace("]", "") + splitted_species = species.split() return "{:}. {:}".format(splitted_species[0][0], splitted_species[1]) + print("Sources:") all_sources = [abbreviate(gb.get_source(file)) for file in files] for source in all_sources[:20]: @@ -97,16 +97,16 @@ def abbreviate(species): # Ignore already listed species continue bind_feature = None - annot_seq = gb.get_annotated_sequence( - file, include_only=["Site"], format="gp" - ) + annot_seq = gb.get_annotated_sequence(file, include_only=["Site"], format="gp") # Find the feature for DNA-binding site for feature in annot_seq.annotation: # DNA binding site is a helix-turn-helix motif - if "site_type" in feature.qual \ - and feature.qual["site_type"] == "DNA binding" \ - and "H-T-H motif" in feature.qual["note"]: - bind_feature = feature + if ( + "site_type" in feature.qual + and feature.qual["site_type"] == "DNA binding" + and "H-T-H motif" in feature.qual["note"] + ): + bind_feature = feature if bind_feature is not None: # If the feature is found, # get the sequence slice that is defined by the feature... @@ -128,10 +128,10 @@ def abbreviate(species): fig = plt.figure(figsize=(4.5, 4.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignment[:,:20], labels=sources[:20], symbols_per_line=len(alignment) + ax, alignment[:, :20], labels=sources[:20], symbols_per_line=len(alignment) ) # Source names in italic -ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"}) +ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"}) fig.tight_layout() ######################################################################## @@ -145,7 +145,7 @@ def abbreviate(species): fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_sequence_logo(ax, profile, scheme="flower") -ax.set_xticks([5,10,15,20]) +ax.set_xticks([5, 10, 15, 20]) ax.set_xlabel("Residue position") ax.set_ylabel("Bits") # Only show left and bottom spine @@ -154,4 +154,4 @@ def abbreviate(species): fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/luxa_comparison.py b/doc/examples/scripts/sequence/homology/luxa_comparison.py index 080531860..eda03243a 100644 --- a/doc/examples/scripts/sequence/homology/luxa_comparison.py +++ b/doc/examples/scripts/sequence/homology/luxa_comparison.py @@ -12,22 +12,21 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.fasta as fasta # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database -query = entrez.SimpleQuery("luxA", "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = entrez.SimpleQuery("luxA", "Gene Name") & entrez.SimpleQuery( + "srcdb_swiss-prot", "Properties" +) uids = entrez.search(query, db_name="protein") -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) ids = [] sequences = [] @@ -39,7 +38,7 @@ matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, tree, distances = align.align_multiple( - sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False + sequences, matrix, gap_penalty=(-10, -1), terminal_penalty=False ) # Order alignment according to the guide tree alignment = alignment[:, order] @@ -48,9 +47,8 @@ fig = plt.figure(figsize=(8.0, 20.0)) ax = fig.add_subplot(111) graphics.plot_alignment_type_based( - ax, alignment, labels=ids, show_numbers=True, spacing=2.0, - color_scheme="blossom" + ax, alignment, labels=ids, show_numbers=True, spacing=2.0, color_scheme="blossom" ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/pi3k_alignment.py b/doc/examples/scripts/sequence/homology/pi3k_alignment.py index e705566eb..4f745876b 100644 --- a/doc/examples/scripts/sequence/homology/pi3k_alignment.py +++ b/doc/examples/scripts/sequence/homology/pi3k_alignment.py @@ -16,23 +16,23 @@ # License: BSD 3 clause import warnings -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite +import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.sequence.io.fasta as fasta -import biotite.application.clustalo as clustalo -uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"] -names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"] +uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"] +names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"] sequences = [] -file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) for header, seq_str in file.items(): sequences.append(seq.ProteinSequence(seq_str)) @@ -47,25 +47,27 @@ # Like the :class:`LetterSimilarityPlotter` we will use the # *average normalized similarity* as measure. + def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i): - code1 = trace_code[seq_i, pos_i] - if code1 == -1: - return np.nan - similarities = np.zeros(trace_code.shape[0]) - for i in range(trace_code.shape[0]): - code2 = trace_code[i, pos_i] - if code2 == -1: - similarities[i] = 0 - else: - sim = matrix[code1, code2] - # Normalize (range 0.0 - 1.0) - min_sim = np.min(matrix[code1]) - max_sim = np.max(matrix[code1]) - sim = (sim - min_sim) / (max_sim - min_sim) - similarities[i] = sim - # Delete self-similarity - similarities = np.delete(similarities, seq_i) - return np.average(similarities) + code1 = trace_code[seq_i, pos_i] + if code1 == -1: + return np.nan + similarities = np.zeros(trace_code.shape[0]) + for i in range(trace_code.shape[0]): + code2 = trace_code[i, pos_i] + if code2 == -1: + similarities[i] = 0 + else: + sim = matrix[code1, code2] + # Normalize (range 0.0 - 1.0) + min_sim = np.min(matrix[code1]) + max_sim = np.max(matrix[code1]) + sim = (sim - min_sim) / (max_sim - min_sim) + similarities[i] = sim + # Delete self-similarity + similarities = np.delete(similarities, seq_i) + return np.average(similarities) + matrix = align.SubstitutionMatrix.std_protein_matrix() # Get the alignment columns as symbols codes (-1 for gaps) @@ -73,15 +75,13 @@ def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i): similarities = np.zeros(trace_code.shape) for i in range(similarities.shape[0]): for j in range(similarities.shape[1]): - similarities[i,j] = get_average_normalized_similarity( + similarities[i, j] = get_average_normalized_similarity( trace_code, matrix.score_matrix(), i, j ) figure = plt.figure(figsize=(8.0, 3.0)) ax = figure.add_subplot(111) -heatmap = ax.pcolor( - similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0 -) +heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0) cbar = figure.colorbar(heatmap) figure.tight_layout() @@ -93,16 +93,19 @@ def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i): # Hence, we create bins, that contain the mean similarity over a range of # columns. + def calculate_bins(similarities, bin_count): edges = np.linspace(0, similarities.shape[1], bin_count, dtype=int) edges = np.append(edges, similarities.shape[1]) binned_similarities = np.zeros(similarities.shape) for i in range(similarities.shape[0]): for j in range(len(edges) - 1): - binned_similarities[i, edges[j]:edges[j+1]] = \ - np.nanmean(similarities[i, edges[j]:edges[j+1]]) + binned_similarities[i, edges[j] : edges[j + 1]] = np.nanmean( + similarities[i, edges[j] : edges[j + 1]] + ) return binned_similarities + with warnings.catch_warnings(): # Catch warnings about empty slice for gap-only parts warnings.simplefilter("ignore") @@ -110,9 +113,7 @@ def calculate_bins(similarities, bin_count): figure = plt.figure(figsize=(8.0, 3.0)) ax = figure.add_subplot(111) -heatmap = ax.pcolor( - similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0 -) +heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0) cbar = figure.colorbar(heatmap) # Furthermore, add some labels to the figure cbar.set_label("Average normalized similarity") @@ -136,14 +137,14 @@ def calculate_bins(similarities, bin_count): # From beginning of the sequence... for i in range(len(trace)): # Check if all sequences have no gap at the given position - if trace[i,0] != -1: + if trace[i, 0] != -1: start_index = i break # ...and the end of the sequence -for i in range(len(trace)-1, -1, -1): +for i in range(len(trace) - 1, -1, -1): # Check if all sequences have no gap at the given position - if trace[i,0] != -1: - stop_index = i+1 + if trace[i, 0] != -1: + stop_index = i + 1 break # Truncate alignment to region where the 'PI3K' sequence exists @@ -155,11 +156,17 @@ def calculate_bins(similarities, bin_count): # The alignment is quite long # -> Reduce font size to reduce figure size graphics.plot_alignment_similarity_based( - ax, alignment, matrix=matrix, symbols_per_line=80, labels=names, + ax, + alignment, + matrix=matrix, + symbols_per_line=80, + labels=names, show_numbers=True, - label_size=10, number_size=10, symbol_size=6, - color=biotite.colors["orange"] + label_size=10, + number_size=10, + symbol_size=6, + color=biotite.colors["orange"], ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/plotepiscan.py b/doc/examples/scripts/sequence/homology/plotepiscan.py index 87a6e1c6d..c4edc9173 100644 --- a/doc/examples/scripts/sequence/homology/plotepiscan.py +++ b/doc/examples/scripts/sequence/homology/plotepiscan.py @@ -3,25 +3,25 @@ ========================================================== Peptide arrays can be used as a high-throughput platform for screening -biological interactions. Typical screenings involve the immobilization -of diverse peptides on a solid surface to study their interactions with -various target molecules. Specifically, arrays of peptides with -overlapping sequences can be used to identify the epitope of antibodies +biological interactions. Typical screenings involve the immobilization +of diverse peptides on a solid surface to study their interactions with +various target molecules. Specifically, arrays of peptides with +overlapping sequences can be used to identify the epitope of antibodies on a protein antigen at amino acid level. General scannings for molecular recognition using peptide arrays -are particlularly useful for epitope identification on monoclonal -antibodies. This example visualizes the data from two epitope mapping +are particlularly useful for epitope identification on monoclonal +antibodies. This example visualizes the data from two epitope mapping studies, using a color coded sequence alignment representation -of the antigens screened. The scannings interrogated a monoclonal +of the antigens screened. The scannings interrogated a monoclonal antibody (MAb) against two arrays of overlaping peptides :footcite:`Iyamu2023`. The files containing peptide array data can be downloaded :download:`here ` -and +and :download:`here `. The antigens screened span the extracellular domain of VAR2CSA, a virulence factor of *Plasmodiun falciparum* for the strains FCR3 -(residues 1-2659) and NF54 (residues 1-2652). The sequence of +(residues 1-2659) and NF54 (residues 1-2652). The sequence of the two domains can be downloaded :download:`here `. @@ -54,53 +54,55 @@ # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment -alignments = align.align_optimal(fcr3_seq, nf54_seq, matrix, - gap_penalty = (-10, -1), - terminal_penalty = False) +alignments = align.align_optimal( + fcr3_seq, nf54_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False +) alignment = alignments[0] print(alignment) ######################################################################## -# Epitope mapping data +# Epitope mapping data # -------------------- # # This study used arrays of overlaping peptides to achive high acurracy -# in mapping the epitope. Both FCR3 and NF54 arrays, consisted of +# in mapping the epitope. Both FCR3 and NF54 arrays, consisted of # 20-mer peptides with an overlap of 19 and 18 amino acids respectively. # Arbitrary units (AU) of fluorescence intensity quantified the antibody -# recognition for each peptide. -# Our goal is to decorate the aligment, with the fluorescence intensity -# scores of each peptide in the arrays. We used a -# color code from red to white for high to low intensity, respectively. -# The background color of the symbols on the aligment corresponds to the +# recognition for each peptide. +# Our goal is to decorate the aligment, with the fluorescence intensity +# scores of each peptide in the arrays. We used a +# color code from red to white for high to low intensity, respectively. +# The background color of the symbols on the aligment corresponds to the # score for the 20th amino acid at the end of the peptide. # -# Lets create a function that maps the peptide score to the 20th residue +# Lets create a function that maps the peptide score to the 20th residue # of the peptide: + def read_scan(filename, pep_len=20, score_res=20): - if not type(pep_len) is int: + if type(pep_len) is not int: raise TypeError("pep_len : only integers are allowed") - elif not type(score_res) is int: - raise TypeError("score_res : only integers are allowed") + elif type(score_res) is not int: + raise TypeError("score_res : only integers are allowed") elif pep_len < score_res: raise Exception("score_res can't be higher than pep_len") - - elif pep_len != 20 or score_res != 20: - s = (score_res) - pep_len -1 + + elif pep_len != 20 or score_res != 20: + s = (score_res) - pep_len - 1 else: - s =-1 + s = -1 - df= pd.read_csv(filename) - scor_res = df['Seq'].str[s] - df['s_res'] = scor_res + df = pd.read_csv(filename) + scor_res = df["Seq"].str[s] + df["s_res"] = scor_res return df + # Load epitope scan data -fcr3_file_path = "../../../download/FCR3_10ug.csv" -nf54_file_path = "../../../download/NF54_10ug.csv" +fcr3_file_path = "../../../download/FCR3_10ug.csv" +nf54_file_path = "../../../download/NF54_10ug.csv" # Define the score residues on the arrays files = [fcr3_file_path, nf54_file_path] @@ -114,66 +116,70 @@ def read_scan(filename, pep_len=20, score_res=20): ag1_scan.head(5) ######################################################################## -# The microarrays contained each peptide printed in duplicated spots. We -# need to combine the values of those experimental replicates into a +# The microarrays contained each peptide printed in duplicated spots. We +# need to combine the values of those experimental replicates into a # unique score for each peptide. Typically, this unique value could come # from the geometric mean between replicates that do not deviate wildly. -# If the average deviation between replicates is high, one can assumme +# If the average deviation between replicates is high, one can assumme # that experimental errors should result in a lower score at a given spot. -# It is easy to imagine that imperfections on the printing of the spot, -# will rather decrease and not increase, the antibody recognition, in -# which case the the peptide signal is better represented +# It is easy to imagine that imperfections on the printing of the spot, +# will rather decrease and not increase, the antibody recognition, in +# which case the the peptide signal is better represented # by the higher score replicate. # -# Now lets write a function to combine the scores adding the flexibility -# to choose cases for those criterias exposed above. We will flag with -# 0 or 1 every peptide entry on the arrays: 1 if the deviation between +# Now lets write a function to combine the scores adding the flexibility +# to choose cases for those criterias exposed above. We will flag with +# 0 or 1 every peptide entry on the arrays: 1 if the deviation between # replicates is higher than 40%, otherwise 0. -def combine_scores(dataframe, combine='max', flag_noisy=True): - df= dataframe + +def combine_scores(dataframe, combine="max", flag_noisy=True): + df = dataframe # mean - df['ave'] = df.iloc[:,[1,2]].mean(axis = 1) + df["ave"] = df.iloc[:, [1, 2]].mean(axis=1) # mean deviation - df['avedev'] = ((df.r1 - df.ave).abs() + (df.r2 - df.ave).abs()) / 2 + df["avedev"] = ((df.r1 - df.ave).abs() + (df.r2 - df.ave).abs()) / 2 # percent deviation between replicates - df['dev_ratio'] = df.apply(lambda x:0 - if x.avedev==0 else x.avedev/x.ave, axis=1) - + df["dev_ratio"] = df.apply( + lambda x: 0 if x.avedev == 0 else x.avedev / x.ave, axis=1 + ) + # signal value: - if combine == 'max': - df['comb_signal'] = df.apply(lambda x:max(x.r1, x.r2) - if x.dev_ratio >=0.4 else x.ave, axis=1) - elif combine == 'mean': - df['comb_signal'] = df.apply(lambda x:x.ave - if x.dev_ratio <= 0.4 else 0, axis=1) - + if combine == "max": + df["comb_signal"] = df.apply( + lambda x: max(x.r1, x.r2) if x.dev_ratio >= 0.4 else x.ave, axis=1 + ) + elif combine == "mean": + df["comb_signal"] = df.apply( + lambda x: x.ave if x.dev_ratio <= 0.4 else 0, axis=1 + ) + if flag_noisy: - df['flag'] = df.apply(lambda x:0 - if x.dev_ratio <= 0.4 else 1, axis=1) + df["flag"] = df.apply(lambda x: 0 if x.dev_ratio <= 0.4 else 1, axis=1) return df -# Make the corresponding signal equal the replicate with the higest + +# Make the corresponding signal equal the replicate with the higest # score value. -dfa = combine_scores(ag1_scan, combine = 'max', flag_noisy = True) -dfb = combine_scores(ag2_scan, combine = 'max', flag_noisy = True) +dfa = combine_scores(ag1_scan, combine="max", flag_noisy=True) +dfb = combine_scores(ag2_scan, combine="max", flag_noisy=True) dfa.head(5) ######################################################################## -# Many molecular recognition screening campaings e.g. epitope mapping -# screenings follow a long-tailed data distribution. To properly +# Many molecular recognition screening campaings e.g. epitope mapping +# screenings follow a long-tailed data distribution. To properly # represent such distribution one can normalize the date using linear or # non-linear transformations on the combined score data. + def data_transform(dataframe, threshold=0): df = dataframe - # Option to set a "threshold" for the signal scores. + # Option to set a "threshold" for the signal scores. t = threshold - df['cubic'] = df.apply(lambda x: np.cbrt(max(0, x.comb_signal-t)), - axis=1) - df['signal_plot'] = df.apply(lambda x: x.cubic/df['cubic'].max(), - axis=1) + df["cubic"] = df.apply(lambda x: np.cbrt(max(0, x.comb_signal - t)), axis=1) + df["signal_plot"] = df.apply(lambda x: x.cubic / df["cubic"].max(), axis=1) + # Normalize, using the power law with cubic exponent. No threshold data_transform(dfa, threshold=0) @@ -184,134 +190,136 @@ def data_transform(dataframe, threshold=0): # ------------------------------------------------------------------------------- # # So far, we have the peptide score data combined, normalized, and mapped -# to a residue for each peptide. +# to a residue for each peptide. # Next, using the alignment trace as a template, we will match the signal -# intensities associated to the score residues, to the position of each +# intensities associated to the score residues, to the position of each # symbol on the alignment, considering the gaps. -# Get the trace for each sequence on the alignment: +# Get the trace for each sequence on the alignment: trace_a = align.get_symbols(alignment)[0] trace_b = align.get_symbols(alignment)[1] + def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): """ - Generate a gapped sequence that relates peptide score data signal with a - template alignment trace. The function returns a list of tuples representing - the gapped sequence, where each tuple consists of a residue and its associated - signal value. + Generate a gapped sequence that relates peptide score data signal with a + template alignment trace. The function returns a list of tuples representing + the gapped sequence, where each tuple consists of a residue and its associated + signal value. Parameters ---------- - dataframe : DataFrame - A *Pandas* dataframe containing columns for each peptide score data, + dataframe : DataFrame + A *Pandas* dataframe containing columns for each peptide score data, and its designated score residue. - seq_trace : list + seq_trace : list The sequence trace obtained from the alignment. - p_len : int + p_len : int The length of each overlapping peptide. overlap_step : int, optional The step size for overlapping peptides.Default is 1. Note: ----- - The 'gapped' sequence may be shorter than the aligment trace if the alignment results - in gaps at either end. Any remaining elements in the trace with 'None' values are + The 'gapped' sequence may be shorter than the aligment trace if the alignment results + in gaps at either end. Any remaining elements in the trace with 'None' values are filled with tuples: ('None', 0). """ template = seq_trace df = dataframe - step = overlap_step - gapped = list(zip(df.s_res , df.signal_plot)) - lk1 = df["s_res"].values.tolist() - plen = p_len # peptide length - + step = overlap_step + gapped = list(zip(df.s_res, df.signal_plot)) + lk1 = df["s_res"].values.tolist() + plen = p_len # peptide length + if step == 1: x, b = 0, 0 - c = 0 # cyclic counter up to the peptide length :20 - p = 0 # peptide counter + c = 0 # cyclic counter up to the peptide length :20 + p = 0 # peptide counter for b in range(len(lk1)): for a in template[x:]: - if c < plen-1 : - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + if c < plen - 1: + if a == None: + gapped.insert(x, (template[x], 0)) + x = x + 1 elif a != lk1[b]: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - elif p==0: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + elif p == 0: + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 else: - x=x+1 - c=c+1 + x = x + 1 + c = c + 1 break else: - c = 0 # reset the counter - p=p+1 - x=x+1 + c = 0 # reset the counter + p = p + 1 + x = x + 1 break elif step == 2: x, b = 0, 0 - c=0 - p=0 + c = 0 + p = 0 for b in range(len(lk1)): for a in template[x:]: - if c < plen-1 and p==0: - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + if c < plen - 1 and p == 0: + if a == None: + gapped.insert(x, (template[x], 0)) + x = x + 1 else: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - elif p==0 : - c = 0 # reset the counter - p=p+1 - x=x+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + elif p == 0: + c = 0 # reset the counter + p = p + 1 + x = x + 1 break - if p!=0: - if a==None and c == 0: - gapped.insert(x,(template[x],0)) - x=x+1 - elif c % 2 == 0: - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + if p != 0: + if a == None and c == 0: + gapped.insert(x, (template[x], 0)) + x = x + 1 + elif c % 2 == 0: + if a == None: + gapped.insert(x, (template[x], 0)) + x = x + 1 else: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - elif c % 2 != 0: - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + elif c % 2 != 0: + if a == None: + gapped.insert(x, (template[x], 0)) + x = x + 1 elif a != lk1[b]: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - else: - x=x+1 - c=c+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + else: + x = x + 1 + c = c + 1 break # For terminal gaps - if len(gapped) < len(template) and template[len(gapped)+1]== None: - gapped_tail=[] - for n in range(len(template)-len(gapped)): - gapped_tail.append(('None', 0)) + if len(gapped) < len(template) and template[len(gapped) + 1] == None: + gapped_tail = [] + for n in range(len(template) - len(gapped)): + gapped_tail.append(("None", 0)) gapped = gapped + gapped_tail - + return gapped + # Let's use gapped_seq() to build the gapped sequences # FCR3 array, overlap_step: 1 (pep = 20-mer with 19 overlap) gapd_s1 = gapped_seq(dfa, trace_a, 20, 1) # NF54 array, overlap_step: 2 (pep = 20-mer with 18 overlap) -gapd_s2 = gapped_seq(dfb, trace_b, 20, 2) +gapd_s2 = gapped_seq(dfb, trace_b, 20, 2) # Checkpoint. Both gapped sequences must have the same length. len(gapd_s1) == len(gapd_s2) @@ -320,116 +328,133 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): # Create a signal map # ------------------- # -# Now we will generate an object mapping the signal scores from two gapped +# Now we will generate an object mapping the signal scores from two gapped # sequences. -def signal_map(gapped_seq1, gapped_seq2,): + +def signal_map( + gapped_seq1, + gapped_seq2, +): """ Generate a mapping of signal scores from two gapped sequences. This function takes two gapped sequences, `gapped_seq1` and - `gapped_seq2`. Each sequence is represented as a list of tuples, - with the first element being an amino acid symbol and the second - element being a signal score. It extracts the signal scores from - each sequence and creates a 2D array with two columns, where the - first column contains signal scores from `gapped_seq1` and the + `gapped_seq2`. Each sequence is represented as a list of tuples, + with the first element being an amino acid symbol and the second + element being a signal score. It extracts the signal scores from + each sequence and creates a 2D array with two columns, where the + first column contains signal scores from `gapped_seq1` and the second column contains signal scores from `gapped_seq2`. Parameters: ----------- gapped_seq1: list The first gapped sequence. - gapped_seq2: list + gapped_seq2: list The second gapped sequence. Returns: -------- - numpy.ndarray: A 2D numpy array with two columns containing signal + numpy.ndarray: A 2D numpy array with two columns containing signal scores extracted from `gapped_seq1` and `gapped_seq2` respectively. """ gapd_s1 = gapped_seq1 gapd_s2 = gapped_seq2 - fl_score = np.zeros((len(gapd_s1),2)) - + fl_score = np.zeros((len(gapd_s1), 2)) + for v1 in range(len(gapd_s1)): - fl_score[v1,0] = gapd_s1[v1][1] - fl_score[v1,1] = gapd_s2[v1][1] - + fl_score[v1, 0] = gapd_s1[v1][1] + fl_score[v1, 1] = gapd_s2[v1][1] + return fl_score + score = signal_map(gapd_s1, gapd_s2) ######################################################################## -# Sequence alignment decorated with MAb recognition regions +# Sequence alignment decorated with MAb recognition regions # --------------------------------------------------------- # -# Now we can plot the sequence alignment using an :class:`ArrayPlotter` -# instance that higlights sequence recognition regions at the positions +# Now we can plot the sequence alignment using an :class:`ArrayPlotter` +# instance that higlights sequence recognition regions at the positions # of the respective score residue per alignment column. -# To easily interpret the intensity-decorated alignment we will add a -# colorbar scaled accordingly. The scale matches the transformation +# To easily interpret the intensity-decorated alignment we will add a +# colorbar scaled accordingly. The scale matches the transformation # applied to the recognition signal recorded on the score ndarray. # -# Let's build a function to create a custom colorbar object. We will -# specify the dataframes corresponding to the two antigens screened in -# this example, the colormap, and the transformation to be +# Let's build a function to create a custom colorbar object. We will +# specify the dataframes corresponding to the two antigens screened in +# this example, the colormap, and the transformation to be # represented with the colorbar. fig = plt.figure(figsize=(8.0, 15)) ax = fig.add_subplot(111) graphics.plot_alignment_array( - ax, alignments[0], fl_score=score, labels=["FCR3", "NF54"], - show_numbers=True, symbols_per_line=80, - show_line_position=True, label_size=10, - number_size=10, symbol_size=6) + ax, + alignments[0], + fl_score=score, + labels=["FCR3", "NF54"], + show_numbers=True, + symbols_per_line=80, + show_line_position=True, + label_size=10, + number_size=10, + symbol_size=6, +) # Add the axes where the colorbar will reside: -ax2 = fig.add_axes([0.13, 0.07, 0.8, 0.01]) +ax2 = fig.add_axes([0.13, 0.07, 0.8, 0.01]) ax2.set_frame_on(False) -# Access the colormap of the relevant instace of ArrayPlotter: +# Access the colormap of the relevant instace of ArrayPlotter: colormap = graphics.ArrayPlotter(ax2, score).get_cmap() -def draw_colorbar(axes, array1, array2, colormap, - orient=None, title=None): + +def draw_colorbar(axes, array1, array2, colormap, orient=None, title=None): df1 = array1 df2 = array2 cmp = colormap ax = axes orientation = orient label = title - + # custom Formtatter for tick labels on the colorbar def fmt(x, pos): - a, b = '{:.1e}'.format(x).split('e') + a, b = "{:.1e}".format(x).split("e") b = int(b) - return r'${}\cdot10^{{{}}}$'.format(a, b) - - vmiA = df1['comb_signal'].min() - vmiB = df2['comb_signal'].min() - vmxA = df1['comb_signal'].max() - vmxB = df2['comb_signal'].max() - - # The normalization of this colormap needs to be consistent with the - # data trasnformtion used earlier on this example. The "cubic" law: - norm = mpl.colors.PowerNorm(gamma=0.33, vmin=min(vmiA,vmiB), - vmax=max(vmxA,vmxB)) - - fig = mpl.pyplot.figure() - return fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmp), - cax=ax, orientation=orientation, label=label, - format=mpl.ticker.FuncFormatter(fmt)) - -# Draw the colorbar -cbar = draw_colorbar(ax2, dfa, dfb, colormap, orient='horizontal', - title='Fluorescence Intensity [AU]') + return r"${}\cdot10^{{{}}}$".format(a, b) + + vmiA = df1["comb_signal"].min() + vmiB = df2["comb_signal"].min() + vmxA = df1["comb_signal"].max() + vmxB = df2["comb_signal"].max() + + # The normalization of this colormap needs to be consistent with the + # data trasnformtion used earlier on this example. The "cubic" law: + norm = mpl.colors.PowerNorm(gamma=0.33, vmin=min(vmiA, vmiB), vmax=max(vmxA, vmxB)) + + fig = mpl.pyplot.figure() + return fig.colorbar( + mpl.cm.ScalarMappable(norm=norm, cmap=cmp), + cax=ax, + orientation=orientation, + label=label, + format=mpl.ticker.FuncFormatter(fmt), + ) + + +# Draw the colorbar +cbar = draw_colorbar( + ax2, dfa, dfb, colormap, orient="horizontal", title="Fluorescence Intensity [AU]" +) # To improve readability we tilt the ticklabels on the colorbar labels = cbar.ax.get_xticklabels() -plt.setp(labels, rotation=45, horizontalalignment='center') +plt.setp(labels, rotation=45, horizontalalignment="center") plt.show() ######################################################################## # References # ---------- # -# .. footbibliography:: \ No newline at end of file +# .. footbibliography:: diff --git a/doc/examples/scripts/sequence/homology/residue_coevolution.py b/doc/examples/scripts/sequence/homology/residue_coevolution.py index e1f2f7329..e84b59f2f 100644 --- a/doc/examples/scripts/sequence/homology/residue_coevolution.py +++ b/doc/examples/scripts/sequence/homology/residue_coevolution.py @@ -43,22 +43,21 @@ # License: BSD 3 clause import warnings -import numpy as np -import matplotlib.pyplot as plt import matplotlib.colors as colors +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx -import biotite.sequence.align as align -import biotite.sequence.graphics as graphics import biotite.application.blast as blast import biotite.application.clustalo as clustalo import biotite.database.rcsb as rcsb - +import biotite.sequence.align as align +import biotite.sequence.graphics as graphics +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx # Get structure and sequence pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif")) -sequence = pdbx.get_sequence(pdbx_file)['A'] +sequence = pdbx.get_sequence(pdbx_file)["A"] # 'use_author_fields' is set to false, # to ensure that values in the 'res_id' annotation point to the sequence structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) @@ -88,16 +87,24 @@ # Plot MSA number_functions = [] for start in hit_starts: + def some_func(x, start=start): return x + start + number_functions.append(some_func) fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.gca() graphics.plot_alignment_type_based( - ax, alignment, symbols_per_line=len(alignment), labels=hit_ids, - symbol_size=8, number_size=8, label_size=8, - show_numbers=True, number_functions=number_functions, - color_scheme="flower" + ax, + alignment, + symbols_per_line=len(alignment), + labels=hit_ids, + symbol_size=8, + number_size=8, + label_size=8, + show_numbers=True, + number_functions=number_functions, + color_scheme="flower", ) ax.set_title("C-Myb R1-like sequences") fig.tight_layout() @@ -111,6 +118,7 @@ def some_func(x, start=start): # High values indicate that the residues at the respective two # positions have coevolved. + def mutual_information_zscore(alignment, n_shuffle=100): codes = align.get_codes(alignment).T alph = alignment.sequences[0].alphabet @@ -127,6 +135,7 @@ def mutual_information_zscore(alignment, n_shuffle=100): z_score = (mi - mean) / std return z_score + def _shuffle(codes): shuffled_codes = codes.copy() # Shuffle each alignment column @@ -134,6 +143,7 @@ def _shuffle(codes): np.random.shuffle(shuffled_codes[i]) return shuffled_codes + def _mutual_information(codes, alph): mi = np.zeros((len(alignment), len(alignment))) # Iterate over all columns to choose first column @@ -147,10 +157,10 @@ def _mutual_information(codes, alph): # Iterate over all symbols in both columns for k in range(codes.shape[1]): # Skip rows where either column has a gap - if codes[i,k] != -1 and codes[j,k] != -1: - marginal_counts_i[codes[i,k]] += 1 - marginal_counts_j[codes[j,k]] += 1 - combined_counts[codes[i,k], codes[j,k]] += 1 + if codes[i, k] != -1 and codes[j, k] != -1: + marginal_counts_i[codes[i, k]] += 1 + marginal_counts_j[codes[j, k]] += 1 + combined_counts[codes[i, k], codes[j, k]] += 1 nrows += 1 marginal_probs_i = marginal_counts_i / nrows marginal_probs_j = marginal_counts_j / nrows @@ -159,27 +169,31 @@ def _mutual_information(codes, alph): with warnings.catch_warnings(): warnings.simplefilter("ignore") mi_before_sum = ( - combined_probs * np.log2( - combined_probs / ( - marginal_probs_i[:, np.newaxis] * - marginal_probs_j[np.newaxis, :] + combined_probs + * np.log2( + combined_probs + / ( + marginal_probs_i[:, np.newaxis] + * marginal_probs_j[np.newaxis, :] ) ) ).flatten() - mi[i,j] = np.sum(mi_before_sum[~np.isnan(mi_before_sum)]) + mi[i, j] = np.sum(mi_before_sum[~np.isnan(mi_before_sum)]) return mi # Remove alignment columns that have a gap in the C-Myb sequence -alignment = alignment[alignment.trace[:,0] != -1] +alignment = alignment[alignment.trace[:, 0] != -1] mi = mutual_information_zscore(alignment) # Create the color map for the plot color = colors.to_rgb(biotite.colors["dimorange"]) cmap_val = np.stack( - [np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) - for i in range(len(color))] + [ + np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) + for i in range(len(color)) + ] ).transpose() cmap = colors.ListedColormap(cmap_val) @@ -196,4 +210,4 @@ def _mutual_information(codes, alph): fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py b/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py index a88d24284..b8c8ad276 100644 --- a/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py +++ b/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py @@ -23,29 +23,28 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import LinearSegmentedColormap +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.io.genbank as gb -import biotite.sequence.align as align import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez -import biotite.application.clustalo as clustalo - +import biotite.sequence.io.genbank as gb # Search for DNA sequences that belong to the cited article -query = entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \ - & entrez.SimpleQuery("159", "Volume") \ - & entrez.SimpleQuery("132-140", "Page Number") +query = ( + entrez.SimpleQuery("Forensic Sci. Int.", "Journal") + & entrez.SimpleQuery("159", "Volume") + & entrez.SimpleQuery("132-140", "Page Number") +) uids = entrez.search(query, db_name="nuccore") # Download and read file containing the Genbank records for the THCA # synthase genes -multi_file = gb.MultiFile.read(entrez.fetch_single_file( - uids, file_name=None, db_name="nuccore", ret_type="gb" -)) +multi_file = gb.MultiFile.read( + entrez.fetch_single_file(uids, file_name=None, db_name="nuccore", ret_type="gb") +) # This dictionary maps the strain ID to the protein sequence @@ -81,6 +80,7 @@ for sequence in sequences.values(): assert len(sequence) == seq_len + # Create consensus sequences for the drug-type and fiber-type cannabis # strains def create_consensus(sequences): @@ -89,9 +89,7 @@ def create_consensus(sequences): for seq_pos in range(seq_len): # Count the number of occurrences of each amino acid # at the given sequence position - counts = np.bincount( - [sequence.code[seq_pos] for sequence in sequences] - ) + counts = np.bincount([sequence.code[seq_pos] for sequence in sequences]) # The consensus amino acid is the most frequent amino acid consensus_code[seq_pos] = np.argmax(counts) # Create empty ProteinSequence object... @@ -101,6 +99,7 @@ def create_consensus(sequences): consensus_sequence.code = consensus_code return consensus_sequence + drug_type_consensus = create_consensus( [sequences[strain] for strain in (1, 10, 13, 20, 53, 54)] ) @@ -120,7 +119,8 @@ def create_consensus(sequences): # At low similarity the symbols are colored red, # at high similarity the symbols are colored white cmap = LinearSegmentedColormap.from_list( - "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)] + "custom", + colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)], # ^ reddish ^ white ) @@ -128,11 +128,16 @@ def create_consensus(sequences): ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignment, matrix=matrix, symbols_per_line=50, + ax, + alignment, + matrix=matrix, + symbols_per_line=50, labels=["Drug-type", "Fiber-type"], - show_numbers=True, cmap=cmap, symbol_size=8 + show_numbers=True, + cmap=cmap, + symbol_size=8, ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/blosum_dendrogram.py b/doc/examples/scripts/sequence/misc/blosum_dendrogram.py index 64d67f2f7..400497ef4 100644 --- a/doc/examples/scripts/sequence/misc/blosum_dendrogram.py +++ b/doc/examples/scripts/sequence/misc/blosum_dendrogram.py @@ -10,12 +10,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.phylo as phylo import biotite.sequence.graphics as graphics +import biotite.sequence.phylo as phylo # Obtain BLOSUM62 matrix = align.SubstitutionMatrix.std_protein_matrix() @@ -31,11 +31,12 @@ matrix = align.SubstitutionMatrix( seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]), seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]), - matrix.score_matrix()[:-4, :-4] + matrix.score_matrix()[:-4, :-4], ) similarities = matrix.score_matrix() print(matrix) + ######################################################################## # Now a function must be defined, that converts the similarity depicted # by a substitution matrix into a distance required by the UPGMA method. @@ -45,25 +46,26 @@ # # Finally the obtained (phylogenetic) tree is plotted as dendrogram. def get_distance(similarities, i, j): - s_max = (similarities[i,i] + similarities[j,j]) / 2 - return s_max - similarities[i,j] + s_max = (similarities[i, i] + similarities[j, j]) / 2 + return s_max - similarities[i, j] + distances = np.zeros(similarities.shape) for i in range(distances.shape[0]): for j in range(distances.shape[1]): - distances[i,j] = get_distance(similarities, i, j) + distances[i, j] = get_distance(similarities, i, j) tree = phylo.upgma(distances) fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.add_subplot(111) # Use the 3-letter amino acid code aa label -labels = [seq.ProteinSequence.convert_letter_1to3(letter).capitalize() - for letter in matrix.get_alphabet1()] -graphics.plot_dendrogram( - ax, tree, orientation="top", labels=labels -) +labels = [ + seq.ProteinSequence.convert_letter_1to3(letter).capitalize() + for letter in matrix.get_alphabet1() +] +graphics.plot_dendrogram(ax, tree, orientation="top", labels=labels) ax.set_ylabel("Distance") # Add grid for clearer distance perception ax.yaxis.grid(color="lightgray") -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/codon_usage.py b/doc/examples/scripts/sequence/misc/codon_usage.py index dd6963c24..e6d7b888d 100644 --- a/doc/examples/scripts/sequence/misc/codon_usage.py +++ b/doc/examples/scripts/sequence/misc/codon_usage.py @@ -35,14 +35,13 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import tempfile import itertools +import tempfile import numpy as np +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta -import biotite.database.entrez as entrez - +import biotite.sequence.io.genbank as gb # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( @@ -56,8 +55,8 @@ # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { - codon: 0 for codon - in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) ) + codon: 0 + for codon in itertools.product(*([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) @@ -82,7 +81,7 @@ # Iterate over the sequence in non-overlapping frames of 3 # and count the occurence of each codon for i in range(0, len(cds_seq), 3): - codon_code = tuple(cds_seq.code[i:i+3]) + codon_code = tuple(cds_seq.code[i : i + 3]) codon_counter[codon_code] += 1 # Convert the total frequencies into relative frequencies @@ -165,4 +164,4 @@ # Print the contents of the created FASTA file print(fasta_file) # In a real application it would be written onto the hard drive via -# fasta_file.write("some_file.fasta") \ No newline at end of file +# fasta_file.write("some_file.fasta") diff --git a/doc/examples/scripts/sequence/misc/color_schemes.py b/doc/examples/scripts/sequence/misc/color_schemes.py index de2dd80ad..b84542932 100644 --- a/doc/examples/scripts/sequence/misc/color_schemes.py +++ b/doc/examples/scripts/sequence/misc/color_schemes.py @@ -8,57 +8,65 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import biotite.sequence as seq -import biotite.sequence.graphics as graphics -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.gridspec import GridSpec from matplotlib.patches import Rectangle +import biotite.sequence as seq +import biotite.sequence.graphics as graphics + def plot_colors(ax, alphabet): - x_space=0.1 - y_space=0.3 + x_space = 0.1 + y_space = 0.3 scheme_names = sorted(graphics.list_color_scheme_names(alphabet)) scheme_names.reverse() - schemes = [graphics.get_color_scheme(name, alphabet) - for name in scheme_names] + schemes = [graphics.get_color_scheme(name, alphabet) for name in scheme_names] for i, scheme in enumerate(schemes): for j, color in enumerate(scheme): - box = Rectangle((j - 0.5 + x_space/2, i - 0.5 + y_space/2), - 1 - x_space, 1 - y_space, color=color, - linewidth=0) + box = Rectangle( + (j - 0.5 + x_space / 2, i - 0.5 + y_space / 2), + 1 - x_space, + 1 - y_space, + color=color, + linewidth=0, + ) ax.add_patch(box) ax.set_xticks(np.arange(len(alphabet))) ax.set_yticks(np.arange(len(schemes))) ax.set_xticklabels([symbol for symbol in alphabet]) ax.set_yticklabels(scheme_names) - ax.set_xlim(-0.5, len(alphabet)-0.5) - ax.set_ylim(-0.5, len(schemes)-0.5) + ax.set_xlim(-0.5, len(alphabet) - 0.5) + ax.set_ylim(-0.5, len(schemes) - 0.5) ax.spines["left"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["top"].set_visible(False) - ax.xaxis.set_ticks_position("none") + ax.xaxis.set_ticks_position("none") ax.yaxis.set_ticks_position("none") + nuc_alphabet = seq.NucleotideSequence.alphabet_amb prot_alphabet = seq.ProteinSequence.alphabet pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") figure = plt.figure(figsize=(8.0, 5.0)) gs = GridSpec( - 3, 1, - height_ratios=[len(graphics.list_color_scheme_names(alphabet)) - for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)], + 3, + 1, + height_ratios=[ + len(graphics.list_color_scheme_names(alphabet)) + for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet) + ], ) -ax = figure.add_subplot(gs[0,0]) +ax = figure.add_subplot(gs[0, 0]) ax.set_title("Nucleotide color schemes") plot_colors(ax, nuc_alphabet) -ax = figure.add_subplot(gs[1,0]) +ax = figure.add_subplot(gs[1, 0]) ax.set_title("Protein color schemes") plot_colors(ax, prot_alphabet) -ax = figure.add_subplot(gs[2,0]) +ax = figure.add_subplot(gs[2, 0]) ax.set_title("Protein block color schemes") plot_colors(ax, pb_alphabet) plt.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/color_schemes_protein.py b/doc/examples/scripts/sequence/misc/color_schemes_protein.py index d68b6cddc..a747c2c74 100644 --- a/doc/examples/scripts/sequence/misc/color_schemes_protein.py +++ b/doc/examples/scripts/sequence/misc/color_schemes_protein.py @@ -42,24 +42,23 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.fasta as fasta # Generate example alignment # (the same as in the bacterial luciferase example) -query = entrez.SimpleQuery("luxA", "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = entrez.SimpleQuery("luxA", "Gene Name") & entrez.SimpleQuery( + "srcdb_swiss-prot", "Properties" +) uids = entrez.search(query, db_name="protein") -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) @@ -70,10 +69,22 @@ # Get color scheme names alphabet = seq.ProteinSequence.alphabet schemes = [ - "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean", - "rainbow", "clustalx", - "zappo", "taylor", "buried", "hydrophobicity", - "prophelix", "propstrand", "propturn" + "flower", + "blossom", + "spring", + "wither", + "autumn", + "sunset", + "ocean", + "rainbow", + "clustalx", + "zappo", + "taylor", + "buried", + "hydrophobicity", + "prophelix", + "propstrand", + "propturn", ] count = len(schemes) # Assert that this example displays all available amino acid color schemes @@ -82,20 +93,24 @@ # Visualize each scheme using the example alignment -fig = plt.figure(figsize=(8.0, count*2.0)) +fig = plt.figure(figsize=(8.0, count * 2.0)) gridspec = GridSpec(2, count) for i, name in enumerate(schemes): for j, color_symbols in enumerate([False, True]): - ax = fig.add_subplot(count, 2, 2*i + j + 1) + ax = fig.add_subplot(count, 2, 2 * i + j + 1) if j == 0: ax.set_ylabel(name) alignment_part = alignment[:40] else: alignment_part = alignment[40:] graphics.plot_alignment_type_based( - ax, alignment_part, symbols_per_line=len(alignment_part), - color_scheme=name, color_symbols=color_symbols, symbol_size=8 + ax, + alignment_part, + symbols_per_line=len(alignment_part), + color_scheme=name, + color_symbols=color_symbols, + symbol_size=8, ) fig.tight_layout() fig.subplots_adjust(wspace=0) -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py index 88d0eb4ad..5c576c651 100644 --- a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py +++ b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py @@ -22,25 +22,23 @@ # License: BSD 3 clause import matplotlib.pyplot as plt -from matplotlib.lines import Line2D import numpy as np +from matplotlib.lines import Line2D from scipy.stats import linregress import biotite +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -from biotite.sequence.align.alignment import score -import biotite.sequence.io.fasta as fasta -import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics - +import biotite.sequence.io.fasta as fasta GAP_PENALTY = (-12, -1) # Download and parse protein sequences of avidin and streptavidin -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - ["CAC34569", "ACL82594"], None, "protein", "fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta") +) for name, sequence in fasta_file.items(): if "CAC34569" in name: query_seq = seq.ProteinSequence(sequence) @@ -54,8 +52,7 @@ # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignment = align.align_optimal( - query_seq, hit_seq, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + query_seq, hit_seq, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] @@ -64,8 +61,12 @@ fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"], - show_numbers=True, show_line_position=True + ax, + alignment, + matrix=matrix, + labels=["Avidin (query)", "Database hit"], + show_numbers=True, + show_line_position=True, ) fig.tight_layout() @@ -103,11 +104,13 @@ # # f(x) = \lambda t(x) e^{-t(x)} + # The probability density function of the extreme value distribution def pdf(x, l, u): t = np.exp(-l * (x - u)) return l * t * np.exp(-t) + x = np.linspace(-5, 10, 1000) y = pdf(x, 1, 0) @@ -124,7 +127,7 @@ def pdf(x, l, u): # .. math:: # # u = \frac{\ln Kmn}{\lambda}, -# +# # where :math:`m` and :math:`n` are the lengths of the aligned # sequences. # :math:`K` and :math:`\lambda` can be calculated from the substitution @@ -166,32 +169,39 @@ def pdf(x, l, u): SAMPLE_SIZE = 10000 SEQ_LENGTH = 300 -BACKGROUND = np.array(list({ - "A": 35155, - "C": 8669, - "D": 24161, - "E": 28354, - "F": 17367, - "G": 33229, - "H": 9906, - "I": 23161, - "K": 25872, - "L": 40625, - "M": 10101, - "N": 20212, - "P": 23435, - "Q": 19208, - "R": 23105, - "S": 32070, - "T": 26311, - "V": 29012, - "W": 5990, - "Y": 14488, - "B": 0, - "Z": 0, - "X": 0, - "*": 0, -}.values())) / 450431 +BACKGROUND = ( + np.array( + list( + { + "A": 35155, + "C": 8669, + "D": 24161, + "E": 28354, + "F": 17367, + "G": 33229, + "H": 9906, + "I": 23161, + "K": 25872, + "L": 40625, + "M": 10101, + "N": 20212, + "P": 23435, + "Q": 19208, + "R": 23105, + "S": 32070, + "T": 26311, + "V": 29012, + "W": 5990, + "Y": 14488, + "B": 0, + "Z": 0, + "X": 0, + "*": 0, + }.values() + ) + ) + / 450431 +) # Generate the sequence code for random sequences @@ -199,7 +209,7 @@ def pdf(x, l, u): random_sequence_code = np.random.choice( np.arange(len(seq.ProteinSequence.alphabet)), size=(SAMPLE_SIZE, 2, SEQ_LENGTH), - p=BACKGROUND + p=BACKGROUND, ) # Sample alignment scores @@ -207,11 +217,10 @@ def pdf(x, l, u): for i in range(SAMPLE_SIZE): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() - seq1.code = random_sequence_code[i,0] - seq2.code = random_sequence_code[i,1] + seq1.code = random_sequence_code[i, 0] + seq2.code = random_sequence_code[i, 1] sample_alignment = align.align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] sample_scores[i] = sample_alignment.score @@ -238,18 +247,17 @@ def pdf(x, l, u): freqs = np.bincount(sample_scores) / SAMPLE_SIZE # Coordinates for the fit -x = np.linspace(0, len(freqs)-1, 1000) +x = np.linspace(0, len(freqs) - 1, 1000) y = pdf(x, l, u) fig, ax = plt.subplots(figsize=(8.0, 4.0)) ax.scatter( - np.arange(len(freqs)), freqs, color=biotite.colors["dimorange"], - label="Sample", s=8 + np.arange(len(freqs)), freqs, color=biotite.colors["dimorange"], label="Sample", s=8 ) ax.plot(x, y, color="gray", linestyle="--", label="Fit") ax.set_xlabel("Similarity score") ax.set_ylabel("Probability") -ax.set_xlim(0, len(freqs)-1) +ax.set_xlim(0, len(freqs) - 1) ax.legend(loc="upper left") fig.tight_layout() @@ -281,8 +289,7 @@ def pdf(x, l, u): SAMPLE_SIZE_PER_LENGTH = 1000 # The sequence lengths to be sampled -length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE) \ - .astype(int) +length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE).astype(int) u_series = np.zeros(LENGTH_SAMPLE_SIZE) l_series = np.zeros(LENGTH_SAMPLE_SIZE) for i, length in enumerate(length_samples): @@ -290,18 +297,17 @@ def pdf(x, l, u): random_sequence_code = np.random.choice( np.arange(len(seq.ProteinSequence.alphabet)), size=(SAMPLE_SIZE_PER_LENGTH, 2, length), - p=BACKGROUND + p=BACKGROUND, ) scores = np.zeros(SAMPLE_SIZE_PER_LENGTH, dtype=int) for j in range(SAMPLE_SIZE_PER_LENGTH): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() - seq1.code = random_sequence_code[j,0] - seq2.code = random_sequence_code[j,1] + seq1.code = random_sequence_code[j, 0] + seq2.code = random_sequence_code[j, 1] sample_alignment = align.align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] scores[j] = sample_alignment.score @@ -309,7 +315,7 @@ def pdf(x, l, u): u_series[i] = np.mean(scores) - np.euler_gamma / l_series[i] ######################################################################## -# Now we use a linear fit of :math:`u` to check if there is a linear +# Now we use a linear fit of :math:`u` to check if there is a linear # relation. # Furthermore, if this is true, the slope and intercept of # the fit should give us a more precise estimation of :math:`\lambda` @@ -319,7 +325,7 @@ def pdf(x, l, u): slope, intercept, r, _, _ = linregress(ln_mn, u_series) # More precise parameter estimation from fit -l = 1/slope +l = 1 / slope k = np.exp(intercept * l) # Coordinates for fit @@ -327,19 +333,17 @@ def pdf(x, l, u): y_fit = slope * x_fit + intercept fig, ax = plt.subplots(figsize=(8.0, 4.0)) -arrowprops = dict( - facecolor='black', shrink=0.1, width=3, headwidth=10, headlength=10 -) +arrowprops = dict(facecolor="black", shrink=0.1, width=3, headwidth=10, headlength=10) ax.scatter(ln_mn, u_series, color=biotite.colors["dimorange"], s=8) ax.plot(x_fit, y_fit, color=biotite.colors["darkorange"], linestyle="--") x_annot = 12 ax.annotate( f"R² = {r**2:.3f}\nK = {k:.3f}", - xy = (x_annot, slope * x_annot + intercept), - xytext = (-100, 50), - textcoords = "offset pixels", - arrowprops = arrowprops, + xy=(x_annot, slope * x_annot + intercept), + xytext=(-100, 50), + textcoords="offset pixels", + arrowprops=arrowprops, ) ax2 = ax.twinx() @@ -348,10 +352,10 @@ def pdf(x, l, u): x_annot = 2 ax2.annotate( f"λ = {l:.3f}", - xy = (x_annot, l), - xytext = (0, -50), - textcoords = "offset pixels", - arrowprops = arrowprops, + xy=(x_annot, l), + xytext=(0, -50), + textcoords="offset pixels", + arrowprops=arrowprops, ) ax.set_xlabel("ln(mn)") @@ -361,17 +365,25 @@ def pdf(x, l, u): ax.set_ylim(0, 50) ax2.set_ylim(0, 0.6) ax.legend( - handles = [ + handles=[ Line2D( - [0], [0], color=biotite.colors["dimorange"], label='u', - marker='o', linestyle="None" + [0], + [0], + color=biotite.colors["dimorange"], + label="u", + marker="o", + linestyle="None", ), Line2D( - [0], [0], color=biotite.colors["lightgreen"], label='λ', - marker='o', linestyle="None" - ) + [0], + [0], + color=biotite.colors["lightgreen"], + label="λ", + marker="o", + linestyle="None", + ), ], - loc = "upper left" + loc="upper left", ) fig.tight_layout() @@ -398,17 +410,17 @@ def pdf(x, l, u): # E-value calculation # ------------------- # -# Finally, we can use the estimated parameters to calculate the E-value +# Finally, we can use the estimated parameters to calculate the E-value # of the alignment of interest. # In this case we use :math:`K` and :math:`\lambda` from the linear fit, # but as already indicated we could alternatively use the parameters # from sampling alignments of sequences at a single length :math:`n`. # While :math:`\lambda` is a direct result of the method of moments as -# shown above, :math:`K` is calculated as +# shown above, :math:`K` is calculated as # # .. math:: # -# K = \frac{e^{\lambda u}}{n^2} +# K = \frac{e^{\lambda u}}{n^2} # # where :math:`n` is the length of both sequences in each sample. # @@ -425,12 +437,12 @@ def pdf(x, l, u): DATABASE_SIZE = 1_000_000 + def e_value(score, length1, length2, k, l): return k * length1 * length2 * np.exp(-l * score) -e = e_value( - alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, l -) + +e = e_value(alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, l) print(f"E-value = {e:.2e}") ######################################################################## diff --git a/doc/examples/scripts/sequence/misc/orf_identification.py b/doc/examples/scripts/sequence/misc/orf_identification.py index 6c7d87abd..695b30af8 100644 --- a/doc/examples/scripts/sequence/misc/orf_identification.py +++ b/doc/examples/scripts/sequence/misc/orf_identification.py @@ -16,10 +16,8 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez -import matplotlib.pyplot as plt +import biotite.sequence.io.fasta as fasta # Download Porcine circovirus genome file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta") @@ -29,13 +27,19 @@ proteins, positions = genome.translate() print("Forward strand:") for i in range(len(proteins)): - print("{:4d} - {:4d}: {:}" - .format(positions[i][0], positions[i][1], str(proteins[i]))) + print( + "{:4d} - {:4d}: {:}".format( + positions[i][0], positions[i][1], str(proteins[i]) + ) + ) print("\n") # Perform translation for complementary strand genome_rev = genome.reverse().complement() proteins, positions = genome_rev.translate() print("Reverse strand:") for i in range(len(proteins)): - print("{:5d} - {:5d}: {:}" - .format(positions[i][0], positions[i][1], str(proteins[i]))) \ No newline at end of file + print( + "{:5d} - {:5d}: {:}".format( + positions[i][0], positions[i][1], str(proteins[i]) + ) + ) diff --git a/doc/examples/scripts/sequence/profile/anderson_logo.py b/doc/examples/scripts/sequence/profile/anderson_logo.py index 50b195f56..20cee13b2 100644 --- a/doc/examples/scripts/sequence/profile/anderson_logo.py +++ b/doc/examples/scripts/sequence/profile/anderson_logo.py @@ -9,33 +9,35 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics # The list of Anderson promoters -seqs = [seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), - seq.NucleotideSequence("tttacagctagctcagtcctaggtattatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctaggtactgtgctagc"), - seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctaggtattgtgctagc"), - seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"), - seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"), - seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"), - seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"), - seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"), - seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"), - seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"), - seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), - seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"), - seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"), - seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"), - seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")] +seqs = [ + seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), + seq.NucleotideSequence("tttacagctagctcagtcctaggtattatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctaggtactgtgctagc"), + seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctaggtattgtgctagc"), + seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"), + seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"), + seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"), + seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"), + seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"), + seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"), + seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"), + seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), + seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"), + seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"), + seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"), + seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc"), +] # Sequences do not need to be aligned # -> Create alignment with trivial trace # [[0 0 0 ...] @@ -43,11 +45,11 @@ # [2 2 2 ...] # ... ] alignment = align.Alignment( - sequences = seqs, - trace = np.tile(np.arange(len(seqs[0])), len(seqs)) \ - .reshape(len(seqs), len(seqs[0])) \ - .transpose(), - score = 0 + sequences=seqs, + trace=np.tile(np.arange(len(seqs[0])), len(seqs)) + .reshape(len(seqs), len(seqs[0])) + .transpose(), + score=0, ) # Create sequence logo from alignment fig = plt.figure(figsize=(8.0, 1.5)) @@ -57,4 +59,4 @@ # Remove the entire frame ax.axis("off") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/profile/rbs_identification.py b/doc/examples/scripts/sequence/profile/rbs_identification.py index acb30ecc1..fd58280bb 100644 --- a/doc/examples/scripts/sequence/profile/rbs_identification.py +++ b/doc/examples/scripts/sequence/profile/rbs_identification.py @@ -16,17 +16,15 @@ # License: BSD 3 clause import tempfile -import numpy as np import matplotlib.pyplot as plt -from matplotlib.patches import Patch import matplotlib.ticker as ticker +import numpy as np +from matplotlib.patches import Patch import biotite +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez -import biotite.application.muscle as muscle - +import biotite.sequence.io.genbank as gb UTR_LENGTH = 20 @@ -58,18 +56,15 @@ # CDS is on if loc.strand == seq.Location.Strand.FORWARD: utr_start = loc.first - UTR_LENGTH - utr_stop = loc.first + utr_stop = loc.first # Include the start codon (3 bases) in the UTRs for later # visualization - utrs.append( - bl21_genome[utr_start : utr_stop + 3].sequence - ) + utrs.append(bl21_genome[utr_start : utr_stop + 3].sequence) else: utr_start = loc.last + 1 - utr_stop = loc.last + 1 + UTR_LENGTH + utr_stop = loc.last + 1 + UTR_LENGTH utrs.append( - bl21_genome[utr_start - 3 : utr_stop].sequence \ - .reverse().complement() + bl21_genome[utr_start - 3 : utr_stop].sequence.reverse().complement() ) @@ -82,14 +77,15 @@ frequencies[np.arange(len(utr)), utr.code] += 1 profile = seq.SequenceProfile( - symbols = frequencies, - gaps = np.zeros(len(frequencies)), - alphabet = bl21_genome.sequence.alphabet + symbols=frequencies, + gaps=np.zeros(len(frequencies)), + alphabet=bl21_genome.sequence.alphabet, ) ### Visualize the profile + # Spend extra effort for correct sequence postion labels def normalize_seq_pos(x): """ @@ -103,15 +99,17 @@ def normalize_seq_pos(x): x -= 1 return x + @ticker.FuncFormatter def sequence_loc_formatter(x, pos): x = normalize_seq_pos(x) return f"{x:+}" + COLOR_SCHEME = [ - biotite.colors["lightgreen"], # A - biotite.colors["orange"], # C - biotite.colors["dimgreen"], # G + biotite.colors["lightgreen"], # A + biotite.colors["orange"], # C + biotite.colors["dimgreen"], # G biotite.colors["brightorange"], # T ] @@ -127,11 +125,14 @@ def sequence_loc_formatter(x, pos): ax.set_ylabel("Conservation (Bits)") ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) -ax.legend(loc="upper left", handles=[ - Patch(color=biotite.colors["green"], label="Purine"), - Patch(color=biotite.colors["lightorange"], label="Pyrimidine"), -]) +ax.legend( + loc="upper left", + handles=[ + Patch(color=biotite.colors["green"], label="Purine"), + Patch(color=biotite.colors["lightorange"], label="Pyrimidine"), + ], +) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/sequencing/gene_counts.py b/doc/examples/scripts/sequence/sequencing/gene_counts.py index 6dd4bed65..6fa73b0c1 100644 --- a/doc/examples/scripts/sequence/sequencing/gene_counts.py +++ b/doc/examples/scripts/sequence/sequencing/gene_counts.py @@ -19,21 +19,20 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -from io import StringIO import functools -import multiprocessing import gzip -import numpy as np +import multiprocessing +from io import StringIO import matplotlib.pyplot as plt +import numpy as np import pandas as pd import requests import biotite +import biotite.application.sra as sra import biotite.sequence as seq +import biotite.sequence.align as align import biotite.sequence.io.fasta as fasta import biotite.sequence.io.fastq as fastq -import biotite.sequence.align as align -import biotite.application.sra as sra - # The number of processes for read mapping N_PROCESS = 2 @@ -93,6 +92,7 @@ # extracts the gene symbols, i.e. the 'names' of the genes, and the # corresponding cDNA sequences. + def get_gene_symbol(header): fields = header.split() for field in fields: @@ -103,6 +103,7 @@ def get_gene_symbol(header): # No gene symbol for this cDNA (e.g. non-coding) return None + response = requests.get(CDNA_URL) fasta_content = gzip.decompress(response.content).decode("UTF-8") @@ -123,9 +124,7 @@ def get_gene_symbol(header): # The k-mer code in restricted to int64, so a larger number # of base alphabet codes decreases the *k* that fits into # the integer type - sequences.append( - seq.NucleotideSequence(seq_string, ambiguous=False) - ) + sequences.append(seq.NucleotideSequence(seq_string, ambiguous=False)) except seq.AlphabetError: # For the simplicity of this example just ignore sequences # with unambiguous symbols @@ -172,13 +171,10 @@ def get_gene_symbol(header): base_alph = seq.NucleotideSequence.alphabet_unamb kmer_alph = align.KmerAlphabet(base_alph, K) -min_selector = align.MinimizerSelector( - kmer_alph, WINDOW, align.RandomPermutation() -) +min_selector = align.MinimizerSelector(kmer_alph, WINDOW, align.RandomPermutation()) kmer_table = align.BucketKmerTable.from_kmer_selection( - kmer_alph, - *zip(*[min_selector.select(sequence) for sequence in sequences]) + kmer_alph, *zip(*[min_selector.select(sequence) for sequence in sequences]) ) ######################################################################## @@ -202,6 +198,7 @@ def get_gene_symbol(header): # After all alignments have been collected, simply the highest-scoring # one is chosen as the *correct* one. + def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): try: read = seq.NucleotideSequence(read_string, ambiguous=False) @@ -226,10 +223,13 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): ( gene_i, align.align_banded( - read, gene_sequences[gene_i], substitution_matrix, + read, + gene_sequences[gene_i], + substitution_matrix, band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), - gap_penalty= -10, max_number=1 - )[0] + gap_penalty=-10, + max_number=1, + )[0], ) for gene_i, diagonal in zip(matched_gene_indices, matched_diagonals) ] @@ -243,9 +243,9 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): substitution_matrix = align.SubstitutionMatrix.std_nucleotide_matrix() -for i, (_, (seq_string, q)) in enumerate(fastq.FastqFile.read_iter( - fastq_path, offset="Sanger" -)): +for i, (_, (seq_string, q)) in enumerate( + fastq.FastqFile.read_iter(fastq_path, offset="Sanger") +): # For demonstration only a single clean read is mapped if i == 3: read_string = seq_string @@ -266,10 +266,11 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): # However, for the large number of reads which can be then processed in # parallel, it is still worth it. + def read_iter(fastq_path): - for i, (_, (read_string, quality)) in enumerate(fastq.FastqFile.read_iter( - fastq_path, offset="Sanger" - )): + for i, (_, (read_string, quality)) in enumerate( + fastq.FastqFile.read_iter(fastq_path, offset="Sanger") + ): # For the purpose of this example only a faction of the reads # are processed to save computation time if i >= EXCERPT: @@ -279,21 +280,24 @@ def read_iter(fastq_path): continue yield read_string + with multiprocessing.Pool(processes=N_PROCESS) as p: # Use multiprocessing to map reads to genes # and remove non-mappable reads (None values) afterwards - mapping_results = list(filter( - lambda mapping: mapping is not None, - p.map( - functools.partial( - map_read, - kmer_table=kmer_table, - gene_sequences=sequences, - substitution_matrix=substitution_matrix + mapping_results = list( + filter( + lambda mapping: mapping is not None, + p.map( + functools.partial( + map_read, + kmer_table=kmer_table, + gene_sequences=sequences, + substitution_matrix=substitution_matrix, + ), + read_iter(fastq_path), ), - read_iter(fastq_path) ) - )) + ) ######################################################################## # Now the genes are counted: @@ -324,7 +328,7 @@ def read_iter(fastq_path): # Put into dataframe for prettier printing counts = pd.DataFrame( {"gene_symbol": ranked_gene_symbols, "count": ranked_counts}, - index = np.arange(1, len(ranked_counts) + 1) + index=np.arange(1, len(ranked_counts) + 1), ) # Show Top N @@ -335,10 +339,7 @@ def read_iter(fastq_path): # Finally the top expressed genes are plotted. figure, ax = plt.subplots(figsize=(8.0, 6.0), constrained_layout=True) -ax.barh( - top_counts["gene_symbol"], top_counts["count"], - color=biotite.colors["orange"] -) +ax.barh(top_counts["gene_symbol"], top_counts["count"], color=biotite.colors["orange"]) ax.invert_yaxis() ax.set_title(f"Top {N_TOP_LIST} expressed genes", weight="semibold") ax.set_xlabel("Counts") @@ -348,4 +349,4 @@ def read_iter(fastq_path): # References # ---------- # -# .. footbibliography:: \ No newline at end of file +# .. footbibliography:: diff --git a/doc/examples/scripts/sequence/sequencing/genome_assembly.py b/doc/examples/scripts/sequence/sequencing/genome_assembly.py index bf5474fd9..29ab83656 100644 --- a/doc/examples/scripts/sequence/sequencing/genome_assembly.py +++ b/doc/examples/scripts/sequence/sequencing/genome_assembly.py @@ -1,4 +1,4 @@ -""" +r""" Comparative genome assembly =========================== @@ -48,21 +48,20 @@ import itertools import tempfile from concurrent.futures import ProcessPoolExecutor -import numpy as np import matplotlib.pyplot as plt -from matplotlib.lines import Line2D +import numpy as np from matplotlib.colors import LinearSegmentedColormap +from matplotlib.lines import Line2D import biotite +import biotite.application.sra as sra +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align +import biotite.sequence.graphics as graphics import biotite.sequence.io as seqio import biotite.sequence.io.fasta as fasta import biotite.sequence.io.fastq as fastq import biotite.sequence.io.genbank as gb -import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez -import biotite.application.sra as sra - # Download the sequencing data app = sra.FastqDumpApp("SRR13453793") @@ -73,8 +72,9 @@ # There is only one read per spot file_path = app.get_file_paths()[0] fastq_file = fastq.FastqFile.read(file_path, offset="Sanger") -reads = [seq.NucleotideSequence(seq_str) - for seq_str, score_array in fastq_file.values()] +reads = [ + seq.NucleotideSequence(seq_str) for seq_str, score_array in fastq_file.values() +] score_arrays = [score_array for seq_str, score_array in fastq_file.values()] print(f"Number of reads: {len(reads)}") @@ -93,7 +93,8 @@ length_ax.hist( [len(score_array) for score_array in score_arrays], - bins=np.logspace(1, 5, N_BINS), color="gray" + bins=np.logspace(1, 5, N_BINS), + color="gray", ) length_ax.set_xlabel("Read length") length_ax.set_ylabel("Number of reads") @@ -102,7 +103,8 @@ score_ax.hist( [np.mean(score_array) for score_array in score_arrays], - bins=N_BINS, color="gray", + bins=N_BINS, + color="gray", ) score_ax.set_xlim(0, 30) score_ax.set_xlabel("Phred score") @@ -134,8 +136,10 @@ fig, ax = plt.subplots(figsize=(8.0, 4.0)) ax.fill_between( # Value in megabases -> 1e-6 - np.arange(len(score_histogram)), score_histogram * 1e-6, - linewidth=0, color="gray" + np.arange(len(score_histogram)), + score_histogram * 1e-6, + linewidth=0, + color="gray", ) ax.set_xlim( np.min(np.where(score_histogram > 0)[0]), @@ -166,15 +170,14 @@ # Download and read the reference SARS-CoV-2 genome orig_genome_file = entrez.fetch( - "NC_045512", tempfile.gettempdir(), "gb", - db_name="Nucleotide", ret_type="gb" + "NC_045512", tempfile.gettempdir(), "gb", db_name="Nucleotide", ret_type="gb" ) orig_genome = seqio.load_sequence(orig_genome_file) # Create complementary reads -compl_reads = list(itertools.chain( - *[(read, read.reverse(False).complement()) for read in reads] -)) +compl_reads = list( + itertools.chain(*[(read, read.reverse(False).complement()) for read in reads]) +) ######################################################################## # To map the reads to their corresponding positions in the reference @@ -239,19 +242,27 @@ read_length = len(compl_reads[INDEX]) # Find the correct diagonal for the example read -diagonals = matches[:,2] - matches[:,0] +diagonals = matches[:, 2] - matches[:, 0] diag, counts = np.unique(diagonals, return_counts=True) correct_diagonal = diag[np.argmax(counts)] # Visualize the matches and the correct diagonal fig, ax = plt.subplots(figsize=(8.0, 8.0)) ax.scatter( - matches[:,0], matches[:,2], - s=4, marker="o", color=biotite.colors["dimorange"], label="Match" + matches[:, 0], + matches[:, 2], + s=4, + marker="o", + color=biotite.colors["dimorange"], + label="Match", ) ax.plot( - [0, read_length], [correct_diagonal, read_length+correct_diagonal], - linestyle=":", linewidth=1.0, color="black", label="Correct diagonal" + [0, read_length], + [correct_diagonal, read_length + correct_diagonal], + linestyle=":", + linewidth=1.0, + color="black", + label="Correct diagonal", ) ax.set_xlim(0, read_length) ax.set_xlabel("Read position") @@ -263,7 +274,7 @@ # Find the correct diagonal for all reads correct_diagonals = [None] * len(all_matches) for i, matches in enumerate(all_matches): - diagonals = matches[:,2] - matches[:,0] + diagonals = matches[:, 2] - matches[:, 0] unqiue_diag, counts = np.unique(diagonals, return_counts=True) if len(unqiue_diag) == 0: # If no match is found for this sequence, ignore this sequence @@ -325,23 +336,28 @@ matrix = align.SubstitutionMatrix.std_nucleotide_matrix() + def map_sequence(read, diag): deviation = int(3 * np.sqrt(len(read) * P_INDEL)) if diag is None: return None else: return align.align_banded( - read, orig_genome, matrix, gap_penalty=-10, - band = (diag - deviation, diag + deviation), - max_number = 1 + read, + orig_genome, + matrix, + gap_penalty=-10, + band=(diag - deviation, diag + deviation), + max_number=1, )[0] + # Each process can be quite memory consuming # -> Cap to two processes to make it work on low-RAM commodity hardware with ProcessPoolExecutor(max_workers=2) as executor: - alignments = list(executor.map( - map_sequence, compl_reads, correct_diagonals, chunksize=1000 - )) + alignments = list( + executor.map(map_sequence, compl_reads, correct_diagonals, chunksize=1000) + ) ######################################################################## # Now we have to select for each read, whether the original or @@ -351,18 +367,25 @@ def map_sequence(read, diag): for_alignments = [alignments[i] for i in range(0, len(alignments), 2)] rev_alignments = [alignments[i] for i in range(1, len(alignments), 2)] -scores = np.stack(( - [ali.score if ali is not None else 0 for ali in for_alignments], - [ali.score if ali is not None else 0 for ali in rev_alignments] -),axis=-1) +scores = np.stack( + ( + [ali.score if ali is not None else 0 for ali in for_alignments], + [ali.score if ali is not None else 0 for ali in rev_alignments], + ), + axis=-1, +) correct_sense = np.argmax(scores, axis=-1) -correct_alignments = [for_a if sense == 0 else rev_a for for_a, rev_a, sense - in zip(for_alignments, rev_alignments, correct_sense)] +correct_alignments = [ + for_a if sense == 0 else rev_a + for for_a, rev_a, sense in zip(for_alignments, rev_alignments, correct_sense) +] # If we use a reverse complementary read, # we also need to reverse the Phred score arrays -correct_score_arrays = [score if sense == 0 else score[::-1] for score, sense - in zip(score_arrays, correct_sense)] +correct_score_arrays = [ + score if sense == 0 else score[::-1] + for score, sense in zip(score_arrays, correct_sense) +] ######################################################################## # Now we know for each read where its corresponding position on the @@ -371,12 +394,8 @@ def map_sequence(read, diag): # Eventually, we visualize the mapping. # Find genome positions for the starts and ends of all reads -starts = np.array( - [ali.trace[ 0, 1] for ali in correct_alignments if ali is not None] -) -stops = np.array( - [ali.trace[-1, 1] for ali in correct_alignments if ali is not None] -) +starts = np.array([ali.trace[0, 1] for ali in correct_alignments if ali is not None]) +stops = np.array([ali.trace[-1, 1] for ali in correct_alignments if ali is not None]) # For a nicer plot sort these by their start position order = np.argsort(starts) starts = starts[order] @@ -384,13 +403,17 @@ def map_sequence(read, diag): fig, ax = plt.subplots(figsize=(8.0, 12.0)) ax.barh( - np.arange(len(starts)), left=starts, width=stops-starts, height=1, - color=biotite.colors["dimgreen"], linewidth=0 + np.arange(len(starts)), + left=starts, + width=stops - starts, + height=1, + color=biotite.colors["dimgreen"], + linewidth=0, ) -ax.set_ylim(0, len(starts)+1) -ax.spines['top'].set_visible(False) -ax.spines['right'].set_visible(False) -ax.spines['left'].set_visible(False) +ax.set_ylim(0, len(starts) + 1) +ax.spines["top"].set_visible(False) +ax.spines["right"].set_visible(False) +ax.spines["left"].set_visible(False) ax.tick_params(left=False, labelleft=False) ax.set_xlabel("Sequence position") ax.set_title("Read mappings to reference genome") @@ -479,24 +502,21 @@ def map_sequence(read, diag): if alignment is not None: trace = alignment.trace - no_gap_trace = trace[(trace[:,0] != -1) & (trace[:,1] != -1)] + no_gap_trace = trace[(trace[:, 0] != -1) & (trace[:, 1] != -1)] # Get the sequence code for the aligned read symbols - seq_code = alignment.sequences[0].code[no_gap_trace[:,0]] + seq_code = alignment.sequences[0].code[no_gap_trace[:, 0]] # The sequence code contains the integers 0 - 3; # one for each possible base # Hence, we can use these integers directly to index the second # dimension of the Pred score sum # The index for the first dimension contains simply the genome # positions taken from the alignment trace - phred_sum[no_gap_trace[:,1], seq_code] \ - += score_array[no_gap_trace[:,0]] + phred_sum[no_gap_trace[:, 1], seq_code] += score_array[no_gap_trace[:, 0]] - sequencing_depth[ - trace[0,1] : trace[-1,1] - ] += 1 + sequencing_depth[trace[0, 1] : trace[-1, 1]] += 1 - read_gap_trace = trace[trace[:,0] == -1] - deletion_number[read_gap_trace[:,1]] += 1 + read_gap_trace = trace[trace[:, 0] == -1] + deletion_number[read_gap_trace[:, 1]] += 1 # Call the most probable base for each genome position according to the # formula above @@ -504,23 +524,21 @@ def map_sequence(read, diag): # Visualize the sequencing depth and score sum over the genome -max_phred_sum = phred_sum[ - np.arange(len(phred_sum)), most_probable_symbol_codes -] +max_phred_sum = phred_sum[np.arange(len(phred_sum)), most_probable_symbol_codes] + def moving_average(data_set, window_size): - weights = np.full(window_size, 1/window_size) - return np.convolve(data_set, weights, mode='valid') + weights = np.full(window_size, 1 / window_size) + return np.convolve(data_set, weights, mode="valid") + fig, ax = plt.subplots(figsize=(8.0, 4.0)) -ax.plot( - moving_average(max_phred_sum, 100), - color="lightgray", linewidth=1.0 -) +ax.plot(moving_average(max_phred_sum, 100), color="lightgray", linewidth=1.0) ax2 = ax.twinx() ax2.plot( moving_average(sequencing_depth, 100), - color=biotite.colors["dimorange"], linewidth=1.0 + color=biotite.colors["dimorange"], + linewidth=1.0, ) ax.axhline(0, color="silver", linewidth=0.5) ax.set_xlim(0, len(orig_genome)) @@ -528,10 +546,9 @@ def moving_average(data_set, window_size): ax.set_ylabel("Phred score sum") ax2.set_ylabel("Sequencing depth") ax.legend( - [Line2D([0], [0], color=c) - for c in ("lightgray", biotite.colors["dimorange"])], + [Line2D([0], [0], color=c) for c in ("lightgray", biotite.colors["dimorange"])], ["Phred score sum", "Sequencing depth"], - loc="upper left" + loc="upper left", ) fig.tight_layout() @@ -551,14 +568,13 @@ def moving_average(data_set, window_size): var_genome.code = most_probable_symbol_codes # A deletion is called, if either enough reads include this deletion # or the sequence position is not covered by any read at all -deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) \ - | (sequencing_depth == 0) +deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) | ( + sequencing_depth == 0 +) var_genome = var_genome[~deletion_mask] # Write the assembled genome into a FASTA file out_file = fasta.FastaFile() -fasta.set_sequence( - out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True -) +fasta.set_sequence(out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True) out_file.write(tempfile.NamedTemporaryFile("w")) ######################################################################## @@ -578,10 +594,13 @@ def moving_average(data_set, window_size): BAND_WIDTH = 1000 genome_alignment = align.align_banded( - var_genome, orig_genome, matrix, - band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1 + var_genome, + orig_genome, + matrix, + band=(-BAND_WIDTH // 2, BAND_WIDTH // 2), + max_number=1, )[0] -identity = align.get_sequence_identity(genome_alignment, 'all') +identity = align.get_sequence_identity(genome_alignment, "all") print(f"Sequence identity: {identity * 100:.2f} %") ######################################################################## @@ -599,9 +618,9 @@ def moving_average(data_set, window_size): # Calculate the sequence identity within each bin bin_identities = np.zeros(N_BINS) -edges = np.linspace(0, len(orig_genome), N_BINS+1) +edges = np.linspace(0, len(orig_genome), N_BINS + 1) for i, (bin_start, bin_stop) in enumerate(zip(edges[:-1], edges[1:])): - orig_genome_trace = genome_alignment.trace[:,1] + orig_genome_trace = genome_alignment.trace[:, 1] excerpt = genome_alignment[ (orig_genome_trace >= bin_start) & (orig_genome_trace < bin_stop) ] @@ -612,9 +631,11 @@ def moving_average(data_set, window_size): # Plot the deviation = 1 - sequence identity deviation_ax.bar( - edges[:-1], width=(edges[1:]-edges[:-1]), + edges[:-1], + width=(edges[1:] - edges[:-1]), height=(1 - bin_identities), - color=biotite.colors["dimorange"], align="edge" + color=biotite.colors["dimorange"], + align="edge", ) deviation_ax.set_xlim(0, len(orig_genome)) deviation_ax.set_ylabel("1 - Sequence identity") @@ -623,20 +644,24 @@ def moving_average(data_set, window_size): deviation_ax.set_ylim(1e-3, 1e-1) # Plot genmic coordinates of the genes -for i, feature in enumerate(sorted( - annot_seq.annotation, - key=lambda feature: min([loc.first for loc in feature.locs]) -)): +for i, feature in enumerate( + sorted( + annot_seq.annotation, + key=lambda feature: min([loc.first for loc in feature.locs]), + ) +): for loc in feature.locs: feature_ax.barh( - left=loc.first, width=loc.last-loc.first, y=i, height=1, - color=biotite.colors["dimgreen"] + left=loc.first, + width=loc.last - loc.first, + y=i, + height=1, + color=biotite.colors["dimgreen"], ) feature_ax.text( - loc.last + 100, i, feature.qual["gene"], - fontsize=8, ha="left", va="center" + loc.last + 100, i, feature.qual["gene"], fontsize=8, ha="left", va="center" ) -feature_ax.set_ylim(i+0.5, -0.5) +feature_ax.set_ylim(i + 0.5, -0.5) feature_ax.set_xlim(0, len(orig_genome)) feature_ax.xaxis.set_visible(False) feature_ax.yaxis.set_visible(False) @@ -671,17 +696,17 @@ def moving_average(data_set, window_size): # The locations of some notable spike protein regions FEATURES = { # Signal peptide - "SP": ( 1, 12), + "SP": (1, 12), # N-terminal domain - "NTD": ( 14, 303), + "NTD": (14, 303), # Receptor binding domain - "RBD": ( 319, 541), + "RBD": (319, 541), # Fusion peptide - "FP": ( 788, 806), + "FP": (788, 806), # Transmembrane domain - "TM": (1214, 1234), + "TM": (1214, 1234), # Cytoplasmatic tail - "CT": (1269, 1273), + "CT": (1269, 1273), } # Get RNA sequence coding for spike protein from the reference genome @@ -694,11 +719,11 @@ def moving_average(data_set, window_size): alignment = align.align_optimal( var_genome, orig_spike_seq, matrix, local=True, max_number=1 )[0] -var_spike_seq = var_genome[alignment.trace[alignment.trace[:,0] != -1, 0]] +var_spike_seq = var_genome[alignment.trace[alignment.trace[:, 0] != -1, 0]] # Obtain protein sequences from RNA sequences orig_spike_prot_seq = orig_spike_seq.translate(complete=True).remove_stops() -var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops() +var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops() # Align both protein sequences with each other for later comparison blosum_matrix = align.SubstitutionMatrix.std_protein_matrix() @@ -712,47 +737,50 @@ def moving_average(data_set, window_size): # Plot alignment cmap = LinearSegmentedColormap.from_list( - "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)] + "custom", + colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)], # ^ reddish ^ white ) graphics.plot_alignment_similarity_based( - ax, alignment, matrix=blosum_matrix, symbols_per_line=SYMBOLS_PER_LINE, - labels=["B.1.1.7", "Reference"], show_numbers=True, label_size=9, - number_size=9, symbol_size=7, spacing=SPACING, cmap=cmap + ax, + alignment, + matrix=blosum_matrix, + symbols_per_line=SYMBOLS_PER_LINE, + labels=["B.1.1.7", "Reference"], + show_numbers=True, + label_size=9, + number_size=9, + symbol_size=7, + spacing=SPACING, + cmap=cmap, ) ## Add indicator for features to the alignment for row in range(1 + len(alignment) // SYMBOLS_PER_LINE): col_start = SYMBOLS_PER_LINE * row - col_stop = SYMBOLS_PER_LINE * (row + 1) + col_stop = SYMBOLS_PER_LINE * (row + 1) if col_stop > len(alignment): # This happens in the last line col_stop = len(alignment) seq_start = alignment.trace[col_start, 1] - seq_stop = alignment.trace[col_stop-1, 1] + 1 + seq_stop = alignment.trace[col_stop - 1, 1] + 1 n_sequences = len(alignment.sequences) y_base = (n_sequences + SPACING) * row + n_sequences for feature_name, (first, last) in FEATURES.items(): # Zero based sequence indexing - start = first-1 + start = first - 1 # Exclusive stop stop = last if start < seq_stop and stop > seq_start: # The feature is found in this line x_begin = np.clip(start - seq_start, 0, SYMBOLS_PER_LINE) - x_end = np.clip(stop - seq_start, 0, SYMBOLS_PER_LINE) + x_end = np.clip(stop - seq_start, 0, SYMBOLS_PER_LINE) x_mean = (x_begin + x_end) / 2 y_line = y_base + 0.3 y_text = y_base + 0.6 - ax.plot( - [x_begin, x_end], [y_line, y_line], - color="black", linewidth=2 - ) - ax.text( - x_mean, y_text, feature_name, - fontsize=8, va="top", ha="center" - ) + ax.plot([x_begin, x_end], [y_line, y_line], color="black", linewidth=2) + ax.text(x_mean, y_text, feature_name, fontsize=8, va="top", ha="center") # Increase y-limit to include the feature indicators in the last line ax.set_ylim(y_text, 0) fig.tight_layout() diff --git a/doc/examples/scripts/sequence/sequencing/quality_control.py b/doc/examples/scripts/sequence/sequencing/quality_control.py index b488f1aeb..f7b769071 100644 --- a/doc/examples/scripts/sequence/sequencing/quality_control.py +++ b/doc/examples/scripts/sequence/sequencing/quality_control.py @@ -20,14 +20,13 @@ # License: BSD 3 clause # sphinx_gallery_thumbnail_number = 2 -import numpy as np -from scipy.stats import binom import matplotlib.pyplot as plt import matplotlib.ticker as ticker +import numpy as np +from scipy.stats import binom import biotite -import biotite.sequence as seq import biotite.application.sra as sra - +import biotite.sequence as seq FIG_SIZE = (8.0, 6.0) @@ -38,12 +37,10 @@ # Each run can have multiple reads per spot # by selecting index 0 we take only the first read for every spot sequences_and_scores = app.get_sequences_and_scores()[0] -sequence_codes = np.stack([ - sequence.code for sequence, _ in sequences_and_scores.values() -]) -scores = np.stack([ - scores for _, scores in sequences_and_scores.values() -]) +sequence_codes = np.stack( + [sequence.code for sequence, _ in sequences_and_scores.values()] +) +scores = np.stack([scores for _, scores in sequences_and_scores.values()]) seq_count = scores.shape[0] seq_length = scores.shape[1] positions = np.arange(1, seq_length + 1) @@ -56,20 +53,18 @@ # For the plot we need the first, second (the median) and third # quartile for each position. -first_quartile, median, third_quartile = np.quantile( - scores, (0.25, 0.5, 0.75), axis=0 -) +first_quartile, median, third_quartile = np.quantile(scores, (0.25, 0.5, 0.75), axis=0) fig, ax = plt.subplots(figsize=FIG_SIZE) ax.bar( positions, - bottom=first_quartile, height=third_quartile-first_quartile, width=1.0, - facecolor=biotite.colors["brightorange"], label="Lower/upper quartile" -) -ax.plot( - positions, median, - color=biotite.colors["dimorange"], label="Median" + bottom=first_quartile, + height=third_quartile - first_quartile, + width=1.0, + facecolor=biotite.colors["brightorange"], + label="Lower/upper quartile", ) +ax.plot(positions, median, color=biotite.colors["dimorange"], label="Median") ax.set_xlim(positions[0], positions[-1]) ax.set_xlabel("Position in read") ax.set_ylabel("Phred score") @@ -92,15 +87,13 @@ fig, ax = plt.subplots(figsize=FIG_SIZE) ax.hist( # Definition range of Sanger Phred scores is 0 to 40 - mean_scores, bins=np.linspace(0, 40, BIN_NUMBER), - color=biotite.colors["lightorange"] + mean_scores, + bins=np.linspace(0, 40, BIN_NUMBER), + color=biotite.colors["lightorange"], ) ax.set_xlabel("Mean Phred score of sequence") ax.set_ylabel("Sequence count") -ax.set_xlim( - np.floor(np.min(mean_scores)), - np.ceil( np.max(mean_scores)) -) +ax.set_xlim(np.floor(np.min(mean_scores)), np.ceil(np.max(mean_scores))) fig.tight_layout() ######################################################################## @@ -115,10 +108,9 @@ # as ambiguous bases might occur in some sequencing datasets alphabet = seq.NucleotideSequence.alphabet_amb -counts = np.stack([ - np.bincount(codes, minlength=len(alphabet)) - for codes in sequence_codes.T -], axis=-1) +counts = np.stack( + [np.bincount(codes, minlength=len(alphabet)) for codes in sequence_codes.T], axis=-1 +) frequencies = counts / seq_count * 100 fig, ax = plt.subplots(figsize=FIG_SIZE) @@ -141,38 +133,30 @@ # distribution. gc_count = np.count_nonzero( - (sequence_codes == alphabet.encode("G")) | - (sequence_codes == alphabet.encode("C")), - axis=1 + (sequence_codes == alphabet.encode("G")) | (sequence_codes == alphabet.encode("C")), + axis=1, ) at_count = np.count_nonzero( - (sequence_codes == alphabet.encode("A")) | - (sequence_codes == alphabet.encode("T")), - axis=1 + (sequence_codes == alphabet.encode("A")) | (sequence_codes == alphabet.encode("T")), + axis=1, ) gc_content = gc_count / (gc_count + at_count) # Exclusive range -> 0 to seq_length inclusive -number_of_gc = np.arange(seq_length+1) -exp_gc_content = binom.pmf( - k=number_of_gc, - n=seq_length, - p=np.mean(gc_content) -) +number_of_gc = np.arange(seq_length + 1) +exp_gc_content = binom.pmf(k=number_of_gc, n=seq_length, p=np.mean(gc_content)) fig, ax = plt.subplots(figsize=FIG_SIZE) # Due to finite sequence length, the distribution is discrete # -> use bar() instead of hist() values, counts = np.unique(gc_content, return_counts=True) bin_width = 100 / seq_length -ax.bar( - values * 100, counts, width=bin_width, - color=biotite.colors["brightorange"] -) +ax.bar(values * 100, counts, width=bin_width, color=biotite.colors["brightorange"]) ax.plot( number_of_gc / seq_length * 100, exp_gc_content * seq_count, - color=biotite.colors["dimorange"], linestyle=":" + color=biotite.colors["dimorange"], + linestyle=":", ) ax.set_xlim(0, 100) ax.set_xlabel("Sequence GC content (%)") @@ -201,11 +185,9 @@ duplications[code] = 1 duplication_level_count = np.bincount(list(duplications.values())) duplication_level_freq = ( - duplication_level_count - * np.arange(len(duplication_level_count)) - / seq_count * 100 + duplication_level_count * np.arange(len(duplication_level_count)) / seq_count * 100 ) -max_duplication = len(duplication_level_count)-1 +max_duplication = len(duplication_level_count) - 1 print("Maximum duplication number:", max_duplication) fig, ax = plt.subplots(figsize=FIG_SIZE) @@ -213,7 +195,7 @@ np.arange(0, len(duplication_level_freq)), duplication_level_freq, width=0.6, - color=biotite.colors["dimorange"] + color=biotite.colors["dimorange"], ) ax.set_xlim(0.5, len(duplication_level_freq) + 0.5) ax.xaxis.set_major_locator(ticker.MaxNLocator(10)) @@ -228,4 +210,4 @@ # Usually one would expect, that most sequences occur only once and the # following duplication numbers become decreasingly likely. # However, in this case we have another peak at around 60 duplications. -# And one read is even repeated astonishing 161 times! \ No newline at end of file +# And one read is even repeated astonishing 161 times! diff --git a/doc/examples/scripts/sequence/sequencing/read_quality.py b/doc/examples/scripts/sequence/sequencing/read_quality.py index 2cc12d492..47b89ddc4 100644 --- a/doc/examples/scripts/sequence/sequencing/read_quality.py +++ b/doc/examples/scripts/sequence/sequencing/read_quality.py @@ -10,13 +10,11 @@ # License: BSD 3 clause from io import StringIO -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.sequence as seq import biotite.sequence.io.fastq as fastq - # Sample FASTQ file from https://en.wikipedia.org/wiki/FASTQ_format fastq_content = StringIO(""" @SEQ_ID @@ -30,8 +28,12 @@ sequence, scores = fastq.get_sequence(fastq_file, "SEQ_ID") figure, ax = plt.subplots(figsize=(8.0, 2.0)) ax.bar( - x=np.arange(len(sequence)), height=scores, color=biotite.colors["orange"], - width=1.0, linewidth=1, edgecolor="white" + x=np.arange(len(sequence)), + height=scores, + color=biotite.colors["orange"], + width=1.0, + linewidth=1, + edgecolor="white", ) # -1 to put space between Y-axis and sequence ax.set_xlim(-1, len(sequence)) @@ -44,6 +46,6 @@ # Show sequence as X-axis ticks ax.set_xticks(np.arange(len(sequence))) ax.set_xticklabels(sequence.symbols) -ax.xaxis.set_ticks_position("none") +ax.xaxis.set_ticks_position("none") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/adjacency_matrix.py b/doc/examples/scripts/structure/contacts/adjacency_matrix.py index 2f5b9594a..d51a423bb 100644 --- a/doc/examples/scripts/structure/contacts/adjacency_matrix.py +++ b/doc/examples/scripts/structure/contacts/adjacency_matrix.py @@ -12,13 +12,12 @@ # License: BSD 3 clause from tempfile import gettempdir +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap import biotite +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb -import matplotlib.pyplot as plt -from matplotlib.colors import ListedColormap - file_name = rcsb.fetch("1aki", "bcif", gettempdir()) array = strucio.load_structure(file_name) @@ -41,4 +40,4 @@ ax.set_ylabel("Residue number") ax.set_title("Adjacency matrix of the lysozyme crystal structure") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/contact_sites.py b/doc/examples/scripts/structure/contacts/contact_sites.py index e7673983c..1c3856a34 100644 --- a/doc/examples/scripts/structure/contacts/contact_sites.py +++ b/doc/examples/scripts/structure/contacts/contact_sites.py @@ -14,10 +14,9 @@ # License: BSD 3 clause import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # The maximum distance between an atom in the repressor and an atom in # the DNA for them to be considered 'in contact' @@ -30,15 +29,9 @@ # Separate structure into the DNA and the two identical protein chains -dna = structure[ - np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False) -] -protein_l = structure[ - (structure.chain_id == "L") & (structure.hetero == False) -] -protein_r = structure[ - (structure.chain_id == "R") & (structure.hetero == False) -] +dna = structure[np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False)] +protein_l = structure[(structure.chain_id == "L") & (structure.hetero == False)] +protein_r = structure[(structure.chain_id == "R") & (structure.hetero == False)] # Quick check if the two protein chains are really identical assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r)) diff --git a/doc/examples/scripts/structure/contacts/contact_sites_pymol.py b/doc/examples/scripts/structure/contacts/contact_sites_pymol.py index 5e51c9a80..a4eb3622e 100644 --- a/doc/examples/scripts/structure/contacts/contact_sites_pymol.py +++ b/doc/examples/scripts/structure/contacts/contact_sites_pymol.py @@ -1,9 +1,8 @@ +import ammolite import numpy as np from matplotlib.colors import to_rgb import biotite import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 550) @@ -15,10 +14,7 @@ # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Add bonds to structure and convert to PyMOL structure = structure[~struc.filter_solvent(structure)] @@ -31,32 +27,38 @@ pymol_obj.color("biotite_lightgreen", structure.chain_id == "R") # Set view -ammolite.cmd.set_view(( - -0.044524662, 0.767611504, 0.639355302, - 0.998693943, 0.018437184, 0.047413416, - 0.024606399, 0.640637815, -0.767439663, - 0.000000000, 0.000000000, -115.614288330, - 56.031833649, 23.317802429, 3.761308193, - 73.517341614, 157.711288452, -20.000000000 -)) +ammolite.cmd.set_view( + ( + -0.044524662, + 0.767611504, + 0.639355302, + 0.998693943, + 0.018437184, + 0.047413416, + 0.024606399, + 0.640637815, + -0.767439663, + 0.000000000, + 0.000000000, + -115.614288330, + 56.031833649, + 23.317802429, + 3.761308193, + 73.517341614, + 157.711288452, + -20.000000000, + ) +) # Highlight contacts residue_mask = np.isin(structure.res_id, common_ids) -pymol_obj.show( - "sticks", - np.isin(structure.chain_id, ["L", "R"]) & residue_mask -) -for chain, color in zip( - ("L", "R"), - ("biotite_dimorange","biotite_darkgreen") -): +pymol_obj.show("sticks", np.isin(structure.chain_id, ["L", "R"]) & residue_mask) +for chain, color in zip(("L", "R"), ("biotite_dimorange", "biotite_darkgreen")): pymol_obj.color( color, - (structure.chain_id == chain) & - (structure.atom_name != "CA") & - residue_mask + (structure.chain_id == chain) & (structure.atom_name != "CA") & residue_mask, ) # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/contacts/disulfide_bonds.py b/doc/examples/scripts/structure/contacts/disulfide_bonds.py index 8d99a675d..e87e33647 100644 --- a/doc/examples/scripts/structure/contacts/disulfide_bonds.py +++ b/doc/examples/scripts/structure/contacts/disulfide_bonds.py @@ -19,28 +19,26 @@ import io from tempfile import gettempdir -import numpy as np -import matplotlib.pyplot as plt import matplotlib.patches as patches +import matplotlib.pyplot as plt +import numpy as np +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb -def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, - dihedral=90, dihedral_tol=10): +def detect_disulfide_bonds( + structure, distance=2.05, distance_tol=0.05, dihedral=90, dihedral_tol=10 +): # Array where detected disulfide bonds are stored disulfide_bonds = [] # A mask that selects only S-gamma atoms of cysteins - sulfide_mask = (structure.res_name == "CYS") & \ - (structure.atom_name == "SG") + sulfide_mask = (structure.res_name == "CYS") & (structure.atom_name == "SG") # sulfides in adjacency to other sulfides are detected in an # efficient manner via a cell list cell_list = struc.CellList( - structure, - cell_size=distance+distance_tol, - selection=sulfide_mask + structure, cell_size=distance + distance_tol, selection=sulfide_mask ) # Iterate over every index corresponding to an S-gamma atom for sulfide_i in np.where(sulfide_mask)[0]: @@ -65,31 +63,34 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, # For dihedral angle measurement the corresponding # C-beta atoms are required, too cb1 = structure[ - (structure.chain_id == sg1.chain_id) & - (structure.res_id == sg1.res_id) & - (structure.atom_name == "CB") + (structure.chain_id == sg1.chain_id) + & (structure.res_id == sg1.res_id) + & (structure.atom_name == "CB") ] cb2 = structure[ - (structure.chain_id == sg2.chain_id) & - (structure.res_id == sg2.res_id) & - (structure.atom_name == "CB") + (structure.chain_id == sg2.chain_id) + & (structure.res_id == sg2.res_id) + & (structure.atom_name == "CB") ] # Measure distance and dihedral angle and check criteria bond_dist = struc.distance(sg1, sg2) bond_dihed = np.abs(np.rad2deg(struc.dihedral(cb1, sg1, sg2, cb2))) - if bond_dist > distance - distance_tol and \ - bond_dist < distance + distance_tol and \ - bond_dihed > dihedral - dihedral_tol and \ - bond_dihed < dihedral + dihedral_tol: - # Atom meet criteria -> we found a disulfide bond - # -> the indices of the bond S-gamma atoms - # are put into a tuple with the lower index first - bond_tuple = sorted((sulfide_i, sulfide_j)) - # Add bond to list of bonds, but each bond only once - if bond_tuple not in disulfide_bonds: - disulfide_bonds.append(bond_tuple) + if ( + bond_dist > distance - distance_tol + and bond_dist < distance + distance_tol + and bond_dihed > dihedral - dihedral_tol + and bond_dihed < dihedral + dihedral_tol + ): + # Atom meet criteria -> we found a disulfide bond + # -> the indices of the bond S-gamma atoms + # are put into a tuple with the lower index first + bond_tuple = sorted((sulfide_i, sulfide_j)) + # Add bond to list of bonds, but each bond only once + if bond_tuple not in disulfide_bonds: + disulfide_bonds.append(bond_tuple) return np.array(disulfide_bonds, dtype=int) + ######################################################################## # As test case a structure of a *cysteine knot* protein is used, # specifically the squash trypsin inhibitor *EETI-II* @@ -104,19 +105,15 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, # For later verification that the implemented function works correctly, # the disulfide bonds, that are removed, are printed out. -pdbx_file = pdbx.BinaryCIFFile.read( - rcsb.fetch("2IT7", "bcif", gettempdir()) -) +pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2IT7", "bcif", gettempdir())) knottin = pdbx.get_structure(pdbx_file, include_bonds=True, model=1) -sulfide_indices = np.where( - (knottin.res_name == "CYS") & (knottin.atom_name == "SG") -)[0] +sulfide_indices = np.where((knottin.res_name == "CYS") & (knottin.atom_name == "SG"))[0] for i, j, _ in knottin.bonds.as_array(): if i in sulfide_indices and j in sulfide_indices: print(knottin[i]) print(knottin[j]) print() - knottin.bonds.remove_bond(i,j) + knottin.bonds.remove_bond(i, j) ######################################################################## # Now the sanitized structure is put into the disulfide detection @@ -143,13 +140,11 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, figure = plt.figure(figsize=(4.0, 1.0)) ax = figure.gca() MARGIN = 0.2 -ax.set_xlim(1-MARGIN, len(sequence)+MARGIN) -ax.set_ylim(0, 1+MARGIN) -ax.set_xticks(np.arange(1, len(sequence)+1)) +ax.set_xlim(1 - MARGIN, len(sequence) + MARGIN) +ax.set_ylim(0, 1 + MARGIN) +ax.set_xticks(np.arange(1, len(sequence) + 1)) ax.set_xticklabels(str(sequence)) -ax.yaxis.set_tick_params( - left=False, right=False, labelleft=False, labelright=False -) +ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False) ax.xaxis.set_tick_params( bottom=True, top=False, labelbottom=True, labeltop=False, width=0 ) @@ -161,10 +156,16 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, ellipse_width = sg2_res_id - sg1_res_id # Height is 2 instead of 1, # because only the upper half of the ellipse is visible - ax.add_patch(patches.Ellipse( - xy=(ellipse_center, 0), width=ellipse_width, height=2, - facecolor="None", edgecolor="gold", linewidth=2 - )) + ax.add_patch( + patches.Ellipse( + xy=(ellipse_center, 0), + width=ellipse_width, + height=2, + facecolor="None", + edgecolor="gold", + linewidth=2, + ) + ) figure.tight_layout() ######################################################################## @@ -180,4 +181,4 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, pdbx.set_structure(out_file, knottin) out_file.write(io.BytesIO()) -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/domain_hbonds.py b/doc/examples/scripts/structure/contacts/domain_hbonds.py index 03bbb75d7..93550583a 100644 --- a/doc/examples/scripts/structure/contacts/domain_hbonds.py +++ b/doc/examples/scripts/structure/contacts/domain_hbonds.py @@ -15,10 +15,9 @@ from tempfile import gettempdir import matplotlib.pyplot as plt import biotite +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb - file_name = rcsb.fetch("2KB1", "bcif", gettempdir()) stack = strucio.load_structure(file_name) @@ -35,19 +34,22 @@ # Create names of bonds label = "{d_resid}{d_resnm}-{d_a} -- {a_resid}{a_resnm}-{a_a}" -names = [label.format( - d_resid=chain_a.res_id[donor], - d_resnm=chain_a.res_name[donor], - d_a=chain_a.atom_name[donor], - a_resid=chain_a.res_id[acceptor], - a_resnm=chain_a.res_name[acceptor], - a_a=chain_a.atom_name[acceptor] - ) for donor, _, acceptor in triplets] - -plt.subplots(figsize=(11,4.5)) +names = [ + label.format( + d_resid=chain_a.res_id[donor], + d_resnm=chain_a.res_name[donor], + d_a=chain_a.atom_name[donor], + a_resid=chain_a.res_id[acceptor], + a_resnm=chain_a.res_name[acceptor], + a_a=chain_a.atom_name[acceptor], + ) + for donor, _, acceptor in triplets +] + +plt.subplots(figsize=(11, 4.5)) plt.bar(names, freq, color=biotite.colors["orange"]) plt.xlabel("Hydrogen bond") plt.ylabel("Hydrogen bond frequency") plt.xticks(rotation=45) plt.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/leaflet.py b/doc/examples/scripts/structure/contacts/leaflet.py index 41c184e69..1a8655ecd 100644 --- a/doc/examples/scripts/structure/contacts/leaflet.py +++ b/doc/examples/scripts/structure/contacts/leaflet.py @@ -21,10 +21,10 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -from tempfile import NamedTemporaryFile import warnings -import numpy as np +from tempfile import NamedTemporaryFile import networkx as nx +import numpy as np import biotite.structure as struc import biotite.structure.io as strucio @@ -33,8 +33,7 @@ PDB_FILE_PATH = "../../../download/dppc_n128.pdb" -def find_leaflets(structure, head_atom_mask, - cutoff_distance=15.0, periodic=False): +def find_leaflets(structure, head_atom_mask, cutoff_distance=15.0, periodic=False): """ Identify which lipids molecules belong to the same lipid bilayer leaflet. @@ -64,28 +63,29 @@ def find_leaflets(structure, head_atom_mask, """ cell_list = struc.CellList( - structure, cell_size=cutoff_distance, selection=head_atom_mask, - periodic=periodic + structure, + cell_size=cutoff_distance, + selection=head_atom_mask, + periodic=periodic, ) adjacency_matrix = cell_list.create_adjacency_matrix(cutoff_distance) graph = nx.Graph(adjacency_matrix) - head_leaflets = [sorted(c) for c in nx.connected_components(graph) - # A leaflet cannot consist of a single lipid - # This also removes all entries - # for atoms not in 'head_atom_mask' - if len(c) > 1] + head_leaflets = [ + sorted(c) + for c in nx.connected_components(graph) + # A leaflet cannot consist of a single lipid + # This also removes all entries + # for atoms not in 'head_atom_mask' + if len(c) > 1 + ] # 'leaflets' contains indices to head atoms # Broadcast each head atom index to all atoms in its corresponding # residue - leaflet_masks = np.empty( - (len(head_leaflets), structure.array_length()), - dtype=bool - ) + leaflet_masks = np.empty((len(head_leaflets), structure.array_length()), dtype=bool) for i, head_leaflet in enumerate(head_leaflets): - leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet) \ - .any(axis=0) + leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet).any(axis=0) return leaflet_masks @@ -100,7 +100,7 @@ def find_leaflets(structure, head_atom_mask, # periodicity should not matter leaflets = find_leaflets( structure, - head_atom_mask=(structure.res_name == "DPP") & (structure.atom_name == "P") + head_atom_mask=(structure.res_name == "DPP") & (structure.atom_name == "P"), ) # Bilayer -> Expect two leaflets assert len(leaflets) == 2 diff --git a/doc/examples/scripts/structure/contacts/leaflet_pymol.py b/doc/examples/scripts/structure/contacts/leaflet_pymol.py index 7678e53d5..59b2e98a0 100644 --- a/doc/examples/scripts/structure/contacts/leaflet_pymol.py +++ b/doc/examples/scripts/structure/contacts/leaflet_pymol.py @@ -1,9 +1,8 @@ +import ammolite import numpy as np from matplotlib.colors import to_rgb import biotite import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 700) @@ -14,15 +13,10 @@ # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Remove hydrogen and water and convert to PyMOL -structure = structure[ - (structure.element != "H") & (structure.res_name != "TIP") -] +structure = structure[(structure.element != "H") & (structure.res_name != "TIP")] structure.bonds = struc.connect_via_distances(structure) pymol_obj = ammolite.PyMOLObject.from_structure(structure) @@ -33,16 +27,13 @@ # Configure lipid heads pymol_obj.color( - "biotite_darkgreen", - (structure.chain_id == "A") & (structure.atom_name == "P") + "biotite_darkgreen", (structure.chain_id == "A") & (structure.atom_name == "P") ) pymol_obj.color( - "biotite_dimorange", - (structure.chain_id == "B") & (structure.atom_name == "P") + "biotite_dimorange", (structure.chain_id == "B") & (structure.atom_name == "P") ) pymol_obj.show( - "spheres", - np.isin(structure.chain_id, ("A", "B")) & (structure.atom_name == "P") + "spheres", np.isin(structure.chain_id, ("A", "B")) & (structure.atom_name == "P") ) # Adjust camera @@ -52,4 +43,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/misc/biological_assembly.py b/doc/examples/scripts/structure/misc/biological_assembly.py index ee9fe79ab..83586dcca 100644 --- a/doc/examples/scripts/structure/misc/biological_assembly.py +++ b/doc/examples/scripts/structure/misc/biological_assembly.py @@ -38,11 +38,10 @@ # License: BSD 3 clause from tempfile import NamedTemporaryFile +import biotite.database.rcsb as rcsb import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb - +import biotite.structure.io.pdbx as pdbx pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("3J31", "bcif")) @@ -77,4 +76,4 @@ # Visualization with PyMOL... # sphinx_gallery_ammolite_script = "biological_assembly_pymol.py" -temp.close() \ No newline at end of file +temp.close() diff --git a/doc/examples/scripts/structure/misc/biological_assembly_pymol.py b/doc/examples/scripts/structure/misc/biological_assembly_pymol.py index 377143fbd..3175b4eb9 100644 --- a/doc/examples/scripts/structure/misc/biological_assembly_pymol.py +++ b/doc/examples/scripts/structure/misc/biological_assembly_pymol.py @@ -1,8 +1,6 @@ -import numpy as np -import matplotlib.pyplot as plt -import biotite.structure as struc import ammolite - +import matplotlib.pyplot as plt +import numpy as np PNG_SIZE = (1000, 1000) @@ -21,4 +19,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/misc/diameter.py b/doc/examples/scripts/structure/misc/diameter.py index 2f7154cd8..e428ccbd3 100644 --- a/doc/examples/scripts/structure/misc/diameter.py +++ b/doc/examples/scripts/structure/misc/diameter.py @@ -11,9 +11,10 @@ from tempfile import gettempdir import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb + def get_diameter(pdb_id): file_name = rcsb.fetch(pdb_id, "bcif", gettempdir()) @@ -24,10 +25,11 @@ def get_diameter(pdb_id): # Calculate all pairwise difference vectors diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :] # Calculate absolute of difference vectors -> square distances - sq_dist = np.sum(diff*diff, axis=-1) + sq_dist = np.sum(diff * diff, axis=-1) # Maximum distance is diameter diameter = np.sqrt(np.max(sq_dist)) return diameter + # Example application -print("Diameter of 1QAW:", get_diameter("1QAW"), "Angstrom") \ No newline at end of file +print("Diameter of 1QAW:", get_diameter("1QAW"), "Angstrom") diff --git a/doc/examples/scripts/structure/misc/gap_bars.py b/doc/examples/scripts/structure/misc/gap_bars.py index 1ec9eb343..55024fd4e 100644 --- a/doc/examples/scripts/structure/misc/gap_bars.py +++ b/doc/examples/scripts/structure/misc/gap_bars.py @@ -16,11 +16,12 @@ # License: BSD 3 clause from tempfile import gettempdir -import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb import matplotlib.pyplot as plt -from matplotlib.patches import Rectangle import numpy as np +from matplotlib.patches import Rectangle +import biotite.database.rcsb as rcsb +import biotite.structure.io as strucio + def plot_gaps(pdb_id, chain_id, ax): # Download and parse structure file @@ -32,7 +33,7 @@ def plot_gaps(pdb_id, chain_id, ax): states = np.zeros(atom_array.res_id[-1], dtype=int) for i in range(len(states)): # Get array for only one residue ID - residue = atom_array[atom_array.res_id == i+1] + residue = atom_array[atom_array.res_id == i + 1] if len(residue) == 0: # not existing states[i] = 0 @@ -52,7 +53,7 @@ def plot_gaps(pdb_id, chain_id, ax): curr_start = i curr_state = states[i] else: - if states[i] != states[i-1]: + if states[i] != states[i - 1]: state_intervals.append((curr_start, i, curr_state)) curr_start = i curr_state = states[i] @@ -69,8 +70,11 @@ def plot_gaps(pdb_id, chain_id, ax): color = "gold" elif state == 2: color = "forestgreen" - ax.add_patch(Rectangle((start+1-0.5, 0), stop-start, 1, - edgecolor="None", facecolor=color)) + ax.add_patch( + Rectangle( + (start + 1 - 0.5, 0), stop - start, 1, edgecolor="None", facecolor=color + ) + ) # Some other visual stuff ax.spines["left"].set_visible(False) ax.spines["bottom"].set_visible(False) @@ -88,6 +92,6 @@ def plot_gaps(pdb_id, chain_id, ax): ax = fig.add_subplot(212) ax.set_title("5w1r", loc="left") plot_gaps("5w1r", "A", ax) -ax.set_xlabel("$Residue \ number$") +ax.set_xlabel(r"$Residue \ number$") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/misc/glycan_visualization.py b/doc/examples/scripts/structure/misc/glycan_visualization.py index 43bcccaf0..bd55ca1f7 100644 --- a/doc/examples/scripts/structure/misc/glycan_visualization.py +++ b/doc/examples/scripts/structure/misc/glycan_visualization.py @@ -18,21 +18,21 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from matplotlib.lines import Line2D import networkx as nx +import numpy as np +from matplotlib.lines import Line2D from networkx.drawing.nx_pydot import graphviz_layout +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # Adapted from "Mol*" Software # The dictionary maps residue names of saccharides to their common names SACCHARIDE_NAMES = { - res_name : common_name for common_name, res_names in [ + res_name: common_name + for common_name, res_names in [ ("Glc", ["GLC", "BGC", "Z8T", "TRE", "MLR"]), ("Man", ["MAN", "BMA"]), ("Gal", ["GLA", "GAL", "GZL", "GXL", "GIV"]), @@ -112,62 +112,51 @@ "All": ("o", "purple"), "Tal": ("o", "lightsteelblue"), "Ido": ("o", "chocolate"), - "GlcNAc": ("s", "royalblue"), "ManNAc": ("s", "forestgreen"), "GalNAc": ("s", "gold"), "GulNAc": ("s", "darkorange"), "AllNAc": ("s", "purple"), "IdoNAc": ("s", "chocolate"), - "GlcN": ("1", "royalblue"), "ManN": ("1", "forestgreen"), "GalN": ("1", "gold"), - "GlcA": ("v", "royalblue"), "ManA": ("v", "forestgreen"), "GalA": ("v", "gold"), "GulA": ("v", "darkorange"), "TalA": ("v", "lightsteelblue"), "IdoA": ("v", "chocolate"), - "Qui": ("^", "royalblue"), "Rha": ("^", "forestgreen"), "6dGul": ("^", "darkorange"), "Fuc": ("^", "crimson"), - "QuiNAc": ("P", "royalblue"), "FucNAc": ("P", "crimson"), - "Oli": ("X", "royalblue"), "Tyv": ("X", "forestgreen"), "Abe": ("X", "darkorange"), "Par": ("X", "pink"), "Dig": ("X", "purple"), - "Ara": ("*", "forestgreen"), "Lyx": ("*", "gold"), "Xyl": ("*", "darkorange"), "Rib": ("*", "pink"), - "Kdn": ("D", "forestgreen"), "Neu5Ac": ("D", "mediumvioletred"), "Neu5Gc": ("D", "turquoise"), - "LDManHep": ("H", "forestgreen"), "Kdo": ("H", "gold"), "DDManHep": ("H", "pink"), "MurNAc": ("H", "purple"), "Mur": ("H", "chocolate"), - "Api": ("p", "royalblue"), "Fru": ("p", "forestgreen"), "Tag": ("p", "gold"), "Sor": ("p", "darkorange"), "Psi": ("p", "pink"), - # Default representation - None: ("h", "black") + None: ("h", "black"), } ######################################################################### @@ -222,19 +211,22 @@ bonds = structure.bonds.as_array()[:, :2] # Convert indices pointing to connected atoms to indices pointing to the # starting atom of the respective residue -connected = struc.get_residue_starts_for( - structure, bonds.flatten() -).reshape(bonds.shape) +connected = struc.get_residue_starts_for(structure, bonds.flatten()).reshape( + bonds.shape +) # Omit bonds within the same residue -connected = connected[connected[:,0] != connected[:,1]] +connected = connected[connected[:, 0] != connected[:, 1]] # Add the residue connections to the graph graph.add_edges_from(connected) fig, ax = plt.subplots(figsize=(8.0, 8.0)) nx.draw( - graph, ax=ax, node_size=10, - node_color=["crimson" if is_glycan[atom_i] else "royalblue" - for atom_i in graph.nodes()] + graph, + ax=ax, + node_size=10, + node_color=[ + "crimson" if is_glycan[atom_i] else "royalblue" for atom_i in graph.nodes() + ], ) ######################################################################## @@ -260,7 +252,8 @@ # Get connected subgraphs containing glycans # -> any subgraph with more than one node glycan_graphs = [ - graph.subgraph(nodes).copy() for nodes in nx.connected_components(graph) + graph.subgraph(nodes).copy() + for nodes in nx.connected_components(graph) if len(nodes) > 1 ] @@ -297,14 +290,14 @@ # almost always an atom index that is lower than the saccharides # attached to it glycan_graph = nx.DiGraph( - [(min(atom_i, atom_j), max(atom_i, atom_j)) - for atom_i, atom_j in glycan_graph.edges()] + [ + (min(atom_i, atom_j), max(atom_i, atom_j)) + for atom_i, atom_j in glycan_graph.edges() + ] ) # The 'root' is the amino acid - root = [ - atom_i for atom_i in glycan_graph.nodes() if is_amino_acid[atom_i] - ] + root = [atom_i for atom_i in glycan_graph.nodes() if is_amino_acid[atom_i]] if len(root) == 0: # Saccharide is not attached to an amino acid -> Ignore glycan continue @@ -331,22 +324,20 @@ # Position the root at coordinate origin pos_array -= pos_array[nodes.index(root)] # Set vertical distances between nodes to 1 - pos_array[:,1] /= ( - pos_array[nodes.index(root_neighbor), 1] - - pos_array[nodes.index(root), 1] + pos_array[:, 1] /= ( + pos_array[nodes.index(root_neighbor), 1] - pos_array[nodes.index(root), 1] ) # Set minimum horizontal distances between nodes to 1 - non_zero_dist = np.abs(pos_array[(pos_array[:,0] != 0), 0]) + non_zero_dist = np.abs(pos_array[(pos_array[:, 0] != 0), 0]) if len(non_zero_dist) != 0: - pos_array[:,0] *= HORIZONTAL_NODE_DISTANCE / np.min(non_zero_dist) + pos_array[:, 0] *= HORIZONTAL_NODE_DISTANCE / np.min(non_zero_dist) # Move graph to residue ID position on x-axis - pos_array[:,0] += structure.res_id[root] + pos_array[:, 0] += structure.res_id[root] # Convert array back to dictionary pos = {node: tuple(coord) for node, coord in zip(nodes, pos_array)} nx.draw_networkx_edges( - glycan_graph, pos, ax=ax, - arrows=False, node_size=0, width=LINE_WIDTH + glycan_graph, pos, ax=ax, arrows=False, node_size=0, width=LINE_WIDTH ) # Draw each node individually @@ -359,14 +350,23 @@ common_name = SACCHARIDE_NAMES.get(structure.res_name[atom_i]) shape, color = SACCHARIDE_REPRESENTATION[common_name] ax.scatter( - pos[atom_i][0], pos[atom_i][1], - s=NODE_SIZE, marker=shape, facecolor=color, - edgecolor="black", linewidths=LINE_WIDTH + pos[atom_i][0], + pos[atom_i][1], + s=NODE_SIZE, + marker=shape, + facecolor=color, + edgecolor="black", + linewidths=LINE_WIDTH, ) legend_elements[common_name] = Line2D( - [0], [0], label=common_name, linestyle="None", - marker=shape, markerfacecolor=color, - markeredgecolor="black", markeredgewidth=LINE_WIDTH + [0], + [0], + label=common_name, + linestyle="None", + marker=shape, + markerfacecolor=color, + markeredgecolor="black", + markeredgewidth=LINE_WIDTH, ) @@ -381,9 +381,13 @@ ax.tick_params(axis="y", left=False, labelleft=False) ax.set_xticks(glycosylated_residue_ids) ax.set_xticklabels( - [symbol + str(res_id) for symbol, res_id - in zip(glycosylated_residue_symbols, glycosylated_residue_ids)], - rotation=45 + [ + symbol + str(res_id) + for symbol, res_id in zip( + glycosylated_residue_symbols, glycosylated_residue_ids + ) + ], + rotation=45, ) # Set the end of the axis to the last amino acid @@ -393,4 +397,4 @@ fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/misc/homolog_superimposition.py b/doc/examples/scripts/structure/misc/homolog_superimposition.py index 2e0e03558..4db689581 100644 --- a/doc/examples/scripts/structure/misc/homolog_superimposition.py +++ b/doc/examples/scripts/structure/misc/homolog_superimposition.py @@ -13,20 +13,19 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause - +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb + def _extract_monomer(complex): complex = complex[struc.filter_amino_acids(complex)] # Get the monomer that belongs to the first atom in the structure return complex[struc.get_chain_masks(complex, [0])[0]] + avidin_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1vyo", "bcif")) -avidin = _extract_monomer( - pdbx.get_structure(avidin_file, model=1, include_bonds=True) -) +avidin = _extract_monomer(pdbx.get_structure(avidin_file, model=1, include_bonds=True)) streptavidin_file = pdbx.BinaryCIFFile.read(rcsb.fetch("6j6j", "bcif")) streptavidin = _extract_monomer( pdbx.get_structure(streptavidin_file, model=1, include_bonds=True) @@ -34,4 +33,4 @@ def _extract_monomer(complex): streptavidin, _, _, _ = struc.superimpose_homologs(avidin, streptavidin) # Visualization with PyMOL... -# sphinx_gallery_ammolite_script = "homolog_superimposition_pymol.py" \ No newline at end of file +# sphinx_gallery_ammolite_script = "homolog_superimposition_pymol.py" diff --git a/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py b/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py index f9c204788..1760d527e 100644 --- a/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py +++ b/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py @@ -1,9 +1,6 @@ -import numpy as np +import ammolite from matplotlib.colors import to_rgb import biotite -import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 750) @@ -13,10 +10,7 @@ # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Convert to PyMOL pymol_avidin = ammolite.PyMOLObject.from_structure(avidin) @@ -33,4 +27,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/misc/pdb_statistics.py b/doc/examples/scripts/structure/misc/pdb_statistics.py index eaf7a3e05..ed8680eb8 100644 --- a/doc/examples/scripts/structure/misc/pdb_statistics.py +++ b/doc/examples/scripts/structure/misc/pdb_statistics.py @@ -10,12 +10,11 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np +from datetime import datetime, time import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.database.rcsb as rcsb -from datetime import datetime, time - +import biotite.database.rcsb as rcsb years = np.arange(1990, datetime.today().year + 1) xray_count = np.zeros(len(years), dtype=int) @@ -28,20 +27,14 @@ # A query that comprises one year date_query = rcsb.FieldQuery( "rcsb_accession_info.initial_release_date", - range_closed = ( - datetime.combine(datetime(year, 1, 1), time.min), - datetime.combine(datetime(year, 12, 31), time.max) - ) - ) - xray_query = rcsb.FieldQuery( - "exptl.method", exact_match="X-RAY DIFFRACTION" - ) - nmr_query = rcsb.FieldQuery( - "exptl.method", exact_match="SOLUTION NMR" - ) - em_query = rcsb.FieldQuery( - "exptl.method", exact_match="ELECTRON MICROSCOPY" + range_closed=( + datetime.combine(datetime(year, 1, 1), time.min), + datetime.combine(datetime(year, 12, 31), time.max), + ), ) + xray_query = rcsb.FieldQuery("exptl.method", exact_match="X-RAY DIFFRACTION") + nmr_query = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR") + em_query = rcsb.FieldQuery("exptl.method", exact_match="ELECTRON MICROSCOPY") # Get the amount of structures, that were released in that year # AND were elucidated with the respective method xray_count[i], nmr_count[i], em_count[i] = [ @@ -53,27 +46,32 @@ fig, ax = plt.subplots(figsize=(8.0, 5.0)) ax.set_title("PDB release statistics") -ax.set_xlim(years[0]-1, years[-1]+1) +ax.set_xlim(years[0] - 1, years[-1] + 1) ax.set_xticks(years) ax.set_xticklabels([str(y) for y in years], rotation=45) ax.set_xlabel("Year") ax.set_ylabel("Released structures per year") +ax.bar(years, xray_count, color=biotite.colors["darkorange"], label="X-Ray") ax.bar( - years, xray_count, - color=biotite.colors["darkorange"], label="X-Ray" -) -ax.bar( - years, nmr_count, bottom=xray_count, - color=biotite.colors["orange"], label="Solution NMR" + years, + nmr_count, + bottom=xray_count, + color=biotite.colors["orange"], + label="Solution NMR", ) ax.bar( - years, em_count, bottom=xray_count + nmr_count, - color=biotite.colors["brightorange"], label="Electron Microscopy" + years, + em_count, + bottom=xray_count + nmr_count, + color=biotite.colors["brightorange"], + label="Electron Microscopy", ) ax.bar( - years, tot_count - xray_count - nmr_count - em_count, + years, + tot_count - xray_count - nmr_count - em_count, bottom=xray_count + nmr_count + em_count, - color="gray", label="Miscellaneous" + color="gray", + label="Miscellaneous", ) ax.legend(loc="upper left") fig.tight_layout() diff --git a/doc/examples/scripts/structure/modeling/docking.py b/doc/examples/scripts/structure/modeling/docking.py index 06492c242..eb9a3fcfa 100644 --- a/doc/examples/scripts/structure/modeling/docking.py +++ b/doc/examples/scripts/structure/modeling/docking.py @@ -28,22 +28,24 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from scipy.stats import spearmanr +import biotite.application.autodock as autodock +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.info as info import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb -import biotite.application.autodock as autodock - # Get the receptor structure # and the original 'correct' conformation of the ligand pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2RTG", "bcif")) structure = pdbx.get_structure( # Include formal charge for accurate partial charge calculation - pdbx_file, model=1, include_bonds=True, extra_fields=["charge"] + pdbx_file, + model=1, + include_bonds=True, + extra_fields=["charge"], ) # The asymmetric unit describes a streptavidin homodimer # However, we are only interested in a single monomer @@ -79,9 +81,7 @@ docked_ligand = struc.from_template(ligand, docked_coord) # As Vina discards all nonpolar hydrogen atoms, their respective # coordinates are NaN -> remove these atoms -docked_ligand = docked_ligand[ - ..., ~np.isnan(docked_ligand.coord[0]).any(axis=-1) -] +docked_ligand = docked_ligand[..., ~np.isnan(docked_ligand.coord[0]).any(axis=-1)] # For comparison of the docked pose with the experimentally determined @@ -142,9 +142,9 @@ # Vina only keeps polar hydrogens in the modeled structure # For consistency, remove all hydrogen atoms in the reference and # modelled structure -ref_ligand = ref_ligand[ref_ligand.element!= "H"] -docked_ligand = docked_ligand[docked_ligand.element!= "H"] +ref_ligand = ref_ligand[ref_ligand.element != "H"] +docked_ligand = docked_ligand[docked_ligand.element != "H"] # Visualization with PyMOL... # sphinx_gallery_thumbnail_number = 2 -# sphinx_gallery_ammolite_script = "docking_pymol.py" \ No newline at end of file +# sphinx_gallery_ammolite_script = "docking_pymol.py" diff --git a/doc/examples/scripts/structure/modeling/docking_pymol.py b/doc/examples/scripts/structure/modeling/docking_pymol.py index 349f93b39..8f9adc263 100644 --- a/doc/examples/scripts/structure/modeling/docking_pymol.py +++ b/doc/examples/scripts/structure/modeling/docking_pymol.py @@ -1,23 +1,17 @@ -import numpy as np +import ammolite from matplotlib.colors import to_rgb import biotite -import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 400) # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Convert to PyMOL -pymol_receptor = ammolite.PyMOLObject.from_structure(receptor) -pymol_ref_ligand = ammolite.PyMOLObject.from_structure(ref_ligand) +pymol_receptor = ammolite.PyMOLObject.from_structure(receptor) +pymol_ref_ligand = ammolite.PyMOLObject.from_structure(ref_ligand) pymol_docked_ligand = ammolite.PyMOLObject.from_structure(docked_ligand) # Visualize receptor as surface @@ -53,4 +47,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/modeling/md_analysis.py b/doc/examples/scripts/structure/modeling/md_analysis.py index 3e36779b3..dfdbf573b 100644 --- a/doc/examples/scripts/structure/modeling/md_analysis.py +++ b/doc/examples/scripts/structure/modeling/md_analysis.py @@ -22,16 +22,16 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause +import matplotlib.pyplot as plt +import numpy as np import biotite import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.xtc as xtc -import numpy as np -import matplotlib.pyplot as plt # Put here the path of the downloaded files templ_file_path = "../../../download/lysozyme_md.pdb" -traj_file_path = "../../../download/lysozyme_md.xtc" +traj_file_path = "../../../download/lysozyme_md.xtc" # Gromacs does not set the element symbol in its PDB files, # but Biotite guesses the element names from the atom names, @@ -76,7 +76,7 @@ trajectory, _ = struc.superimpose(trajectory[0], trajectory) rmsd = struc.rmsd(trajectory[0], trajectory) -figure = plt.figure(figsize=(6,3)) +figure = plt.figure(figsize=(6, 3)) ax = figure.add_subplot(111) ax.plot(time, rmsd, color=biotite.colors["dimorange"]) ax.set_xlim(time[0], time[-1]) @@ -97,7 +97,7 @@ radius = struc.gyration_radius(trajectory) -figure = plt.figure(figsize=(6,3)) +figure = plt.figure(figsize=(6, 3)) ax = figure.add_subplot(111) ax.plot(time, radius, color=biotite.colors["dimorange"]) ax.set_xlim(time[0], time[-1]) @@ -129,10 +129,10 @@ ca_trajectory = trajectory[:, trajectory.atom_name == "CA"] rmsf = struc.rmsf(struc.average(ca_trajectory), ca_trajectory) -figure = plt.figure(figsize=(6,3)) +figure = plt.figure(figsize=(6, 3)) ax = figure.add_subplot(111) res_count = struc.get_residue_count(trajectory) -ax.plot(np.arange(1, res_count+1), rmsf, color=biotite.colors["dimorange"]) +ax.plot(np.arange(1, res_count + 1), rmsf, color=biotite.colors["dimorange"]) ax.set_xlim(1, res_count) ax.set_ylim(0, 1.5) ax.set_xlabel("Residue") @@ -140,4 +140,4 @@ figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/mmtf_trajectory.py b/doc/examples/scripts/structure/modeling/mmtf_trajectory.py index 4bc706de8..cf4d8612c 100644 --- a/doc/examples/scripts/structure/modeling/mmtf_trajectory.py +++ b/doc/examples/scripts/structure/modeling/mmtf_trajectory.py @@ -25,13 +25,13 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause +import os.path from tempfile import NamedTemporaryFile +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure.io.xtc as xtc import biotite.structure.io.pdbx as pdbx -import numpy as np -import matplotlib.pyplot as plt -import os.path +import biotite.structure.io.xtc as xtc # Put here the path of the downloaded trajectory file xtc_file_path = "../../../download/lysozyme_md.xtc" @@ -53,14 +53,14 @@ ) for i, dim in enumerate(("x", "y", "z")): columns[f"coord_{dim}"] = pdbx.BinaryCIFData( - coord[:,:,i].flatten(), + coord[:, :, i].flatten(), encoding=[ pdbx.FixedPointEncoding(factor=100, src_type=np.float32), pdbx.DeltaEncoding(), # Encode the difference into two bytes pdbx.IntegerPackingEncoding(byte_count=2, is_unsigned=False), pdbx.ByteArrayEncoding(), - ] + ], ) category = pdbx.BinaryCIFCategory(columns) bcif_file = pdbx.BinaryCIFFile( @@ -77,15 +77,17 @@ figure = plt.figure() ax = figure.add_subplot(111) ax.bar( - [1,2], [xtc_size/1e+6, bcif_size/1e+6], width=0.3, + [1, 2], + [xtc_size / 1e6, bcif_size / 1e6], + width=0.3, color=[biotite.colors["dimgreen"], biotite.colors["dimorange"]], - linewidth=0 + linewidth=0, ) -ax.set_xticks([1,2]) +ax.set_xticks([1, 2]) ax.set_xticklabels(["XTC", "BinaryCIF"]) ax.set_xlim(0.5, 2.5) ax.set_ylim(0, 40) ax.yaxis.grid(True) ax.set_ylabel("File size (MB)") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/normal_modes.py b/doc/examples/scripts/structure/modeling/normal_modes.py index 13c7eca3a..ac760c459 100644 --- a/doc/examples/scripts/structure/modeling/normal_modes.py +++ b/doc/examples/scripts/structure/modeling/normal_modes.py @@ -36,11 +36,10 @@ from tempfile import NamedTemporaryFile import numpy as np from numpy import newaxis +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # A CSV file containing the eigenvectors for the CA atoms VECTOR_FILE = "../../../download/glycosylase_anm_vectors.csv" @@ -64,8 +63,7 @@ # Filter first peptide chain protein_chain = structure[ - struc.filter_amino_acids(structure) - & (structure.chain_id == structure.chain_id[0]) + struc.filter_amino_acids(structure) & (structure.chain_id == structure.chain_id[0]) ] # Filter CA atoms ca = protein_chain[protein_chain.atom_name == "CA"] @@ -88,7 +86,7 @@ # Stepwise application of eigenvectors as smooth sine oscillation -time = np.linspace(0, 2*np.pi, FRAMES, endpoint=False) +time = np.linspace(0, 2 * np.pi, FRAMES, endpoint=False) deviation = np.sin(time)[:, newaxis, newaxis] * mode_vectors # Apply oscillation of CA atom to all atoms in the corresponding residue @@ -97,13 +95,14 @@ protein_chain, # The last array element will be the length of the atom array, # i.e. no valid index - add_exclusive_stop=True + add_exclusive_stop=True, ) -for i in range(len(residue_starts) -1): +for i in range(len(residue_starts) - 1): res_start = residue_starts[i] - res_stop = residue_starts[i+1] - oscillation[:, res_start:res_stop, :] \ - = protein_chain.coord[res_start:res_stop, :] + deviation[:, i:i+1, :] + res_stop = residue_starts[i + 1] + oscillation[:, res_start:res_stop, :] = ( + protein_chain.coord[res_start:res_stop, :] + deviation[:, i : i + 1, :] + ) # An atom array stack containing all frames oscillating_structure = struc.from_template(protein_chain, oscillation) @@ -112,4 +111,4 @@ strucio.save_structure(temp.name, oscillating_structure) # sphinx_gallery_static_image = "normal_modes.gif" -temp.close() \ No newline at end of file +temp.close() diff --git a/doc/examples/scripts/structure/modeling/normal_modes_pymol.py b/doc/examples/scripts/structure/modeling/normal_modes_pymol.py index 5165510e9..1c0ad0e2c 100644 --- a/doc/examples/scripts/structure/modeling/normal_modes_pymol.py +++ b/doc/examples/scripts/structure/modeling/normal_modes_pymol.py @@ -1,6 +1,5 @@ +from os.path import isdir, join from pymol import cmd -from os.path import join, isdir - INPUT_STRUCTURE = "normal_modes.pdb" OUTPUT_DIR = "normal_modes" @@ -13,20 +12,34 @@ cmd.dss() # Define colors -cmd.set_color("biotite_lightgreen", [111/255, 222/255, 76/255]) +cmd.set_color("biotite_lightgreen", [111 / 255, 222 / 255, 76 / 255]) # Set overall colors cmd.color("biotite_lightgreen", "chain A") # Set view -cmd.set_view(( - 0.605540633, 0.363677770, -0.707855821, - -0.416691631, 0.902691007, 0.107316799, - 0.678002179, 0.229972601, 0.698157668, - 0.000000000, 0.000000000, -115.912551880, - 32.098876953, 31.005725861, 78.377349854, - 89.280677795, 142.544403076, -20.000000000 -)) +cmd.set_view( + ( + 0.605540633, + 0.363677770, + -0.707855821, + -0.416691631, + 0.902691007, + 0.107316799, + 0.678002179, + 0.229972601, + 0.698157668, + 0.000000000, + 0.000000000, + -115.912551880, + 32.098876953, + 31.005725861, + 78.377349854, + 89.280677795, + 142.544403076, + -20.000000000, + ) +) # Prepare output video frames cmd.mset() diff --git a/doc/examples/scripts/structure/modeling/rotamer_library.py b/doc/examples/scripts/structure/modeling/rotamer_library.py index 2087a08de..fa828eb1d 100644 --- a/doc/examples/scripts/structure/modeling/rotamer_library.py +++ b/doc/examples/scripts/structure/modeling/rotamer_library.py @@ -13,14 +13,11 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np -import networkx as nx import matplotlib.pyplot as plt +import numpy as np import biotite.structure as struc -import biotite.structure.io as strucio -import biotite.structure.info as info import biotite.structure.graphics as graphics - +import biotite.structure.info as info # 'CA' is not in backbone, # as we want to include the rotation between 'CA' and 'CB' @@ -73,14 +70,12 @@ bond_list_without_axis.remove_bond(atom_i, atom_j) # ...and these atoms are found by identifying the atoms that # are still connected to one of the two atoms involved - rotated_atom_indices = struc.find_connected( - bond_list_without_axis, root=atom_i - ) + rotated_atom_indices = struc.find_connected(bond_list_without_axis, root=atom_i) accepted = False while not accepted: # A random angle between 0 and 360 degrees - angle = np.random.rand() * 2*np.pi + angle = np.random.rand() * 2 * np.pi # Rotate coord[rotated_atom_indices] = struc.rotate_about_axis( coord[rotated_atom_indices], axis, angle, support @@ -91,9 +86,7 @@ # than the sum of their VdW radii, if they are not bonded to # each other accepted = True - distances = struc.distance( - coord[:, np.newaxis], coord[np.newaxis, :] - ) + distances = struc.distance(coord[:, np.newaxis], coord[np.newaxis, :]) clashed = distances < vdw_radii_mean for clash_atom1, clash_atom2 in zip(*np.where(clashed)): if clash_atom1 == clash_atom2: @@ -115,23 +108,28 @@ ### Visualize rotamers ### colors = np.zeros((residue.array_length(), 3)) -colors[residue.element == "H"] = (0.8, 0.8, 0.8) # gray -colors[residue.element == "C"] = (0.0, 0.8, 0.0) # green -colors[residue.element == "N"] = (0.0, 0.0, 0.8) # blue -colors[residue.element == "O"] = (0.8, 0.0, 0.0) # red +colors[residue.element == "H"] = (0.8, 0.8, 0.8) # gray +colors[residue.element == "C"] = (0.0, 0.8, 0.0) # green +colors[residue.element == "N"] = (0.0, 0.0, 0.8) # blue +colors[residue.element == "O"] = (0.8, 0.0, 0.0) # red # For consistency, each subplot has the same box size coord = rotamers.coord -size = np.array( - [coord[:, :, 0].max() - coord[:, :, 0].min(), - coord[:, :, 1].max() - coord[:, :, 1].min(), - coord[:, :, 2].max() - coord[:, :, 2].min()] -).max() * 0.5 +size = ( + np.array( + [ + coord[:, :, 0].max() - coord[:, :, 0].min(), + coord[:, :, 1].max() - coord[:, :, 1].min(), + coord[:, :, 2].max() - coord[:, :, 2].min(), + ] + ).max() + * 0.5 +) fig = plt.figure(figsize=(8.0, 8.0)) fig.suptitle("Rotamers of tyrosine", fontsize=20, weight="bold") for i, rotamer in enumerate(rotamers): - ax = fig.add_subplot(3, 3, i+1, projection="3d") + ax = fig.add_subplot(3, 3, i + 1, projection="3d") graphics.plot_atoms(ax, rotamer, colors, line_width=3, size=size, zoom=0.9) fig.tight_layout() @@ -139,4 +137,4 @@ ### Write rotamers to structure file ### -#strucio.save_structure("rotamers.pdb", rotamers) \ No newline at end of file +# strucio.save_structure("rotamers.pdb", rotamers) diff --git a/doc/examples/scripts/structure/modeling/solvation_shells.py b/doc/examples/scripts/structure/modeling/solvation_shells.py index cdd00be28..dcba8894d 100644 --- a/doc/examples/scripts/structure/modeling/solvation_shells.py +++ b/doc/examples/scripts/structure/modeling/solvation_shells.py @@ -25,16 +25,16 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause +import matplotlib.pyplot as plt import numpy as np import scipy.signal as signal -import matplotlib.pyplot as plt import biotite import biotite.structure as struc import biotite.structure.io as strucio # Put here the path of the downloaded files templ_file_path = "../../../download/waterbox_md.pdb" -traj_file_path = "../../../download/waterbox_md.xtc" +traj_file_path = "../../../download/waterbox_md.xtc" # Load the trajectory traj = strucio.load_structure(traj_file_path, template=templ_file_path) @@ -53,27 +53,19 @@ # Calculate the RDF of water molecules # centered on sodium or chloride ions, respectively N_BINS = 200 -bins, rdf_na = struc.rdf( - center=na, atoms=solvent, periodic=True, bins=N_BINS -) -bins, rdf_cl = struc.rdf( - center=cl, atoms=solvent, periodic=True, bins=N_BINS -) +bins, rdf_na = struc.rdf(center=na, atoms=solvent, periodic=True, bins=N_BINS) +bins, rdf_cl = struc.rdf(center=cl, atoms=solvent, periodic=True, bins=N_BINS) # Find peaks # This requires a bit trial and error on the parameters # The 'x' in '[x * N_BINS/10]' is the expected peak width in Å, # that is transformed into a peak width in amount of values -peak_indices_na = signal.find_peaks_cwt( - rdf_na, widths=[0.2 * N_BINS/10] -) -peak_indices_cl = signal.find_peaks_cwt( - rdf_cl, widths=[0.3 * N_BINS/10] -) +peak_indices_na = signal.find_peaks_cwt(rdf_na, widths=[0.2 * N_BINS / 10]) +peak_indices_cl = signal.find_peaks_cwt(rdf_cl, widths=[0.3 * N_BINS / 10]) peak_indices_na, peak_indices_cl = peak_indices_na[:3], peak_indices_cl[:3] # Create plots -fig, ax = plt.subplots(figsize=(8.0,3.0)) +fig, ax = plt.subplots(figsize=(8.0, 3.0)) # Plot average density in box ax.axhline(1, color="lightgray", linestyle="--") # Plot both RDFs @@ -81,19 +73,25 @@ ax.plot(bins, rdf_cl, color=biotite.colors["dimorange"], label="Cl") # The peak positions are shown as vertical lines ax.vlines( - bins[peak_indices_na], ymin=0, ymax=3, - color=biotite.colors["darkgreen"], linestyle=":" + bins[peak_indices_na], + ymin=0, + ymax=3, + color=biotite.colors["darkgreen"], + linestyle=":", ) ax.vlines( - bins[peak_indices_cl], ymin=0, ymax=3, - color=biotite.colors["dimorange"], linestyle=":" + bins[peak_indices_cl], + ymin=0, + ymax=3, + color=biotite.colors["dimorange"], + linestyle=":", ) ax.set_xticks(np.arange(0, 10.5, 0.5)) -ax.set_xlim(0,10) -ax.set_ylim(0,2.7) +ax.set_xlim(0, 10) +ax.set_ylim(0, 2.7) ax.set_xlabel("Radius (Å)") ax.set_ylabel("Relative density") ax.legend() fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/trajectory_sse.py b/doc/examples/scripts/structure/modeling/trajectory_sse.py index 5b33d2156..a0acf219c 100644 --- a/doc/examples/scripts/structure/modeling/trajectory_sse.py +++ b/doc/examples/scripts/structure/modeling/trajectory_sse.py @@ -14,20 +14,18 @@ # Code source: Daniel Bauer, Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from matplotlib.lines import Line2D +import numpy as np from matplotlib import colors -import matplotlib as mpl +from matplotlib.lines import Line2D import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.xtc as xtc from biotite.application.dssp import DsspApp - # Put here the path of the downloaded files templ_file_path = "../../../download/lysozyme_md.pdb" -traj_file_path = "../../../download/lysozyme_md.xtc" +traj_file_path = "../../../download/lysozyme_md.xtc" xtc_file = xtc.XTCFile.read(traj_file_path) @@ -36,25 +34,28 @@ traj = traj[:, struc.filter_amino_acids(traj)] # DSSP does not assign an SSE to the last residue -> -1 -sse = np.empty((traj.shape[0], struc.get_residue_count(traj)-1), dtype='U1') +sse = np.empty((traj.shape[0], struc.get_residue_count(traj) - 1), dtype="U1") for idx, frame in enumerate(traj): app = DsspApp(traj[idx]) app.start() app.join() sse[idx] = app.get_sse() + # Matplotlib needs numbers to assign colors correctly def sse_to_num(sse): num = np.empty(sse.shape, dtype=int) - num[sse == 'C'] = 0 - num[sse == 'E'] = 1 - num[sse == 'B'] = 2 - num[sse == 'S'] = 3 - num[sse == 'T'] = 4 - num[sse == 'H'] = 5 - num[sse == 'G'] = 6 - num[sse == 'I'] = 7 + num[sse == "C"] = 0 + num[sse == "E"] = 1 + num[sse == "B"] = 2 + num[sse == "S"] = 3 + num[sse == "T"] = 4 + num[sse == "H"] = 5 + num[sse == "G"] = 6 + num[sse == "I"] = 7 return num + + sse = sse_to_num(sse) @@ -68,24 +69,26 @@ def sse_to_num(sse): r"turn": "yellow", r"$\alpha$-helix": "blue", r"$3_{10}$-helix": "gray", - r"$\pi$-helix": "purple", + r"$\pi$-helix": "purple", } cmap = colors.ListedColormap(color_assign.values()) plt.figure(figsize=(8.0, 6.0)) -plt.imshow(sse.T, cmap=cmap, origin='lower') +plt.imshow(sse.T, cmap=cmap, origin="lower") plt.xlabel("Time / ps") plt.ylabel("Residue") ticks = np.arange(0, len(traj), 10) plt.xticks(ticks, time[ticks].astype(int)) # Custom legend below the DSSP plot -custom_lines = [ - Line2D([0], [0], color=cmap(i), lw=4) for i in range(len(color_assign)) -] +custom_lines = [Line2D([0], [0], color=cmap(i), lw=4) for i in range(len(color_assign))] plt.legend( - custom_lines, color_assign.keys(), loc="upper center", - bbox_to_anchor=(0.5, -0.15), ncol=len(color_assign), fontsize=8 + custom_lines, + color_assign.keys(), + loc="upper center", + bbox_to_anchor=(0.5, -0.15), + ncol=len(color_assign), + fontsize=8, ) plt.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py index 31a92644a..d5566aac4 100644 --- a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py +++ b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py @@ -28,9 +28,9 @@ import matplotlib.pyplot as plt import numpy as np import biotite +import biotite.structure as struct import biotite.structure.io.gro as gro import biotite.structure.io.xtc as xtc -import biotite.structure as struct def water_in_prox(atoms, sele, cutoff): @@ -38,27 +38,28 @@ def water_in_prox(atoms, sele, cutoff): Get the atom indices of water oxygen atoms that are in vicinity of the selected atoms. """ - cell_list = struct.CellList(atoms, cell_size=5, - selection=atoms.atom_name == "OW") + cell_list = struct.CellList(atoms, cell_size=5, selection=atoms.atom_name == "OW") adjacent_atoms = cell_list.get_atoms(atoms[sele].coord, cutoff) adjacent_atoms = np.unique(adjacent_atoms.flatten()) adjacent_atoms = adjacent_atoms[adjacent_atoms > 0] return adjacent_atoms + def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)): """ Calculate the cumulative number of water molecules visiting the pore. """ - protein_sele = np.isin(traj.res_id, key_residues) \ - & ~np.isin(traj.atom_name, ["N", "O", "CA", "C"]) + protein_sele = np.isin(traj.res_id, key_residues) & ~np.isin( + traj.atom_name, ["N", "O", "CA", "C"] + ) water_count = np.zeros(traj.shape[0]) prev_counted_indices = [] for idx, frame in enumerate(traj): indices = water_in_prox(frame, protein_sele, cutoff) count = (~np.isin(indices, prev_counted_indices)).sum() if idx != 0: - count += water_count[idx-1] + count += water_count[idx - 1] water_count[idx] = count prev_counted_indices = indices return water_count @@ -83,35 +84,40 @@ def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)): # Linear fitting from pylab import polyfit + open_fit = polyfit(time, counts[0], 1) closed_fit = polyfit(time, counts[1], 1) fig, ax = plt.subplots(figsize=(8.0, 4.0)) -ax.plot(time, counts[0], - label="open pore", color=biotite.colors["dimgreen"]) -ax.plot(time, open_fit[0]*time+open_fit[1], - linestyle="--", color="black", zorder=-1) -ax.plot(time, counts[1], - label="closed pore", color=biotite.colors["lightorange"]) -ax.plot(time, closed_fit[0]*time+closed_fit[1], - linestyle="--", color="black", zorder=-1) +ax.plot(time, counts[0], label="open pore", color=biotite.colors["dimgreen"]) +ax.plot( + time, open_fit[0] * time + open_fit[1], linestyle="--", color="black", zorder=-1 +) +ax.plot(time, counts[1], label="closed pore", color=biotite.colors["lightorange"]) +ax.plot( + time, closed_fit[0] * time + closed_fit[1], linestyle="--", color="black", zorder=-1 +) ax.set( - xlabel = "Time / ns", - ylabel = "Count", - title = "Cumulative count\nof individual water molecules visiting the pore" + xlabel="Time / ns", + ylabel="Count", + title="Cumulative count\nof individual water molecules visiting the pore", ) ax.legend() -ax.annotate(f"{open_fit[0]:.1f} per ns", - xy=(20, 20*open_fit[0]+open_fit[1]+100), - xytext=(20-5, 20*open_fit[0]+open_fit[1]+1300), - arrowprops=dict(facecolor=biotite.colors["darkgreen"]), - va="center") -ax.annotate(f"{closed_fit[0]:.1f} per ns", - xy=(30, 20*closed_fit[0]+closed_fit[1]+100), - xytext=(30+2, 20*closed_fit[0]+closed_fit[1]+1300), - arrowprops=dict(facecolor=biotite.colors["orange"]), - va="center") +ax.annotate( + f"{open_fit[0]:.1f} per ns", + xy=(20, 20 * open_fit[0] + open_fit[1] + 100), + xytext=(20 - 5, 20 * open_fit[0] + open_fit[1] + 1300), + arrowprops=dict(facecolor=biotite.colors["darkgreen"]), + va="center", +) +ax.annotate( + f"{closed_fit[0]:.1f} per ns", + xy=(30, 20 * closed_fit[0] + closed_fit[1] + 100), + xytext=(30 + 2, 20 * closed_fit[0] + closed_fit[1] + 1300), + arrowprops=dict(facecolor=biotite.colors["orange"]), + va="center", +) fig.savefig("water_exchange.png", bbox_inches="tight") -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/molecule/alkane_isomers.py b/doc/examples/scripts/structure/molecule/alkane_isomers.py index ed479a52f..c9ea8a265 100644 --- a/doc/examples/scripts/structure/molecule/alkane_isomers.py +++ b/doc/examples/scripts/structure/molecule/alkane_isomers.py @@ -24,12 +24,11 @@ opposed to one request per carbon number. """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite.database.pubchem as pubchem -import biotite.structure.io.mol as mol import biotite.structure as struc - +import biotite.structure.io.mol as mol MAX_CARBON_COUNT = 12 PLOT_MAX_CARBON_COUNT = 6 @@ -37,13 +36,13 @@ carbon_numbers = [] alkane_cids = [] -for n_carbon in range(1, MAX_CARBON_COUNT+1): +for n_carbon in range(1, MAX_CARBON_COUNT + 1): formula = f"C{n_carbon}H{2 * n_carbon + 2}" print(formula) cids = np.array(pubchem.search(pubchem.FormulaQuery(formula))) carbon_numbers.extend([n_carbon] * len(cids)) alkane_cids.extend(cids) -carbon_numbers = np.array(carbon_numbers) +carbon_numbers = np.array(carbon_numbers) alkane_cids = np.array(alkane_cids) ######################################################################## @@ -58,15 +57,13 @@ # appropriate data type and used for filtering. # Finally, also the IUPAC name for each remaining compound is retrieved # to review the results. - + # Filter natural isotopes... n_isotopes = np.array( pubchem.fetch_property(alkane_cids, "IsotopeAtomCount"), dtype=int ) # ...and neutral compounds -charge = np.array( - pubchem.fetch_property(alkane_cids, "Charge"), dtype=int -) +charge = np.array(pubchem.fetch_property(alkane_cids, "Charge"), dtype=int) # Apply filter mask = (n_isotopes == 0) & (charge == 0) carbon_numbers = carbon_numbers[mask] @@ -85,7 +82,7 @@ # Remove compounds containing multiple molecules # (indicated by the ';' as separator between molecule names) -single_molecule_mask = np.array([not ";" in name for name in iupac_names]) +single_molecule_mask = np.array([";" not in name for name in iupac_names]) # Some compounds containing multiple molecules have no name at all single_molecule_mask &= np.array([len(name) != 0 for name in iupac_names]) carbon_numbers = carbon_numbers[single_molecule_mask] @@ -109,10 +106,7 @@ # for alkanes with zero carbon atoms, which does not make sense isomer_numbers = np.bincount(carbon_numbers)[1:] fig, ax = plt.subplots(figsize=(8.0, 4.0)) -ax.plot( - np.arange(1, MAX_CARBON_COUNT+1), isomer_numbers, - marker="o", color="gray" -) +ax.plot(np.arange(1, MAX_CARBON_COUNT + 1), isomer_numbers, marker="o", color="gray") ax.set_xlim(left=0) ax.set_ylim(bottom=0) ax.set_xlabel("Number of carbon atoms") @@ -127,18 +121,18 @@ # xy-coordinates are plotted as skeletal formula. files = pubchem.fetch( - alkane_cids[carbon_numbers <= PLOT_MAX_CARBON_COUNT], - as_structural_formula=True + alkane_cids[carbon_numbers <= PLOT_MAX_CARBON_COUNT], as_structural_formula=True ) fig, axes = plt.subplots( nrows=np.max(isomer_numbers[:PLOT_MAX_CARBON_COUNT]), ncols=PLOT_MAX_CARBON_COUNT, figsize=(8.0, 6.0), - sharex=True, sharey=True + sharex=True, + sharey=True, ) fig.suptitle("Number of carbon atoms", fontsize=16) -for i, n_carbon in enumerate(range(1, PLOT_MAX_CARBON_COUNT+1)): +for i, n_carbon in enumerate(range(1, PLOT_MAX_CARBON_COUNT + 1)): axes[0, i].set_title(n_carbon, fontsize=12) indices_for_n_carbon = np.where(carbon_numbers == n_carbon)[0] for j, file_index in enumerate(indices_for_n_carbon): @@ -149,17 +143,13 @@ # Center atoms in origin atoms.coord -= struc.centroid(atoms) # Structural formula is 0 in z-dimension - coord = atoms.coord[:,:2] + coord = atoms.coord[:, :2] ax = axes[j, i] - ax.plot( - coord[:, 0], coord[:, 1], - color="black", linestyle="None", marker="o" - ) + ax.plot(coord[:, 0], coord[:, 1], color="black", linestyle="None", marker="o") for bond_i, bond_j, _ in atoms.bonds.as_array(): ax.plot( - coord[[bond_i, bond_j], 0], coord[[bond_i, bond_j], 1], - color="black" + coord[[bond_i, bond_j], 0], coord[[bond_i, bond_j], 1], color="black" ) for ax in axes.flatten(): @@ -171,4 +161,4 @@ plt.show() -# sphinx_gallery_thumbnail_number = 2 \ No newline at end of file +# sphinx_gallery_thumbnail_number = 2 diff --git a/doc/examples/scripts/structure/molecule/molecular_visualization.py b/doc/examples/scripts/structure/molecule/molecular_visualization.py index 70d77d837..883785167 100644 --- a/doc/examples/scripts/structure/molecule/molecular_visualization.py +++ b/doc/examples/scripts/structure/molecule/molecular_visualization.py @@ -16,13 +16,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.animation import FuncAnimation import biotite.structure as struc -import biotite.structure.info as info import biotite.structure.graphics as graphics - +import biotite.structure.info as info # Get an atom array for caffeine # Caffeine has the PDB reside name 'CFF' @@ -35,34 +34,37 @@ # Normal vector of ring plane normal = np.cross(n1.coord - n3.coord, n1.coord - n7.coord) # Align ring plane normal to z-axis -caffeine = struc.align_vectors(caffeine, normal, np.array([0,0,1])) +caffeine = struc.align_vectors(caffeine, normal, np.array([0, 0, 1])) # Caffeine should be colored by element colors = np.zeros((caffeine.array_length(), 3)) -colors[caffeine.element == "H"] = (0.8, 0.8, 0.8) # gray -colors[caffeine.element == "C"] = (0.0, 0.8, 0.0) # green -colors[caffeine.element == "N"] = (0.0, 0.0, 0.8) # blue -colors[caffeine.element == "O"] = (0.8, 0.0, 0.0) # red +colors[caffeine.element == "H"] = (0.8, 0.8, 0.8) # gray +colors[caffeine.element == "C"] = (0.0, 0.8, 0.0) # green +colors[caffeine.element == "N"] = (0.0, 0.0, 0.8) # blue +colors[caffeine.element == "O"] = (0.8, 0.0, 0.0) # red fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111, projection="3d") graphics.plot_atoms( - ax, caffeine, colors, line_width=5, background_color="white", - zoom=1.5 + ax, caffeine, colors, line_width=5, background_color="white", zoom=1.5 ) fig.tight_layout() + # Create an animation that rotates the molecule about the x-axis def update(angle): ax.elev = angle + FPS = 50 DURATION = 4 angles = np.linspace(-180, 180, DURATION * FPS) # Start at 90 degrees -angles = np.concatenate([ - np.linspace( 90, 180, int(DURATION * FPS * 1/4)), - np.linspace(-180, 90, int(DURATION * FPS * 3/4)) -]) -animation = FuncAnimation(fig, update, angles, interval=int(1000/FPS)) -plt.show() \ No newline at end of file +angles = np.concatenate( + [ + np.linspace(90, 180, int(DURATION * FPS * 1 / 4)), + np.linspace(-180, 90, int(DURATION * FPS * 3 / 4)), + ] +) +animation = FuncAnimation(fig, update, angles, interval=int(1000 / FPS)) +plt.show() diff --git a/doc/examples/scripts/structure/molecule/peoe_visualization.py b/doc/examples/scripts/structure/molecule/peoe_visualization.py index d2dbaf66e..c38e51d98 100644 --- a/doc/examples/scripts/structure/molecule/peoe_visualization.py +++ b/doc/examples/scripts/structure/molecule/peoe_visualization.py @@ -13,15 +13,14 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np -from sklearn.decomposition import PCA import matplotlib.pyplot as plt -from matplotlib.colors import Normalize +import numpy as np from matplotlib.cm import ScalarMappable +from matplotlib.colors import Normalize +from sklearn.decomposition import PCA import biotite.structure as struc -import biotite.structure.info as info import biotite.structure.graphics as graphics - +import biotite.structure.info as info # Acetylsalicylic acid MOLECULE_NAME = "AIN" @@ -42,7 +41,6 @@ CMAP_NAME = "bwr_r" - # Get an atom array for the selected molecule molecule = info.residue(MOLECULE_NAME) @@ -71,17 +69,19 @@ colors = color_map(normalized_charges) # Ball size should be proportional to VdW radius of the respective atom -ball_sizes = np.array( - [info.vdw_radius_single(e) for e in molecule.element] -) * BALL_SCALE +ball_sizes = ( + np.array([info.vdw_radius_single(e) for e in molecule.element]) * BALL_SCALE +) # Gradient of ray strength # The ray size is proportional to the absolute charge value ray_full_sizes = ball_sizes + np.abs(charges) * RAY_SCALE -ray_sizes = np.array([ - np.linspace(ray_full_sizes[i], ball_sizes[i], N_RAY_STEPS, endpoint=False) - for i in range(molecule.array_length()) -]).T +ray_sizes = np.array( + [ + np.linspace(ray_full_sizes[i], ball_sizes[i], N_RAY_STEPS, endpoint=False) + for i in range(molecule.array_length()) + ] +).T # The plotting begins here @@ -92,32 +92,38 @@ # As 'axes.scatter()' uses sizes in points**2, # the VdW-radii as also squared graphics.plot_ball_and_stick_model( - ax, molecule, colors, ball_size=ball_sizes**2, line_width=3, - line_color=color_map(0.5), background_color=(.05, .05, .05), zoom=1.5 + ax, + molecule, + colors, + ball_size=ball_sizes**2, + line_width=3, + line_color=color_map(0.5), + background_color=(0.05, 0.05, 0.05), + zoom=1.5, ) # Plot the element labels for atom in molecule: ax.text( - *atom.coord, atom.element, - fontsize=ELEMENT_FONT_SIZE, color="black", - ha="center", va="center", zorder=100 + *atom.coord, + atom.element, + fontsize=ELEMENT_FONT_SIZE, + color="black", + ha="center", + va="center", + zorder=100, ) # Plot the rays for i in range(N_RAY_STEPS): ax.scatter( - *molecule.coord.T, s=ray_sizes[i]**2, c=colors, - linewidth=0, alpha=RAY_ALPHA + *molecule.coord.T, s=ray_sizes[i] ** 2, c=colors, linewidth=0, alpha=RAY_ALPHA ) # Plot the colorbar color_bar = fig.colorbar( - ScalarMappable( - norm=Normalize(vmin=-max_charge, vmax=max_charge), - cmap=color_map - ), - ax=ax + ScalarMappable(norm=Normalize(vmin=-max_charge, vmax=max_charge), cmap=color_map), + ax=ax, ) color_bar.set_label("Partial charge (e)", color="white") color_bar.ax.yaxis.set_tick_params(color="white") @@ -126,4 +132,4 @@ label.set_color("white") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/nucleotide/base_pairs.py b/doc/examples/scripts/structure/nucleotide/base_pairs.py index cde0b9b21..28681901e 100644 --- a/doc/examples/scripts/structure/nucleotide/base_pairs.py +++ b/doc/examples/scripts/structure/nucleotide/base_pairs.py @@ -10,15 +10,14 @@ # License: BSD 3 clause from tempfile import gettempdir -import biotite -import biotite.structure.io.pdb as pdb -import biotite.database.rcsb as rcsb -import biotite.structure as struc -import biotite.sequence.graphics as graphics import matplotlib.pyplot as plt import matplotlib.ticker as ticker -from matplotlib.patches import Arc import numpy as np +from matplotlib.patches import Arc +import biotite +import biotite.database.rcsb as rcsb +import biotite.structure as struc +import biotite.structure.io.pdb as pdb # Download the PDB file and read the structure pdb_file_path = rcsb.fetch("4p5j", "pdb", gettempdir()) @@ -44,10 +43,10 @@ # Setup the axis ax.set_xlim(0.5, len(residue_ids) + 0.5) -ax.set_ylim(0, len(residue_ids)/2 + 0.5) +ax.set_ylim(0, len(residue_ids) / 2 + 0.5) ax.set_aspect("equal") ax.xaxis.set_major_locator(ticker.MultipleLocator(3)) -ax.tick_params(axis='both', which='major', labelsize=8) +ax.tick_params(axis="both", which="major", labelsize=8) ax.set_yticks([]) # Remove the frame @@ -55,7 +54,7 @@ # Plot the residue names in order for residue_name, residue_id in zip(residue_names, residue_ids): - ax.text(residue_id, 0, residue_name, ha='center', fontsize=8) + ax.text(residue_id, 0, residue_name, ha="center", fontsize=8) # Compute the basepairs and pseudknot order (first result) base_pairs = struc.base_pairs(nucleotides) @@ -63,9 +62,7 @@ # Draw the arcs between base pairs for (base1, base2), order in zip(base_pairs, pseudoknot_order): - arc_center = ( - np.mean((nucleotides.res_id[base1],nucleotides.res_id[base2])), 1.5 - ) + arc_center = (np.mean((nucleotides.res_id[base1], nucleotides.res_id[base2])), 1.5) arc_diameter = abs(nucleotides.res_id[base2] - nucleotides.res_id[base1]) name1 = nucleotides.res_name[base1] name2 = nucleotides.res_name[base2] @@ -80,10 +77,16 @@ else: linestyle = ":" arc = Arc( - arc_center, arc_diameter, arc_diameter, theta1=0, theta2=180, - color=color, linewidth=1.5, linestyle=linestyle + arc_center, + arc_diameter, + arc_diameter, + theta1=0, + theta2=180, + color=color, + linewidth=1.5, + linestyle=linestyle, ) ax.add_patch(arc) # Display the plot -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/nucleotide/leontis_westhof.py b/doc/examples/scripts/structure/nucleotide/leontis_westhof.py index a8a436f97..460dd573e 100644 --- a/doc/examples/scripts/structure/nucleotide/leontis_westhof.py +++ b/doc/examples/scripts/structure/nucleotide/leontis_westhof.py @@ -2,7 +2,7 @@ Leontis-Westhof Nomenclature ============================ -In this example we plot a secondary structure diagram annotated with +In this example we plot a secondary structure diagram annotated with Leontis-Westhof nomenclature :footcite:`Leontis2001` of the sarcin-ricin loop from E. coli (PDB ID: 6ZYB). """ @@ -11,14 +11,13 @@ # License: BSD 3 clause from tempfile import gettempdir +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure.io.pdb as pdb import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.graphics as graphics -import matplotlib.pyplot as plt -import numpy as np - +import biotite.structure.io.pdb as pdb # Download the PDB file and read the structure pdb_file_path = rcsb.fetch("6ZYB", "pdb", gettempdir()) @@ -30,9 +29,9 @@ base_pairs = struc.base_pairs(nucleotides) glycosidic_bonds = struc.base_pairs_glycosidic_bond(nucleotides, base_pairs) edges = struc.base_pairs_edge(nucleotides, base_pairs) -base_pairs = struc.get_residue_positions( - nucleotides, base_pairs.flatten() -).reshape(base_pairs.shape) +base_pairs = struc.get_residue_positions(nucleotides, base_pairs.flatten()).reshape( + base_pairs.shape +) # Get the one-letter-codes of the bases base_labels = [] @@ -41,7 +40,7 @@ # Color canonical Watson-Crick base pairs with a darker orange and # non-canonical base pairs with a lighter orange -colors = np.full(base_pairs.shape[0], biotite.colors['brightorange']) +colors = np.full(base_pairs.shape[0], biotite.colors["brightorange"]) for i, (base1, base2) in enumerate(base_pairs): name1 = base_labels[base1] name2 = base_labels[base2] @@ -68,34 +67,33 @@ # Plot the secondary structure graphics.plot_nucleotide_secondary_structure( - ax, base_labels, base_pairs, struc.get_residue_count(nucleotides), - bond_color=colors + ax, base_labels, base_pairs, struc.get_residue_count(nucleotides), bond_color=colors ) # Display the plot plt.show() ######################################################################## -# The sarcin-ricin loop is part of the 23s rRNA and is considered +# The sarcin-ricin loop is part of the 23s rRNA and is considered # crucial to the ribosome‘s activity. The incorporation of the -# Leontis-Westhof nomenclature into the 2D-plot shows how the individual -# base pairs are oriented and how their glycosidic bonds are oriented +# Leontis-Westhof nomenclature into the 2D-plot shows how the individual +# base pairs are oriented and how their glycosidic bonds are oriented # relative to each other. # -# This visualization enables one to see a pattern that cannot be -# communicated through the 2D structure alone. The upper part of the -# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds. -# All bases interact through their Watson-Crick edge (W). On the other -# hand, the lower part of the sarcin ricin loop looks strikingly -# different. The glycosidic bonds are oriented in cis (c) and trans (t) -# orientation. The bases interact through all three edges: Watson-Crick +# This visualization enables one to see a pattern that cannot be +# communicated through the 2D structure alone. The upper part of the +# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds. +# All bases interact through their Watson-Crick edge (W). On the other +# hand, the lower part of the sarcin ricin loop looks strikingly +# different. The glycosidic bonds are oriented in cis (c) and trans (t) +# orientation. The bases interact through all three edges: Watson-Crick # (W), Hoogsteen (H), and Sugar (S). -# -# Thus, it can be concluded that the upper part of the sarcin ricin loop -# represents a highly organized helix, while the lower part of the loop +# +# Thus, it can be concluded that the upper part of the sarcin ricin loop +# represents a highly organized helix, while the lower part of the loop # is comparatively unorganized. # # References # ---------- -# -# .. footbibliography:: \ No newline at end of file +# +# .. footbibliography:: diff --git a/doc/examples/scripts/structure/nucleotide/transfer_rnas.py b/doc/examples/scripts/structure/nucleotide/transfer_rnas.py index 23823b3d9..5d238b346 100644 --- a/doc/examples/scripts/structure/nucleotide/transfer_rnas.py +++ b/doc/examples/scripts/structure/nucleotide/transfer_rnas.py @@ -2,7 +2,7 @@ Comparison of a tRNA-like-structure with a tRNA =============================================== -In this example we plot a secondary-structure diagram of a tRNA mimic +In this example we plot a secondary-structure diagram of a tRNA mimic (PDB ID: 4P5J) from the *turnip yellow mosaic virus* (TYMV) and compare it to a PHE-tRNA (PDB ID: 1EHZ). """ @@ -11,15 +11,16 @@ # License: BSD 3 clause from tempfile import gettempdir +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure.io.pdb as pdb import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.graphics as graphics -import matplotlib.pyplot as plt -import numpy as np +import biotite.structure.io.pdb as pdb + -# Create a function to get the structures and compute information for +# Create a function to get the structures and compute information for # the plots. def plot_rna(pdb_id, axes): # Download the PDB file and read the structure @@ -30,15 +31,15 @@ def plot_rna(pdb_id, axes): # Compute the base pairs and their pseudoknot order base_pairs = struc.base_pairs(nucleotides) - base_pairs = struc.get_residue_positions( - nucleotides, base_pairs.flatten() - ).reshape(base_pairs.shape) + base_pairs = struc.get_residue_positions(nucleotides, base_pairs.flatten()).reshape( + base_pairs.shape + ) pseudoknot_order = struc.pseudoknots(base_pairs)[0] # Set the linestyle according to the pseudoknot order - linestyles = np.full(base_pairs.shape[0], '-', dtype=object) - linestyles[pseudoknot_order == 1] = '--' - linestyles[pseudoknot_order == 2] = ':' + linestyles = np.full(base_pairs.shape[0], "-", dtype=object) + linestyles[pseudoknot_order == 1] = "--" + linestyles[pseudoknot_order == 2] = ":" # Indicate canonical nucleotides with an upper case one-letter-code # and non-canonical nucleotides with a lower case one-letter-code @@ -52,7 +53,7 @@ def plot_rna(pdb_id, axes): # Color canonical Watson-Crick base pairs with a darker orange and # non-canonical base pairs with a lighter orange - colors = np.full(base_pairs.shape[0], biotite.colors['brightorange']) + colors = np.full(base_pairs.shape[0], biotite.colors["brightorange"]) for i, (base1, base2) in enumerate(base_pairs): name1 = base_labels[base1] name2 = base_labels[base2] @@ -61,37 +62,45 @@ def plot_rna(pdb_id, axes): # Plot the secondary structure graphics.plot_nucleotide_secondary_structure( - axes, base_labels, base_pairs, struc.get_residue_count(nucleotides), - pseudoknot_order=pseudoknot_order, bond_linestyle=linestyles, + axes, + base_labels, + base_pairs, + struc.get_residue_count(nucleotides), + pseudoknot_order=pseudoknot_order, + bond_linestyle=linestyles, bond_color=colors, # Margin to compensate for reduced axis limits in shared axis - border=0.13 + border=0.13, ) # Use the PDB ID to label each plot axes.set_title(pdb_id, loc="left") + # Create a matplotlib pyplot fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(8.0, 16.0), + 2, + 1, + figsize=(8.0, 16.0), # Share both axes to ensure eqaul scaling of bath secondary structures - sharex=True, sharey=True + sharex=True, + sharey=True, ) # Plot the secondary structures -plot_rna('1EHZ', ax1) -plot_rna('4P5J', ax2) +plot_rna("1EHZ", ax1) +plot_rna("4P5J", ax2) fig.tight_layout() plt.show() ######################################################################## -# The generated plots show that both structures consist of four hairpin -# loops. Two of those loops, which are opposite to each other, interact -# through two pseudoknotted base pairs in the otherwise unpaired loop of -# the respective hairpin structures. The fact that this interaction was -# mimicked indicates functional importance. -# -# A third hairpin loop is folded towards the centre of the tRNA mimic. -# This is not the case for the phenylalanine tRNA and thus signifies a -# major difference between the structures. \ No newline at end of file +# The generated plots show that both structures consist of four hairpin +# loops. Two of those loops, which are opposite to each other, interact +# through two pseudoknotted base pairs in the otherwise unpaired loop of +# the respective hairpin structures. The fact that this interaction was +# mimicked indicates functional importance. +# +# A third hairpin loop is folded towards the centre of the tRNA mimic. +# This is not the case for the phenylalanine tRNA and thus signifies a +# major difference between the structures. diff --git a/doc/examples/scripts/structure/nucleotide/watson_crick.py b/doc/examples/scripts/structure/nucleotide/watson_crick.py index 5ac45ae82..00fbfd33c 100644 --- a/doc/examples/scripts/structure/nucleotide/watson_crick.py +++ b/doc/examples/scripts/structure/nucleotide/watson_crick.py @@ -9,13 +9,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx import biotite.structure.graphics as graphics -import biotite.database.rcsb as rcsb - +import biotite.structure.io.pdbx as pdbx # Structure of a DNA double helix pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1qxb", "bcif")) @@ -26,13 +25,15 @@ base_pairs = struc.base_pairs(nucleotides) for i, j in base_pairs: if (nucleotides.res_name[i], nucleotides.res_name[j]) == ("DG", "DC"): - guanine, cytosine = [nucleotides[mask] for mask - in struc.get_residue_masks(nucleotides, [i, j])] + guanine, cytosine = [ + nucleotides[mask] for mask in struc.get_residue_masks(nucleotides, [i, j]) + ] break for i, j in base_pairs: if (nucleotides.res_name[i], nucleotides.res_name[j]) == ("DA", "DT"): - adenine, thymine = [nucleotides[mask] for mask - in struc.get_residue_masks(nucleotides, [i, j])] + adenine, thymine = [ + nucleotides[mask] for mask in struc.get_residue_masks(nucleotides, [i, j]) + ] break pairs = [(guanine, cytosine), (adenine, thymine)] @@ -41,19 +42,18 @@ # Arrange bases for i, (purine, pyrimidine) in enumerate(pairs): - n1, n3, c5, c6 = [pyrimidine[pyrimidine.atom_name == name][0] - for name in ("N1", "N3", "C5", "C6")] + n1, n3, c5, c6 = [ + pyrimidine[pyrimidine.atom_name == name][0] for name in ("N1", "N3", "C5", "C6") + ] # Pyrimidine N3-C6 axis is aligned to x-axis purine, pyrimidine = [ - struc.align_vectors( - base, - n3.coord - c6.coord, - np.array([1, 0, 0]) - ) for base in (purine, pyrimidine) + struc.align_vectors(base, n3.coord - c6.coord, np.array([1, 0, 0])) + for base in (purine, pyrimidine) ] # Coords are changed -> update 'Atom' objects - n1, n3, c4, c5 = [pyrimidine[pyrimidine.atom_name == name][0] - for name in ("N1", "N3", "C4", "C5")] + n1, n3, c4, c5 = [ + pyrimidine[pyrimidine.atom_name == name][0] for name in ("N1", "N3", "C4", "C5") + ] # Pyrimidine base plane normal vector is aligned to z-axis # Furthermore, distance between bases is set purine, pyrimidine = [ @@ -61,10 +61,11 @@ base, np.cross(n3.coord - n1.coord, c5.coord - n1.coord), np.array([0, 0, 1]), - origin_position = struc.centroid(purine + pyrimidine), + origin_position=struc.centroid(purine + pyrimidine), # 10 Å separation between pairs - target_position = np.array([0, 10*i, 0]) - ) for base in (purine, pyrimidine) + target_position=np.array([0, 10 * i, 0]), + ) + for base in (purine, pyrimidine) ] pairs[i] = (purine, pyrimidine) @@ -73,14 +74,12 @@ atoms = pairs[0][0] + pairs[0][1] + pairs[1][0] + pairs[1][1] # Color by element colors = np.zeros((atoms.array_length(), 3)) -colors[atoms.element == "H"] = (0.8, 0.8, 0.8) # gray -colors[atoms.element == "C"] = (0.2, 0.2, 0.2) # darkgray -colors[atoms.element == "N"] = (0.0, 0.0, 0.8) # blue -colors[atoms.element == "O"] = (0.8, 0.0, 0.0) # red -colors[atoms.element == "P"] = (0.0, 0.6, 0.0) # green -graphics.plot_atoms( - ax, atoms, colors, line_width=3, background_color="white", zoom=1.5 -) +colors[atoms.element == "H"] = (0.8, 0.8, 0.8) # gray +colors[atoms.element == "C"] = (0.2, 0.2, 0.2) # darkgray +colors[atoms.element == "N"] = (0.0, 0.0, 0.8) # blue +colors[atoms.element == "O"] = (0.8, 0.0, 0.0) # red +colors[atoms.element == "P"] = (0.0, 0.6, 0.0) # green +graphics.plot_atoms(ax, atoms, colors, line_width=3, background_color="white", zoom=1.5) # Plot hydrogen bonds for purine, pyrimidine in pairs: @@ -102,14 +101,13 @@ for pair in pairs: for base in pair: label = base.res_name[0][1] - ring_center = struc.centroid(base[ - np.isin(base.atom_name, ["N1", "C2", "N3", "C4", "C5", "C6"]) - ]) + ring_center = struc.centroid( + base[np.isin(base.atom_name, ["N1", "C2", "N3", "C4", "C5", "C6"])] + ) x, y, z = ring_center ax.text( - x, y, z, label, - fontsize=20, fontweight="bold", va="center", ha="center" + x, y, z, label, fontsize=20, fontweight="bold", va="center", ha="center" ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/pb_alignment.py b/doc/examples/scripts/structure/protein/pb_alignment.py index 6145cedeb..9a3396ecf 100644 --- a/doc/examples/scripts/structure/protein/pb_alignment.py +++ b/doc/examples/scripts/structure/protein/pb_alignment.py @@ -27,15 +27,14 @@ # License: BSD 3 clause from tempfile import gettempdir -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # PB alphabet pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") @@ -84,8 +83,7 @@ # Fetch animal lysoyzme structures lyso_files = rcsb.fetch( - ["1REX", "1AKI", "1DKJ", "1GD6"], - format="bcif", target_path=gettempdir() + ["1REX", "1AKI", "1DKJ", "1GD6"], format="bcif", target_path=gettempdir() ) organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"] @@ -106,25 +104,21 @@ # centered on the amino acid to calculate the PB for # Hence, the PBs are not defined for the two amino acids # at each terminus - pb_angles = np.full((len(phi)-4, 8), np.nan) - pb_angles[:, 0] = psi[ : -4] - pb_angles[:, 1] = phi[1 : -3] - pb_angles[:, 2] = psi[1 : -3] - pb_angles[:, 3] = phi[2 : -2] - pb_angles[:, 4] = psi[2 : -2] - pb_angles[:, 5] = phi[3 : -1] - pb_angles[:, 6] = psi[3 : -1] - pb_angles[:, 7] = phi[4 : ] + pb_angles = np.full((len(phi) - 4, 8), np.nan) + pb_angles[:, 0] = psi[:-4] + pb_angles[:, 1] = phi[1:-3] + pb_angles[:, 2] = psi[1:-3] + pb_angles[:, 3] = phi[2:-2] + pb_angles[:, 4] = psi[2:-2] + pb_angles[:, 5] = phi[3:-1] + pb_angles[:, 6] = psi[3:-1] + pb_angles[:, 7] = phi[4:] pb_angles = np.rad2deg(pb_angles) # Angle RMSD of all reference angles with all actual angles rmsda = np.sum( - ( - ( - ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180 - ) % 360 - 180 - )**2, - axis=-1 + ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2, + axis=-1, ) # Chose PB, where the RMSDA to the reference angle is lowest # Due to the definition of Biotite symbol codes @@ -139,7 +133,7 @@ matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str) matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict) alignment, order, _, _ = align.align_multiple( - pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False + pb_seqs, matrix, gap_penalty=(-500, -100), terminal_penalty=False ) # Visualize the alignment @@ -150,10 +144,15 @@ ax = fig.add_subplot(111) # The color scheme was generated with the 'Gecos' software graphics.plot_alignment_type_based( - ax, alignment, labels=labels, symbols_per_line=45, spacing=2, - show_numbers=True, color_scheme="flower" + ax, + alignment, + labels=labels, + symbols_per_line=45, + spacing=2, + show_numbers=True, + color_scheme="flower", ) # Organism names in italic -ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"}) +ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"}) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/peptide_assembly.py b/doc/examples/scripts/structure/protein/peptide_assembly.py index 4c07451ad..71ed3469e 100644 --- a/doc/examples/scripts/structure/protein/peptide_assembly.py +++ b/doc/examples/scripts/structure/protein/peptide_assembly.py @@ -21,19 +21,18 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -from tempfile import NamedTemporaryFile import itertools +from tempfile import NamedTemporaryFile import numpy as np from numpy.linalg import norm import biotite.sequence as seq import biotite.structure as struc -import biotite.structure.io as strucio import biotite.structure.info as info +import biotite.structure.io as strucio - -C_N_LENGTH = 1.34 -N_CA_LENGTH = 1.46 -CA_C_LENGTH = 1.54 +C_N_LENGTH = 1.34 +N_CA_LENGTH = 1.46 +CA_C_LENGTH = 1.54 CA_C_N_ANGLE = 114 C_N_CA_ANGLE = 123 @@ -41,13 +40,15 @@ # Reference peptide bond atom coordinates taken from 1l2y: # CA, C, N, O, H -peptide_coord = np.array([ - [-8.608, 3.135, -1.618], - [-7.117, 2.964, -1.897], - [-6.379, 4.031, -2.228], - [-6.634, 1.849, -1.758], - [-6.821, 4.923, -2.394] -]) +peptide_coord = np.array( + [ + [-8.608, 3.135, -1.618], + [-7.117, 2.964, -1.897], + [-6.379, 4.031, -2.228], + [-6.634, 1.849, -1.758], + [-6.821, 4.923, -2.394], + ] +) def create_raw_backbone_coord(number_of_res): @@ -60,7 +61,7 @@ def create_raw_backbone_coord(number_of_res): range(len(coord)), itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]), itertools.cycle([1, -1]), - itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]) + itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]), ): if i == 0: coord[i] = [0, 0, 0] @@ -72,14 +73,14 @@ def create_raw_backbone_coord(number_of_res): # Calculate the coordinates of a new atoms by rotating the previous # bond by the given angle new_coord = struc.rotate_about_axis( - coord[i-2], - axis = rot_axis, - angle = np.deg2rad(angle), - support = coord[i-1] + coord[i - 2], + axis=rot_axis, + angle=np.deg2rad(angle), + support=coord[i - 1], ) # Scale bond to correct bond length - bond_vector = new_coord - coord[i-1] - coord[i] = coord[i-1] + bond_vector * length / norm(bond_vector) + bond_vector = new_coord - coord[i - 1] + coord[i] = coord[i - 1] + bond_vector * length / norm(bond_vector) return coord @@ -98,25 +99,17 @@ def append_residue(chain, residue): # Remove atoms removed by peptide bond chain = chain[ - (chain.res_id != last_res_id) | - ~np.isin( - chain.atom_name, - ["OXT", "HXT"] - ) - ] - residue = residue[ - ~np.isin( - residue.atom_name, - ["H2", "H3"] - ) + (chain.res_id != last_res_id) | ~np.isin(chain.atom_name, ["OXT", "HXT"]) ] + residue = residue[~np.isin(residue.atom_name, ["H2", "H3"])] # Increment residue ID for attached residue residue.res_id[:] = last_res_id + 1 -C_N_LENGTH = 1.34 -N_CA_LENGTH = 1.46 -CA_C_LENGTH = 1.54 + +C_N_LENGTH = 1.34 +N_CA_LENGTH = 1.46 +CA_C_LENGTH = 1.54 CA_C_N_ANGLE = 114 C_N_CA_ANGLE = 123 @@ -124,13 +117,15 @@ def append_residue(chain, residue): # Reference peptide bond atom coordinates taken from 1l2y: # CA, C, N, O, H -peptide_coord = np.array([ - [-8.608, 3.135, -1.618], - [-7.117, 2.964, -1.897], - [-6.379, 4.031, -2.228], - [-6.634, 1.849, -1.758], - [-6.821, 4.923, -2.394] -]) +peptide_coord = np.array( + [ + [-8.608, 3.135, -1.618], + [-7.117, 2.964, -1.897], + [-6.379, 4.031, -2.228], + [-6.634, 1.849, -1.758], + [-6.821, 4.923, -2.394], + ] +) def create_raw_backbone_coord(number_of_res): @@ -143,7 +138,7 @@ def create_raw_backbone_coord(number_of_res): range(len(coord)), itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]), itertools.cycle([1, -1]), - itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]) + itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]), ): if i == 0: coord[i] = [0, 0, 0] @@ -155,14 +150,14 @@ def create_raw_backbone_coord(number_of_res): # Calculate the coordinates of a new atoms by rotating the # previous bond by the given angle new_coord = struc.rotate_about_axis( - coord[i-2], - axis = rot_axis, - angle = np.deg2rad(angle), - support = coord[i-1] + coord[i - 2], + axis=rot_axis, + angle=np.deg2rad(angle), + support=coord[i - 1], ) # Scale bond to correct bond length - bond_vector = new_coord - coord[i-1] - coord[i] = coord[i-1] + bond_vector * length / norm(bond_vector) + bond_vector = new_coord - coord[i - 1] + coord[i] = coord[i - 1] + bond_vector * length / norm(bond_vector) return coord @@ -181,18 +176,9 @@ def append_residue(chain, residue): # Remove atoms removed by peptide bond chain = chain[ - (chain.res_id != last_res_id) | - ~np.isin( - chain.atom_name, - ["OXT", "HXT"] - ) - ] - residue = residue[ - ~np.isin( - residue.atom_name, - ["H2", "H3"] - ) + (chain.res_id != last_res_id) | ~np.isin(chain.atom_name, ["OXT", "HXT"]) ] + residue = residue[~np.isin(residue.atom_name, ["H2", "H3"])] # Increment residue ID for attached residue residue.res_id[:] = last_res_id + 1 @@ -203,9 +189,7 @@ def append_residue(chain, residue): # Add peptide bond index_prev_c = np.where(chain.atom_name == "C")[0][-2] index_curr_n = np.where(chain.atom_name == "N")[0][-1] - chain.bonds.add_bond( - index_prev_c, index_curr_n, struc.BondType.SINGLE - ) + chain.bonds.add_bond(index_prev_c, index_curr_n, struc.BondType.SINGLE) return chain @@ -213,15 +197,14 @@ def assemble_peptide(sequence): res_names = [seq.ProteinSequence.convert_letter_1to3(r) for r in sequence] backbone_coord = create_raw_backbone_coord(len(sequence)) - chain = struc.AtomArray(0) for i, res_name in enumerate(res_names): residue = info.residue(res_name) # Superimpose residue to corresponding backbone coordinates _, transformation = struc.superimpose( - backbone_coord[3*i : 3*i + 3], - residue.coord[np.isin(residue.atom_name, ["N", "CA", "C"])] + backbone_coord[3 * i : 3 * i + 3], + residue.coord[np.isin(residue.atom_name, ["N", "CA", "C"])], ) residue = transformation.apply(residue) @@ -238,8 +221,7 @@ def assemble_peptide(sequence): for atom_name in ["N", "H"] ] _, transformation = struc.superimpose( - chain.coord[[ca_i, c_i, n_i]], - peptide_coord[:3] + chain.coord[[ca_i, c_i, n_i]], peptide_coord[:3] ) chain.coord[[o_i, h_i]] = transformation.apply(peptide_coord[3:]) return chain diff --git a/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py b/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py index 9ef7d7b2f..7afdc6a06 100644 --- a/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py +++ b/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py @@ -1,19 +1,14 @@ -import numpy as np +import ammolite from matplotlib.colors import to_rgb import biotite import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 400) # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Convert to PyMOL chain.bonds = struc.connect_via_distances(chain) @@ -21,14 +16,8 @@ # Visualize as stick model pymol_obj.show_as("sticks") -pymol_obj.color( - "biotite_lightgreen", - (chain.res_id % 2 == 0) & (chain.element == "C") -) -pymol_obj.color( - "biotite_dimgreen", - (chain.res_id % 2 != 0) & (chain.element == "C") -) +pymol_obj.color("biotite_lightgreen", (chain.res_id % 2 == 0) & (chain.element == "C")) +pymol_obj.color("biotite_dimgreen", (chain.res_id % 2 != 0) & (chain.element == "C")) ammolite.cmd.set("depth_cue", 0) # Adjust camera @@ -37,4 +26,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/protein/ramachandran.py b/doc/examples/scripts/structure/protein/ramachandran.py index 021349d36..806ac283f 100644 --- a/doc/examples/scripts/structure/protein/ramachandran.py +++ b/doc/examples/scripts/structure/protein/ramachandran.py @@ -12,34 +12,29 @@ # License: BSD 3 clause from tempfile import gettempdir -import biotite.structure as struc -import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb import matplotlib.pyplot as plt import numpy as np -from matplotlib import colors -import scipy.stats as sts +import biotite.database.rcsb as rcsb +import biotite.structure as struc +import biotite.structure.io as strucio # Download and parse file file = rcsb.fetch("3vkh", "cif", gettempdir()) atom_array = strucio.load_structure(file) # Calculate backbone dihedral angles # from one of the two identical chains in the asymmetric unit -phi, psi, omega = struc.dihedral_backbone( - atom_array[atom_array.chain_id == "A"] -) +phi, psi, omega = struc.dihedral_backbone(atom_array[atom_array.chain_id == "A"]) # Conversion from radians into degree -phi *= 180/np.pi -psi *= 180/np.pi +phi *= 180 / np.pi +psi *= 180 / np.pi # Remove invalid values (NaN) at first and last position -phi= phi[1:-1] -psi= psi[1:-1] +phi = phi[1:-1] +psi = psi[1:-1] # Plot density figure = plt.figure() ax = figure.add_subplot(111) -h, xed, yed, image = ax.hist2d(phi, psi, bins=(200, 200), - cmap="RdYlGn_r", cmin=1) +h, xed, yed, image = ax.hist2d(phi, psi, bins=(200, 200), cmap="RdYlGn_r", cmin=1) cbar = figure.colorbar(image, orientation="vertical") cbar.set_label("Count") ax.set_aspect("equal") @@ -49,4 +44,4 @@ ax.set_ylabel(r"$\psi$") ax.set_title("Ramachandran plot of dynein motor domain") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/residue_chirality.py b/doc/examples/scripts/structure/protein/residue_chirality.py index 9d6d94061..92dd15b87 100644 --- a/doc/examples/scripts/structure/protein/residue_chirality.py +++ b/doc/examples/scripts/structure/protein/residue_chirality.py @@ -18,9 +18,9 @@ from tempfile import gettempdir import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb def get_enantiomer(n, ca, c, cb): @@ -29,16 +29,15 @@ def get_enantiomer(n, ca, c, cb): # the enantiomer: # L = 1 # D = -1 - n = np.cross(ca-n, ca-c) + n = np.cross(ca - n, ca - c) sign = np.sign(np.dot(cb - ca, n)) return sign + def analyze_chirality(array): # Filter backbone + CB array = array[struc.filter_amino_acids(array)] - array = array[ - (array.atom_name == "CB") | (struc.filter_peptide_backbone(array)) - ] + array = array[(array.atom_name == "CB") | (struc.filter_peptide_backbone(array))] # Iterate over each residue ids, names = struc.get_residues(array) enantiomers = np.zeros(len(ids), dtype=int) @@ -48,10 +47,10 @@ def analyze_chirality(array): # Glyine -> no chirality enantiomers[i] = 0 else: - enantiomers[i] = get_enantiomer(coord[0], coord[1], - coord[2], coord[3]) + enantiomers[i] = get_enantiomer(coord[0], coord[1], coord[2], coord[3]) return enantiomers + # Fetch and parse structure file file = rcsb.fetch("1l2y", "bcif", gettempdir()) stack = strucio.load_structure(file) @@ -62,5 +61,5 @@ def analyze_chirality(array): # Reflected structures have opposite enantiomers # Test via reflection at x-y-plane, z -> -z array_reflect = array.copy() -array_reflect.coord[:,2] *= -1 -print("1l2y (reflected)", analyze_chirality(array_reflect)) \ No newline at end of file +array_reflect.coord[:, 2] *= -1 +print("1l2y (reflected)", analyze_chirality(array_reflect)) diff --git a/doc/examples/scripts/structure/protein/sheet_arrangement.py b/doc/examples/scripts/structure/protein/sheet_arrangement.py index aea9f4e60..930c6a0e3 100644 --- a/doc/examples/scripts/structure/protein/sheet_arrangement.py +++ b/doc/examples/scripts/structure/protein/sheet_arrangement.py @@ -17,42 +17,45 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np -import networkx as nx import matplotlib.pyplot as plt +import networkx as nx +import numpy as np from matplotlib.patches import FancyArrow import biotite -import biotite.structure.io.pdbx as pdbx import biotite.database.rcsb as rcsb - +import biotite.structure.io.pdbx as pdbx ##### OPTIONS ##### PDB_ID = "3AKO" SHEETS = ["A"] -FIG_SIZE = (8.0, 4.0) # Figure size in inches -Y_LIMIT = 2.0 # Vertical plot limits -SHEET_DISTANCE = 3.0 # Separation of strands in different sheets -ARROW_TAIL_WITH = 0.4 # Width of the arrow tails -ARROW_HEAD_WITH = 0.7 # Width of the arrow heads -ARROW_HEAD_LENGTH = 0.25 # Length of the arrow heads -ARROW_LINE_WIDTH = 1 # Width of the arrow edges -ARROW_COLORS = [ # Each chain is colored differently +FIG_SIZE = (8.0, 4.0) # Figure size in inches +Y_LIMIT = 2.0 # Vertical plot limits +SHEET_DISTANCE = 3.0 # Separation of strands in different sheets +ARROW_TAIL_WITH = 0.4 # Width of the arrow tails +ARROW_HEAD_WITH = 0.7 # Width of the arrow heads +ARROW_HEAD_LENGTH = 0.25 # Length of the arrow heads +ARROW_LINE_WIDTH = 1 # Width of the arrow edges +ARROW_COLORS = [ # Each chain is colored differently biotite.colors["darkgreen"], biotite.colors["dimorange"], biotite.colors["lightgreen"], biotite.colors["brightorange"], ] -CONNECTION_COLOR = "black" # Color of the connection lines -CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines -CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines -CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines -RES_ID_HEIGHT = -0.2 # The vertical distance of the residue ID labels from the arrow ends -RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels -RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels -ADAPTIVE_ARROW_LENGTHS = True # If true, the arrow length is proportional to the number of its residues -SHOW_SHEET_NAMES = False # If true, the sheets are labeled below the plot -SHEET_NAME_FONT_SIZE = 14 # The font size of the sheet labels +CONNECTION_COLOR = "black" # Color of the connection lines +CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines +CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines +CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines +RES_ID_HEIGHT = ( + -0.2 +) # The vertical distance of the residue ID labels from the arrow ends +RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels +RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels +ADAPTIVE_ARROW_LENGTHS = ( + True # If true, the arrow length is proportional to the number of its residues +) +SHOW_SHEET_NAMES = False # If true, the sheets are labeled below the plot +SHEET_NAME_FONT_SIZE = 14 # The font size of the sheet labels ##### SNOITPO ##### ######################################################################## @@ -73,19 +76,20 @@ if SHEETS is None: sele = np.full(sheet_order.row_count, True) else: - sele = np.array([ - sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array() - ]) + sele = np.array([sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array()]) sheet_ids = sheet_order["sheet_id"].as_array()[sele] is_parallel_list = sheet_order["sense"].as_array()[sele] == "parallel" -adjacent_strands = np.array([ - (strand_i, strand_j) for strand_i, strand_j in zip( - sheet_order["range_id_1"].as_array()[sele], - sheet_order["range_id_2"].as_array()[sele] - ) -]) +adjacent_strands = np.array( + [ + (strand_i, strand_j) + for strand_i, strand_j in zip( + sheet_order["range_id_1"].as_array()[sele], + sheet_order["range_id_2"].as_array()[sele], + ) + ] +) print("Adjacent strands (sheet ID, strand ID):") for sheet_id, (strand_i, strand_j) in zip(sheet_ids, adjacent_strands): @@ -105,9 +109,7 @@ sheet_range = bcif_file.block["struct_sheet_range"] # Again, create a boolean mask that covers the selected sheets -sele = np.array([ - sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array() -]) +sele = np.array([sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array()]) strand_chain_ids = sheet_range["beg_auth_asym_id"].as_array()[sele] strand_res_id_begs = sheet_range["beg_auth_seq_id"].as_array(int)[sele] strand_res_id_ends = sheet_range["end_auth_seq_id"].as_array(int)[sele] @@ -127,19 +129,21 @@ # i.e. entries with the same chain ID and residue ID # Duplicate entries appear e.g. in beta-barrel structure files # Draw one of each duplicate as orphan -> no connections -non_duplicate_mask = (np.diff(strand_res_id_begs[order], prepend=[-1]) != 0) +non_duplicate_mask = np.diff(strand_res_id_begs[order], prepend=[-1]) != 0 connections = [] -non_duplicate_indices = np.arange(len(sorted_strand_ids))[non_duplicate_mask] +non_duplicate_indices = np.arange(len(sorted_strand_ids))[non_duplicate_mask] for i in range(len(non_duplicate_indices) - 1): current_i = non_duplicate_indices[i] - next_i = non_duplicate_indices[i+1] + next_i = non_duplicate_indices[i + 1] if sorted_chain_ids[current_i] != sorted_chain_ids[next_i]: # No connection between separate chains continue - connections.append(( - (sorted_sheet_ids[current_i], sorted_strand_ids[current_i]), - (sorted_sheet_ids[next_i], sorted_strand_ids[next_i] ) - )) + connections.append( + ( + (sorted_sheet_ids[current_i], sorted_strand_ids[current_i]), + (sorted_sheet_ids[next_i], sorted_strand_ids[next_i]), + ) + ) print("Connected strands (sheet ID, strand ID):") for strand_i, strand_j in connections: @@ -148,18 +152,17 @@ # Save the start and end residue IDs for each strand for labeling ranges = { (sheet_id, strand_id): (begin, end) - for sheet_id, strand_id, begin, end - in zip( - sorted_sheet_ids, sorted_strand_ids, - sorted_res_id_begs, sorted_res_id_ends + for sheet_id, strand_id, begin, end in zip( + sorted_sheet_ids, sorted_strand_ids, sorted_res_id_begs, sorted_res_id_ends ) } # Save the chains ID for each strand for coloring chain_ids = { (sheet_id, strand_id): chain_id - for sheet_id, strand_id, chain_id - in zip(sorted_sheet_ids, sorted_strand_ids, sorted_chain_ids) + for sheet_id, strand_id, chain_id in zip( + sorted_sheet_ids, sorted_strand_ids, sorted_chain_ids + ) } unique_chain_ids = np.unique(sorted_chain_ids) @@ -176,14 +179,15 @@ sheet_graphs = {} for sheet_id in np.unique(sheet_ids): # Select only strands from the current sheet - sheet_mask = (sheet_ids == sheet_id) - sheet_graphs[sheet_id] = nx.Graph([ - (strand_i, strand_j, {"is_parallel": is_parallel}) - for (strand_i, strand_j), is_parallel in zip( - adjacent_strands[sheet_mask], - is_parallel_list[sheet_mask] - ) - ]) + sheet_mask = sheet_ids == sheet_id + sheet_graphs[sheet_id] = nx.Graph( + [ + (strand_i, strand_j, {"is_parallel": is_parallel}) + for (strand_i, strand_j), is_parallel in zip( + adjacent_strands[sheet_mask], is_parallel_list[sheet_mask] + ) + ] + ) ######################################################################## # Another missing information is the direction of the plotted arrows, @@ -199,7 +203,7 @@ # The calculated arrow direction is stored as node attribute. for graph in sheet_graphs.values(): - initial_strand = adjacent_strands[0,0] + initial_strand = adjacent_strands[0, 0] graph.nodes[initial_strand]["is_upwards"] = True for strand in graph.nodes: if strand == initial_strand: @@ -212,21 +216,15 @@ # yet determined continue is_parallel = graph.edges[(strand, adj_strand)]["is_parallel"] - this_strand_is_upwards.append( - is_upwards ^ ~is_parallel - ) + this_strand_is_upwards.append(is_upwards ^ ~is_parallel) if len(this_strand_is_upwards) == 0: - raise ValueError( - "Cannot determine arrow direction from adjacent strands" - ) + raise ValueError("Cannot determine arrow direction from adjacent strands") elif all(this_strand_is_upwards): graph.nodes[strand]["is_upwards"] = True elif not any(this_strand_is_upwards): graph.nodes[strand]["is_upwards"] = False else: - raise ValueError( - "Conflicting arrow directions from adjacent strands" - ) + raise ValueError("Conflicting arrow directions from adjacent strands") ######################################################################## # No we have got all positioning information we need to start plotting. @@ -234,7 +232,7 @@ fig, ax = plt.subplots(figsize=FIG_SIZE) ### Plot arrows -MAX_ARROW_LENGTH = 2 # from y=-1 to y=1 +MAX_ARROW_LENGTH = 2 # from y=-1 to y=1 arrow_length_per_seq_length = MAX_ARROW_LENGTH / np.max( [end - beg + 1 for beg, end in ranges.values()] ) @@ -280,14 +278,17 @@ dy = -arrow_length ax.add_patch( FancyArrow( - x=pos, y=y, dx=0, dy=dy, + x=pos, + y=y, + dx=0, + dy=dy, length_includes_head=True, - width = ARROW_TAIL_WITH, - head_width = ARROW_HEAD_WITH, - head_length = ARROW_HEAD_LENGTH, - facecolor = ARROW_COLORS[color_index % len(ARROW_COLORS)], - edgecolor = CONNECTION_COLOR, - linewidth = ARROW_LINE_WIDTH, + width=ARROW_TAIL_WITH, + head_width=ARROW_HEAD_WITH, + head_length=ARROW_HEAD_LENGTH, + facecolor=ARROW_COLORS[color_index % len(ARROW_COLORS)], + edgecolor=CONNECTION_COLOR, + linewidth=ARROW_LINE_WIDTH, ) ) # Start and end coordinates of the respective arrow @@ -299,10 +300,12 @@ # Plot the short connections at low height # to decrease line intersections # -> sort connections by length of connection -order = np.argsort([ - np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0]) - for strand_i, strand_j in connections -]) +order = np.argsort( + [ + np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0]) + for strand_i, strand_j in connections + ] +) connections = [connections[i] for i in order] for i, (strand_i, strand_j) in enumerate(connections): horizontal_line_height = 1 + CONNECTION_HEIGHT + i * CONNECTION_SEPARATION @@ -311,17 +314,12 @@ if np.sign(coord_i_end[1]) == np.sign(coord_j_beg[1]): # Start and end are on the same side of the arrows - x = ( - coord_i_end[0], - coord_i_end[0], - coord_j_beg[0], - coord_j_beg[0] - ) + x = (coord_i_end[0], coord_i_end[0], coord_j_beg[0], coord_j_beg[0]) y = ( coord_i_end[1], np.sign(coord_i_end[1]) * horizontal_line_height, np.sign(coord_j_beg[1]) * horizontal_line_height, - coord_j_beg[1] + coord_j_beg[1], ) else: # Start and end are on different sides @@ -332,7 +330,7 @@ coord_i_end[0] + offset, coord_i_end[0] + offset, coord_j_beg[0], - coord_j_beg[0] + coord_j_beg[0], ) y = ( coord_i_end[1], @@ -340,14 +338,15 @@ np.sign(coord_i_end[1]) * horizontal_line_height, np.sign(coord_j_beg[1]) * horizontal_line_height, np.sign(coord_j_beg[1]) * horizontal_line_height, - coord_j_beg[1] + coord_j_beg[1], ) ax.plot( - x, y, - color = CONNECTION_COLOR, - linewidth = CONNECTION_LINE_WIDTH, + x, + y, + color=CONNECTION_COLOR, + linewidth=CONNECTION_LINE_WIDTH, # Avoid intersection of the line's end with the arrow - solid_capstyle = "butt" + solid_capstyle="butt", ) ### Plot residue ID labels @@ -358,16 +357,16 @@ coord[0], np.sign(coord[1]) * (np.abs(coord[1]) + RES_ID_HEIGHT), str(res_id), - ha="center", va="center", - fontsize=RES_ID_FONT_SIZE, weight=RES_ID_FONT_WEIGHT + ha="center", + va="center", + fontsize=RES_ID_FONT_SIZE, + weight=RES_ID_FONT_WEIGHT, ) ### Plot sheet names as x-axis ticks if SHOW_SHEET_NAMES: tick_pos = [ - np.mean([ - coord_dict[key][0][0] for key in coord_dict if key[0] == sheet_id - ]) + np.mean([coord_dict[key][0][0] for key in coord_dict if key[0] == sheet_id]) for sheet_id in sheet_ids ] ax.set_xticks(tick_pos) @@ -375,8 +374,11 @@ ax.set_frame_on(False) ax.yaxis.set_visible(False) ax.xaxis.set_tick_params( - bottom=False, top=False, labelbottom=True, labeltop=False, - labelsize=SHEET_NAME_FONT_SIZE + bottom=False, + top=False, + labelbottom=True, + labeltop=False, + labelsize=SHEET_NAME_FONT_SIZE, ) else: ax.axis("off") @@ -385,4 +387,4 @@ ax.set_xlim(-1, current_position - SHEET_DISTANCE + 1) ax.set_ylim(-Y_LIMIT, Y_LIMIT) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/transketolase_sse.py b/doc/examples/scripts/structure/protein/transketolase_sse.py index 78f3ba546..5c0896e2b 100644 --- a/doc/examples/scripts/structure/protein/transketolase_sse.py +++ b/doc/examples/scripts/structure/protein/transketolase_sse.py @@ -14,25 +14,24 @@ # License: BSD 3 clause from tempfile import gettempdir -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Rectangle import biotite -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx +import biotite.application.dssp as dssp +import biotite.database.entrez as entrez +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb -import biotite.database.rcsb as rcsb -import biotite.database.entrez as entrez -import biotite.application.dssp as dssp - +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx # Create 'FeaturePlotter' subclasses # for drawing the scondary structure features -class HelixPlotter(graphics.FeaturePlotter): +class HelixPlotter(graphics.FeaturePlotter): def __init__(self): pass @@ -48,12 +47,12 @@ def matches(self, feature): def draw(self, axes, feature, bbox, loc, style_param): # Approx. 1 turn per 3.6 residues to resemble natural helix n_turns = np.ceil((loc.last - loc.first + 1) / 3.6) - x_val = np.linspace(0, n_turns * 2*np.pi, 100) + x_val = np.linspace(0, n_turns * 2 * np.pi, 100) # Curve ranges from 0.3 to 0.7 - y_val = (-0.4*np.sin(x_val) + 1) / 2 + y_val = (-0.4 * np.sin(x_val) + 1) / 2 # Transform values for correct location in feature map - x_val *= bbox.width / (n_turns * 2*np.pi) + x_val *= bbox.width / (n_turns * 2 * np.pi) x_val += bbox.x0 y_val *= bbox.height y_val += bbox.y0 @@ -63,18 +62,14 @@ def draw(self, axes, feature, bbox, loc, style_param): bbox.p0, bbox.width, bbox.height, color="white", linewidth=0 ) axes.add_patch(background) - axes.plot( - x_val, y_val, linewidth=2, color=biotite.colors["dimgreen"] - ) + axes.plot(x_val, y_val, linewidth=2, color=biotite.colors["dimgreen"]) class SheetPlotter(graphics.FeaturePlotter): - def __init__(self, head_width=0.8, tail_width=0.5): self._head_width = head_width self._tail_width = tail_width - def matches(self, feature): if feature.key == "SecStr": if "sec_str_type" in feature.qual: @@ -84,39 +79,52 @@ def matches(self, feature): def draw(self, axes, feature, bbox, loc, style_param): x = bbox.x0 - y = bbox.y0 + bbox.height/2 + y = bbox.y0 + bbox.height / 2 dx = bbox.width dy = 0 - if loc.defect & seq.Location.Defect.MISS_RIGHT: + if loc.defect & seq.Location.Defect.MISS_RIGHT: # If the feature extends into the prevoius or next line # do not draw an arrow head draw_head = False else: draw_head = True - axes.add_patch(biotite.AdaptiveFancyArrow( - x, y, dx, dy, - self._tail_width*bbox.height, self._head_width*bbox.height, - # Create head with 90 degrees tip - # -> head width/length ratio = 1/2 - head_ratio=0.5, draw_head=draw_head, - color=biotite.colors["orange"], linewidth=0 - )) + axes.add_patch( + biotite.AdaptiveFancyArrow( + x, + y, + dx, + dy, + self._tail_width * bbox.height, + self._head_width * bbox.height, + # Create head with 90 degrees tip + # -> head width/length ratio = 1/2 + head_ratio=0.5, + draw_head=draw_head, + color=biotite.colors["orange"], + linewidth=0, + ) + ) # Test our drawing functions with example annotation -annotation = seq.Annotation([ - seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type" : "helix"}), - seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type" : "sheet"}), -]) +annotation = seq.Annotation( + [ + seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type": "helix"}), + seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type": "sheet"}), + ] +) fig = plt.figure(figsize=(8.0, 0.8)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, multi_line=False, loc_range=(1,100), + ax, + annotation, + multi_line=False, + loc_range=(1, 100), # Register our drawing functions - feature_plotters=[HelixPlotter(), SheetPlotter()] + feature_plotters=[HelixPlotter(), SheetPlotter()], ) fig.tight_layout() @@ -138,11 +146,14 @@ def draw(self, axes, feature, bbox, loc, style_param): fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, symbols_per_line=150, - show_numbers=True, show_line_position=True, + ax, + annotation, + symbols_per_line=150, + show_numbers=True, + show_line_position=True, # 'loc_range' takes exclusive stop -> length+1 is required - loc_range=(1,length+1), - feature_plotters=[HelixPlotter(), SheetPlotter()] + loc_range=(1, length + 1), + feature_plotters=[HelixPlotter(), SheetPlotter()], ) fig.tight_layout() @@ -152,14 +163,17 @@ def draw(self, axes, feature, bbox, loc, style_param): # Converter for the DSSP secondary structure elements # to the classical ones -dssp_to_abc = {"I" : "c", - "S" : "c", - "H" : "a", - "E" : "b", - "G" : "c", - "B" : "b", - "T" : "c", - "C" : "c"} +dssp_to_abc = { + "I": "c", + "S": "c", + "H": "a", + "E": "b", + "G": "c", + "B": "b", + "T": "c", + "C": "c", +} + def visualize_secondary_structure(sse, first_id): """ @@ -176,7 +190,7 @@ def _add_sec_str(annotation, first, last, str_type): # coil return feature = seq.Feature( - "SecStr", [seq.Location(first, last)], {"sec_str_type" : str_type} + "SecStr", [seq.Location(first, last)], {"sec_str_type": str_type} ) annotation.add_feature(feature) @@ -190,25 +204,29 @@ def _add_sec_str(annotation, first, last, str_type): curr_start = i curr_sse = sse[i] else: - if sse[i] != sse[i-1]: + if sse[i] != sse[i - 1]: _add_sec_str( - annotation, curr_start+first_id, i-1+first_id, curr_sse + annotation, curr_start + first_id, i - 1 + first_id, curr_sse ) curr_start = i curr_sse = sse[i] # Add last secondary structure element to annotation - _add_sec_str(annotation, curr_start+first_id, i-1+first_id, curr_sse) + _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, symbols_per_line=150, - loc_range=(first_id, first_id+len(sse)), - show_numbers=True, show_line_position=True, - feature_plotters=[HelixPlotter(), SheetPlotter()] + ax, + annotation, + symbols_per_line=150, + loc_range=(first_id, first_id + len(sse)), + show_numbers=True, + show_line_position=True, + feature_plotters=[HelixPlotter(), SheetPlotter()], ) fig.tight_layout() + # Fetch and load structure file_name = rcsb.fetch("1QGD", "bcif", gettempdir()) pdbx_file = pdbx.BinaryCIFFile.read(file_name) @@ -230,4 +248,4 @@ def _add_sec_str(annotation, first, last, str_type): sse = struc.annotate_sse(array, chain_id="A") visualize_secondary_structure(sse, tk_mono.res_id[0]) -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/key.py b/doc/key.py index b517f37fb..b7a2b4334 100644 --- a/doc/key.py +++ b/doc/key.py @@ -8,4 +8,4 @@ def set_ncbi_api_key_from_env(*args, **kwargs): ncbi_api_key = os.environ.get("NCBI_API_KEY") if ncbi_api_key is not None and ncbi_api_key != "": - entrez.set_api_key(ncbi_api_key) \ No newline at end of file + entrez.set_api_key(ncbi_api_key) diff --git a/doc/scraper.py b/doc/scraper.py index ac60188e6..e21c52048 100644 --- a/doc/scraper.py +++ b/doc/scraper.py @@ -1,12 +1,11 @@ -import shutil import copy -import sys import os -from os.path import splitext, join, dirname, isfile -from sphinx_gallery.scrapers import figure_rst -from sphinx_gallery.py_source_parser import extract_file_config +import shutil +import sys +from os.path import dirname, isfile, join, splitext from sphinx.errors import ExtensionError - +from sphinx_gallery.py_source_parser import extract_file_config +from sphinx_gallery.scrapers import figure_rst STATIC_IMAGE_COMMAND = "static_image" PYMOL_IMAGE_COMMAND = "ammolite_script" @@ -19,7 +18,7 @@ def static_image_scraper(block, block_vars, gallery_conf): # Search for `sphinx_gallery_static_image` commands block_conf = extract_file_config(code) if STATIC_IMAGE_COMMAND not in block_conf: - return figure_rst([], gallery_conf['src_dir']) + return figure_rst([], gallery_conf["src_dir"]) image_sources = [ join(script_dir, image_name.strip()) @@ -29,7 +28,7 @@ def static_image_scraper(block, block_vars, gallery_conf): # Copy the images into the 'gallery' directory under a canonical # sphinx-gallery name image_destinations = [] - image_path_iterator = block_vars['image_path_iterator'] + image_path_iterator = block_vars["image_path_iterator"] for image in image_sources: suffix = splitext(image)[1] image_destination = image_path_iterator.next() @@ -40,7 +39,7 @@ def static_image_scraper(block, block_vars, gallery_conf): shutil.copy(image, image_destination) # Generate rST for detected image files - return figure_rst(image_destinations, gallery_conf['src_dir']) + return figure_rst(image_destinations, gallery_conf["src_dir"]) def pymol_scraper(block, block_vars, gallery_conf): @@ -48,7 +47,7 @@ def pymol_scraper(block, block_vars, gallery_conf): block_conf = extract_file_config(code) # Search for a `sphinx_gallery_ammolite_script` command if PYMOL_IMAGE_COMMAND not in block_conf: - return figure_rst([], gallery_conf['src_dir']) + return figure_rst([], gallery_conf["src_dir"]) script_dir = dirname(block_vars["src_file"]) pymol_script_path = join(script_dir, block_conf[PYMOL_IMAGE_COMMAND]) @@ -56,7 +55,7 @@ def pymol_scraper(block, block_vars, gallery_conf): # the example script # -> the image will be included in version control # -> Rendering with PyMOL is not necessary for building the docs - pymol_image_path = splitext(block_vars["src_file"])[0] + ".png" + pymol_image_path = splitext(block_vars["src_file"])[0] + ".png" if not isfile(pymol_script_path): raise ExtensionError( f"'{block_vars['src_file']}' has no corresponding " @@ -64,8 +63,8 @@ def pymol_scraper(block, block_vars, gallery_conf): ) try: - import pymol import ammolite + import pymol except ImportError: # If Ammolite is not installed, fall back to the image file, # if already existing @@ -82,7 +81,7 @@ def pymol_scraper(block, block_vars, gallery_conf): # to STDOUT or STDERR # -> Save original STDOUT/STDERR and point them # temporarily to DEVNULL - dev_null = open(os.devnull, 'w') + dev_null = open(os.devnull, "w") orig_stdout = sys.stdout orig_stderr = sys.stderr sys.stdout = dev_null @@ -100,13 +99,12 @@ def pymol_scraper(block, block_vars, gallery_conf): dev_null.close() if not isfile(pymol_image_path): raise ExtensionError( - "PyMOL script did not create an image " - "(at expected location)" + "PyMOL script did not create an image " "(at expected location)" ) # Copy the images into the 'gallery' directory under a canonical # sphinx-gallery name - image_path_iterator = block_vars['image_path_iterator'] + image_path_iterator = block_vars["image_path_iterator"] image_destination = image_path_iterator.next() shutil.copy(pymol_image_path, image_destination) - return figure_rst([image_destination], gallery_conf['src_dir']) + return figure_rst([image_destination], gallery_conf["src_dir"]) diff --git a/doc/switcher.py b/doc/switcher.py index 974715613..e8fcb7208 100644 --- a/doc/switcher.py +++ b/doc/switcher.py @@ -5,14 +5,13 @@ __author__ = "Patrick Kunzmann" __all__ = ["create_api_doc", "skip_non_methods"] -from dataclasses import dataclass -from pathlib import Path import json import re +from dataclasses import dataclass import requests import biotite -RELEASE_REQUEST = f"https://api.github.com/repos/biotite-dev/biotite/releases" +RELEASE_REQUEST = "https://api.github.com/repos/biotite-dev/biotite/releases" BIOTITE_URL = "https://www.biotite-python.org" SEMVER_TAG_REGEX = r"^v(\d+)\.(\d+)\.(\d+)" @@ -35,18 +34,17 @@ def __str__(self): return f"{self.major}.{self.minor}.{self.patch}" def __ge__(self, other): - return ( - (self.major, self.minor, self.patch) - >= (other.major, other.minor, other.patch) + return (self.major, self.minor, self.patch) >= ( + other.major, + other.minor, + other.patch, ) def _get_previous_versions(min_tag, n_versions): response = requests.get(RELEASE_REQUEST, params={"per_page": n_versions}) release_data = json.loads(response.text) - versions = [ - Version.from_tag(release["tag_name"]) for release in release_data - ] + versions = [Version.from_tag(release["tag_name"]) for release in release_data] return [version for version in versions if version >= Version.from_tag(min_tag)] @@ -69,17 +67,21 @@ def create_switcher_json(file_path, min_tag, n_versions): """ version_config = [] for version in _get_previous_versions(min_tag, n_versions)[::-1]: - version_config.append({ - "name": f"{version.major}.{version.minor}", - "version": str(version), - "url": f"{BIOTITE_URL}/{version}/", - }) + version_config.append( + { + "name": f"{version.major}.{version.minor}", + "version": str(version), + "url": f"{BIOTITE_URL}/{version}/", + } + ) current_version = _get_current_version() - version_config.append({ - "name": f"{current_version.major}.{current_version.minor}", - "version": str(current_version), - "url": f"{BIOTITE_URL}/{current_version}/", - "preferred": True - }) + version_config.append( + { + "name": f"{current_version.major}.{current_version.minor}", + "version": str(current_version), + "url": f"{BIOTITE_URL}/{current_version}/", + "preferred": True, + } + ) with open(file_path, "w") as file: json.dump(version_config, file, indent=4) diff --git a/doc/viewcode.py b/doc/viewcode.py index d828f960f..10f5d9870 100644 --- a/doc/viewcode.py +++ b/doc/viewcode.py @@ -10,10 +10,10 @@ __author__ = "Patrick Kunzmann" __all__ = ["linkcode_resolve"] +import inspect from importlib import import_module -from os.path import dirname, join, isdir, splitext from os import listdir -import inspect +from os.path import dirname, isdir, join, splitext import biotite @@ -66,10 +66,13 @@ def _index_attributes(package_name, src_path): # Import all modules in directory and index attributes source_files = [ - file_name for file_name in directory_content - if file_name != "__init__.py" and ( + file_name + for file_name in directory_content + if file_name != "__init__.py" + and ( # Standard Python modules - file_name.endswith(".py") or + file_name.endswith(".py") + or # Extension modules file_name.endswith(".pyx") ) @@ -83,9 +86,7 @@ def _index_attributes(package_name, src_path): module = import_module(module_name) if not hasattr(module, "__all__"): - raise AttributeError( - f"Module {module_name} has not attribute '__all__'" - ) + raise AttributeError(f"Module {module_name} has not attribute '__all__'") # Only index attributes from modules that are available # via respective Biotite (sub-)package # If a the attribute is available, the module was imported in @@ -98,8 +99,7 @@ def _index_attributes(package_name, src_path): is_cython = source_file.endswith(".pyx") for attribute in module.__all__: - attribute_index[(package_name, attribute)] \ - = (module_name, is_cython) + attribute_index[(package_name, attribute)] = (module_name, is_cython) if is_cython: with open(join(src_path, source_file), "r") as cython_file: lines = cython_file.read().splitlines() @@ -150,7 +150,7 @@ def _index_cython_code(code_lines): cropped_line = stripped_line[3:].strip() # ...and determine the end of the name by finding the # subsequent '(' - cropped_line = cropped_line[:cropped_line.index("(")].strip() + cropped_line = cropped_line[: cropped_line.index("(")].strip() attr_name = cropped_line elif line.startswith(("class", "cdef class")): attr_type = "class" @@ -163,8 +163,11 @@ def _index_cython_code(code_lines): cropped_line = cropped_line[5:].strip() # ...and determine the end of the name by finding the # subsequent '(' or ':' - index = cropped_line.index("(") if "(" in cropped_line \ - else cropped_line.index(":") + index = ( + cropped_line.index("(") + if "(" in cropped_line + else cropped_line.index(":") + ) cropped_line = cropped_line[:index].strip() attr_name = cropped_line else: @@ -172,8 +175,8 @@ def _index_cython_code(code_lines): continue attr_line_start = i - attr_line_stop = i+1 - for j in range(i+1, len(code_lines)): + attr_line_stop = i + 1 + for j in range(i + 1, len(code_lines)): attr_line = code_lines[j] if len(attr_line.strip()) == 0 or attr_line.strip()[0] == "#": continue @@ -189,7 +192,7 @@ def _index_cython_code(code_lines): # 'One' based indexing attr_line_start + 1, # 'One' based indexing and inclusive stop - attr_line_stop + attr_line_stop, ) return line_index @@ -203,7 +206,7 @@ def _is_package(path): _attribute_index, _cython_line_index = _index_attributes( "biotite", # Directory to src/biotite - join(dirname(dirname(__file__)), "src", "biotite") + join(dirname(dirname(__file__)), "src", "biotite"), ) @@ -226,17 +229,11 @@ def linkcode_resolve(domain, info): if is_cython: if (package_name, attr_name) in _cython_line_index: first, last = _cython_line_index[(package_name, attr_name)] - return ( - base_url + - f"{module_name.replace('.', '/')}.pyx#L{first}-L{last}" - ) + return base_url + f"{module_name.replace('.', '/')}.pyx#L{first}-L{last}" else: # In case the attribute is not found # by the Cython code analyzer - return ( - base_url + - f"{module_name.replace('.', '/')}.pyx" - ) + return base_url + f"{module_name.replace('.', '/')}.pyx" else: module = import_module(module_name) @@ -255,7 +252,4 @@ def linkcode_resolve(domain, info): source_lines, first = inspect.getsourcelines(obj) last = first + len(source_lines) - 1 - return ( - base_url + - f"{module_name.replace('.', '/')}.py#L{first}-L{last}" - ) \ No newline at end of file + return base_url + f"{module_name.replace('.', '/')}.py#L{first}-L{last}" diff --git a/setup_ccd.py b/setup_ccd.py index 07218964d..fba5d3ab2 100644 --- a/setup_ccd.py +++ b/setup_ccd.py @@ -1,7 +1,7 @@ import gzip import logging -from io import StringIO from dataclasses import dataclass +from io import StringIO import numpy as np import requests from biotite.structure.io.pdbx import * @@ -28,6 +28,7 @@ class ColumnInfo: The name of an alternative column to use, if the original column contains masked values and no `fill_value` is given. """ + dtype: ... encoding: ... fill_value: ... = None @@ -37,67 +38,75 @@ class ColumnInfo: MAIN_COLUMNS = { "id": ColumnInfo( "U5", - [StringArrayEncoding( - data_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + data_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=2, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "name": ColumnInfo( str, - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "type": ColumnInfo( str, - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "formula_weight": ColumnInfo( "f8", [ FixedPointEncoding(factor=1000, src_type=TypeCode.FLOAT64), - ByteArrayEncoding() + ByteArrayEncoding(), ], - fill_value=0 + fill_value=0, ), "one_letter_code": ColumnInfo( "U1", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )], - fill_value="" + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], + fill_value="", ), } @@ -105,148 +114,160 @@ class ColumnInfo: ATOM_COLUMNS = { "comp_id": ColumnInfo( "U5", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=2, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "atom_id": ColumnInfo( "U6", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "type_symbol": ColumnInfo( "U2", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] - ), - "charge": ColumnInfo( - "i1", - [ByteArrayEncoding(type=TypeCode.INT8)], - fill_value=0 + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), + "charge": ColumnInfo("i1", [ByteArrayEncoding(type=TypeCode.INT8)], fill_value=0), "pdbx_model_Cartn_x_ideal": ColumnInfo( "f4", [ FixedPointEncoding(factor=100), IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding() + ByteArrayEncoding(), ], - alternative="model_Cartn_x" + alternative="model_Cartn_x", ), "pdbx_model_Cartn_y_ideal": ColumnInfo( "f4", [ FixedPointEncoding(factor=100), IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding() + ByteArrayEncoding(), ], - alternative="model_Cartn_y" + alternative="model_Cartn_y", ), "pdbx_model_Cartn_z_ideal": ColumnInfo( "f4", [ FixedPointEncoding(factor=100), IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding() + ByteArrayEncoding(), ], - alternative="model_Cartn_z" + alternative="model_Cartn_z", ), } BOND_COLUMNS = { "comp_id": ColumnInfo( "U5", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=2, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "atom_id_1": ColumnInfo( "U6", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "atom_id_2": ColumnInfo( "U6", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "value_order": ColumnInfo( "U4", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)], + ) + ], ), "pdbx_aromatic_flag": ColumnInfo( "U1", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)], + ) + ], ), } @@ -282,18 +303,14 @@ def check_presence(pdbx_file, category_name, column_names): is_present = column_names[0] in category for name in column_names: if (name in category) != is_present: - raise ComponentException( - "Only some column names are missing" - ) + raise ComponentException("Only some column names are missing") if not is_present: return is_unmasked = category[column_names[0]].mask is None for name in column_names: if (category[name].mask is None) != is_unmasked: - raise ComponentException( - "Only some column names are masked" - ) + raise ComponentException("Only some column names are masked") def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): @@ -320,17 +337,12 @@ def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): for comp_id, block in pdbx_file.items(): try: if category_name not in block: - raise ComponentException( - f"Block has no category '{category_name}'" - ) + raise ComponentException(f"Block has no category '{category_name}'") chunk = {} category = block[category_name] for col_name, info in column_infos.items(): col = category.get(col_name) - if ( - col is None - or (col.mask is not None and info.fill_value is None) - ): + if col is None or (col.mask is not None and info.fill_value is None): # Some/all values are missing and there is no default # -> Try alternative if info.alternative is not None: @@ -353,13 +365,14 @@ def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): else: for col_name, data_array in chunk.items(): column_chunks[col_name].append(data_array) - return BinaryCIFCategory({ - col_name: BinaryCIFData( - array=np.concatenate(col_data), - encoding=column_infos[col_name].encoding - ) - for col_name, col_data in column_chunks.items() - }) + return BinaryCIFCategory( + { + col_name: BinaryCIFData( + array=np.concatenate(col_data), encoding=column_infos[col_name].encoding + ) + for col_name, col_data in column_chunks.items() + } + ) def extract_component_groups(type_dict, include, exclude, file_name): @@ -393,8 +406,8 @@ def extract_component_groups(type_dict, include, exclude, file_name): del type_dict[comp_id] # Write extracted components into output file logging.info( - f"Using the following types for '{file_name.name}':\n" + - ", ".join(types_for_group) + f"Using the following types for '{file_name.name}':\n" + + ", ".join(types_for_group) ) with open(file_name, "w") as file: for comp_id in comp_ids_for_group: @@ -412,12 +425,12 @@ def setup_ccd(target_diriectory): logging.info("Checking for consistent coordinates...") check_presence( - ccd_file, "chem_comp_atom", - ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"] + ccd_file, "chem_comp_atom", ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"] ) check_presence( - ccd_file, "chem_comp_atom", - ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"] + ccd_file, + "chem_comp_atom", + ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"], ) logging.info("Extracting component groups...") @@ -426,26 +439,25 @@ def setup_ccd(target_diriectory): for comp_id, block in ccd_file.items() } extract_component_groups( - type_dict, ["peptide", "amino"], ["peptide-like"], - target_diriectory / "amino_acids.txt" + type_dict, + ["peptide", "amino"], + ["peptide-like"], + target_diriectory / "amino_acids.txt", ) extract_component_groups( - type_dict, ["rna", "dna"], [], - target_diriectory / "nucleotides.txt" + type_dict, ["rna", "dna"], [], target_diriectory / "nucleotides.txt" ) extract_component_groups( - type_dict, ["saccharide"], [], - target_diriectory / "carbohydrates.txt" + type_dict, ["saccharide"], [], target_diriectory / "carbohydrates.txt" ) remaining_types = set(type_dict.values()) logging.info( - "The following types are not used in any group:\n" + - ", ".join(remaining_types) + "The following types are not used in any group:\n" + ", ".join(remaining_types) ) compressed_block = BinaryCIFBlock() for category_name, column_infos in [ - ("chem_comp", MAIN_COLUMNS), + ("chem_comp", MAIN_COLUMNS), ("chem_comp_atom", ATOM_COLUMNS), ("chem_comp_bond", BOND_COLUMNS), ]: @@ -459,5 +471,7 @@ def setup_ccd(target_diriectory): compressed_file["components"] = compressed_block compressed_file.write(target_diriectory / "components.bcif") + from pathlib import Path -setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd") \ No newline at end of file + +setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd") diff --git a/src/biotite/__init__.py b/src/biotite/__init__.py index f90e3b5ff..8b80e11ea 100644 --- a/src/biotite/__init__.py +++ b/src/biotite/__init__.py @@ -12,7 +12,7 @@ __name__ = "biotite" __author__ = "Patrick Kunzmann" -from .file import * from .copyable import * -from .visualize import * +from .file import * from .version import __version__, __version_tuple__ +from .visualize import * diff --git a/src/biotite/application/__init__.py b/src/biotite/application/__init__.py index 72ca3f96c..de09a3dbf 100644 --- a/src/biotite/application/__init__.py +++ b/src/biotite/application/__init__.py @@ -65,5 +65,5 @@ from .application import * from .localapp import * +from .msaapp import * from .webapp import * -from .msaapp import * \ No newline at end of file diff --git a/src/biotite/application/application.py b/src/biotite/application/application.py index 858658175..fb5d2c037 100644 --- a/src/biotite/application/application.py +++ b/src/biotite/application/application.py @@ -4,19 +4,26 @@ __name__ = "biotite.application" __author__ = "Patrick Kunzmann" -__all__ = ["Application", "AppStateError", "TimeoutError", "VersionError", - "AppState", "requires_state"] +__all__ = [ + "Application", + "AppStateError", + "TimeoutError", + "VersionError", + "AppState", + "requires_state", +] import abc import time -from functools import wraps from enum import Flag, auto +from functools import wraps class AppState(Flag): """ This enum type represents the app states of an application. """ + CREATED = auto() RUNNING = auto() FINISHED = auto() @@ -45,6 +52,7 @@ def requires_state(app_state): ... def function(self): ... pass """ + def decorator(func): @wraps(func) def wrapper(*args, **kwargs): @@ -52,16 +60,16 @@ def wrapper(*args, **kwargs): try: instance = args[0] except IndexError: - raise TypeError( - "This method must be called from a class instance" - ) + raise TypeError("This method must be called from a class instance") if not instance._state & app_state: raise AppStateError( f"The application is in {instance.get_app_state()} state, " f"but {app_state} state is required" ) return func(*args, **kwargs) + return wrapper + return decorator @@ -146,11 +154,10 @@ def join(self, timeout=None): """ time.sleep(self.wait_interval()) while self.get_app_state() != AppState.FINISHED: - if timeout is not None and time.time()-self._start_time > timeout: + if timeout is not None and time.time() - self._start_time > timeout: self.cancel() raise TimeoutError( - f"The application expired its timeout " - f"({timeout:.1f} s)" + f"The application expired its timeout " f"({timeout:.1f} s)" ) else: time.sleep(self.wait_interval()) @@ -249,6 +256,7 @@ class AppStateError(Exception): """ Indicate that the application lifecycle was violated. """ + pass @@ -256,6 +264,7 @@ class TimeoutError(Exception): """ Indicate that the application's timeout expired. """ + pass @@ -263,4 +272,5 @@ class VersionError(Exception): """ Indicate that the application's version is invalid. """ - pass \ No newline at end of file + + pass diff --git a/src/biotite/application/autodock/__init__.py b/src/biotite/application/autodock/__init__.py index 9d8aabe1e..756b6648c 100644 --- a/src/biotite/application/autodock/__init__.py +++ b/src/biotite/application/autodock/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.autodock" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/autodock/app.py b/src/biotite/application/autodock/app.py index c93cd3cc8..9b3bf6d72 100644 --- a/src/biotite/application/autodock/app.py +++ b/src/biotite/application/autodock/app.py @@ -9,12 +9,12 @@ import copy from tempfile import NamedTemporaryFile import numpy as np -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state -from ...structure.io.pdbqt import PDBQTFile -from ...structure.residues import get_residue_starts_for, get_residue_masks from ...structure.bonds import find_connected from ...structure.error import BadStructureError +from ...structure.io.pdbqt import PDBQTFile +from ...structure.residues import get_residue_masks, get_residue_starts_for +from ..application import AppState, requires_state +from ..localapp import LocalApp, cleanup_tempfile class VinaApp(LocalApp): @@ -62,8 +62,8 @@ class VinaApp(LocalApp): ... flexible=(receptor.res_id == 2) | (receptor.res_id == 5) ... ) """ - def __init__(self, ligand, receptor, center, size, flexible=None, - bin_path="vina"): + + def __init__(self, ligand, receptor, center, size, flexible=None, bin_path="vina"): super().__init__(bin_path) if ligand.bonds is None: @@ -83,23 +83,17 @@ def __init__(self, ligand, receptor, center, size, flexible=None, if self._is_flexible: flexible_indices = np.where(flexible)[0] - self._flex_res_starts = np.unique(get_residue_starts_for( - receptor, flexible_indices - )) - - self._ligand_file = NamedTemporaryFile( - "w", suffix=".pdbqt", delete=False - ) - self._receptor_file = NamedTemporaryFile( - "w", suffix=".pdbqt", delete=False - ) - self._receptor_flex_file = NamedTemporaryFile( + self._flex_res_starts = np.unique( + get_residue_starts_for(receptor, flexible_indices) + ) + + self._ligand_file = NamedTemporaryFile("w", suffix=".pdbqt", delete=False) + self._receptor_file = NamedTemporaryFile("w", suffix=".pdbqt", delete=False) + self._receptor_flex_file = NamedTemporaryFile( "w", suffix=".pdbqt", delete=False ) - self._out_file = NamedTemporaryFile( - "r", suffix=".pdbqt", delete=False - ) - + self._out_file = NamedTemporaryFile("r", suffix=".pdbqt", delete=False) + @requires_state(AppState.CREATED) def set_seed(self, seed): """ @@ -114,7 +108,7 @@ def set_seed(self, seed): The seed for the random number generator. """ self._seed = seed - + @requires_state(AppState.CREATED) def set_exhaustiveness(self, exhaustiveness): """ @@ -131,7 +125,7 @@ def set_exhaustiveness(self, exhaustiveness): Must be greater than 0. """ self._exhaustiveness = exhaustiveness - + @requires_state(AppState.CREATED) def set_max_number_of_models(self, number): """ @@ -147,7 +141,7 @@ def set_max_number_of_models(self, number): The maximum number of generated modes/models. """ self._number = number - + @requires_state(AppState.CREATED) def set_energy_range(self, energy_range): """ @@ -168,34 +162,31 @@ def run(self): # Use different atom ID ranges for atoms in ligand and receptor # for unambiguous assignment, if the receptor contains flexible # residues - self._ligand.set_annotation("atom_id", np.arange( - 1, - self._ligand.array_length() + 1 - )) - self._receptor.set_annotation("atom_id", np.arange( - self._ligand.array_length() + 1, - self._ligand.array_length() + self._receptor.array_length() + 1 - )) + self._ligand.set_annotation( + "atom_id", np.arange(1, self._ligand.array_length() + 1) + ) + self._receptor.set_annotation( + "atom_id", + np.arange( + self._ligand.array_length() + 1, + self._ligand.array_length() + self._receptor.array_length() + 1, + ), + ) ligand_file = PDBQTFile() - # Contains 'true' entries for all atoms that have not been + # Contains 'true' entries for all atoms that have not been # removed from ligand self._ligand_mask = ligand_file.set_structure( - self._ligand, - rotatable_bonds="all" + self._ligand, rotatable_bonds="all" ) ligand_file.write(self._ligand_file) self._ligand_file.flush() - + if self._is_flexible: - self._rigid_mask = np.ones( - self._receptor.array_length(), dtype=bool - ) - # Contains 'true' entries for all atoms that have not been + self._rigid_mask = np.ones(self._receptor.array_length(), dtype=bool) + # Contains 'true' entries for all atoms that have not been # removed from receptor in flexible side chains - self._receptor_mask = np.zeros( - self._receptor.array_length(), dtype=bool - ) + self._receptor_mask = np.zeros(self._receptor.array_length(), dtype=bool) for i, start in enumerate(self._flex_res_starts): flex_mask, rigid_mask, root = self._get_flexible_residue(start) self._rigid_mask &= rigid_mask @@ -207,7 +198,7 @@ def run(self): self._receptor[flex_mask], rotatable_bonds="all", root=root_in_flex_residue, - include_torsdof=False + include_torsdof=False, ) # Enclose each flexible residue # with BEGIN_RES and END_RES @@ -220,7 +211,7 @@ def run(self): receptor_file.set_structure( self._receptor[self._rigid_mask], rotatable_bonds=None, - include_torsdof=False + include_torsdof=False, ) receptor_file.write(self._receptor_file) self._receptor_file.flush() @@ -228,23 +219,30 @@ def run(self): else: receptor_file = PDBQTFile() receptor_file.set_structure( - self._receptor, - rotatable_bonds=None, - include_torsdof=False + self._receptor, rotatable_bonds=None, include_torsdof=False ) receptor_file.write(self._receptor_file) self._receptor_file.flush() arguments = [ - "--ligand", self._ligand_file.name, - "--receptor", self._receptor_file.name, - "--out", self._out_file.name, - "--center_x", f"{self._center[0]:.3f}", - "--center_y", f"{self._center[1]:.3f}", - "--center_z", f"{self._center[2]:.3f}", - "--size_x", f"{self._size[0]:.3f}", - "--size_y", f"{self._size[1]:.3f}", - "--size_z", f"{self._size[2]:.3f}", + "--ligand", + self._ligand_file.name, + "--receptor", + self._receptor_file.name, + "--out", + self._out_file.name, + "--center_x", + f"{self._center[0]:.3f}", + "--center_y", + f"{self._center[1]:.3f}", + "--center_z", + f"{self._center[2]:.3f}", + "--size_x", + f"{self._size[0]:.3f}", + "--size_y", + f"{self._size[1]:.3f}", + "--size_z", + f"{self._size[2]:.3f}", ] if self._seed is not None: arguments.extend(["--seed", str(self._seed)]) @@ -259,32 +257,32 @@ def run(self): self.set_arguments(arguments) super().run() - + def evaluate(self): super().evaluate() out_file = PDBQTFile.read(self._out_file) - + models = out_file.get_structure() n_ligand_atoms = np.count_nonzero(self._ligand_mask) self._ligand_models = models[..., :n_ligand_atoms] self._flex_models = models[..., n_ligand_atoms:] self._n_models = models.stack_depth() - + remarks = out_file.get_remarks() self._energies = np.array( # VINA RESULT: -5.8 0.000 0.000 # ^ [float(remark[12:].split()[0]) for remark in remarks] ) - + def clean_up(self): super().clean_up() cleanup_tempfile(self._ligand_file) cleanup_tempfile(self._receptor_file) cleanup_tempfile(self._receptor_flex_file) cleanup_tempfile(self._out_file) - + @requires_state(AppState.JOINED) def get_energies(self): """ @@ -302,7 +300,7 @@ def get_energies(self): @requires_state(AppState.JOINED) def get_ligand_models(self): """ - Get the ligand structure with the conformations for each + Get the ligand structure with the conformations for each generated binding mode. Returns @@ -312,7 +310,7 @@ def get_ligand_models(self): Each model corresponds to one binding mode. The models are sorted from best to worst predicted binding affinity. - + Notes ----- The returned structure may contain less atoms than the input @@ -338,12 +336,11 @@ def get_ligand_coord(self): atoms are set to *NaN*. """ coord = np.full( - (self._n_models, self._ligand.array_length(), 3), - np.nan, dtype=np.float32 + (self._n_models, self._ligand.array_length(), 3), np.nan, dtype=np.float32 ) coord[:, self._ligand_mask] = self._ligand_models.coord return coord - + @requires_state(AppState.JOINED) def get_flexible_residue_models(self): """ @@ -360,7 +357,7 @@ def get_flexible_residue_models(self): Each model corresponds to one binding mode. The models are sorted from best to worst predicted binding affinity. - + Notes ----- The returned structure may contain less atoms than the input @@ -385,7 +382,7 @@ def get_receptor_coord(self): affinity. Missing coordinates due to the removed nonpolar hydrogen atoms from flexible side chains are set to *NaN*. - + Notes ----- The output is only meaningful, if flexible side chains were @@ -394,8 +391,7 @@ def get_receptor_coord(self): of the input receptor coordinates. """ coord = np.repeat( - self._receptor.coord[np.newaxis, ...], - repeats=self._n_models, axis=0 + self._receptor.coord[np.newaxis, ...], repeats=self._n_models, axis=0 ) if self._is_flexible: # Replace original coordinates with modeled coordinates @@ -424,16 +420,16 @@ def _get_flexible_residue(self, residue_start): root_connect_indices, _ = self._receptor.bonds.get_bonds(root_index) connected_index = None try: - connected_index = root_connect_indices[np.isin( - self._receptor.atom_name[root_connect_indices], ("CB",) - )][0] + connected_index = root_connect_indices[ + np.isin(self._receptor.atom_name[root_connect_indices], ("CB",)) + ][0] except IndexError: # Residue has no appropriate connection (e.g. in glycine) # -> There is no atom in the flexible side chain flex_mask = np.zeros(self._receptor.array_length(), dtype=bool) rigid_mask = np.ones(self._receptor.array_length(), dtype=bool) return flex_mask, rigid_mask, root_index - + # Remove the root bond from the bond list # to find the atoms involved in the flexible part bonds = self._receptor.bonds.copy() @@ -442,7 +438,7 @@ def _get_flexible_residue(self, residue_start): if root_index in flexible_indices: raise BadStructureError( "There are multiple connections between the flexible and " - "rigid part, maybe a cyclic residue like proline was selected" + "rigid part, maybe a cyclic residue like proline was selected" ) flex_mask = np.zeros(self._receptor.array_length(), dtype=bool) @@ -452,7 +448,6 @@ def _get_flexible_residue(self, residue_start): flex_mask[root_index] = True return flex_mask, rigid_mask, root_index - @staticmethod def dock(ligand, receptor, center, size, flexible=None, bin_path="vina"): diff --git a/src/biotite/application/blast/__init__.py b/src/biotite/application/blast/__init__.py index 77caf3e64..65857b2b4 100644 --- a/src/biotite/application/blast/__init__.py +++ b/src/biotite/application/blast/__init__.py @@ -10,5 +10,5 @@ __name__ = "biotite.application.blast" __author__ = "Patrick Kunzmann" +from .alignment import * from .webapp import * -from .alignment import * \ No newline at end of file diff --git a/src/biotite/application/blast/alignment.py b/src/biotite/application/blast/alignment.py index dc5b31784..251520d2e 100644 --- a/src/biotite/application/blast/alignment.py +++ b/src/biotite/application/blast/alignment.py @@ -14,10 +14,10 @@ class BlastAlignment(Alignment): A specialized :class:`Alignment` class for alignments using the BLAST application. It stores additional data, like the E-value, the HSP position and a description of the hit sequence. - + Like its superclass, all attributes of a :class:`BlastAlignment` are public. The attributes are the same as the constructor parameters. - + Parameters ---------- sequences : list @@ -44,16 +44,25 @@ class BlastAlignment(Alignment): hit_definition : str The name of the hit sequence. """ - - def __init__(self, sequences, trace, score, e_value, - query_interval, hit_interval, hit_id, hit_definition): + + def __init__( + self, + sequences, + trace, + score, + e_value, + query_interval, + hit_interval, + hit_id, + hit_definition, + ): super().__init__(sequences, trace, score) self.e_value = e_value self.query_interval = query_interval self.hit_interval = hit_interval self.hit_id = hit_id self.hit_definition = hit_definition - + def __eq__(self, item): if not isinstance(item, BlastAlignment): return False @@ -68,7 +77,7 @@ def __eq__(self, item): if self.hit_definition != item.hit_definition: return False return super().__eq__(item) - + def __getitem__(self, index): super_alignment = super().__getitem__(index) return BlastAlignment( @@ -79,5 +88,5 @@ def __getitem__(self, index): self.query_interval, self.hit_interval, self.hit_id, - self.hit_definition - ) \ No newline at end of file + self.hit_definition, + ) diff --git a/src/biotite/application/blast/webapp.py b/src/biotite/application/blast/webapp.py index cf358ac23..58a98d269 100644 --- a/src/biotite/application/blast/webapp.py +++ b/src/biotite/application/blast/webapp.py @@ -6,26 +6,26 @@ __author__ = "Patrick Kunzmann" __all__ = ["BlastWebApp"] -from .alignment import BlastAlignment -from ..application import Application, requires_state, AppState -from ..webapp import WebApp, RuleViolationError -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.io.fasta.convert import get_sequence -from ...sequence.align.alignment import Alignment import time -import requests from xml.etree import ElementTree - +import requests +from ...sequence.align.alignment import Alignment +from ...sequence.io.fasta.convert import get_sequence +from ...sequence.io.fasta.file import FastaFile +from ...sequence.seqtypes import NucleotideSequence, ProteinSequence +from ...sequence.sequence import Sequence +from ..application import AppState, requires_state +from ..webapp import WebApp +from .alignment import BlastAlignment _ncbi_url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" + class BlastWebApp(WebApp): """ Perform a local alignment against a large sequence database using using the web-based BLAST application (by default NCBI BLAST). - + Parameters ---------- program : str @@ -35,7 +35,7 @@ class BlastWebApp(WebApp): The query sequence. If a string is provided, it is interpreted as path to a FASTA file, if the string contains a valid FASTA file extension, otherwise it is interpreted as a single letter - string representation of a sequence. + string representation of a sequence. database : str, optional The NCBI sequence database to blast against. By default it contains all sequences (`database`='nr'`). @@ -52,68 +52,71 @@ class BlastWebApp(WebApp): HTTP request. This allows the NCBI to contact you in case your application sends too many requests. """ - + _last_contact = 0 _last_request = 0 _contact_delay = 3 _request_delay = 60 - - def __init__(self, program, query, database="nr", - app_url=_ncbi_url, obey_rules=True, - mail="padix.key@gmail.com"): + + def __init__( + self, + program, + query, + database="nr", + app_url=_ncbi_url, + obey_rules=True, + mail="padix.key@gmail.com", + ): super().__init__(app_url, obey_rules) - + # 'megablast' is somehow not working # When entering the corresponding HTTPS request into a browser # you are redirected onto the blast mainpage - if program not in ["blastn", "blastp", - "blastx", "tblastn", "tblastx"]: + if program not in ["blastn", "blastp", "blastx", "tblastn", "tblastx"]: raise ValueError(f"'{program}' is not a valid BLAST program") self._program = program - - requires_protein = (program in ["blastp", "tblastn"]) - if isinstance(query, str) and query.endswith((".fa",".fst",".fasta")): + + requires_protein = program in ["blastp", "tblastn"] + if isinstance(query, str) and query.endswith((".fa", ".fst", ".fasta")): # If string has a file extension, it is interpreted as # FASTA file from which the sequence is taken file = FastaFile.read(query) # Get first entry in file and take the sequence - # (rather than header) + # (rather than header) self._query = str(get_sequence(file)) elif isinstance(query, Sequence): self._query = str(query) else: self._query = query - + # Check for unsuitable symbols in query string if requires_protein: ref_alphabet = ProteinSequence.alphabet else: ref_alphabet = NucleotideSequence.alphabet_amb for symbol in self._query: - if not symbol.upper() in ref_alphabet: - raise ValueError( - f"Query sequence contains unsuitable symbol {symbol}" - ) - + if symbol.upper() not in ref_alphabet: + raise ValueError(f"Query sequence contains unsuitable symbol {symbol}") + self._database = database - + self._gap_openining = None self._gap_extension = None self._word_size = None - + self._expect_value = None self._max_results = None self._entrez_query = None - + self._reward = None self._penalty = None - + self._matrix = None self._threshold = None - - self._mail=mail + + self._mail = mail self._rid = None - + @requires_state(AppState.CREATED) def set_entrez_query(self, query): """ @@ -126,7 +129,7 @@ def set_entrez_query(self, query): An NCBI Entrez query. """ self._entrez_query = str(query) - + @requires_state(AppState.CREATED) def set_max_results(self, number): """ @@ -138,30 +141,30 @@ def set_max_results(self, number): The maximum number of results. """ self._max_results = number - + @requires_state(AppState.CREATED) def set_max_expect_value(self, value): """ Set the threshold expectation value (E-value). No alignments with an E-value above this threshold will be considered. - + The E-Value is the expectation value for the number of random sequences of a similar sized database getting an equal or higher score by change when aligned with the query sequence. - + Parameters ---------- value : float The threshold E-value. """ self._expect_value = value - + @requires_state(AppState.CREATED) def set_gap_penalty(self, opening, extension): """ Set the affine gap penalty for the alignment. - + Parameters ---------- opening : float @@ -171,75 +174,75 @@ def set_gap_penalty(self, opening, extension): """ self._gap_openining = opening self._gap_extension = extension - + @requires_state(AppState.CREATED) def set_word_size(self, size): """ Set the word size for alignment seeds. - + Parameters ---------- size : int Word size. """ self._word_size = size - + @requires_state(AppState.CREATED) def set_match_reward(self, reward): """ Set the score of a symbol match in the alignment. - + Used only in 'blastn' and 'megablast'. - + Parameters ---------- reward : int Match reward. Must be positive. """ self._reward = reward - + @requires_state(AppState.CREATED) def set_mismatch_penalty(self, penalty): """ Set the penalty of a symbol mismatch in the alignment. - + Used only in 'blastn' and 'megablast'. - + Parameters ---------- penalty : int Mismatch penalty. Must be negative. """ self._penalty = penalty - + @requires_state(AppState.CREATED) def set_substitution_matrix(self, matrix_name): """ Set the penalty of a symbol mismatch in the alignment. - + Used only in 'blastp', "blastx', 'tblastn' and 'tblastx'. - + Parameters ---------- matrix_name : str Name of the substitution matrix. Default is 'BLOSUM62'. """ self._matrix = matrix_name.upper() - + @requires_state(AppState.CREATED) def set_threshold(self, threshold): """ Set the threshold neighboring score for initial words. - + Used only in 'blastp', "blastx', 'tblastn' and 'tblastx'. - + Parameters ---------- threshold : int Threshold value. Must be positve. """ self._threshold = threshold - + def run(self): param_dict = {} param_dict["tool"] = "Biotite" @@ -255,23 +258,24 @@ def run(self): if self._expect_value is not None: param_dict["EXPECT"] = self._expect_value if self._gap_openining is not None and self._gap_extension is not None: - param_dict["GAPCOSTS"] = "{:d} {:d}".format(self._gap_openining, - self._gap_extension) + param_dict["GAPCOSTS"] = "{:d} {:d}".format( + self._gap_openining, self._gap_extension + ) if self._word_size is not None: param_dict["WORD_SIZE"] = self._word_size - + if self._program in ["blastn", "megablast"]: if self._reward is not None: param_dict["NUCL_REWARD"] = self._reward if self._penalty is not None: param_dict["NUCL_PENALTY"] = self._penalty - + if self._program in ["blastp", "blastx", "tblastn", "tblastx"]: if self._matrix is not None: param_dict["MATRIX"] = self._matrix if self._threshold is not None: param_dict["THRESHOLD"] = self._threshold - + request = requests.get(self.app_url(), params=param_dict) if "Submitted URI too large" in request.text: raise ValueError("The URI is too large, try a shorter sequence") @@ -279,11 +283,9 @@ def run(self): self._request() info_dict = BlastWebApp._get_info(request.text) self._rid = info_dict["RID"] - + def is_finished(self): - data_dict = {"FORMAT_OBJECT" : "SearchInfo", - "RID" : self._rid, - "CMD" : "Get"} + data_dict = {"FORMAT_OBJECT": "SearchInfo", "RID": self._rid, "CMD": "Get"} request = requests.get(self.app_url(), params=data_dict) self._contact() info_dict = BlastWebApp._get_info(request.text) @@ -294,17 +296,17 @@ def is_finished(self): "(Server responsed status 'UNKNOWN')" ) return info_dict["Status"] == "READY" - + def wait_interval(self): # NCBI requires a 3 second delay between server contacts return BlastWebApp._contact_delay - + def clean_up(self): param_dict = {} param_dict["CMD"] = "Delete" param_dict["RID"] = self._rid request = requests.get(self.app_url(), params=param_dict) - + def evaluate(self): param_dict = {} param_dict["tool"] = "BiotiteClient" @@ -316,7 +318,7 @@ def evaluate(self): param_dict["NCBI_GI"] = "T" request = requests.get(self.app_url(), params=param_dict) self._contact() - + self._alignments = [] self._xml_response = request.text root = ElementTree.fromstring(self._xml_response) @@ -333,15 +335,14 @@ def evaluate(self): query_end = int(hsp.find("Hsp_query-to").text) hit_begin = int(hsp.find("Hsp_hit-from").text) hit_end = int(hsp.find("Hsp_hit-to").text) - + seq1_str = hsp.find("Hsp_qseq").text seq2_str = hsp.find("Hsp_hseq").text if self._program in ["blastn", "megablast"]: # NucleotideSequence/ProteinSequence do ignore gaps # Gaps are represented by the trace seq1, seq2 = [ - NucleotideSequence(s.replace("-", "")) - for s in (seq1_str, seq2_str) + NucleotideSequence(s.replace("-", "")) for s in (seq1_str, seq2_str) ] else: seq1, seq2 = [ @@ -349,18 +350,24 @@ def evaluate(self): for s in (seq1_str, seq2_str) ] trace = Alignment.trace_from_strings([seq1_str, seq2_str]) - - alignment = BlastAlignment( [seq1 ,seq2], trace, score, e_value, - (query_begin, query_end), - (hit_begin, hit_end), - hit_id, hit_definition ) + + alignment = BlastAlignment( + [seq1, seq2], + trace, + score, + e_value, + (query_begin, query_end), + (hit_begin, hit_end), + hit_id, + hit_definition, + ) self._alignments.append(alignment) @requires_state(AppState.JOINED) def get_xml_response(self): """ Get the raw XML response. - + Returns ------- response : str @@ -372,14 +379,14 @@ def get_xml_response(self): def get_alignments(self): """ Get the resulting local sequence alignments. - + Returns ------- alignment : list of BlastAlignment The local sequence alignments. """ return self._alignments - + @staticmethod def _get_info(text): """ @@ -399,7 +406,7 @@ def _get_info(text): pair = line.split("=") info_dict[pair[0].strip()] = pair[1].strip() return info_dict - + def _contact(self): """ Resets the time since the last server contact. Used for @@ -409,7 +416,7 @@ def _contact(self): if (contact - BlastWebApp._last_contact) < BlastWebApp._contact_delay: self.violate_rule("The server was contacted too often") BlastWebApp._last_contact = contact - + def _request(self): """ Resets the time since the last new alignment request. Used for diff --git a/src/biotite/application/clustalo/__init__.py b/src/biotite/application/clustalo/__init__.py index 1f3afebac..ba0f44704 100644 --- a/src/biotite/application/clustalo/__init__.py +++ b/src/biotite/application/clustalo/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.clustalo" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/clustalo/app.py b/src/biotite/application/clustalo/app.py index 778c613d8..a24112461 100644 --- a/src/biotite/application/clustalo/app.py +++ b/src/biotite/application/clustalo/app.py @@ -8,20 +8,16 @@ from tempfile import NamedTemporaryFile import numpy as np -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.align.alignment import Alignment from ...sequence.phylo.tree import Tree +from ..application import AppState, requires_state from ..localapp import cleanup_tempfile from ..msaapp import MSAApp -from ..application import AppState, requires_state class ClustalOmegaApp(MSAApp): """ Perform a multiple sequence alignment using Clustal-Omega. - + Parameters ---------- sequences : list of ProteinSequence or NucleotideSequence @@ -30,7 +26,7 @@ class ClustalOmegaApp(MSAApp): Path of the Custal-Omega binary. matrix : None This parameter is used for compatibility reasons and is ignored. - + Examples -------- @@ -48,34 +44,30 @@ class ClustalOmegaApp(MSAApp): -BISMITE --IQLITE """ - + def __init__(self, sequences, bin_path="clustalo", matrix=None): super().__init__(sequences, bin_path, None) self._seq_count = len(sequences) self._mbed = True self._dist_matrix = None self._tree = None - self._in_dist_matrix_file = NamedTemporaryFile( - "w", suffix=".mat", delete=False - ) + self._in_dist_matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False) self._out_dist_matrix_file = NamedTemporaryFile( "r", suffix=".mat", delete=False ) - self._in_tree_file = NamedTemporaryFile( - "w", suffix=".tree", delete=False - ) - self._out_tree_file = NamedTemporaryFile( - "r", suffix=".tree", delete=False - ) - + self._in_tree_file = NamedTemporaryFile("w", suffix=".tree", delete=False) + self._out_tree_file = NamedTemporaryFile("r", suffix=".tree", delete=False) + def run(self): args = [ - "--in", self.get_input_file_path(), - "--out", self.get_output_file_path(), + "--in", + self.get_input_file_path(), + "--out", + self.get_output_file_path(), # The temporary files are already created # -> tell Clustal to overwrite these empty files "--force", - # Tree order for get_alignment_order() to work properly + # Tree order for get_alignment_order() to work properly "--output-order=tree-order", ] if self.get_seqtype() == "protein": @@ -87,28 +79,24 @@ def run(self): # as input and output# # -> Only request tree output when not tree is input args += [ - "--guidetree-out", self._out_tree_file.name, + "--guidetree-out", + self._out_tree_file.name, ] if not self._mbed: - args += [ - "--full", - "--distmat-out", self._out_dist_matrix_file.name - ] + args += ["--full", "--distmat-out", self._out_dist_matrix_file.name] if self._dist_matrix is not None: # Add the sequence names (0, 1, 2, 3 ...) as first column dist_matrix_with_index = np.concatenate( - ( - np.arange(self._seq_count)[:, np.newaxis], - self._dist_matrix - ), axis=1 + (np.arange(self._seq_count)[:, np.newaxis], self._dist_matrix), axis=1 ) np.savetxt( - self._in_dist_matrix_file.name, dist_matrix_with_index, + self._in_dist_matrix_file.name, + dist_matrix_with_index, # The first line contains the amount of sequences - comments = "", - header = str(self._seq_count), + comments="", + header=str(self._seq_count), # The sequence indices are integers, the rest are floats - fmt = ["%d"] + ["%.5f"] * self._seq_count + fmt=["%d"] + ["%.5f"] * self._seq_count, ) args += ["--distmat-in", self._in_dist_matrix_file.name] if self._tree is not None: @@ -117,15 +105,15 @@ def run(self): args += ["--guidetree-in", self._in_tree_file.name] self.set_arguments(args) super().run() - + def evaluate(self): super().evaluate() if not self._mbed: self._dist_matrix = np.loadtxt( self._out_dist_matrix_file.name, # The first row only contains the number of sequences - skiprows = 1, - dtype = float + skiprows=1, + dtype=float, ) # The first column contains only the name of the # sequences, in this case 0, 1, 2, 3 ... @@ -133,17 +121,15 @@ def evaluate(self): self._dist_matrix = self._dist_matrix[:, 1:] # Only read output tree if no tree was input if self._tree is None: - self._tree = Tree.from_newick( - self._out_tree_file.read().replace("\n", "") - ) - + self._tree = Tree.from_newick(self._out_tree_file.read().replace("\n", "")) + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_dist_matrix_file) cleanup_tempfile(self._out_dist_matrix_file) cleanup_tempfile(self._in_tree_file) cleanup_tempfile(self._out_tree_file) - + @requires_state(AppState.CREATED) def full_matrix_calculation(self): """ @@ -154,13 +140,13 @@ def full_matrix_calculation(self): default *mBed* heuristic. """ self._mbed = False - + @requires_state(AppState.CREATED) def set_distance_matrix(self, matrix): """ Set the pairwise sequence distances, the program should use to - calculate the guide tree. - + calculate the guide tree. + Parameters ---------- matrix : ndarray, shape=(n,n), dtype=float @@ -172,13 +158,13 @@ def set_distance_matrix(self, matrix): f"{self._seq_count} sequences" ) self._dist_matrix = matrix.astype(float, copy=False) - + @requires_state(AppState.JOINED) def get_distance_matrix(self): """ Get the pairwise sequence distances the program used to - calculate the guide tree. - + calculate the guide tree. + Returns ------- matrix : ndarray, shape=(n,n), dtype=float @@ -186,17 +172,16 @@ def get_distance_matrix(self): """ if self._mbed: raise ValueError( - "Getting the distance matrix requires " - "'full_matrix_calculation()'" + "Getting the distance matrix requires " "'full_matrix_calculation()'" ) return self._dist_matrix - + @requires_state(AppState.CREATED) def set_guide_tree(self, tree): """ Set the guide tree, the program should use for the progressive alignment. - + Parameters ---------- tree : Tree @@ -208,31 +193,31 @@ def set_guide_tree(self, tree): "{self._seq_count} sequences, must be equal" ) self._tree = tree - + @requires_state(AppState.JOINED) def get_guide_tree(self): """ Get the guide tree created for the progressive alignment. - + Returns ------- tree : Tree The guide tree. """ return self._tree - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return False - + @staticmethod def supports_custom_protein_matrix(): return False diff --git a/src/biotite/application/dssp/__init__.py b/src/biotite/application/dssp/__init__.py index 93f8f17e2..b1d43758c 100644 --- a/src/biotite/application/dssp/__init__.py +++ b/src/biotite/application/dssp/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.dssp" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/dssp/app.py b/src/biotite/application/dssp/app.py index eb0974460..e4d84e87a 100644 --- a/src/biotite/application/dssp/app.py +++ b/src/biotite/application/dssp/app.py @@ -7,11 +7,11 @@ __all__ = ["DsspApp"] from tempfile import NamedTemporaryFile -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state +import numpy as np from ...structure.io.pdbx.cif import CIFFile from ...structure.io.pdbx.convert import set_structure -import numpy as np +from ..application import AppState, requires_state +from ..localapp import LocalApp, cleanup_tempfile class DsspApp(LocalApp): @@ -73,7 +73,7 @@ def __init__(self, atom_array, bin_path="mkdssp"): "occupancy", np.ones(self._array.array_length(), dtype=float) ) - self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False) + self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False) self._out_file = NamedTemporaryFile("r", suffix=".dssp", delete=False) def run(self): @@ -81,9 +81,7 @@ def run(self): set_structure(in_file, self._array) in_file.write(self._in_file) self._in_file.flush() - self.set_arguments( - ["-i", self._in_file.name, "-o", self._out_file.name] - ) + self.set_arguments(["-i", self._in_file.name, "-o", self._out_file.name]) super().run() def evaluate(self): @@ -93,13 +91,12 @@ def evaluate(self): sse_start = None for i, line in enumerate(lines): if line.startswith(" # RESIDUE AA STRUCTURE"): - sse_start = i+1 + sse_start = i + 1 if sse_start is None: raise ValueError("DSSP file does not contain SSE records") # Remove "!" for missing residues lines = [ - line for line in lines[sse_start:] - if len(line) != 0 and line[13] != "!" + line for line in lines[sse_start:] if len(line) != 0 and line[13] != "!" ] self._sse = np.zeros(len(lines), dtype="U1") # Parse file for SSE letters diff --git a/src/biotite/application/localapp.py b/src/biotite/application/localapp.py index acfd1bd8b..d52dc0632 100644 --- a/src/biotite/application/localapp.py +++ b/src/biotite/application/localapp.py @@ -9,23 +9,24 @@ import abc import copy from os import chdir, getcwd, remove +from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired from .application import Application, AppState, AppStateError, requires_state -from subprocess import Popen, PIPE, SubprocessError, TimeoutExpired + class LocalApp(Application, metaclass=abc.ABCMeta): """ The base class for all locally installed applications, that are used via the command line. - + Internally this creates a :class:`Popen` instance, which handles the execution. - + Parameters ---------- bin_path : str Path of the application represented by this class. """ - + def __init__(self, bin_path): super().__init__() self._bin_path = bin_path @@ -35,28 +36,28 @@ def __init__(self, bin_path): self._process = None self._command = None self._stdin_file = None - + @requires_state(AppState.CREATED) def set_arguments(self, arguments): """ Set command line arguments for the application run. - + PROTECTED: Do not call from outside. - + Parameters ---------- arguments : list of str A list of strings representing the command line options. """ self._arguments = copy.copy(arguments) - + @requires_state(AppState.CREATED) def set_stdin(self, file): """ Set a file as standard input for the application run. - + PROTECTED: Do not call from outside. - + Parameters ---------- file : file object @@ -65,7 +66,7 @@ def set_stdin(self, file): such as `StringIO` are invalid. """ self._stdin_file = file - + @requires_state(AppState.CREATED) def add_additional_options(self, options): """ @@ -81,12 +82,12 @@ def add_additional_options(self, options): It is recommended to use this method only, when the respective :class:`LocalApp` subclass does not provide a method to set the desired option. - + Parameters ---------- options : list of str A list of strings representing the command line options. - + Notes ----- In order to see which options the command line execution used, @@ -114,27 +115,24 @@ def add_additional_options(self, options): clustalo --full --in ...fa --out ...fa --force --output-order=tree-order --seqtype Protein --guidetree-out ...tree """ self._options += options - + @requires_state( - AppState.RUNNING | \ - AppState.CANCELLED | \ - AppState.FINISHED | \ - AppState.JOINED + AppState.RUNNING | AppState.CANCELLED | AppState.FINISHED | AppState.JOINED ) def get_command(self): """ Get the executed command. Cannot be called until the application has been started. - + Returns ------- command : str The executed command. - + Examples -------- - + >>> seq1 = ProteinSequence("BIQTITE") >>> seq2 = ProteinSequence("TITANITE") >>> seq3 = ProteinSequence("BISMITE") @@ -146,72 +144,71 @@ def get_command(self): """ return " ".join(self._command) - @requires_state(AppState.CREATED) def set_exec_dir(self, exec_dir): """ Set the directory where the application should be executed. If not set, it will be executed in the working directory at the - time the application was created. - + time the application was created. + PROTECTED: Do not call from outside. - + Parameters ---------- exec_dir : str The execution directory. """ self._exec_dir = exec_dir - + @requires_state(AppState.RUNNING | AppState.FINISHED) def get_process(self): """ Get the `Popen` instance. - + PROTECTED: Do not call from outside. - + Returns ------- process : Popen The `Popen` instance """ return self._process - + @requires_state(AppState.FINISHED | AppState.JOINED) def get_exit_code(self): """ Get the exit code of the process. - + PROTECTED: Do not call from outside. - + Returns ------- code : int The exit code. """ return self._process.returncode - + @requires_state(AppState.FINISHED | AppState.JOINED) def get_stdout(self): """ Get the STDOUT pipe content of the process. - + PROTECTED: Do not call from outside. - + Returns ------- stdout : str The standard output. """ return self._stdout - + @requires_state(AppState.FINISHED | AppState.JOINED) def get_stderr(self): """ Get the STDERR pipe content of the process. - + PROTECTED: Do not call from outside. - + Returns ------- stdout : str @@ -221,14 +218,17 @@ def get_stderr(self): def run(self): cwd = getcwd() - chdir(self._exec_dir) + chdir(self._exec_dir) self._command = [self._bin_path] + self._options + self._arguments self._process = Popen( - self._command, stdin=self._stdin_file, stdout=PIPE, stderr=PIPE, - encoding="UTF-8" + self._command, + stdin=self._stdin_file, + stdout=PIPE, + stderr=PIPE, + encoding="UTF-8", ) chdir(cwd) - + def is_finished(self): code = self._process.poll() if code == None: @@ -236,23 +236,19 @@ def is_finished(self): else: self._stdout, self._stderr = self._process.communicate() return True - + @requires_state(AppState.RUNNING | AppState.FINISHED) def join(self, timeout=None): # Override method as repetitive calls of 'is_finished()' # are not necessary as 'communicate()' already waits for the # finished application try: - self._stdout, self._stderr = self._process.communicate( - timeout=timeout - ) + self._stdout, self._stderr = self._process.communicate(timeout=timeout) except TimeoutExpired: self.cancel() - raise TimeoutError( - f"The application expired its timeout ({timeout:.1f} s)" - ) + raise TimeoutError(f"The application expired its timeout ({timeout:.1f} s)") self._state = AppState.FINISHED - + try: self.evaluate() except AppStateError: @@ -263,12 +259,11 @@ def join(self, timeout=None): else: self._state = AppState.JOINED self.clean_up() - - + def wait_interval(self): # Not used in this implementation of 'join()' raise NotImplementedError() - + def evaluate(self): super().evaluate() # Check if applicaion terminated correctly @@ -276,10 +271,9 @@ def evaluate(self): if exit_code != 0: err_msg = self.get_stderr().replace("\n", " ") raise SubprocessError( - f"'{self._bin_path}' returned with exit code {exit_code}: " - f"{err_msg}" + f"'{self._bin_path}' returned with exit code {exit_code}: " f"{err_msg}" ) - + def clean_up(self): if self.get_app_state() == AppState.CANCELLED: self._process.kill() @@ -290,7 +284,7 @@ def cleanup_tempfile(temp_file): Close a :class:`NamedTemporaryFile` and delete it manually, if `delete` is set to ``False``. This function is a small helper function intended for usage in - `LocalApp` subclasses. + `LocalApp` subclasses. The manual deletion is necessary, as Windows does not allow to open a :class:`NamedTemporaryFile` as second time @@ -303,4 +297,4 @@ def cleanup_tempfile(temp_file): """ temp_file.close() if not temp_file.delete: - remove(temp_file.name) \ No newline at end of file + remove(temp_file.name) diff --git a/src/biotite/application/mafft/__init__.py b/src/biotite/application/mafft/__init__.py index 52f86e0ac..19def8bad 100644 --- a/src/biotite/application/mafft/__init__.py +++ b/src/biotite/application/mafft/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.mafft" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/mafft/app.py b/src/biotite/application/mafft/app.py index 2d4a22530..562dcf1ae 100644 --- a/src/biotite/application/mafft/app.py +++ b/src/biotite/application/mafft/app.py @@ -6,25 +6,19 @@ __author__ = "Patrick Kunzmann" __all__ = ["MafftApp"] -import re import os -from ..msaapp import MSAApp -from ..application import AppState, requires_state -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.align.alignment import Alignment +import re from ...sequence.phylo.tree import Tree - +from ..application import AppState, requires_state +from ..msaapp import MSAApp _prefix_pattern = re.compile(r"\d*_") - class MafftApp(MSAApp): """ Perform a multiple sequence alignment using MAFFT. - + Parameters ---------- sequences : list of Sequence @@ -33,7 +27,7 @@ class MafftApp(MSAApp): Path of the MUSCLE binary. matrix : SubstitutionMatrix, optional A custom substitution matrix. - + Examples -------- @@ -51,19 +45,19 @@ class MafftApp(MSAApp): -BISMITE --IQLITE """ - + def __init__(self, sequences, bin_path="mafft", matrix=None): super().__init__(sequences, bin_path, matrix) self._tree = None self._out_tree_file_name = self.get_input_file_path() + ".tree" - + def run(self): args = [ "--quiet", "--auto", "--treeout", # Get the reordered alignment in order for - # get_alignment_order() to work properly + # get_alignment_order() to work properly "--reorder", ] if self.get_seqtype() == "protein": @@ -75,7 +69,7 @@ def run(self): args += [self.get_input_file_path()] self.set_arguments(args) super().run() - + def evaluate(self): with open(self.get_output_file_path(), "w") as f: # MAFFT outputs alignment to stdout @@ -89,7 +83,7 @@ def evaluate(self): # -> remove the '_' prefix newick = re.sub(_prefix_pattern, "", raw_newick) self._tree = Tree.from_newick(newick) - + def clean_up(self): os.remove(self._out_tree_file_name) @@ -97,26 +91,26 @@ def clean_up(self): def get_guide_tree(self): """ Get the guide tree created for the progressive alignment. - + Returns ------- tree : Tree The guide tree. """ return self._tree - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return True - + @staticmethod def supports_custom_protein_matrix(): return True diff --git a/src/biotite/application/msaapp.py b/src/biotite/application/msaapp.py index bf490872e..d70a3012f 100644 --- a/src/biotite/application/msaapp.py +++ b/src/biotite/application/msaapp.py @@ -7,22 +7,22 @@ __all__ = ["MSAApp"] import abc -from tempfile import NamedTemporaryFile from collections import OrderedDict +from tempfile import NamedTemporaryFile import numpy as np -from .localapp import LocalApp, cleanup_tempfile -from .application import AppState, requires_state -from ..sequence.seqtypes import NucleotideSequence, ProteinSequence -from ..sequence.io.fasta.file import FastaFile from ..sequence.align.alignment import Alignment -from .util import map_sequence, map_matrix +from ..sequence.io.fasta.file import FastaFile +from ..sequence.seqtypes import NucleotideSequence, ProteinSequence +from .application import AppState, requires_state +from .localapp import LocalApp, cleanup_tempfile +from .util import map_matrix, map_sequence class MSAApp(LocalApp, metaclass=abc.ABCMeta): """ This is an abstract base class for multiple sequence alignment software. - + It handles conversion of :class:`Sequence` objects to FASTA input and FASTA output to an :class:`Alignment` object. Inheriting subclasses only need to incorporate the file path @@ -41,10 +41,10 @@ class MSAApp(LocalApp, metaclass=abc.ABCMeta): sequences are mapped back into the original sequence types. The mapping does not work, when the alphabet of the exotic sequences is larger than the amino acid alphabet. - + Internally this creates a :class:`Popen` instance, which handles the execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -54,10 +54,10 @@ class MSAApp(LocalApp, metaclass=abc.ABCMeta): matrix : SubstitutionMatrix, optional A custom substitution matrix. """ - + def __init__(self, sequences, bin_path, matrix=None): super().__init__(bin_path) - + if len(sequences) < 2: raise ValueError("At least two sequences are required") # Check if all sequences share the same alphabet @@ -68,40 +68,39 @@ def __init__(self, sequences, bin_path, matrix=None): # Check matrix symmetry if matrix is not None and not matrix.is_symmetric(): raise ValueError( - "A symmetric matrix is required for " - "multiple sequence alignments" + "A symmetric matrix is required for " "multiple sequence alignments" ) - # Check whether the program supports the alignment for the given # sequence type - if ProteinSequence.alphabet.extends(alphabet) \ - and self.supports_protein(): - self._is_mapped = False - self._seqtype = "protein" - if matrix is not None: - if not self.supports_custom_protein_matrix(): - raise TypeError( - "The software does not support custom " - "substitution matrices for protein sequences" - ) - self._matrix = matrix - else: - self._matrix = None - - elif NucleotideSequence.alphabet_amb.extends(alphabet) \ - and self.supports_nucleotide(): - self._is_mapped = False - self._seqtype = "nucleotide" - if matrix is not None: - if not self.supports_custom_nucleotide_matrix(): - raise TypeError( - "The software does not support custom " - "substitution matrices for nucleotide sequences" - ) - self._matrix = matrix - else: - self._matrix = None + if ProteinSequence.alphabet.extends(alphabet) and self.supports_protein(): + self._is_mapped = False + self._seqtype = "protein" + if matrix is not None: + if not self.supports_custom_protein_matrix(): + raise TypeError( + "The software does not support custom " + "substitution matrices for protein sequences" + ) + self._matrix = matrix + else: + self._matrix = None + + elif ( + NucleotideSequence.alphabet_amb.extends(alphabet) + and self.supports_nucleotide() + ): + self._is_mapped = False + self._seqtype = "nucleotide" + if matrix is not None: + if not self.supports_custom_nucleotide_matrix(): + raise TypeError( + "The software does not support custom " + "substitution matrices for nucleotide sequences" + ) + self._matrix = matrix + else: + self._matrix = None else: # For all other sequence types, try to map the sequence into @@ -126,26 +125,16 @@ def __init__(self, sequences, bin_path, matrix=None): self._sequences = sequences # Sequence masquerades as protein self._seqtype = "protein" - self._mapped_sequences = [ - map_sequence(sequence) for sequence in sequences - ] + self._mapped_sequences = [map_sequence(sequence) for sequence in sequences] self._matrix = map_matrix(matrix) - self._sequences = sequences - self._in_file = NamedTemporaryFile( - "w", suffix=".fa", delete=False - ) - self._out_file = NamedTemporaryFile( - "r", suffix=".fa", delete=False - ) - self._matrix_file = NamedTemporaryFile( - "w", suffix=".mat", delete=False - ) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) + self._out_file = NamedTemporaryFile("r", suffix=".fa", delete=False) + self._matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False) def run(self): - sequences = self._sequences if not self._is_mapped \ - else self._mapped_sequences + sequences = self._sequences if not self._is_mapped else self._mapped_sequences sequences_file = FastaFile() for i, seq in enumerate(sequences): sequences_file[str(i)] = str(seq) @@ -155,7 +144,7 @@ def run(self): self._matrix_file.write(str(self._matrix)) self._matrix_file.flush() super().run() - + def evaluate(self): super().evaluate() alignment_file = FastaFile.read(self._out_file) @@ -169,26 +158,26 @@ def evaluate(self): # Also obtain original order self._order = np.zeros(len(seq_dict), dtype=int) for i, seq_index in enumerate(seq_dict): - self._order[i] = int(seq_index) - + self._order[i] = int(seq_index) + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) cleanup_tempfile(self._out_file) cleanup_tempfile(self._matrix_file) - + @requires_state(AppState.JOINED) def get_alignment(self): """ Get the resulting multiple sequence alignment. - + Returns ------- alignment : Alignment The global multiple sequence alignment. """ return self._alignment - + @requires_state(AppState.JOINED) def get_alignment_order(self): """ @@ -202,12 +191,12 @@ def get_alignment_order(self): order. This method returns the order of the sequences intended by the MSA software. - + Returns ------- order : ndarray, dtype=int The sequence order intended by the MSA software. - + Examples -------- Align sequences and restore the original order: @@ -220,39 +209,39 @@ def get_alignment_order(self): alignment = alignment[:, order] """ return self._order - + def get_input_file_path(self): """ Get input file path (FASTA format). - + PROTECTED: Do not call from outside. - + Returns ------- path : str Path of input file. """ return self._in_file.name - + def get_output_file_path(self): """ Get output file path (FASTA format). - + PROTECTED: Do not call from outside. - + Returns ------- path : str Path of output file. """ return self._out_file.name - + def get_matrix_file_path(self): """ Get file path for custom substitution matrix. - + PROTECTED: Do not call from outside. - + Returns ------- path : str or None @@ -260,7 +249,7 @@ def get_matrix_file_path(self): None if no matrix was given. """ return self._matrix_file.name if self._matrix is not None else None - + def get_seqtype(self): """ Get the type of aligned sequences. @@ -268,16 +257,16 @@ def get_seqtype(self): When a custom sequence type (neither nucleotide nor protein) is mapped onto a protein sequence, the return value is also ``'protein'``. - + PROTECTED: Do not call from outside. - + Returns ------- seqtype : {'nucleotide', 'protein'} Type of sequences to be aligned. """ return self._seqtype - + @staticmethod @abc.abstractmethod def supports_nucleotide(): @@ -289,11 +278,11 @@ def supports_nucleotide(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @staticmethod @abc.abstractmethod def supports_protein(): @@ -305,11 +294,11 @@ def supports_protein(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @staticmethod @abc.abstractmethod def supports_custom_nucleotide_matrix(): @@ -321,11 +310,11 @@ def supports_custom_nucleotide_matrix(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @staticmethod @abc.abstractmethod def supports_custom_protein_matrix(): @@ -337,19 +326,19 @@ def supports_custom_protein_matrix(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @classmethod def align(cls, sequences, bin_path=None, matrix=None): """ Perform a multiple sequence alignment. - + This is a convenience function, that wraps the :class:`MSAApp` execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -359,7 +348,7 @@ def align(cls, sequences, bin_path=None, matrix=None): path will be used. matrix : SubstitutionMatrix, optional A custom substitution matrix. - + Returns ------- alignment : Alignment diff --git a/src/biotite/application/muscle/__init__.py b/src/biotite/application/muscle/__init__.py index 644e7a118..c75f0f8be 100644 --- a/src/biotite/application/muscle/__init__.py +++ b/src/biotite/application/muscle/__init__.py @@ -10,4 +10,4 @@ __author__ = "Patrick Kunzmann" from .app3 import * -from .app5 import * \ No newline at end of file +from .app5 import * diff --git a/src/biotite/application/muscle/app3.py b/src/biotite/application/muscle/app3.py index 8df72ce65..86a883afa 100644 --- a/src/biotite/application/muscle/app3.py +++ b/src/biotite/application/muscle/app3.py @@ -6,25 +6,21 @@ __author__ = "Patrick Kunzmann" __all__ = ["MuscleApp"] -import re import numbers -import warnings +import re import subprocess +import warnings from tempfile import NamedTemporaryFile +from ...sequence.phylo.tree import Tree +from ..application import AppState, VersionError, requires_state from ..localapp import cleanup_tempfile from ..msaapp import MSAApp -from ..application import AppState, VersionError, requires_state -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.align.matrix import SubstitutionMatrix -from ...sequence.align.alignment import Alignment -from ...sequence.phylo.tree import Tree class MuscleApp(MSAApp): """ Perform a multiple sequence alignment using MUSCLE version 3. - + Parameters ---------- sequences : list of Sequence @@ -33,11 +29,11 @@ class MuscleApp(MSAApp): Path of the MUSCLE binary. matrix : SubstitutionMatrix, optional A custom substitution matrix. - + See also -------- Muscle5App - + Examples -------- @@ -55,34 +51,32 @@ class MuscleApp(MSAApp): BISM-ITE -IQL-ITE """ - + def __init__(self, sequences, bin_path="muscle", matrix=None): major_version = get_version(bin_path)[0] if major_version != 3: - raise VersionError( - f"Muscle 3 is required, got version {major_version}" - ) - + raise VersionError(f"Muscle 3 is required, got version {major_version}") + super().__init__(sequences, bin_path, matrix) self._gap_open = None self._gap_ext = None self._terminal_penalty = None self._tree1 = None self._tree2 = None - self._out_tree1_file = NamedTemporaryFile( - "r", suffix=".tree", delete=False - ) - self._out_tree2_file = NamedTemporaryFile( - "r", suffix=".tree", delete=False - ) - + self._out_tree1_file = NamedTemporaryFile("r", suffix=".tree", delete=False) + self._out_tree2_file = NamedTemporaryFile("r", suffix=".tree", delete=False) + def run(self): args = [ "-quiet", - "-in", self.get_input_file_path(), - "-out", self.get_output_file_path(), - "-tree1", self._out_tree1_file.name, - "-tree2", self._out_tree2_file.name, + "-in", + self.get_input_file_path(), + "-out", + self.get_output_file_path(), + "-tree1", + self._out_tree1_file.name, + "-tree2", + self._out_tree2_file.name, ] if self.get_seqtype() == "protein": args += ["-seqtype", "protein"] @@ -91,7 +85,7 @@ def run(self): if self.get_matrix_file_path() is not None: args += ["-matrix", self.get_matrix_file_path()] if self._gap_open is not None and self._gap_ext is not None: - args += ["-gapopen", f"{self._gap_open:.1f}"] + args += ["-gapopen", f"{self._gap_open:.1f}"] args += ["-gapextend", f"{self._gap_ext:.1f}"] # When the gap penalty is set, # use the penalty also for hydrophobic regions @@ -100,7 +94,7 @@ def run(self): args += ["-center", "0.0"] self.set_arguments(args) super().run() - + def evaluate(self): super().evaluate() @@ -108,23 +102,19 @@ def evaluate(self): if len(newick) > 0: self._tree1 = Tree.from_newick(newick) else: - warnings.warn( - "MUSCLE did not write a tree file from the first iteration" - ) - + warnings.warn("MUSCLE did not write a tree file from the first iteration") + newick = self._out_tree2_file.read().replace("\n", "") if len(newick) > 0: self._tree2 = Tree.from_newick(newick) else: - warnings.warn( - "MUSCLE did not write a tree file from the second iteration" - ) - + warnings.warn("MUSCLE did not write a tree file from the second iteration") + def clean_up(self): super().clean_up() cleanup_tempfile(self._out_tree1_file) cleanup_tempfile(self._out_tree2_file) - + @requires_state(AppState.CREATED) def set_gap_penalty(self, gap_penalty): """ @@ -145,20 +135,20 @@ def set_gap_penalty(self, gap_penalty): if gap_penalty > 0: raise ValueError("Gap penalty must be negative") self._gap_open = gap_penalty - self._gap_ext= gap_penalty + self._gap_ext = gap_penalty elif type(gap_penalty) == tuple: if gap_penalty[0] > 0 or gap_penalty[1] > 0: - raise ValueError("Gap penalty must be negative") + raise ValueError("Gap penalty must be negative") self._gap_open = gap_penalty[0] self._gap_ext = gap_penalty[1] else: raise TypeError("Gap penalty must be either float or tuple") - + @requires_state(AppState.JOINED) def get_guide_tree(self, iteration="identity"): """ Get the guide tree created for the progressive alignment. - + Parameters ---------- iteration : {'kmer', 'identity'} @@ -168,7 +158,7 @@ def get_guide_tree(self, iteration="identity"): If 'identity' the second iteration tree is returned. This tree uses distances based on the pairwise sequence identity after the first progressive alignment iteration. - + Returns ------- tree : Tree @@ -180,32 +170,31 @@ def get_guide_tree(self, iteration="identity"): return self._tree2 else: raise ValueError("Iteration must be 'kmer' or 'identity'") - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return False - + @staticmethod def supports_custom_protein_matrix(): return True - + @classmethod - def align(cls, sequences, bin_path=None, matrix=None, - gap_penalty=None): + def align(cls, sequences, bin_path=None, matrix=None, gap_penalty=None): """ Perform a multiple sequence alignment. - + This is a convenience function, that wraps the :class:`MuscleApp` execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -222,7 +211,7 @@ def align(cls, sequences, bin_path=None, matrix=None, The first value in the tuple is the gap opening penalty, the second value is the gap extension penalty. The values need to be negative. - + Returns ------- alignment : Alignment @@ -240,15 +229,11 @@ def align(cls, sequences, bin_path=None, matrix=None, def get_version(bin_path="muscle"): - output = subprocess.run( - [bin_path, "-version"], capture_output=True, text=True - ) + output = subprocess.run([bin_path, "-version"], capture_output=True, text=True) # Find matches for version string containing major and minor version - match = re.search("\d+\.\d+", output.stdout) + match = re.search(r"\d+\.\d+", output.stdout) if match is None: - raise subprocess.SubprocessError( - "Could not determine Muscle version" - ) + raise subprocess.SubprocessError("Could not determine Muscle version") version_string = match.group(0) splitted = version_string.split(".") - return int(splitted[0]), int(splitted[1]) \ No newline at end of file + return int(splitted[0]), int(splitted[1]) diff --git a/src/biotite/application/muscle/app5.py b/src/biotite/application/muscle/app5.py index 326c92227..94a1f54bf 100644 --- a/src/biotite/application/muscle/app5.py +++ b/src/biotite/application/muscle/app5.py @@ -6,31 +6,22 @@ __author__ = "Patrick Kunzmann" __all__ = ["Muscle5App"] -import numbers -import warnings -from tempfile import NamedTemporaryFile -from ..localapp import cleanup_tempfile -from ..msaapp import MSAApp from ..application import AppState, VersionError, requires_state -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.align.matrix import SubstitutionMatrix -from ...sequence.align.alignment import Alignment -from ...sequence.phylo.tree import Tree +from ..msaapp import MSAApp from .app3 import get_version class Muscle5App(MSAApp): """ Perform a multiple sequence alignment using MUSCLE version 5. - + Parameters ---------- sequences : list of Sequence The sequences to be aligned. bin_path : str, optional Path of the MUSCLE binary. - + See also -------- MuscleApp @@ -38,7 +29,7 @@ class Muscle5App(MSAApp): Notes ----- Alignment ensemble generation is not supported, yet. - + Examples -------- @@ -56,14 +47,14 @@ class Muscle5App(MSAApp): BI-SMITE -I-QLITE """ - + def __init__(self, sequences, bin_path="muscle"): major_version = get_version(bin_path)[0] if major_version < 5: raise VersionError( f"At least Muscle 5 is required, got version {major_version}" ) - + super().__init__(sequences, bin_path) self._mode = "align" self._consiters = None @@ -86,7 +77,7 @@ def set_iterations(self, consistency=None, refinement=None): self._consiters = consistency if refinement is not None: self._refineiters = refinement - + @requires_state(AppState.CREATED) def set_thread_number(self, number): """ @@ -110,48 +101,49 @@ def run(self): args = [ f"-{self._mode}", self.get_input_file_path(), - "-output", self.get_output_file_path(), + "-output", + self.get_output_file_path(), ] if self.get_seqtype() == "protein": args += ["-amino"] else: args += ["-nt"] if self._n_threads is not None: - args += ["-threads", str(self._n_threads)] + args += ["-threads", str(self._n_threads)] if self._consiters is not None: - args += ["-consiters", str(self._consiters)] + args += ["-consiters", str(self._consiters)] if self._refineiters is not None: - args += ["-refineiters", str(self._refineiters)] + args += ["-refineiters", str(self._refineiters)] self.set_arguments(args) super().run() - + def clean_up(self): super().clean_up() - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return False - + @staticmethod def supports_custom_protein_matrix(): return False - + @classmethod def align(cls, sequences, bin_path="muscle"): """ Perform a multiple sequence alignment. - + This is a convenience function, that wraps the :class:`Muscle5App` execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -159,7 +151,7 @@ def align(cls, sequences, bin_path="muscle"): bin_path : str, optional Path of the MSA software binary. By default, the default path will be used. - + Returns ------- alignment : Alignment diff --git a/src/biotite/application/sra/__init__.py b/src/biotite/application/sra/__init__.py index d68a49d3e..f69fccde6 100644 --- a/src/biotite/application/sra/__init__.py +++ b/src/biotite/application/sra/__init__.py @@ -15,4 +15,4 @@ __name__ = "biotite.application.sra" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/sra/app.py b/src/biotite/application/sra/app.py index 6f5a20955..bda5be577 100644 --- a/src/biotite/application/sra/app.py +++ b/src/biotite/application/sra/app.py @@ -7,17 +7,16 @@ __all__ = ["FastaDumpApp", "FastqDumpApp"] import abc -from os.path import join -from subprocess import Popen, SubprocessError, PIPE, TimeoutExpired import glob +from os.path import join +from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired from tempfile import TemporaryDirectory -from ..application import Application, AppState, AppStateError, \ - requires_state -from ...sequence.seqtypes import NucleotideSequence -from ...sequence.io.fastq.file import FastqFile +from ...sequence.io.fasta.convert import get_sequences from ...sequence.io.fasta.file import FastaFile from ...sequence.io.fastq.convert import get_sequences as get_sequences_and_scores -from ...sequence.io.fasta.convert import get_sequences +from ...sequence.io.fastq.file import FastqFile +from ...sequence.seqtypes import NucleotideSequence +from ..application import Application, AppState, AppStateError, requires_state # Do not use LocalApp, as two programs are executed @@ -48,8 +47,13 @@ class _DumpApp(Application, metaclass=abc.ABCMeta): the score format. """ - def __init__(self, uid, output_path_prefix=None, - prefetch_path="prefetch", fasterq_dump_path="fasterq-dump"): + def __init__( + self, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + ): super().__init__() self._prefetch_path = prefetch_path self._fasterq_dump_path = fasterq_dump_path @@ -62,21 +66,16 @@ def __init__(self, uid, output_path_prefix=None, self._prefetch_process = None self._fasterq_dump_process = None - @requires_state(AppState.RUNNING | AppState.FINISHED) def join(self, timeout=None): # Override method as repetitive calls of 'is_finished()' # are not necessary as 'communicate()' already waits for the # finished application try: - _, self._stderr = self._process.communicate( - timeout=timeout - ) + _, self._stderr = self._process.communicate(timeout=timeout) except TimeoutExpired: self.cancel() - raise TimeoutError( - f"The application expired its timeout ({timeout:.1f} s)" - ) + raise TimeoutError(f"The application expired its timeout ({timeout:.1f} s)") self._state = AppState.FINISHED try: @@ -90,7 +89,6 @@ def join(self, timeout=None): self._state = AppState.JOINED self.clean_up() - def run(self): # Prefetch into a temp directory with file name equaling UID # This ensures that the ID in the header is not the temp prefix @@ -105,16 +103,14 @@ def run(self): command, stdout=PIPE, stderr=PIPE, shell=True, encoding="UTF-8" ) - def is_finished(self): code = self._process.poll() if code == None: return False else: - _, self._stderr = self._process.communicate() + _, self._stderr = self._process.communicate() return True - def evaluate(self): super().evaluate() # Check if applicaion terminated correctly @@ -128,26 +124,24 @@ def evaluate(self): self._file_names = ( # For entries with one read per spot - glob.glob(self._prefix + ".fastq") + + glob.glob(self._prefix + ".fastq") + + # For entries with multiple reads per spot glob.glob(self._prefix + "_*.fastq") ) # Only load FASTQ files into memory when needed self._fastq_files = None - def wait_interval(self): # Not used in this implementation of 'join()' raise NotImplementedError() - def clean_up(self): if self.get_app_state() == AppState.CANCELLED: self._process.kill() # Directory with temp files does not need to be deleted, # as temp dir is automatically deleted upon object destruction - @requires_state(AppState.CREATED) def get_prefetch_options(self): """ @@ -176,7 +170,6 @@ def get_fastq_dump_options(self): """ return "" - @requires_state(AppState.JOINED) def get_file_paths(self): """ @@ -189,7 +182,6 @@ def get_file_paths(self): """ return self._file_names - @requires_state(AppState.JOINED) @abc.abstractmethod def get_sequences(self): @@ -236,15 +228,18 @@ class FastqDumpApp(_DumpApp): the score format. """ - def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump", offset="Sanger"): - super().__init__( - uid, output_path_prefix, prefetch_path, fasterq_dump_path - ) + def __init__( + self, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + offset="Sanger", + ): + super().__init__(uid, output_path_prefix, prefetch_path, fasterq_dump_path) self._offset = offset self._fastq_files = None - @requires_state(AppState.JOINED) def get_fastq(self): """ @@ -265,20 +260,16 @@ def get_fastq(self): ] return self._fastq_files - @requires_state(AppState.JOINED) def get_sequences(self): return [ { - header: NucleotideSequence( - seq_str.replace("U","T").replace("X","N") - ) + header: NucleotideSequence(seq_str.replace("U", "T").replace("X", "N")) for header, (seq_str, _) in fastq_file.items() } for fastq_file in self.get_fastq() ] - @requires_state(AppState.JOINED) def get_sequences_and_scores(self): """ @@ -294,15 +285,17 @@ def get_sequences_and_scores(self): Each item in the list is a dictionary mapping identifiers to its corresponding sequence and score values. """ - return [ - get_sequences_and_scores(fastq_file) - for fastq_file in self.get_fastq() - ] - + return [get_sequences_and_scores(fastq_file) for fastq_file in self.get_fastq()] @classmethod - def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump", offset="Sanger"): + def fetch( + cls, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + offset="Sanger", + ): """ Get the sequences belonging to the UID from the *NCBI sequence read archive* (SRA). @@ -338,9 +331,7 @@ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", Each item in the list is a dictionary mapping identifiers to its corresponding sequence. """ - app = cls( - uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset - ) + app = cls(uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset) app.start() app.join() return app.get_sequences() @@ -368,14 +359,16 @@ class FastaDumpApp(_DumpApp): respectively. """ - def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump"): - super().__init__( - uid, output_path_prefix, prefetch_path, fasterq_dump_path - ) + def __init__( + self, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + ): + super().__init__(uid, output_path_prefix, prefetch_path, fasterq_dump_path) self._fasta_files = None - @requires_state(AppState.CREATED) def get_prefetch_options(self): return @@ -383,12 +376,10 @@ def get_prefetch_options(self): # when https://github.com/ncbi/sra-tools/issues/883 is resolved # return "--eliminate-quals" - @requires_state(AppState.CREATED) def get_fastq_dump_options(self): return "--fasta" - @requires_state(AppState.JOINED) def get_fasta(self): """ @@ -404,20 +395,22 @@ def get_fasta(self): """ if self._fasta_files is None: self._fasta_files = [ - FastaFile.read(file_name) - for file_name in self.get_file_paths() + FastaFile.read(file_name) for file_name in self.get_file_paths() ] return self._fasta_files - @requires_state(AppState.JOINED) def get_sequences(self): return [get_sequences(fasta_file) for fasta_file in self.get_fasta()] - @classmethod - def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump"): + def fetch( + cls, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + ): """ Get the sequences belonging to the UID from the *NCBI sequence read archive* (SRA). @@ -448,9 +441,7 @@ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", Each item in the list is a dictionary mapping identifiers to its corresponding sequence. """ - app = cls( - uid, output_path_prefix, prefetch_path, fasterq_dump_path - ) + app = cls(uid, output_path_prefix, prefetch_path, fasterq_dump_path) app.start() app.join() - return app.get_sequences() \ No newline at end of file + return app.get_sequences() diff --git a/src/biotite/application/tantan/__init__.py b/src/biotite/application/tantan/__init__.py index 6efc86610..7a829420a 100644 --- a/src/biotite/application/tantan/__init__.py +++ b/src/biotite/application/tantan/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.tantan" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/tantan/app.py b/src/biotite/application/tantan/app.py index 077a5cbdd..23416b83f 100644 --- a/src/biotite/application/tantan/app.py +++ b/src/biotite/application/tantan/app.py @@ -6,17 +6,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["TantanApp"] -from collections.abc import Sequence as SequenceABC import io +from collections.abc import Sequence as SequenceABC from tempfile import NamedTemporaryFile import numpy as np -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence from ...sequence.alphabet import common_alphabet from ...sequence.io.fasta.file import FastaFile -from ..util import map_sequence, map_matrix - +from ...sequence.seqtypes import NucleotideSequence, ProteinSequence +from ..application import AppState, requires_state +from ..localapp import LocalApp, cleanup_tempfile MASKING_LETTER = "!" @@ -43,7 +41,7 @@ class TantanApp(LocalApp): References ---------- - + .. footbibliography:: Examples @@ -59,10 +57,10 @@ class TantanApp(LocalApp): True True True True True True True True False False False False False] >>> print(sequence, "\n" + "".join(["^" if e else " " for e in repeat_mask])) - GGCATCGATATATATATATAGTCAA - ^^^^^^^^^^^ + GGCATCGATATATATATATAGTCAA + ^^^^^^^^^^^ """ - + def __init__(self, sequence, matrix=None, bin_path="tantan"): super().__init__(bin_path) @@ -93,59 +91,43 @@ def __init__(self, sequence, matrix=None, bin_path="tantan"): ) self._is_protein = True else: - raise TypeError( - "A NucleotideSequence or ProteinSequence is required" - ) - + raise TypeError("A NucleotideSequence or ProteinSequence is required") + if matrix is None: self._matrix_file = None else: - common_alph = common_alphabet( - (seq.alphabet for seq in self._sequences) - ) + common_alph = common_alphabet((seq.alphabet for seq in self._sequences)) if common_alph is None: - raise ValueError( - "There is no common alphabet within the sequences" - ) + raise ValueError("There is no common alphabet within the sequences") if not matrix.get_alphabet1().extends(common_alph): raise ValueError( "The alphabet of the sequence(s) do not fit the matrix" ) if not matrix.is_symmetric(): raise ValueError("A symmetric matrix is required") - self._matrix_file = NamedTemporaryFile( - "w", suffix=".mat", delete=False - ) + self._matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False) self._matrix = matrix - - self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) def run(self): FastaFile.write_iter( self._in_file, - ( - (f"sequence_{i:d}", str(seq)) - for i, seq in enumerate(self._sequences) - ) + ((f"sequence_{i:d}", str(seq)) for i, seq in enumerate(self._sequences)), ) self._in_file.flush() if self._matrix is not None: self._matrix_file.write(str(self._matrix)) self._matrix_file.flush() - + args = [] if self._matrix is not None: args += ["-m", self._matrix_file.name] if self._is_protein: - args += ["-p"] - args += [ - "-x", MASKING_LETTER, - self._in_file.name - ] + args += ["-p"] + args += ["-x", MASKING_LETTER, self._in_file.name] self.set_arguments(args) super().run() - def evaluate(self): super().evaluate() @@ -154,18 +136,14 @@ def evaluate(self): self._masks = [] encoded_masking_letter = MASKING_LETTER.encode("ASCII")[0] for _, masked_seq_string in FastaFile.read_iter(out_file): - array = np.frombuffer( - masked_seq_string.encode("ASCII"), dtype=np.ubyte - ) + array = np.frombuffer(masked_seq_string.encode("ASCII"), dtype=np.ubyte) self._masks.append(array == encoded_masking_letter) - def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) if self._matrix_file is not None: cleanup_tempfile(self._matrix_file) - @requires_state(AppState.JOINED) def get_mask(self): @@ -186,7 +164,6 @@ def get_mask(self): else: return self._masks[0] - @staticmethod def mask_repeats(sequence, matrix=None, bin_path="tantan"): """ @@ -219,4 +196,4 @@ def mask_repeats(sequence, matrix=None, bin_path="tantan"): app = TantanApp(sequence, matrix, bin_path) app.start() app.join() - return app.get_mask() \ No newline at end of file + return app.get_mask() diff --git a/src/biotite/application/util.py b/src/biotite/application/util.py index ce544c417..af92a1354 100644 --- a/src/biotite/application/util.py +++ b/src/biotite/application/util.py @@ -8,15 +8,15 @@ import numpy as np -from ..sequence.seqtypes import ProteinSequence from ..sequence.align.matrix import SubstitutionMatrix +from ..sequence.seqtypes import ProteinSequence def map_sequence(sequence): """ Map a sequence with an arbitrary alphabet into a :class:`ProteinSequence`, in order to support arbitrary sequence - types in software that can handle protein sequences. + types in software that can handle protein sequences. """ if len(sequence.alphabet) > len(ProteinSequence.alphabet): # Cannot map into a protein sequence if the alphabet @@ -39,12 +39,11 @@ def map_matrix(matrix): Map a :class:`SubstitutionMatrix` with an arbitrary alphabet into a class:`SubstitutionMatrix` for protein sequences, in order to support arbitrary sequence types in software that can handle protein - sequences. + sequences. """ if matrix is None: raise TypeError( - "A substitution matrix must be provided for custom " - "sequence types" + "A substitution matrix must be provided for custom " "sequence types" ) # Create a protein substitution matrix with the values taken # from the original matrix @@ -54,6 +53,5 @@ def map_matrix(matrix): new_score_matrix = np.zeros((new_length, new_length)) new_score_matrix[:old_length, :old_length] = matrix.score_matrix() return SubstitutionMatrix( - ProteinSequence.alphabet, ProteinSequence.alphabet, - new_score_matrix - ) \ No newline at end of file + ProteinSequence.alphabet, ProteinSequence.alphabet, new_score_matrix + ) diff --git a/src/biotite/application/viennarna/rnaalifold.py b/src/biotite/application/viennarna/rnaalifold.py index aadc61b97..1eebe573e 100644 --- a/src/biotite/application/viennarna/rnaalifold.py +++ b/src/biotite/application/viennarna/rnaalifold.py @@ -9,11 +9,11 @@ import copy from tempfile import NamedTemporaryFile import numpy as np -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile from ...sequence.io.fasta import FastaFile, set_alignment -from ...structure.dotbracket import base_pairs_from_dot_bracket from ...structure.bonds import BondList +from ...structure.dotbracket import base_pairs_from_dot_bracket +from ..application import AppState, requires_state +from ..localapp import LocalApp, cleanup_tempfile from .util import build_constraint_string @@ -45,9 +45,7 @@ def __init__(self, alignment, temperature=37, bin_path="RNAalifold"): self._temperature = str(temperature) self._constraints = None self._enforce = None - self._in_file = NamedTemporaryFile( - "w", suffix=".fa", delete=False - ) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) self._constraints_file = NamedTemporaryFile( "w+", suffix=".constraints", delete=False ) @@ -57,15 +55,17 @@ def run(self): # -> Extremely high value for characters per line fasta_file = FastaFile(chars_per_line=np.iinfo(np.int32).max) set_alignment( - fasta_file, self._alignment, - seq_names=[str(i) for i in range(len(self._alignment.sequences))] + fasta_file, + self._alignment, + seq_names=[str(i) for i in range(len(self._alignment.sequences))], ) fasta_file.write(self._in_file) self._in_file.flush() options = [ "--noPS", - "-T", self._temperature, + "-T", + self._temperature, ] if self._enforce is True: options.append("--enforceConstraint") @@ -78,7 +78,7 @@ def run(self): self.set_arguments(options + [self._in_file.name]) super().run() - + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) @@ -97,7 +97,7 @@ def evaluate(self): self._free_energy = float(energy_contributions[0]) self._covariance_energy = float(energy_contributions[1]) self._dotbracket = dotbracket - + @requires_state(AppState.CREATED) def set_temperature(self, temperature): """ @@ -110,10 +110,17 @@ def set_temperature(self, temperature): The temperature. """ self._temperature = str(temperature) - + @requires_state(AppState.CREATED) - def set_constraints(self, pairs=None, paired=None, unpaired=None, - downstream=None, upstream=None, enforce=False): + def set_constraints( + self, + pairs=None, + paired=None, + unpaired=None, + downstream=None, + upstream=None, + enforce=False, + ): """ Add constraints of known paired or unpaired bases to the folding algorithm. @@ -138,15 +145,14 @@ def set_constraints(self, pairs=None, paired=None, unpaired=None, the respective base pairs must form. By default (false), a constraint does only forbid formation of a pair that would conflict with this constraint. - + Warnings -------- If a constraint is given for a gap position in the consensus sequence, the software may find no base pairs at all. """ self._constraints = build_constraint_string( - len(self._alignment), - pairs, paired, unpaired, downstream, upstream + len(self._alignment), pairs, paired, unpaired, downstream, upstream ) self._enforce = enforce @@ -160,19 +166,19 @@ def get_free_energy(self): ------- free_energy : float The free energy. - + Notes ----- The total energy of the secondary structure regarding the minimization objective is the sum of the free energy and the covariance term. - + See also -------- get_covariance_energy """ return self._free_energy - + @requires_state(AppState.JOINED) def get_covariance_energy(self): """ @@ -183,19 +189,19 @@ def get_covariance_energy(self): ------- covariance_energy : float The energy of the covariance term. - + Notes ----- The total energy of the secondary structure regarding the minimization objective is the sum of the free energy and the covariance term. - + See also -------- get_free_energy """ return self._covariance_energy - + @requires_state(AppState.JOINED) def get_consensus_sequence_string(self): """ @@ -265,7 +271,7 @@ def get_base_pairs(self, sequence_index=None): pair_list = pair_list[trace != -1] # Convert back to array of base pairs, # remove unused BondType column - base_pairs = pair_list.as_array()[:,:2] + base_pairs = pair_list.as_array()[:, :2] return base_pairs @staticmethod @@ -300,5 +306,5 @@ def compute_secondary_structure(alignment, bin_path="RNAalifold"): return ( app.get_dot_bracket(), app.get_free_energy(), - app.get_covariance_energy() + app.get_covariance_energy(), ) diff --git a/src/biotite/application/viennarna/rnafold.py b/src/biotite/application/viennarna/rnafold.py index 38877f963..c636fb285 100644 --- a/src/biotite/application/viennarna/rnafold.py +++ b/src/biotite/application/viennarna/rnafold.py @@ -6,13 +6,12 @@ __author__ = "Tom David Müller, Patrick Kunzmann" __all__ = ["RNAfoldApp"] -import warnings from tempfile import NamedTemporaryFile import numpy as np -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile from ...sequence.io.fasta import FastaFile, set_sequence from ...structure.dotbracket import base_pairs_from_dot_bracket +from ..application import AppState, requires_state +from ..localapp import LocalApp, cleanup_tempfile from .util import build_constraint_string @@ -51,9 +50,7 @@ def __init__(self, sequence, temperature=37, bin_path="RNAfold"): self._temperature = str(temperature) self._constraints = None self._enforce = None - self._in_file = NamedTemporaryFile( - "w", suffix=".fa", delete=False - ) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) super().__init__(bin_path) def run(self): @@ -68,7 +65,8 @@ def run(self): options = [ "--noPS", - "-T", self._temperature, + "-T", + self._temperature, ] if self._enforce is True: options.append("--enforceConstraint") @@ -106,8 +104,15 @@ def set_temperature(self, temperature): self._temperature = str(temperature) @requires_state(AppState.CREATED) - def set_constraints(self, pairs=None, paired=None, unpaired=None, - downstream=None, upstream=None, enforce=False): + def set_constraints( + self, + pairs=None, + paired=None, + unpaired=None, + downstream=None, + upstream=None, + enforce=False, + ): """ Add constraints of known paired or unpaired bases to the folding algorithm. @@ -134,8 +139,7 @@ def set_constraints(self, pairs=None, paired=None, unpaired=None, of a pair that would conflict with this constraint. """ self._constraints = build_constraint_string( - len(self._sequence), - pairs, paired, unpaired, downstream, upstream + len(self._sequence), pairs, paired, unpaired, downstream, upstream ) self._enforce = enforce diff --git a/src/biotite/application/viennarna/rnaplot.py b/src/biotite/application/viennarna/rnaplot.py index 7eedea7ee..acb23e74d 100644 --- a/src/biotite/application/viennarna/rnaplot.py +++ b/src/biotite/application/viennarna/rnaplot.py @@ -6,13 +6,14 @@ __author__ = "Tom David Müller" __all__ = ["RNAplotApp"] -import numpy as np -from tempfile import NamedTemporaryFile -from os import remove from enum import IntEnum -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state +from os import remove +from tempfile import NamedTemporaryFile +import numpy as np from ...structure.dotbracket import dot_bracket as dot_bracket_ +from ..application import AppState, requires_state +from ..localapp import LocalApp, cleanup_tempfile + class RNAplotApp(LocalApp): """ @@ -60,21 +61,28 @@ class Layout(IntEnum): This enum type represents the layout type of the plot according to the official *RNAplot* orientation. """ - RADIAL = 0, - NAVIEW = 1, - CIRCULAR = 2, - RNATURTLE = 3, + + RADIAL = (0,) + NAVIEW = (1,) + CIRCULAR = (2,) + RNATURTLE = (3,) RNAPUZZLER = 4 - def __init__(self, dot_bracket=None, base_pairs=None, length=None, - layout_type=Layout.NAVIEW, bin_path="RNAplot"): + def __init__( + self, + dot_bracket=None, + base_pairs=None, + length=None, + layout_type=Layout.NAVIEW, + bin_path="RNAplot", + ): super().__init__(bin_path) if dot_bracket is not None: self._dot_bracket = dot_bracket elif (base_pairs is not None) and (length is not None): self._dot_bracket = dot_bracket_( - base_pairs, length, max_pseudoknot_order = 0 + base_pairs, length, max_pseudoknot_order=0 )[0] else: raise ValueError( @@ -84,10 +92,10 @@ def __init__(self, dot_bracket=None, base_pairs=None, length=None, # Get the value of the enum type self._layout_type = str(int(layout_type)) - self._in_file = NamedTemporaryFile("w", suffix=".fold", delete=False) + self._in_file = NamedTemporaryFile("w", suffix=".fold", delete=False) def run(self): - self._in_file.write("N"*len(self._dot_bracket) + "\n") + self._in_file.write("N" * len(self._dot_bracket) + "\n") self._in_file.write(self._dot_bracket) self._in_file.flush() self.set_arguments( @@ -146,8 +154,11 @@ def get_coordinates(self): @staticmethod def compute_coordinates( - dot_bracket=None, base_pairs=None, length=None, - layout_type=Layout.NAVIEW, bin_path="RNAplot" + dot_bracket=None, + base_pairs=None, + length=None, + layout_type=Layout.NAVIEW, + bin_path="RNAplot", ): """ Get coordinates for a 2D representation of any unknotted RNA @@ -179,9 +190,13 @@ def compute_coordinates( The 2D coordinates. Each row represents the *x* and *y* coordinates for a total sequence length of *n*. """ - app = RNAplotApp(dot_bracket=dot_bracket, base_pairs=base_pairs, - length=length, layout_type=layout_type, - bin_path=bin_path) + app = RNAplotApp( + dot_bracket=dot_bracket, + base_pairs=base_pairs, + length=length, + layout_type=layout_type, + bin_path=bin_path, + ) app.start() app.join() - return app.get_coordinates() \ No newline at end of file + return app.get_coordinates() diff --git a/src/biotite/application/viennarna/util.py b/src/biotite/application/viennarna/util.py index df6149a2b..fa9336e08 100644 --- a/src/biotite/application/viennarna/util.py +++ b/src/biotite/application/viennarna/util.py @@ -10,9 +10,14 @@ from ...structure.pseudoknots import pseudoknots -def build_constraint_string(sequence_length, - pairs=None, paired=None, unpaired=None, - downstream=None, upstream=None): +def build_constraint_string( + sequence_length, + pairs=None, + paired=None, + unpaired=None, + downstream=None, + upstream=None, +): """ Build a ViennaRNA constraint string. @@ -30,7 +35,7 @@ def build_constraint_string(sequence_length, Positions of bases that are paired with any downstream base. upstream : ndarray, shape=(n,), dtype=int or dtype=bool, optional Positions of bases that are paired with any upstream base. - + Returns ------- constraints : str @@ -45,21 +50,21 @@ def build_constraint_string(sequence_length, raise ValueError("Given pairs include pseudoknots") # Ensure the lower base comes first for each pair pairs = np.sort(pairs, axis=-1) - _set_constraints(constraints, pairs[:,0], "(") - _set_constraints(constraints, pairs[:,1], ")") + _set_constraints(constraints, pairs[:, 0], "(") + _set_constraints(constraints, pairs[:, 1], ")") _set_constraints(constraints, paired, "|") _set_constraints(constraints, unpaired, "x") _set_constraints(constraints, downstream, "<") _set_constraints(constraints, upstream, ">") - + return "".join(constraints) - + def _set_constraints(constraints, index, character): if index is None: return - + # Search for conflicts with other constraints potential_conflict_indices = np.where(constraints[index] != ".")[0] if len(potential_conflict_indices) > 0: @@ -68,5 +73,5 @@ def _set_constraints(constraints, index, character): f"Constraint '{character}' at position {conflict_i} " f"conflicts with existing constraint '{constraints[conflict_i]}'" ) - - constraints[index] = character \ No newline at end of file + + constraints[index] = character diff --git a/src/biotite/application/webapp.py b/src/biotite/application/webapp.py index afeaaddaf..d40e5ed22 100644 --- a/src/biotite/application/webapp.py +++ b/src/biotite/application/webapp.py @@ -13,16 +13,16 @@ class WebApp(Application, metaclass=abc.ABCMeta): """ The base class for all web based applications. - + It allows for getting and setting the URL of the app and raises an :class:`RuleViolationError` when a subclass calls :func:`violate_rule()` (e.g. when the server was contacted too often.) - + Be careful, when calling func:`get_app_state()`. This may involve a server contact and therefore frequent calls may raise a :class:`RuleViolationError`. - + Parameters ---------- app_url : str @@ -31,19 +31,19 @@ class WebApp(Application, metaclass=abc.ABCMeta): If true, the application raises an :class:`RuleViolationError`, if the server rules are violated. (Default: True) """ - + def __init__(self, app_url, obey_rules=True): super().__init__() self._obey_rules = obey_rules self._app_url = app_url - + def violate_rule(self, msg=None): """ Indicate that a server rule was violated, i.e. this raises a :class:`RuleViolationError` unless `obey_rules` is false. - + PROTECTED: Do not call from outside. - + Parameters ---------- msg : str, optional @@ -51,16 +51,14 @@ def violate_rule(self, msg=None): """ if self._obey_rules: if msg is None: - raise RuleViolationError( - "The user guidelines would be violated" - ) + raise RuleViolationError("The user guidelines would be violated") else: raise RuleViolationError(msg) - + def app_url(self): """ Get the URL of the web app. - + Returns ------- url : str @@ -74,4 +72,5 @@ class RuleViolationError(Exception): Indicates that the user guidelines of the web application would be violated, if the program continued. """ - pass \ No newline at end of file + + pass diff --git a/src/biotite/copyable.py b/src/biotite/copyable.py index d9c389b63..30d8a85d5 100644 --- a/src/biotite/copyable.py +++ b/src/biotite/copyable.py @@ -12,22 +12,22 @@ class Copyable(metaclass=abc.ABCMeta): """ Base class for all objects, that should be copyable. - + The public method `copy()` first creates a fresh instance of the class of the instance, that is copied via the `__copy_create__()` method. All variables, that could not be set via the constructor, are then copied via `__copy_fill__()`, starting with the method in the uppermost base class and ending with the class of the instance to be copied. - + This approach solves the problem of encapsulated variables in superclasses. """ - + def copy(self): """ Create a deep copy of this object. - + Returns ------- copy @@ -36,36 +36,36 @@ def copy(self): clone = self.__copy_create__() self.__copy_fill__(clone) return clone - + def __copy_create__(self): """ Instantiate a new object of this class. - + Only the constructor should be called in this method. All further attributes, that need to be copied are handled in `__copy_fill__()` - + Do not call the `super()` method here. - + This method must be overridden, if the constructor takes parameters. - + Returns ------- copy A freshly instantiated copy of *self*. """ return type(self)() - + def __copy_fill__(self, clone): """ Copy all necessary attributes to the new object. - + Always call the `super()` method as first statement. - + Parameters ---------- clone The freshly instantiated copy of *self*. """ - pass \ No newline at end of file + pass diff --git a/src/biotite/database/__init__.py b/src/biotite/database/__init__.py index 36c544065..d4b733cb8 100644 --- a/src/biotite/database/__init__.py +++ b/src/biotite/database/__init__.py @@ -20,4 +20,4 @@ __name__ = "biotite.database" __author__ = "Patrick Kunzmann" -from .error import * \ No newline at end of file +from .error import * diff --git a/src/biotite/database/entrez/__init__.py b/src/biotite/database/entrez/__init__.py index 2b5488ce4..a27d11338 100644 --- a/src/biotite/database/entrez/__init__.py +++ b/src/biotite/database/entrez/__init__.py @@ -11,5 +11,5 @@ from .dbnames import * from .download import * +from .key import * from .query import * -from .key import * \ No newline at end of file diff --git a/src/biotite/database/entrez/check.py b/src/biotite/database/entrez/check.py index 52bcd3fdc..063ecdf03 100644 --- a/src/biotite/database/entrez/check.py +++ b/src/biotite/database/entrez/check.py @@ -9,7 +9,6 @@ import json from ..error import RequestError - # Taken from https://github.com/kblin/ncbi-entrez-error-messages _error_messages = [ "Error reading from remote server", @@ -58,4 +57,4 @@ def check_for_errors(message): for error_msg in _error_messages: # Often whitespace is also replaced by '+' in error message if error_msg.replace(" ", "") in message_end: - raise RequestError(error_msg) \ No newline at end of file + raise RequestError(error_msg) diff --git a/src/biotite/database/entrez/dbnames.py b/src/biotite/database/entrez/dbnames.py index 2aa967a61..e17796648 100644 --- a/src/biotite/database/entrez/dbnames.py +++ b/src/biotite/database/entrez/dbnames.py @@ -88,4 +88,4 @@ def sanitize_database_name(db_name): # Is already E-utility database name return db_name else: - raise ValueError("Database '{db_name}' is not existing") \ No newline at end of file + raise ValueError("Database '{db_name}' is not existing") diff --git a/src/biotite/database/entrez/download.py b/src/biotite/database/entrez/download.py index d30ac41ea..e2239b925 100644 --- a/src/biotite/database/entrez/download.py +++ b/src/biotite/database/entrez/download.py @@ -6,22 +6,28 @@ __author__ = "Patrick Kunzmann" __all__ = ["fetch", "fetch_single_file"] -from os.path import isdir, isfile, join, getsize -import os -import glob import io +import os +from os.path import getsize, isdir, isfile, join import requests +from ..error import RequestError from .check import check_for_errors from .dbnames import sanitize_database_name from .key import get_api_key -from ..error import RequestError - _fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" -def fetch(uids, target_path, suffix, db_name, ret_type, - ret_mode="text", overwrite=False, verbose=False): +def fetch( + uids, + target_path, + suffix, + db_name, + ret_type, + ret_mode="text", + overwrite=False, + verbose=False, +): """ Download files from the NCBI Entrez database in various formats. @@ -111,31 +117,28 @@ def fetch(uids, target_path, suffix, db_name, ret_type, file = join(target_path, id + "." + suffix) else: file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: - param_dict = { - "db" : sanitize_database_name(db_name), - "id" : id, - "rettype" : ret_type, - "retmode" : ret_mode, - "tool" : "Biotite", - "mail" : "padix.key@gmail.com" - } - api_key = get_api_key() - if api_key is not None: - param_dict["api_key"] = api_key - r = requests.get(_fetch_url, params=param_dict) - content = r.text - check_for_errors(content) - if content.startswith(" Error"): - raise RequestError(content[8:]) - if file is None: - file = io.StringIO(content) - else: - with open(file, "w+") as f: - f.write(content) + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: + param_dict = { + "db": sanitize_database_name(db_name), + "id": id, + "rettype": ret_type, + "retmode": ret_mode, + "tool": "Biotite", + "mail": "padix.key@gmail.com", + } + api_key = get_api_key() + if api_key is not None: + param_dict["api_key"] = api_key + r = requests.get(_fetch_url, params=param_dict) + content = r.text + check_for_errors(content) + if content.startswith(" Error"): + raise RequestError(content[8:]) + if file is None: + file = io.StringIO(content) + else: + with open(file, "w+") as f: + f.write(content) files.append(file) if verbose: print("\nDone") @@ -146,8 +149,9 @@ def fetch(uids, target_path, suffix, db_name, ret_type, return files -def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text", - overwrite=False): +def fetch_single_file( + uids, file_name, db_name, ret_type, ret_mode="text", overwrite=False +): """ Almost the same as :func:`fetch()`, but the data for the given UIDs will be stored in a single file. @@ -188,24 +192,26 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text", -------- fetch """ - if file_name is not None \ - and os.path.isfile(file_name) \ - and getsize(file_name) > 0 \ - and not overwrite: - # Do no redownload the already existing file - return file_name + if ( + file_name is not None + and os.path.isfile(file_name) + and getsize(file_name) > 0 + and not overwrite + ): + # Do no redownload the already existing file + return file_name uid_list_str = "" for id in uids: uid_list_str += id + "," # Remove terminal comma uid_list_str = uid_list_str[:-1] param_dict = { - "db" : sanitize_database_name(db_name), - "id" : uid_list_str, - "rettype" : ret_type, - "retmode" : ret_mode, - "tool" : "Biotite", - "mail" : "padix.key@gmail.com" + "db": sanitize_database_name(db_name), + "id": uid_list_str, + "rettype": ret_type, + "retmode": ret_mode, + "tool": "Biotite", + "mail": "padix.key@gmail.com", } api_key = get_api_key() if api_key is not None: diff --git a/src/biotite/database/entrez/key.py b/src/biotite/database/entrez/key.py index 2427fd13a..83e56869c 100644 --- a/src/biotite/database/entrez/key.py +++ b/src/biotite/database/entrez/key.py @@ -41,4 +41,4 @@ def set_api_key(key): The API key. """ global _API_KEY - _API_KEY = key \ No newline at end of file + _API_KEY = key diff --git a/src/biotite/database/entrez/query.py b/src/biotite/database/entrez/query.py index 1626735f6..18b49d8fa 100644 --- a/src/biotite/database/entrez/query.py +++ b/src/biotite/database/entrez/query.py @@ -6,22 +6,23 @@ __author__ = "Patrick Kunzmann" __all__ = ["Query", "SimpleQuery", "CompositeQuery", "search"] -import requests import abc from xml.etree import ElementTree +import requests +from ..error import RequestError from .check import check_for_errors from .dbnames import sanitize_database_name -from ..error import RequestError from .key import get_api_key - _search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + class Query(metaclass=abc.ABCMeta): """ Base class for a wrapper around a search term for the NCBI Entrez search service. """ + def __init__(self): pass @@ -85,7 +86,6 @@ def __str__(self): return "({:}) {:} ({:})".format(str(self._q1), self._op, self._q2) - class SimpleQuery(Query): """ A simple query for the NCBI Entrez search service without @@ -121,17 +121,59 @@ class SimpleQuery(Query): # Field identifiers are taken from # https://www.ncbi.nlm.nih.gov/books/NBK49540/ _fields = [ - "Accession", "All Fields", "Author", "EC/RN Number", "Feature Key", - "Filter", "Gene Name", "Genome Project", "Issue", "Journal", "Keyword", - "Modification Date", "Molecular Weight", "Organism", "Page Number", - "Primary Accession", "Properties", "Protein Name", "Publication Date", - "SeqID String", "Sequence Length", "Substance Name", "Text Word", - "Title", "Volume", + "Accession", + "All Fields", + "Author", + "EC/RN Number", + "Feature Key", + "Filter", + "Gene Name", + "Genome Project", + "Issue", + "Journal", + "Keyword", + "Modification Date", + "Molecular Weight", + "Organism", + "Page Number", + "Primary Accession", + "Properties", + "Protein Name", + "Publication Date", + "SeqID String", + "Sequence Length", + "Substance Name", + "Text Word", + "Title", + "Volume", # Abbreviations - "ACCN", "ALL", "AU", "AUTH", "ECNO", "FKEY", "FILT", "SB", "GENE", - "ISS", "JOUR", "KYWD", "MDAT", "MOLWT", "ORGN", "PAGE", "PACC", - "PORGN", "PROP", "PROT", "PDAT", "SQID", "SLEN", "SUBS", "WORD", "TI", - "TITL" "VOL" + "ACCN", + "ALL", + "AU", + "AUTH", + "ECNO", + "FKEY", + "FILT", + "SB", + "GENE", + "ISS", + "JOUR", + "KYWD", + "MDAT", + "MOLWT", + "ORGN", + "PAGE", + "PACC", + "PORGN", + "PROP", + "PROT", + "PDAT", + "SQID", + "SLEN", + "SUBS", + "WORD", + "TI", + "TITL" "VOL", ] def __init__(self, term, field=None): @@ -139,12 +181,9 @@ def __init__(self, term, field=None): if field is not None: if field not in SimpleQuery._fields: raise ValueError(f"Unknown field identifier '{field}'") - for invalid_string in \ - ['"', "AND", "OR", "NOT", "[", "]", "(", ")", "\t", "\n"]: - if invalid_string in term: - raise ValueError( - f"Query contains illegal term {invalid_string}" - ) + for invalid_string in ['"', "AND", "OR", "NOT", "[", "]", "(", ")", "\t", "\n"]: + if invalid_string in term: + raise ValueError(f"Query contains illegal term {invalid_string}") if " " in term: # Encapsulate in quotes if spaces are in search term term = f'"{term}"' diff --git a/src/biotite/database/error.py b/src/biotite/database/error.py index 577e6ce73..271aa37e0 100644 --- a/src/biotite/database/error.py +++ b/src/biotite/database/error.py @@ -12,4 +12,5 @@ class RequestError(Exception): Indicates that the database returned a response with an error message or other malformed content. """ - pass \ No newline at end of file + + pass diff --git a/src/biotite/database/pubchem/__init__.py b/src/biotite/database/pubchem/__init__.py index 73c3a296d..30c4813bb 100644 --- a/src/biotite/database/pubchem/__init__.py +++ b/src/biotite/database/pubchem/__init__.py @@ -18,4 +18,4 @@ from .download import * from .query import * -from .throttle import * \ No newline at end of file +from .throttle import * diff --git a/src/biotite/database/pubchem/download.py b/src/biotite/database/pubchem/download.py index e7f1c22ed..bc9e97d0d 100644 --- a/src/biotite/database/pubchem/download.py +++ b/src/biotite/database/pubchem/download.py @@ -6,24 +6,29 @@ __author__ = "Patrick Kunzmann" __all__ = ["fetch", "fetch_property"] +import io import numbers -import requests -from os.path import isdir, isfile, join, getsize import os -import io -import numpy as np -from .throttle import ThrottleStatus -from .error import parse_error_details +from os.path import getsize, isdir, isfile, join +import requests from ..error import RequestError - +from .error import parse_error_details +from .throttle import ThrottleStatus _base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" _binary_formats = ["png", "asnb"] -def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, - overwrite=False, verbose=False, - throttle_threshold=0.5, return_throttle_status=False): +def fetch( + cids, + format="sdf", + target_path=None, + as_structural_formula=False, + overwrite=False, + verbose=False, + throttle_threshold=0.5, + return_throttle_status=False, +): """ Download structure files from *PubChem* in various formats. @@ -109,8 +114,7 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, raise TypeError("CIDs must be given as integers, not as string") # Verbose output if verbose: - print(f"Fetching file {i+1:d} / {len(cids):d} ({cid})...", - end="\r") + print(f"Fetching file {i+1:d} / {len(cids):d} ({cid})...", end="\r") # Fetch file from database if target_path is not None: @@ -119,36 +123,33 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: - record_type = "2d" if as_structural_formula else "3d" - r = requests.get( - _base_url + f"compound/cid/{cid}/{format.upper()}", - params={"record_type": record_type} - ) - if not r.ok: - raise RequestError(parse_error_details(r.text)) + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: + record_type = "2d" if as_structural_formula else "3d" + r = requests.get( + _base_url + f"compound/cid/{cid}/{format.upper()}", + params={"record_type": record_type}, + ) + if not r.ok: + raise RequestError(parse_error_details(r.text)) - if format.lower() in _binary_formats: - content = r.content - else: - content = r.text + if format.lower() in _binary_formats: + content = r.content + else: + content = r.text - if file is None: - if format in _binary_formats: - file = io.BytesIO(content) - else: - file = io.StringIO(content) + if file is None: + if format in _binary_formats: + file = io.BytesIO(content) else: - mode = "wb+" if format in _binary_formats else "w+" - with open(file, mode) as f: - f.write(content) + file = io.StringIO(content) + else: + mode = "wb+" if format in _binary_formats else "w+" + with open(file, mode) as f: + f.write(content) - throttle_status = ThrottleStatus.from_response(r) - if throttle_threshold is not None: - throttle_status.wait_if_busy(throttle_threshold) + throttle_status = ThrottleStatus.from_response(r) + if throttle_threshold is not None: + throttle_status.wait_if_busy(throttle_threshold) files.append(file) if verbose: @@ -164,8 +165,7 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, return return_value -def fetch_property(cids, name, - throttle_threshold=0.5, return_throttle_status=False): +def fetch_property(cids, name, throttle_threshold=0.5, return_throttle_status=False): """ Download the given property for the given CID(s). @@ -230,15 +230,13 @@ def fetch_property(cids, name, # Property names may only contain letters and numbers if not name.isalnum(): - raise ValueError( - f"Property '{name}' contains invalid characters" - ) + raise ValueError(f"Property '{name}' contains invalid characters") # Use TXT format instead of CSV to avoid issues with ',' characters # within table elements r = requests.post( _base_url + f"compound/cid/property/{name}/TXT", - data={"cid": ','.join([str(cid) for cid in cids])} + data={"cid": ",".join([str(cid) for cid in cids])}, ) if not r.ok: raise RequestError(parse_error_details(r.text)) diff --git a/src/biotite/database/pubchem/error.py b/src/biotite/database/pubchem/error.py index cbbdc0dcd..963fac865 100644 --- a/src/biotite/database/pubchem/error.py +++ b/src/biotite/database/pubchem/error.py @@ -15,6 +15,6 @@ def parse_error_details(response_text): for message_line_indicator in ["Detail: ", "Message: "]: for line in response_text.splitlines(): if line.startswith(message_line_indicator): - return line[len(message_line_indicator):] + return line[len(message_line_indicator) :] # No 'Detail: ...' or 'Message: ' line found - return "Unknown error" \ No newline at end of file + return "Unknown error" diff --git a/src/biotite/database/pubchem/query.py b/src/biotite/database/pubchem/query.py index bb6eec92d..9d54d4f1c 100644 --- a/src/biotite/database/pubchem/query.py +++ b/src/biotite/database/pubchem/query.py @@ -4,20 +4,28 @@ __name__ = "biotite.database.pubchem" __author__ = "Patrick Kunzmann" -__all__ = ["Query", "NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery", - "FormulaQuery", "SuperstructureQuery", "SubstructureQuery", - "SimilarityQuery", "IdentityQuery", - "search"] +__all__ = [ + "Query", + "NameQuery", + "SmilesQuery", + "InchiQuery", + "InchiKeyQuery", + "FormulaQuery", + "SuperstructureQuery", + "SubstructureQuery", + "SimilarityQuery", + "IdentityQuery", + "search", +] -import copy import abc import collections +import copy import requests +from ...structure.io.mol.mol import MOLFile +from ..error import RequestError from .error import parse_error_details from .throttle import ThrottleStatus -from ..error import RequestError -from ...structure.io.mol.mol import MOLFile - _base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" @@ -258,9 +266,10 @@ def get_params(self): # Only set maximum number, if provided by the user # The PubChem default value for this might change over time if self._number is not None: - params["MaxRecords"] = self._number + params["MaxRecords"] = self._number return params + def _format_element(element, count): if count == 1: return element.capitalize() @@ -318,8 +327,8 @@ def __init__(self, **kwargs): ) if not query_key_found: raise TypeError( - "Expected exactly one of 'smiles', 'smarts', 'inchi', 'sdf' " - "or 'cid'") + "Expected exactly one of 'smiles', 'smarts', 'inchi', 'sdf' " "or 'cid'" + ) if "number" in kwargs: self._number = kwargs["number"] del kwargs["number"] @@ -346,14 +355,10 @@ def from_atoms(cls, atoms, *args, **kwargs): mol_file.set_structure(atoms) # Every MOL string with "$$$$" is a valid SDF string # Important: USE MS-style new lines - return cls( - *args, - sdf = "\r\n".join(mol_file.lines) + "\r\n$$$$\r\n", - **kwargs - ) + return cls(*args, sdf="\r\n".join(mol_file.lines) + "\r\n$$$$\r\n", **kwargs) def get_input_url_path(self): - input_string = f"compound/{self.search_type()}/{self._query_key}" + input_string = f"compound/{self.search_type()}/{self._query_key}" if self._query_key == "cid": # Put CID in URL and not in POST payload, # as PubChem is confused otherwise @@ -370,7 +375,7 @@ def get_params(self): # Only set maximum number, if provided by the user # The PubChem default value for this might change over time if self._number is not None: - params["MaxRecords"] = self._number + params["MaxRecords"] = self._number for key, val in self.search_options().items(): # Convert 'snake case' Python parameters # to 'camel case' request parameters @@ -472,13 +477,13 @@ class SuperOrSubstructureQuery(StructureQuery, metaclass=abc.ABCMeta): """ _option_defaults = { - "match_charges" : False, - "match_tautomers" : False, - "rings_not_embedded" : False, - "single_double_bonds_match" : True, - "chains_match_rings" : True, - "strip_hydrogen" : False, - "stereo" : "ignore", + "match_charges": False, + "match_tautomers": False, + "rings_not_embedded": False, + "single_double_bonds_match": True, + "chains_match_rings": True, + "strip_hydrogen": False, + "stereo": "ignore", } def __init__(self, **kwargs): @@ -706,7 +711,7 @@ def search_type(self): return f"fastsimilarity_{dim}" def search_options(self): - return {"threshold" : int(round(self._threshold * 100))} + return {"threshold": int(round(self._threshold * 100))} class IdentityQuery(StructureQuery): @@ -766,8 +771,6 @@ def get_params(self): return params - - def search(query, throttle_threshold=0.5, return_throttle_status=False): """ Get all CIDs that meet the given query requirements, @@ -812,7 +815,7 @@ def search(query, throttle_threshold=0.5, return_throttle_status=False): r = requests.post( _base_url + query.get_input_url_path() + "/cids/TXT", data=query.get_params(), - files=files + files=files, ) if not r.ok: raise RequestError(parse_error_details(r.text)) diff --git a/src/biotite/database/pubchem/throttle.py b/src/biotite/database/pubchem/throttle.py index 27cb09084..171c1a484 100644 --- a/src/biotite/database/pubchem/throttle.py +++ b/src/biotite/database/pubchem/throttle.py @@ -7,8 +7,8 @@ __all__ = ["ThrottleStatus"] -from dataclasses import dataclass import time +from dataclasses import dataclass @dataclass(frozen=True) @@ -67,8 +67,7 @@ def from_response(response): """ throttle_control = response.headers["X-Throttling-Control"] throttle_status = [ - substring.split(")")[0] for substring - in throttle_control.split("(")[1:] + substring.split(")")[0] for substring in throttle_control.split("(")[1:] ] # Remove '%' sign and convert to int count_status, time_status, service_status = [ @@ -96,4 +95,4 @@ def wait_if_busy(self, threshold=0.5, wait_time=1.0): threshold is exceeded. """ if self.count > threshold or self.time > threshold: - time.sleep(wait_time) \ No newline at end of file + time.sleep(wait_time) diff --git a/src/biotite/database/rcsb/__init__.py b/src/biotite/database/rcsb/__init__.py index c36dfb2b8..0e5faf41c 100644 --- a/src/biotite/database/rcsb/__init__.py +++ b/src/biotite/database/rcsb/__init__.py @@ -10,4 +10,4 @@ __author__ = "Patrick Kunzmann" from .download import * -from .query import * \ No newline at end of file +from .query import * diff --git a/src/biotite/database/rcsb/download.py b/src/biotite/database/rcsb/download.py index e24255672..2af19a3e5 100644 --- a/src/biotite/database/rcsb/download.py +++ b/src/biotite/database/rcsb/download.py @@ -6,14 +6,12 @@ __author__ = "Patrick Kunzmann" __all__ = ["fetch"] -import requests -from os.path import isdir, isfile, join, getsize -import os -import glob import io +import os +from os.path import getsize, isfile, join +import requests from ..error import RequestError - _standard_url = "https://files.rcsb.org/download/" _bcif_url = "https://models.rcsb.org/" _fasta_url = "https://www.rcsb.org/fasta/entry/" @@ -93,8 +91,7 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False): for i, id in enumerate(pdb_ids): # Verbose output if verbose: - print(f"Fetching file {i+1:d} / {len(pdb_ids):d} ({id})...", - end="\r") + print(f"Fetching file {i+1:d} / {len(pdb_ids):d} ({id})...", end="\r") # Fetch file from database if target_path is not None: @@ -103,38 +100,35 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False): # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: - if format == "pdb": - r = requests.get(_standard_url + id + ".pdb") - content = r.text - _assert_valid_file(content, id) - elif format in ["cif", "mmcif", "pdbx"]: - r = requests.get(_standard_url + id + ".cif") - content = r.text - _assert_valid_file(content, id) - elif format in ["bcif"]: - r = requests.get(_bcif_url + id + ".bcif") - content = r.content - _assert_valid_file(r.text, id) - elif format == "fasta": - r = requests.get(_fasta_url + id) - content = r.text - _assert_valid_file(content, id) - else: - raise ValueError(f"Format '{format}' is not supported") - - if file is None: - if format in _binary_formats: - file = io.BytesIO(content) - else: - file = io.StringIO(content) + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: + if format == "pdb": + r = requests.get(_standard_url + id + ".pdb") + content = r.text + _assert_valid_file(content, id) + elif format in ["cif", "mmcif", "pdbx"]: + r = requests.get(_standard_url + id + ".cif") + content = r.text + _assert_valid_file(content, id) + elif format in ["bcif"]: + r = requests.get(_bcif_url + id + ".bcif") + content = r.content + _assert_valid_file(r.text, id) + elif format == "fasta": + r = requests.get(_fasta_url + id) + content = r.text + _assert_valid_file(content, id) + else: + raise ValueError(f"Format '{format}' is not supported") + + if file is None: + if format in _binary_formats: + file = io.BytesIO(content) else: - mode = "wb+" if format in _binary_formats else "w+" - with open(file, mode) as f: - f.write(content) + file = io.StringIO(content) + else: + mode = "wb+" if format in _binary_formats else "w+" + with open(file, mode) as f: + f.write(content) files.append(file) if verbose: @@ -153,10 +147,13 @@ def _assert_valid_file(response_text, pdb_id): """ # Structure file and FASTA file retrieval # have different error messages - if len(response_text) == 0 or any(err_msg in response_text for err_msg in [ - "404 Not Found", - "RCSB Protein Data Bank Error Page", - "No fasta files were found.", - "No valid PDB IDs were submitted.", - ]): + if len(response_text) == 0 or any( + err_msg in response_text + for err_msg in [ + "404 Not Found", + "RCSB Protein Data Bank Error Page", + "No fasta files were found.", + "No valid PDB IDs were submitted.", + ] + ): raise RequestError("PDB ID {:} is invalid".format(pdb_id)) diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index 7f131f3ee..155ab25d8 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -4,28 +4,38 @@ __name__ = "biotite.database.rcsb" __author__ = "Patrick Kunzmann, Maximilian Dombrowsky" -__all__ = ["Query", "SingleQuery", "CompositeQuery", - "BasicQuery", "FieldQuery", - "SequenceQuery", "StructureQuery", "MotifQuery", - "Sorting", - "Grouping", "DepositGrouping", "IdentityGrouping", "UniprotGrouping", - "search", "count"] +__all__ = [ + "Query", + "SingleQuery", + "CompositeQuery", + "BasicQuery", + "FieldQuery", + "SequenceQuery", + "StructureQuery", + "MotifQuery", + "Sorting", + "Grouping", + "DepositGrouping", + "IdentityGrouping", + "UniprotGrouping", + "search", + "count", +] import abc -import json import copy +import json from datetime import datetime import numpy as np import requests from ...sequence.seqtypes import NucleotideSequence from ..error import RequestError - _search_url = "https://search.rcsb.org/rcsbsearch/v2/query" _scope_to_target = { "protein": "pdb_protein_sequence", - "rna": "pdb_rna_sequence", - "dna": "pdb_dna_sequence" + "rna": "pdb_rna_sequence", + "dna": "pdb_dna_sequence", } @@ -35,6 +45,7 @@ class Query(metaclass=abc.ABCMeta): This is the abstract base class for all queries. """ + @abc.abstractmethod def get_content(self): """ @@ -58,7 +69,6 @@ def __or__(self, query): return CompositeQuery([self, query], "or") - class SingleQuery(Query, metaclass=abc.ABCMeta): """ A terminal query node for the RCSB search API. @@ -69,6 +79,7 @@ class SingleQuery(Query, metaclass=abc.ABCMeta): This is the abstract base class for all queries that are terminal nodes. """ + @abc.abstractmethod def get_content(self): return {"parameters": {}} @@ -91,12 +102,11 @@ class CompositeQuery(Query): operator : {'or', 'and'} The type of combination. """ + def __init__(self, queries, operator): self._queries = queries if operator not in ("or", "and"): - raise ValueError( - f"Operator must be 'or' or 'and', not '{operator}'" - ) + raise ValueError(f"Operator must be 'or' or 'and', not '{operator}'") self._operator = operator def get_content(self): @@ -113,12 +123,11 @@ def get_content(self): content = { "type": "group", "logical_operator": self._operator, - "nodes": [query.get_content() for query in self._queries] + "nodes": [query.get_content() for query in self._queries], } return content - class BasicQuery(SingleQuery): """ A text query for searching for a given term across all available @@ -141,6 +150,7 @@ class BasicQuery(SingleQuery): >>> print(sorted(search(query))) ['1L2Y', '8ANG', '8ANH', '8ANI', '8ANM'] """ + def __init__(self, term): super().__init__() self._term = term @@ -212,7 +222,10 @@ class FieldQuery(SingleQuery): >>> print(sorted(search(query))) ['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H'] """ - def __init__(self, field, molecular_definition=False, case_sensitive=False, **kwargs): + + def __init__( + self, field, molecular_definition=False, case_sensitive=False, **kwargs + ): super().__init__() self._negation = False self._field = field @@ -231,20 +244,25 @@ def __init__(self, field, molecular_definition=False, case_sensitive=False, **kw if self._operator not in [ "exact_match", - "contains_words", "contains_phrase", - "greater", "less", "greater_or_equal", "less_or_equal", "equals", - "range", "range_closed", + "contains_words", + "contains_phrase", + "greater", + "less", + "greater_or_equal", + "less_or_equal", + "equals", + "range", + "range_closed", "is_in", - "exists" + "exists", ]: raise TypeError( - f"Constructor got an unexpected keyword argument " - f"'{self._operator}'" + f"Constructor got an unexpected keyword argument " f"'{self._operator}'" ) # Convert dates into ISO 8601 if isinstance(self._value, datetime): - self._value = _to_isoformat(self._value) + self._value = _to_isoformat(self._value) elif isinstance(self._value, (tuple, list, np.ndarray)): self._value = [ _to_isoformat(val) if isinstance(val, datetime) else val @@ -257,14 +275,14 @@ def __init__(self, field, molecular_definition=False, case_sensitive=False, **kw "from": self._value[0], "include_lower": False, "to": self._value[1], - "include_upper": False + "include_upper": False, } elif self._operator == "range_closed": self._value = { "from": self._value[0], "include_lower": True, "to": self._value[1], - "include_upper": True + "include_upper": True, } # Rename operators to names used in API @@ -332,8 +350,8 @@ class SequenceQuery(SingleQuery): >>> print(sorted(search(query))) ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2LL5', '2MJ9', '3UC7', '3UC8'] """ - def __init__(self, sequence, scope, - min_identity=0.0, max_expect_value=10000000.0): + + def __init__(self, sequence, scope, min_identity=0.0, max_expect_value=10000000.0): super().__init__() self._target = _scope_to_target.get(scope.lower()) if self._target is None: @@ -381,6 +399,7 @@ class MotifQuery(SingleQuery): ... "protein" ... ) """ + def __init__(self, pattern, pattern_type, scope): super().__init__() self._pattern = pattern @@ -424,27 +443,20 @@ class StructureQuery(SingleQuery): >>> print(sorted(search(query))) ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2M7D', '7MQS'] """ + def __init__(self, pdb_id, chain=None, assembly=None, strict=True): super().__init__() - if (chain is None and assembly is None) \ - or (chain is not None and assembly is not None): - raise TypeError( - "Either the chain ID or assembly ID must be set" - ) + if (chain is None and assembly is None) or ( + chain is not None and assembly is not None + ): + raise TypeError("Either the chain ID or assembly ID must be set") elif chain is None: - self._value = { - "entry_id": pdb_id, - "asssembly_id": assembly - } + self._value = {"entry_id": pdb_id, "asssembly_id": assembly} else: - self._value = { - "entry_id": pdb_id, - "asym_id": chain - } + self._value = {"entry_id": pdb_id, "asym_id": chain} - self._operator = "strict_shape_match" if strict \ - else "relaxed_shape_match" + self._operator = "strict_shape_match" if strict else "relaxed_shape_match" def get_content(self): content = super().get_content() @@ -455,10 +467,7 @@ def get_content(self): return content - - class Sorting: - def __init__(self, field, descending=True): self._field = field self._descending = descending @@ -487,12 +496,7 @@ def get_content(self): ``'ranking_criteria_type'`` attributes. """ direction = "desc" if self._descending else "asc" - return { - "sort_by" : self._field, - "direction" : direction - } - - + return {"sort_by": self._field, "direction": direction} class Grouping(metaclass=abc.ABCMeta): @@ -539,7 +543,7 @@ def get_content(self): The content dictionary for the ``'group_by'`` attributes. """ if self._sorting is not None: - return {"ranking_criteria_type" : self._sorting.get_content()} + return {"ranking_criteria_type": self._sorting.get_content()} else: return {} @@ -627,6 +631,7 @@ class IdentityGrouping(Grouping): To choose the order a :class:`Sorting` object needs to be provided. """ + def __init__(self, similarity_cutoff, sort_by=None): super().__init__(sort_by) if similarity_cutoff not in (100, 95, 90, 70, 50, 30): @@ -677,11 +682,7 @@ def is_compatible_return_type(self, return_type): return return_type == "polymer_entity" - - - -def count(query, return_type="entry", group_by=None, - content_types=("experimental",)): +def count(query, return_type="entry", group_by=None, content_types=("experimental",)): """ Count PDB entries that meet the given query requirements, via the RCSB search API. @@ -737,9 +738,7 @@ def count(query, return_type="entry", group_by=None, >>> print(sorted(ids)) ['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H'] """ - query_dict = _initialize_query_dict( - query, return_type, group_by, content_types - ) + query_dict = _initialize_query_dict(query, return_type, group_by, content_types) query_dict["request_options"]["return_counts"] = True @@ -761,8 +760,15 @@ def count(query, return_type="entry", group_by=None, raise RequestError(f"Error {r.status_code}") -def search(query, return_type="entry", range=None, sort_by=None, group_by=None, - return_groups=False, content_types=("experimental",)): +def search( + query, + return_type="entry", + range=None, + sort_by=None, + group_by=None, + return_groups=False, + content_types=("experimental",), +): """ Get all PDB IDs that meet the given query requirements, via the RCSB search API. @@ -864,17 +870,13 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, ... )) {'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']} """ - query_dict = _initialize_query_dict( - query, return_type, group_by, content_types - ) + query_dict = _initialize_query_dict(query, return_type, group_by, content_types) if group_by is not None: if return_groups: - query_dict["request_options"]["group_by_return_type"] \ - = "groups" + query_dict["request_options"]["group_by_return_type"] = "groups" else: - query_dict["request_options"]["group_by_return_type"] \ - = "representatives" + query_dict["request_options"]["group_by_return_type"] = "representatives" if sort_by is not None: if isinstance(sort_by, Sorting): @@ -890,7 +892,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, else: query_dict["request_options"]["paginate"] = { "start": int(range[0]), - "rows": int(range[1]) - int(range[0]) + "rows": int(range[1]) - int(range[0]), } r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) @@ -900,7 +902,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, return [result["identifier"] for result in r.json()["result_set"]] else: return { - group["identifier"] : [ + group["identifier"]: [ result["identifier"] for result in group["result_set"] ] for group in r.json()["group_set"] @@ -922,8 +924,11 @@ def _initialize_query_dict(query, return_type, group_by, content_types): `count()` and `search()` have in common. """ if return_type not in [ - "entry", "polymer_instance", "assembly", - "polymer_entity", "non_polymer_entity", + "entry", + "polymer_instance", + "assembly", + "polymer_entity", + "non_polymer_entity", ]: raise ValueError(f"'{return_type}' is an invalid return type") @@ -947,7 +952,7 @@ def _initialize_query_dict(query, return_type, group_by, content_types): query_dict = { "query": query.get_content(), "return_type": return_type, - "request_options": request_options + "request_options": request_options, } return query_dict @@ -956,4 +961,4 @@ def _to_isoformat(object): """ Convert a datetime into the specifc ISO 8601 format required by the RCSB. """ - return object.strftime("%Y-%m-%dT%H:%M:%SZ") \ No newline at end of file + return object.strftime("%Y-%m-%dT%H:%M:%SZ") diff --git a/src/biotite/database/uniprot/check.py b/src/biotite/database/uniprot/check.py index 4b00845d2..bbd2db470 100644 --- a/src/biotite/database/uniprot/check.py +++ b/src/biotite/database/uniprot/check.py @@ -27,6 +27,9 @@ def assert_valid_response(response_status_code): raise RequestError("Gone. The resource you requested was removed.") elif response_status_code == 500: raise RequestError( - "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team.") + "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team." + ) elif response_status_code == 503: - raise RequestError("Service not available. The server is being updated, try again later.") + raise RequestError( + "Service not available. The server is being updated, try again later." + ) diff --git a/src/biotite/database/uniprot/download.py b/src/biotite/database/uniprot/download.py index 7faf37954..42d3cca58 100644 --- a/src/biotite/database/uniprot/download.py +++ b/src/biotite/database/uniprot/download.py @@ -6,9 +6,9 @@ __author__ = "Maximilian Greil" __all__ = ["fetch"] -from os.path import isdir, isfile, join, getsize -import os import io +import os +from os.path import getsize, isdir, isfile, join import requests from .check import assert_valid_response @@ -36,8 +36,7 @@ def _get_database_name(id): return "uniprotkb" -def fetch(ids, format, target_path=None, - overwrite=False, verbose=False): +def fetch(ids, format, target_path=None, overwrite=False, verbose=False): """ Download files from the UniProt in various formats. @@ -101,18 +100,14 @@ def fetch(ids, format, target_path=None, db_name = _get_database_name(id) # Verbose output if verbose: - print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", - end="\r") + print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r") # Fetch file from database if target_path is not None: file = join(target_path, id + "." + format) else: # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: if format in ["fasta", "gff", "txt", "xml", "rdf", "tab"]: r = requests.get(_fetch_url + db_name + "/" + id + "." + format) content = r.text diff --git a/src/biotite/database/uniprot/query.py b/src/biotite/database/uniprot/query.py index 95e6f391d..922749626 100644 --- a/src/biotite/database/uniprot/query.py +++ b/src/biotite/database/uniprot/query.py @@ -6,11 +6,10 @@ __author__ = "Maximilian Greil" __all__ = ["Query", "SimpleQuery", "CompositeQuery", "search"] -import requests import abc +import requests from .check import assert_valid_response - _base_url = "https://rest.uniprot.org/uniprotkb/search/" @@ -122,22 +121,114 @@ class SimpleQuery(Query): # Field identifiers are taken from # https://www.uniprot.org/help/query-fields _fields = [ - "accession", "active", "ft_init_met", "ft_signal", "ft_transit", "ft_propep", "ft_chain", "ft_peptide", - "ft_topo_dom", "ft_transmem", "ft_intramem", "ft_domain", "ft_repeat", "ft_zn_fing", "ft_dna_bind", - "ft_region", "ft_coiled", "ft_motif", "ft_compbias", "ft_act_site", "ft_binding", "ft_site", "ft_non_std", - "ft_mod_res", "ft_lipid", "ft_carbohyd", "ft_disulfid", "ft_crosslnk", "ft_var_seq", "ft_variant", - "ft_mutagen", "ft_unsure", "ft_conflict", "ft_non_cons", "ft_non_ter", "ft_helix", "ft_turn", "ft_strand", - "lit_author", "protein_name", "chebi", "citation", "uniref_cluster_90", "xrefcount_pdb", "date_created", - "database", "xref", "ec", "cc_function", "cc_catalytic_activity", "cc_cofactor", "cc_activity_regulation", - "cc_biophysicochemical_properties", "cc_subunit", "cc_pathway", "cc_scl_term", "cc_tissue_specificity", - "cc_developmental_stage", "cc_induction", "cc_domain", "cc_ptm cc_rna_editing", "cc_mass_spectrometry", - "cc_polymorphism", "cc_disease", "cc_disruption_phenotype", "cc_allergen", "cc_toxic_dose", "cc_biotechnology", - "cc_pharmaceutical", "cc_miscellaneous", "cc_similarity", "cc_caution", "cc_sequence_caution", - "existence", "family", "fragment", "gene", "gene_exact", "go", "virus_host_name", "virus_host_id", - "accession_id", "inchikey", "protein_name", "interactor", "keyword", "length", "lineage", "mass", - "cc_mass_spectrometry", "date_modified", "protein_name", "organelle", "organism_name", "organism_id", - "plasmid", "proteome", "proteomecomponent", "sec_acc", "reviewed", "scope", "sequence", - "date_sequence_modified", "strain", "taxonomy_name", "taxonomy_id", "tissue", "cc_webresource" + "accession", + "active", + "ft_init_met", + "ft_signal", + "ft_transit", + "ft_propep", + "ft_chain", + "ft_peptide", + "ft_topo_dom", + "ft_transmem", + "ft_intramem", + "ft_domain", + "ft_repeat", + "ft_zn_fing", + "ft_dna_bind", + "ft_region", + "ft_coiled", + "ft_motif", + "ft_compbias", + "ft_act_site", + "ft_binding", + "ft_site", + "ft_non_std", + "ft_mod_res", + "ft_lipid", + "ft_carbohyd", + "ft_disulfid", + "ft_crosslnk", + "ft_var_seq", + "ft_variant", + "ft_mutagen", + "ft_unsure", + "ft_conflict", + "ft_non_cons", + "ft_non_ter", + "ft_helix", + "ft_turn", + "ft_strand", + "lit_author", + "protein_name", + "chebi", + "citation", + "uniref_cluster_90", + "xrefcount_pdb", + "date_created", + "database", + "xref", + "ec", + "cc_function", + "cc_catalytic_activity", + "cc_cofactor", + "cc_activity_regulation", + "cc_biophysicochemical_properties", + "cc_subunit", + "cc_pathway", + "cc_scl_term", + "cc_tissue_specificity", + "cc_developmental_stage", + "cc_induction", + "cc_domain", + "cc_ptm cc_rna_editing", + "cc_mass_spectrometry", + "cc_polymorphism", + "cc_disease", + "cc_disruption_phenotype", + "cc_allergen", + "cc_toxic_dose", + "cc_biotechnology", + "cc_pharmaceutical", + "cc_miscellaneous", + "cc_similarity", + "cc_caution", + "cc_sequence_caution", + "existence", + "family", + "fragment", + "gene", + "gene_exact", + "go", + "virus_host_name", + "virus_host_id", + "accession_id", + "inchikey", + "protein_name", + "interactor", + "keyword", + "length", + "lineage", + "mass", + "cc_mass_spectrometry", + "date_modified", + "protein_name", + "organelle", + "organism_name", + "organism_id", + "plasmid", + "proteome", + "proteomecomponent", + "sec_acc", + "reviewed", + "scope", + "sequence", + "date_sequence_modified", + "strain", + "taxonomy_name", + "taxonomy_id", + "tissue", + "cc_webresource", ] def __init__(self, field, term): @@ -146,14 +237,11 @@ def __init__(self, field, term): raise ValueError(f"Unknown field identifier '{field}'") if not _check_brackets(term): raise ValueError( - f"Query term contains illegal number of round brackets ( ) and/or square brackets [ ]" + "Query term contains illegal number of round brackets ( ) and/or square brackets [ ]" ) - for invalid_string in \ - ['"', "AND", "OR", "NOT", "\t", "\n"]: + for invalid_string in ['"', "AND", "OR", "NOT", "\t", "\n"]: if invalid_string in term: - raise ValueError( - f"Query contains illegal term {invalid_string}" - ) + raise ValueError(f"Query contains illegal term {invalid_string}") if " " in term: term = f'"{term}"' self._field = field @@ -198,12 +286,8 @@ def search(query, number=500): ['P12345'] """ - params = { - 'query': str(query), - 'format': 'list', - 'size': str(number) - } + params = {"query": str(query), "format": "list", "size": str(number)} r = requests.get(_base_url, params=params) content = r.text assert_valid_response(r.status_code) - return content.split('\n')[:-1] + return content.split("\n")[:-1] diff --git a/src/biotite/file.py b/src/biotite/file.py index fa1963b6a..fc7a8f7aa 100644 --- a/src/biotite/file.py +++ b/src/biotite/file.py @@ -4,16 +4,19 @@ __name__ = "biotite" __author__ = "Patrick Kunzmann" -__all__ = ["File", "TextFile", "InvalidFileError", - "SerializationError", "DeserializationError"] +__all__ = [ + "File", + "TextFile", + "InvalidFileError", + "SerializationError", + "DeserializationError", +] import abc +import copy import io -import warnings from os import PathLike - from .copyable import Copyable -import copy class File(Copyable, metaclass=abc.ABCMeta): @@ -185,12 +188,14 @@ class InvalidFileError(Exception): either because the file does not contain the required data or because the file is malformed. """ + pass class SerializationError(Exception): pass + class DeserializationError(Exception): pass @@ -205,7 +210,7 @@ def wrap_string(text, width): """ lines = [] for i in range(0, len(text), width): - lines.append(text[i : i+width]) + lines.append(text[i : i + width]) return lines diff --git a/src/biotite/sequence/__init__.py b/src/biotite/sequence/__init__.py index afda0ab34..005a7c88c 100644 --- a/src/biotite/sequence/__init__.py +++ b/src/biotite/sequence/__init__.py @@ -76,9 +76,9 @@ __author__ = "Patrick Kunzmann" from .alphabet import * +from .annotation import * +from .codon import * +from .profile import * from .search import * from .seqtypes import * from .sequence import * -from .codon import * -from .annotation import * -from .profile import * diff --git a/src/biotite/sequence/align/__init__.py b/src/biotite/sequence/align/__init__.py index d548b11a3..7e90c32ad 100644 --- a/src/biotite/sequence/align/__init__.py +++ b/src/biotite/sequence/align/__init__.py @@ -191,8 +191,8 @@ from .buckets import * from .cigar import * from .kmeralphabet import * -from .kmertable import * from .kmersimilarity import * +from .kmertable import * from .localgapped import * from .localungapped import * from .matrix import * @@ -200,4 +200,4 @@ from .pairwise import * from .permutation import * from .selector import * -from .statistics import * \ No newline at end of file +from .statistics import * diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py index 7d97d15a8..b416cba43 100644 --- a/src/biotite/sequence/align/alignment.py +++ b/src/biotite/sequence/align/alignment.py @@ -5,16 +5,21 @@ __name__ = "biotite.sequence.align" __author__ = "Patrick Kunzmann" -import numpy as np import numbers -import copy import textwrap +import numpy as np from ..alphabet import LetterAlphabet - -__all__ = ["Alignment", "get_codes", "get_symbols", - "get_sequence_identity", "get_pairwise_sequence_identity", - "score", "find_terminal_gaps", "remove_terminal_gaps"] +__all__ = [ + "Alignment", + "get_codes", + "get_symbols", + "get_sequence_identity", + "get_pairwise_sequence_identity", + "score", + "find_terminal_gaps", + "remove_terminal_gaps", +] class Alignment(object): @@ -95,8 +100,10 @@ def __init__(self, sequences, trace, score=None): def __repr__(self): """Represent Alignment a string for debugging.""" - return f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], " \ - f"np.{np.array_repr(self.trace)}, score={self.score})" + return ( + f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], " + f"np.{np.array_repr(self.trace)}, score={self.score})" + ) def _gapped_str(self, seq_index): seq_str = "" @@ -148,17 +155,18 @@ def __getitem__(self, index): if isinstance(index, tuple): if len(index) > 2: raise IndexError("Only 1D or 2D indices are allowed") - if isinstance(index[0], numbers.Integral) or \ - isinstance(index[0], numbers.Integral): - raise IndexError( - "Integers are invalid indices for alignments, " - "a single sequence or alignment column cannot be " - "selected" - ) + if isinstance(index[0], numbers.Integral) or isinstance( + index[0], numbers.Integral + ): + raise IndexError( + "Integers are invalid indices for alignments, " + "a single sequence or alignment column cannot be " + "selected" + ) return Alignment( Alignment._index_sequences(self.sequences, index[1]), self.trace[index], - self.score + self.score, ) else: return Alignment(self.sequences, self.trace[index], self.score) @@ -182,17 +190,16 @@ def __eq__(self, item): @staticmethod def _index_sequences(sequences, index): - if isinstance(index, (list, tuple)) or \ - (isinstance(index, np.ndarray) and index.dtype != bool): - return [sequences[i] for i in index] + if isinstance(index, (list, tuple)) or ( + isinstance(index, np.ndarray) and index.dtype != bool + ): + return [sequences[i] for i in index] elif isinstance(index, np.ndarray) and index.dtype == bool: return [seq for seq, mask in zip(sequences, index) if mask] if isinstance(index, slice): return sequences[index] else: - raise IndexError( - f"Invalid alignment index type '{type(index).__name__}'" - ) + raise IndexError(f"Invalid alignment index type '{type(index).__name__}'") @staticmethod def trace_from_strings(seq_str_list): @@ -212,12 +219,9 @@ def trace_from_strings(seq_str_list): The created trace. """ if len(seq_str_list) < 2: - raise ValueError( - "An alignment must contain at least two sequences" - ) + raise ValueError("An alignment must contain at least two sequences") seq_i = np.zeros(len(seq_str_list)) - trace = np.full(( len(seq_str_list[0]), len(seq_str_list) ), - -1, dtype=int) + trace = np.full((len(seq_str_list[0]), len(seq_str_list)), -1, dtype=int) # Get length of string (same length for all strings) # rather than length of list for pos_i in range(len(seq_str_list[0])): @@ -275,7 +279,7 @@ def get_codes(alignment): # of the sequence code is used # (https://numpy.org/neps/nep-0050-scalar-promotion.html) codes[i] = np.where( - trace[:,i] != -1, sequences[i].code[trace[:,i]], np.int64(-1) + trace[:, i] != -1, sequences[i].code[trace[:, i]], np.int64(-1) ) return np.stack(codes) @@ -366,7 +370,7 @@ def get_sequence_identity(alignment, mode="not_terminal"): # Count matches matches = 0 for i in range(codes.shape[1]): - column = codes[:,i] + column = codes[:, i] # One unique value -> all symbols match unique_symbols = np.unique(column) if len(unique_symbols) == 1 and unique_symbols[0] != -1: @@ -430,9 +434,11 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"): # Count matches # Calculate at which positions the sequences are identical # and are not gaps - equality_matrix = (codes[:, np.newaxis, :] == codes[np.newaxis, :, :]) \ - & (codes[:, np.newaxis, :] != -1) \ - & (codes[np.newaxis, :, :] != -1) \ + equality_matrix = ( + (codes[:, np.newaxis, :] == codes[np.newaxis, :, :]) + & (codes[:, np.newaxis, :] != -1) + & (codes[np.newaxis, :, :] != -1) + ) # Sum these positions up matches = np.count_nonzero(equality_matrix, axis=-1) @@ -444,21 +450,20 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"): for i in range(n_seq): for j in range(n_seq): # Find latest start and earliest stop of all sequences - start, stop = find_terminal_gaps(alignment[:, [i,j]]) + start, stop = find_terminal_gaps(alignment[:, [i, j]]) if stop <= start: raise ValueError( "Cannot calculate non-terminal identity, " "as the two sequences have no overlap" ) - length[i,j] = stop - start + length[i, j] = stop - start elif mode == "shortest": length = np.zeros((n_seq, n_seq)) for i in range(n_seq): for j in range(n_seq): - length[i,j] = min([ - len(alignment.sequences[i]), - len(alignment.sequences[j]) - ]) + length[i, j] = min( + [len(alignment.sequences[i]), len(alignment.sequences[j])] + ) else: raise ValueError(f"'{mode}' is an invalid calculation mode") @@ -506,7 +511,7 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True): # Do not count self-similarity # and do not count similarity twice (not S(i,j) and S(j,i)) for i in range(codes.shape[0]): - for j in range(i+1, codes.shape[0]): + for j in range(i + 1, codes.shape[0]): code_i = column[i] code_j = column[j] # Ignore gaps @@ -593,11 +598,11 @@ def find_terminal_gaps(alignment): """ trace = alignment.trace # Find for each sequence the positions of non-gap symbols - no_gap_pos = [np.where(trace[:,i] != -1)[0] for i in range(trace.shape[1])] + no_gap_pos = [np.where(trace[:, i] != -1)[0] for i in range(trace.shape[1])] # Find for each sequence the positions of the sequence start and end # in the alignment - firsts = [no_gap_pos[i][0 ] for i in range(trace.shape[1])] - lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])] + firsts = [no_gap_pos[i][0] for i in range(trace.shape[1])] + lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])] # The terminal gaps are before all sequences start and after any # sequence ends # Use exclusive stop -> -1 @@ -658,4 +663,4 @@ def remove_terminal_gaps(alignment): "Cannot remove terminal gaps, since at least two sequences have " "no overlap and the resulting alignment would be empty" ) - return alignment[start : stop] \ No newline at end of file + return alignment[start:stop] diff --git a/src/biotite/sequence/align/buckets.py b/src/biotite/sequence/align/buckets.py index 79a1afadd..5b99ef890 100644 --- a/src/biotite/sequence/align/buckets.py +++ b/src/biotite/sequence/align/buckets.py @@ -6,11 +6,12 @@ __author__ = "Patrick Kunzmann" __all__ = ["bucket_number"] -from os.path import realpath, dirname, join +from os.path import dirname, join, realpath import numpy as np - _primes = None + + def bucket_number(n_kmers, load_factor=0.8): """ Find an appropriate number of buckets for a :class:`BucketKmerTable` @@ -54,16 +55,17 @@ def bucket_number(n_kmers, load_factor=0.8): """ global _primes if _primes is None: - with open( - join(dirname(realpath(__file__)), "primes.txt") - ) as file: - _primes = np.array([ - int(line) for line in file.read().splitlines() - if len(line) != 0 and line[0] != "#" - ]) + with open(join(dirname(realpath(__file__)), "primes.txt")) as file: + _primes = np.array( + [ + int(line) + for line in file.read().splitlines() + if len(line) != 0 and line[0] != "#" + ] + ) number = int(n_kmers / load_factor) index = np.searchsorted(_primes, number, side="left") if index == len(_primes): raise ValueError("Number of buckets too large") - return _primes[index] \ No newline at end of file + return _primes[index] diff --git a/src/biotite/sequence/align/cigar.py b/src/biotite/sequence/align/cigar.py index abe76cae6..2bd0de6b2 100644 --- a/src/biotite/sequence/align/cigar.py +++ b/src/biotite/sequence/align/cigar.py @@ -15,6 +15,7 @@ class CigarOp(enum.IntEnum): """ An enum for the different CIGAR operations. """ + MATCH = 0 INSERTION = 1 DELETION = 2 @@ -46,23 +47,23 @@ def from_cigar_symbol(symbol): def to_cigar_symbol(self): return _op_to_str[self] + _str_to_op = { - "M" : CigarOp.MATCH, - "I" : CigarOp.INSERTION, - "D" : CigarOp.DELETION, - "N" : CigarOp.INTRON, - "S" : CigarOp.SOFT_CLIP, - "H" : CigarOp.HARD_CLIP, - "P" : CigarOp.PADDING, - "=" : CigarOp.EQUAL, - "X" : CigarOp.DIFFERENT, - "B" : CigarOp.BACK - } + "M": CigarOp.MATCH, + "I": CigarOp.INSERTION, + "D": CigarOp.DELETION, + "N": CigarOp.INTRON, + "S": CigarOp.SOFT_CLIP, + "H": CigarOp.HARD_CLIP, + "P": CigarOp.PADDING, + "=": CigarOp.EQUAL, + "X": CigarOp.DIFFERENT, + "B": CigarOp.BACK, +} _op_to_str = {v: k for k, v in _str_to_op.items()} -def read_alignment_from_cigar(cigar, position, - reference_sequence, segment_sequence): +def read_alignment_from_cigar(cigar, position, reference_sequence, segment_sequence): """ Create an :class:`Alignment` from a CIGAR string. @@ -147,20 +148,16 @@ def read_alignment_from_cigar(cigar, position, else: operations = np.asarray(cigar, dtype=int) if operations.ndim != 2: - raise ValueError( - "Expected array with shape (n,2)" - ) + raise ValueError("Expected array with shape (n,2)") if operations.shape[1] != 2: - raise ValueError( - "Expected (operation, length) pairs" - ) + raise ValueError("Expected (operation, length) pairs") if len(operations) == 0: return Alignment( [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int) ) - trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int) + trace = np.zeros((np.sum(operations[:, 1]), 2), dtype=int) clip_mask = np.ones(trace.shape[0], dtype=bool) i = 0 @@ -187,19 +184,23 @@ def read_alignment_from_cigar(cigar, position, elif op == CigarOp.HARD_CLIP: clip_mask[i : i + length] = False else: - raise ValueError( - f"CIGAR operation {op} is not implemented" - ) + raise ValueError(f"CIGAR operation {op} is not implemented") i += length # Remove clipped positions trace = trace[clip_mask] return Alignment([reference_sequence, segment_sequence], trace) -def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, - introns=(), distinguish_matches=False, - hard_clip=False, include_terminal_gaps=False, - as_string=True): +def write_alignment_to_cigar( + alignment, + reference_index=0, + segment_index=1, + introns=(), + distinguish_matches=False, + hard_clip=False, + include_terminal_gaps=False, + as_string=True, +): """ Convert an :class:`Alignment` into a CIGAR string. @@ -305,8 +306,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, seg_trace = alignment.trace[:, segment_index] operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int) - insertion_mask = (ref_trace == -1) - deletion_mask = (seg_trace == -1) + insertion_mask = ref_trace == -1 + deletion_mask = seg_trace == -1 if np.any(insertion_mask & deletion_mask): raise ValueError( "Alignment contains insertion and deletion at the same position" @@ -318,35 +319,27 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, intron_mask = np.zeros(operations.shape[0], dtype=bool) for start, stop in introns: if start >= stop: - raise ValueError( - "Intron start must be smaller than intron stop" - ) + raise ValueError("Intron start must be smaller than intron stop") if start < 0: - raise ValueError( - "Intron start must not be negative" - ) + raise ValueError("Intron start must not be negative") intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True if np.any(intron_mask & ~deletion_mask): - raise ValueError( - "Introns must be within gaps in the reference sequence" - ) + raise ValueError("Introns must be within gaps in the reference sequence") operations[intron_mask] = CigarOp.INTRON if distinguish_matches: symbol_codes = get_codes(alignment) ref_codes = symbol_codes[reference_index, :] seg_codes = symbol_codes[segment_index, :] - equal_mask = (ref_codes == seg_codes) - match_mask = (operations == CigarOp.MATCH) + equal_mask = ref_codes == seg_codes + match_mask = operations == CigarOp.MATCH operations[equal_mask & match_mask] = CigarOp.EQUAL operations[~equal_mask & match_mask] = CigarOp.DIFFERENT op_tuples = _aggregate_consecutive(operations) clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP - start_clip_length, end_clip_length = _find_clipped_bases( - alignment, segment_index - ) + start_clip_length, end_clip_length = _find_clipped_bases(alignment, segment_index) if start_clip_length != 0: start_clip = [(clip_op, start_clip_length)] else: @@ -386,9 +379,7 @@ def _find_clipped_bases(alignment, segment_index): # all previous bases are clipped... start_clip_length = seg_trace[0] # ...and the same applies for the last base - end_clip_length = ( - len(alignment.sequences[segment_index]) - seg_trace[-1] - 1 - ) + end_clip_length = len(alignment.sequences[segment_index]) - seg_trace[-1] - 1 return start_clip_length, end_clip_length @@ -431,4 +422,4 @@ def _op_tuples_from_cigar(cigar): op = CigarOp.from_cigar_symbol(char) op_tuples.append((op, count)) count = "" - return np.array(op_tuples, dtype=int) \ No newline at end of file + return np.array(op_tuples, dtype=int) diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 7f7d4f9eb..f53b2f223 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -5,11 +5,9 @@ __name__ = "biotite.sequence.align" __author__ = "Patrick Kunzmann" -from ..sequence import Sequence -from ..seqtypes import NucleotideSequence, ProteinSequence -from ..alphabet import Alphabet -import numpy as np import os +import numpy as np +from ..seqtypes import NucleotideSequence, ProteinSequence __all__ = ["SubstitutionMatrix"] @@ -21,54 +19,54 @@ class SubstitutionMatrix(object): A :class:`SubstitutionMatrix` maps each possible pairing of a symbol of a first alphabet with a symbol of a second alphabet to a score (integer). - + The class uses a 2-D (m x n) :class:`ndarray` (dtype=:attr:`numpy.int32`), where each element stores the score for a symbol pairing, indexed by the symbol codes of the respective symbols in an *m*-length alphabet 1 and an *n*-length alphabet 2. - + There are 3 ways to creates instances: - + At first a 2-D :class:`ndarray` containing the scores can be directly provided. - + Secondly a dictionary can be provided, where the keys are pairing tuples and values are the corresponding scores. The pairing tuples consist of a symbol of alphabet 1 as first element and a symbol of alphabet 2 as second element. Parings have to be provided for each possible combination. - + At last a valid matrix name can be given, which is loaded from the internal matrix database. The following matrices are avaliable: - + - Nucleotide substitution matrices from NCBI database - **NUC** - Also usable with ambiguous alphabet - + - Protein substitution matrices from NCBI database - + - **PAM** - **BLOSUM** - **MATCH** - Only differentiates between match and mismatch - **IDENTITY** - Strongly penalizes mismatches - **GONNET** - Not usable with default protein alphabet - **DAYHOFF** - + - Corrected protein substitution matrices :footcite:`Hess2016`, **** is the BLOCKS version, the matrix is based on - + - **BLOSUM_** - **RBLOSUM_** - **CorBLOSUM_** - + A list of all available matrix names is returned by :meth:`list_db()`. - + Since this class can handle two different alphabets, it is possible to align two different types of sequences. - + Objects of this class are immutable. - + Parameters ---------- alphabet1 : Alphabet, length=m @@ -79,23 +77,23 @@ class SubstitutionMatrix(object): Either a symbol code indexed :class:`ndarray` containing the scores, or a dictionary mapping the symbol pairing to scores, or a string referencing a matrix in the internal database. - + Raises ------ KeyError If the matrix dictionary misses a symbol given in the alphabet. - + References ---------- - + .. footbibliography:: - + Examples -------- - + Creating a matrix for two different (nonsense) alphabets via a matrix dictionary: - + >>> alph1 = Alphabet(["foo","bar"]) >>> alph2 = Alphabet([1,2,3]) >>> matrix_dict = {("foo",1):5, ("foo",2):10, ("foo",3):15, @@ -119,17 +117,16 @@ class SubstitutionMatrix(object): C 0 1 0 0 G 0 0 1 0 T 0 0 0 1 - + Creating a matrix via database name: - + >>> alph = ProteinSequence.alphabet >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50") """ - + # Directory of matrix files - _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "matrix_data") - + _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data") + def __init__(self, alphabet1, alphabet2, score_matrix): self._alph1 = alphabet1 self._alph2 = alphabet2 @@ -147,16 +144,19 @@ def __init__(self, alphabet1, alphabet2, score_matrix): matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix) self._fill_with_matrix_dict(matrix_dict) else: - raise TypeError("Matrix must be either a dictionary, " - "an 2-D ndarray or a string") + raise TypeError( + "Matrix must be either a dictionary, " "an 2-D ndarray or a string" + ) # This class is immutable and has a getter function for the # score matrix -> make the score matrix read-only self._matrix.setflags(write=False) def __repr__(self): """Represent SubstitutionMatrix as a string for debugging.""" - return f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " \ - f"np.{np.array_repr(self._matrix)})" + return ( + f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " + f"np.{np.array_repr(self._matrix)})" + ) def __eq__(self, item): if not isinstance(item, SubstitutionMatrix): @@ -173,40 +173,39 @@ def __ne__(self, item): return not self == item def _fill_with_matrix_dict(self, matrix_dict): - self._matrix = np.zeros(( len(self._alph1), len(self._alph2) ), - dtype=np.int32) + self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32) for i in range(len(self._alph1)): for j in range(len(self._alph2)): sym1 = self._alph1.decode(i) sym2 = self._alph2.decode(j) - self._matrix[i,j] = int(matrix_dict[sym1, sym2]) - + self._matrix[i, j] = int(matrix_dict[sym1, sym2]) + def get_alphabet1(self): """ - Get the first alphabet. - + Get the first alphabet. + Returns ------- alphabet : Alphabet The first alphabet. """ return self._alph1 - + def get_alphabet2(self): """ - Get the second alphabet. - + Get the second alphabet. + Returns ------- alphabet : Alphabet The second alphabet. """ return self._alph2 - + def score_matrix(self): """ Get the 2-D :class:`ndarray` containing the score values. - + Returns ------- matrix : ndarray, shape=(m,n), dtype=np.int32 @@ -214,12 +213,12 @@ def score_matrix(self): The array is read-only. """ return self._matrix - + def transpose(self): """ Get a copy of this instance, where the alphabets are interchanged. - + Returns ------- transposed : SubstitutionMatrix @@ -229,7 +228,7 @@ def transpose(self): new_alph2 = self._alph1 new_matrix = np.transpose(self._matrix) return SubstitutionMatrix(new_alph1, new_alph2, new_matrix) - + def is_symmetric(self): """ Check whether the substitution matrix is symmetric, @@ -242,35 +241,36 @@ def is_symmetric(self): True, if both alphabets are identical and the score matrix is symmetric, false otherwise. """ - return self._alph1 == self._alph2 \ - and np.array_equal(self._matrix, np.transpose(self._matrix)) - + return self._alph1 == self._alph2 and np.array_equal( + self._matrix, np.transpose(self._matrix) + ) + def get_score_by_code(self, code1, code2): """ Get the substitution score of two symbols, represented by their code. - + Parameters ---------- code1, code2 : int Symbol codes of the two symbols to be aligned. - + Returns ------- score : int The substitution / alignment score. """ return self._matrix[code1, code2] - + def get_score(self, symbol1, symbol2): """ Get the substitution score of two symbols. - + Parameters ---------- symbol1, symbol2 : object Symbols to be aligned. - + Returns ------- score : int @@ -279,19 +279,19 @@ def get_score(self, symbol1, symbol2): code1 = self._alph1.encode(symbol1) code2 = self._alph2.encode(symbol2) return self._matrix[code1, code2] - + def shape(self): """ Get the shape (i.e. the length of both alphabets) of the subsitution matrix. - + Returns ------- shape : tuple Matrix shape. """ return (len(self._alph1), len(self._alph2)) - + def __str__(self): # Create matrix in NCBI format string = " " @@ -306,18 +306,18 @@ def __str__(self): # Remove terminal line break string = string[:-1] return string - + @staticmethod def dict_from_str(string): """ Create a matrix dictionary from a string in NCBI matrix format. - + Symbols of the first alphabet are taken from the left column, symbols of the second alphabet are taken from the top row. - + The keys of the dictionary consist of tuples containing the aligned symbols and the values are the corresponding scores. - + Returns ------- matrix_dict : dict @@ -329,22 +329,22 @@ def dict_from_str(string): symbols2 = [e for e in lines[0].split()] scores = np.array([line.split()[1:] for line in lines[1:]]).astype(int) scores = np.transpose(scores) - + matrix_dict = {} for i in range(len(symbols1)): for j in range(len(symbols2)): - matrix_dict[(symbols1[i], symbols2[j])] = scores[i,j] + matrix_dict[(symbols1[i], symbols2[j])] = scores[i, j] return matrix_dict - + @staticmethod def dict_from_db(matrix_name): """ Create a matrix dictionary from a valid matrix name in the internal matrix database. - + The keys of the dictionary consist of tuples containing the aligned symbols and the values are the corresponding scores. - + Returns ------- matrix_dict : dict @@ -353,12 +353,12 @@ def dict_from_db(matrix_name): filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat" with open(filename, "r") as f: return SubstitutionMatrix.dict_from_str(f.read()) - + @staticmethod def list_db(): """ List all matrix names in the internal database. - + Returns ------- db_list : list @@ -367,27 +367,26 @@ def list_db(): files = os.listdir(SubstitutionMatrix._db_dir) # Remove '.mat' from files return [file[:-4] for file in sorted(files)] - - + @staticmethod def std_protein_matrix(): """ Get the default :class:`SubstitutionMatrix` for protein sequence alignments, which is BLOSUM62. - + Returns ------- matrix : SubstitutionMatrix Default matrix. """ return _matrix_blosum62 - + @staticmethod def std_nucleotide_matrix(): """ Get the default :class:`SubstitutionMatrix` for DNA sequence alignments. - + Returns ------- matrix : SubstitutionMatrix @@ -395,11 +394,11 @@ def std_nucleotide_matrix(): """ return _matrix_nuc -# Preformatted BLOSUM62 and NUC substitution matrix from NCBI -_matrix_blosum62 = SubstitutionMatrix(ProteinSequence.alphabet, - ProteinSequence.alphabet, - "BLOSUM62") -_matrix_nuc = SubstitutionMatrix(NucleotideSequence.alphabet_amb, - NucleotideSequence.alphabet_amb, - "NUC") +# Preformatted BLOSUM62 and NUC substitution matrix from NCBI +_matrix_blosum62 = SubstitutionMatrix( + ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62" +) +_matrix_nuc = SubstitutionMatrix( + NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" +) diff --git a/src/biotite/sequence/align/statistics.py b/src/biotite/sequence/align/statistics.py index 19a8c9aba..a62eae224 100644 --- a/src/biotite/sequence/align/statistics.py +++ b/src/biotite/sequence/align/statistics.py @@ -29,7 +29,7 @@ class EValueEstimator: of random sequence alignments in :meth:`from_samples()` :footcite:`Altschul1986`, which may be time consuming. If these parameters are known, the constructor can be used instead. - + Based on the sampled parameters, the decadic logarithm of the E-value can be quickly calculated via :meth:`log_evalue()`. @@ -39,7 +39,7 @@ class EValueEstimator: The :math:`\lambda` parameter. k : float The :math:`K` parameter. - + Notes ----- The calculated E-value is a rough estimation that gets more @@ -102,8 +102,9 @@ def __init__(self, lam, k): self._k = k @staticmethod - def from_samples(alphabet, matrix, gap_penalty, frequencies, - sample_length=1000, sample_size=1000): + def from_samples( + alphabet, matrix, gap_penalty, frequencies, sample_length=1000, sample_size=1000 + ): r""" Create an :class:`EValueEstimator` with :math:`\lambda` and :math:`K` estimated via sampling alignments of random sequences @@ -137,13 +138,13 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, The number of sampled sequences. The accuracy of the estimated parameters and E-values, but also the runtime increases with the sample size. - + Returns ------- estimator : EValueEstimator A :class:`EValueEstimator` with sampled :math:`\lambda` and :math:`K` parameters. - + Notes ----- The sampling process generates random sequences based on @@ -167,15 +168,15 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, raise ValueError("A symmetric substitution matrix is required") if not matrix.get_alphabet1().extends(alphabet): raise ValueError( - "The substitution matrix is not compatible " - "with the given alphabet" + "The substitution matrix is not compatible " "with the given alphabet" ) - score_matrix = matrix.score_matrix()[:len(alphabet), :len(alphabet)] - if np.sum( - score_matrix \ - * frequencies[np.newaxis, :] \ - * frequencies[:, np.newaxis] - ) >= 0: + score_matrix = matrix.score_matrix()[: len(alphabet), : len(alphabet)] + if ( + np.sum( + score_matrix * frequencies[np.newaxis, :] * frequencies[:, np.newaxis] + ) + >= 0 + ): raise ValueError( "Invalid substitution matrix, the expected similarity " "score between two random symbols is not negative" @@ -183,9 +184,7 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, # Generate the sequence code for the random sequences random_sequence_code = np.random.choice( - len(alphabet), - size=(sample_size, 2, sample_length), - p=frequencies + len(alphabet), size=(sample_size, 2, sample_length), p=frequencies ) # Sample the alignments of random sequences @@ -193,28 +192,27 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, for i in range(sample_size): seq1 = GeneralSequence(alphabet) seq2 = GeneralSequence(alphabet) - seq1.code = random_sequence_code[i,0] - seq2.code = random_sequence_code[i,1] + seq1.code = random_sequence_code[i, 0] + seq2.code = random_sequence_code[i, 1] sample_scores[i] = align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=gap_penalty, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=gap_penalty, max_number=1 )[0].score - + # Use method of moments to estimate parameters lam = np.pi / np.sqrt(6 * np.var(sample_scores)) u = np.mean(sample_scores) - np.euler_gamma / lam k = np.exp(lam * u) / sample_length**2 - + return EValueEstimator(lam, k) @property def lam(self): return self._lam - + @property def k(self): return self._k - + def log_evalue(self, score, seq1_length, seq2_length): r""" Calculate the decadic logarithm of the E-value for a given @@ -223,11 +221,11 @@ def log_evalue(self, score, seq1_length, seq2_length): The E-value and the logarithm of the E-value is calculated as .. math:: - + E = Kmn e^{-\lambda s} \log_{10} E = (\log_{10} Kmn) - \frac{\lambda s}{\ln 10}, - + where :math:`s` is the similarity score and :math:`m` and :math:`n` are the lengths of the aligned sequences. @@ -245,12 +243,12 @@ def log_evalue(self, score, seq1_length, seq2_length): this is usually either the combined length of all sequences in the database or the length of the hit sequence multiplied by the number of sequences in the database. - + Returns ------- log_e : float The decadic logarithm of the E-value. - + Notes ----- This method returns the logarithm of the E-value instead of @@ -261,5 +259,6 @@ def log_evalue(self, score, seq1_length, seq2_length): seq1_length = np.asarray(seq1_length) seq2_length = np.asarray(seq2_length) - return np.log10(self._k * seq1_length * seq2_length) \ - - self._lam * score / np.log(10) \ No newline at end of file + return np.log10( + self._k * seq1_length * seq2_length + ) - self._lam * score / np.log(10) diff --git a/src/biotite/sequence/alphabet.py b/src/biotite/sequence/alphabet.py index 4b9fe9683..39c82a752 100644 --- a/src/biotite/sequence/alphabet.py +++ b/src/biotite/sequence/alphabet.py @@ -4,14 +4,19 @@ __name__ = "biotite.sequence" __author__ = "Patrick Kunzmann" -__all__ = ["Alphabet", "LetterAlphabet", "AlphabetMapper", "AlphabetError", - "common_alphabet"] +__all__ = [ + "Alphabet", + "LetterAlphabet", + "AlphabetMapper", + "AlphabetError", + "common_alphabet", +] import copy -from numbers import Integral import string +from numbers import Integral import numpy as np -from .codec import encode_chars, decode_to_chars, map_sequence_code +from .codec import decode_to_chars, encode_chars, map_sequence_code class Alphabet(object): @@ -107,7 +112,7 @@ def __init__(self, symbols): def __repr__(self): """Represent Alphabet as a string for debugging.""" - return f'Alphabet({self._symbols})' + return f"Alphabet({self._symbols})" def get_symbols(self): """ @@ -139,8 +144,7 @@ def extends(self, alphabet): elif len(alphabet) > len(self): return False else: - return alphabet.get_symbols() \ - == self.get_symbols()[:len(alphabet)] + return alphabet.get_symbols() == self.get_symbols()[: len(alphabet)] def encode(self, symbol): """ @@ -164,9 +168,7 @@ def encode(self, symbol): try: return self._symbol_dict[symbol] except KeyError: - raise AlphabetError( - f"Symbol {repr(symbol)} is not in the alphabet" - ) + raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet") def decode(self, code): """ @@ -238,9 +240,8 @@ def is_letter_alphabet(self): have length 1 and are printable. """ for symbol in self: - if not isinstance(symbol, (str, bytes)) \ - or len(symbol) > 1: - return False + if not isinstance(symbol, (str, bytes)) or len(symbol) > 1: + return False if isinstance(symbol, str): symbol = symbol.encode("ASCII") if symbol not in LetterAlphabet.PRINATBLES: @@ -292,8 +293,9 @@ class LetterAlphabet(Alphabet): in this list. """ - PRINATBLES = (string.digits + string.ascii_letters + string.punctuation) \ - .encode("ASCII") + PRINATBLES = (string.digits + string.ascii_letters + string.punctuation).encode( + "ASCII" + ) def __init__(self, symbols): if len(symbols) == 0: @@ -312,13 +314,12 @@ def __init__(self, symbols): # Direct 'astype' conversion is not allowed by numpy # -> frombuffer() self._symbols = np.frombuffer( - np.array(self._symbols, dtype="|S1"), - dtype=np.ubyte + np.array(self._symbols, dtype="|S1"), dtype=np.ubyte ) def __repr__(self): """Represent LetterAlphabet as a string for debugging.""" - return f'LetterAlphabet({self.get_symbols()})' + return f"LetterAlphabet({self.get_symbols()})" def extends(self, alphabet): if alphabet is self: @@ -326,9 +327,7 @@ def extends(self, alphabet): elif type(alphabet) == LetterAlphabet: if len(alphabet._symbols) > len(self._symbols): return False - return np.all( - alphabet._symbols == self._symbols[:len(alphabet._symbols)] - ) + return np.all(alphabet._symbols == self._symbols[: len(alphabet._symbols)]) else: return super().extends(alphabet) @@ -341,17 +340,14 @@ def get_symbols(self): symbols : list Copy of the internal list of symbols. """ - return [symbol.decode("ASCII") for symbol - in self._symbols_as_bytes()] + return [symbol.decode("ASCII") for symbol in self._symbols_as_bytes()] def encode(self, symbol): if not isinstance(symbol, (str, bytes)) or len(symbol) > 1: raise AlphabetError(f"Symbol '{symbol}' is not a single letter") indices = np.where(self._symbols == ord(symbol))[0] if len(indices) == 0: - raise AlphabetError( - f"Symbol {repr(symbol)} is not in the alphabet" - ) + raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet") return indices[0].item() def decode(self, code, as_bytes=False): @@ -382,13 +378,10 @@ def encode_multiple(self, symbols, dtype=None): elif isinstance(symbols, bytes): symbols = np.frombuffer(symbols, dtype=np.ubyte) elif isinstance(symbols, np.ndarray): - symbols = np.frombuffer( - symbols.astype(dtype="|S1"), dtype=np.ubyte - ) + symbols = np.frombuffer(symbols.astype(dtype="|S1"), dtype=np.ubyte) else: symbols = np.frombuffer( - np.array(list(symbols), dtype="|S1"), - dtype=np.ubyte + np.array(list(symbols), dtype="|S1"), dtype=np.ubyte ) return encode_chars(alphabet=self._symbols, symbols=symbols) @@ -435,7 +428,6 @@ def _symbols_as_bytes(self): return np.frombuffer(self._symbols, dtype="|S1") - class AlphabetMapper(object): """ This class is used for symbol code conversion from a source @@ -486,8 +478,7 @@ def __init__(self, source_alphabet, target_alphabet): else: self._necessary_mapping = True self._mapper = np.zeros( - len(source_alphabet), - dtype=AlphabetMapper._dtype(len(target_alphabet)) + len(source_alphabet), dtype=AlphabetMapper._dtype(len(target_alphabet)) ) for old_code in range(len(source_alphabet)): symbol = source_alphabet.decode(old_code) @@ -500,26 +491,25 @@ def __getitem__(self, code): return self._mapper[code] else: return code - if not isinstance(code, np.ndarray) \ - or code.dtype not in (np.uint8, np.uint16, np.uint32, np.uint64): - code = np.array(code, dtype=np.uint64) + if not isinstance(code, np.ndarray) or code.dtype not in ( + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ): + code = np.array(code, dtype=np.uint64) if self._necessary_mapping: mapped_code = np.empty(len(code), dtype=self._mapper.dtype) - map_sequence_code( - self._mapper, - code, - mapped_code - ) + map_sequence_code(self._mapper, code, mapped_code) return mapped_code else: return code - @staticmethod def _dtype(alphabet_size): - _size_uint8 = np.iinfo(np.uint8 ).max +1 - _size_uint16 = np.iinfo(np.uint16).max +1 - _size_uint32 = np.iinfo(np.uint32).max +1 + _size_uint8 = np.iinfo(np.uint8).max + 1 + _size_uint16 = np.iinfo(np.uint16).max + 1 + _size_uint32 = np.iinfo(np.uint32).max + 1 if alphabet_size <= _size_uint8: return np.uint8 elif alphabet_size <= _size_uint16: @@ -535,6 +525,7 @@ class AlphabetError(Exception): This exception is raised, when a code or a symbol is not in an :class:`Alphabet`. """ + pass @@ -563,4 +554,4 @@ def common_alphabet(alphabets): common_alphabet = alphabet else: return None - return common_alphabet \ No newline at end of file + return common_alphabet diff --git a/src/biotite/sequence/annotation.py b/src/biotite/sequence/annotation.py index cb2a9267e..21d10768e 100644 --- a/src/biotite/sequence/annotation.py +++ b/src/biotite/sequence/annotation.py @@ -6,17 +6,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["Location", "Feature", "Annotation", "AnnotatedSequence"] -import numbers import copy +import numbers import sys -from enum import Flag, Enum, auto +from enum import Enum, Flag, auto import numpy as np -from .sequence import Sequence from ..copyable import Copyable -from .seqtypes import NucleotideSequence -class Location(): +class Location: """ A :class:`Location` defines at which base(s)/residue(s) a feature is located. @@ -63,24 +61,25 @@ class Defect(Flag): - **BETWEEN** - The position is between to consecutive bases/residues. """ - NONE = 0 - MISS_LEFT = auto() - MISS_RIGHT = auto() - BEYOND_LEFT = auto() + + NONE = 0 + MISS_LEFT = auto() + MISS_RIGHT = auto() + BEYOND_LEFT = auto() BEYOND_RIGHT = auto() - UNK_LOC = auto() - BETWEEN = auto() + UNK_LOC = auto() + BETWEEN = auto() class Strand(Enum): """ This enum type describes the strand of the feature location. This is not relevant for protein sequence features. """ + FORWARD = auto() REVERSE = auto() - def __init__(self, first, last, strand=Strand.FORWARD, - defect=Defect.NONE): + def __init__(self, first, last, strand=Strand.FORWARD, defect=Defect.NONE): if first > last: raise ValueError( "The first position cannot be higher than the last position" @@ -92,8 +91,10 @@ def __init__(self, first, last, strand=Strand.FORWARD, def __repr__(self): """Represent Location as a string for debugging.""" - return f'Location({self._first}, {self._last}, strand={"Location." + str(self._strand)}, ' \ - f'defect={"Location." + str(self._defect)})' + return ( + f'Location({self._first}, {self._last}, strand={"Location." + str(self._strand)}, ' + f'defect={"Location." + str(self._defect)})' + ) @property def first(self): @@ -122,10 +123,12 @@ def __str__(self): def __eq__(self, item): if not isinstance(item, Location): return False - return ( self.first == item.first - and self.last == item.last - and self.strand == item.strand - and self.defect == item.defect) + return ( + self.first == item.first + and self.last == item.last + and self.strand == item.strand + and self.defect == item.defect + ) def __hash__(self): return hash((self._first, self._last, self._strand, self._defect)) @@ -208,9 +211,11 @@ def get_location_range(self): def __eq__(self, item): if not isinstance(item, Feature): return False - return ( self._key == item._key - and self._locs == item._locs - and self._qual == item._qual) + return ( + self._key == item._key + and self._locs == item._locs + and self._qual == item._qual + ) def __lt__(self, item): if not isinstance(item, Feature): @@ -223,7 +228,7 @@ def __lt__(self, item): return True elif first > it_first: return False - else: # First is equal + else: # First is equal return last > it_last def __gt__(self, item): @@ -237,7 +242,7 @@ def __gt__(self, item): return True elif first < it_first: return False - else: # First is equal + else: # First is equal return last < it_last @property @@ -253,7 +258,7 @@ def qual(self): return copy.copy(self._qual) def __hash__(self): - return hash(( self._key, self._locs, frozenset(self._qual.items()) )) + return hash((self._key, self._locs, frozenset(self._qual.items()))) class Annotation(Copyable): @@ -350,7 +355,9 @@ def __init__(self, features=None): def __repr__(self): """Represent Annotation as a string for debugging.""" - return f'Annotation([{", ".join([feat.__repr__() for feat in self._features])}])' + return ( + f'Annotation([{", ".join([feat.__repr__() for feat in self._features])}])' + ) def __copy_create__(self): return Annotation(self._features) @@ -403,7 +410,7 @@ def get_location_range(self): if loc.last > last: last = loc.last # Exclusive stop -> +1 - return first, last+1 + return first, last + 1 def del_feature(self, feature): """ @@ -475,9 +482,7 @@ def __getitem__(self, index): if loc.last > i_last: defect |= Location.Defect.MISS_RIGHT last = i_last - locs_in_scope.append(Location( - first, last, loc.strand, defect - )) + locs_in_scope.append(Location(first, last, loc.strand, defect)) if len(locs_in_scope) > 0: # The feature is present in the new annotation # if any of the original locations is in the new @@ -488,15 +493,12 @@ def __getitem__(self, index): sub_annot.add_feature(new_feature) return sub_annot else: - raise TypeError( - f"'{type(index).__name__}' instances are invalid indices" - ) + raise TypeError(f"'{type(index).__name__}' instances are invalid indices") def __delitem__(self, item): if not isinstance(item, Feature): raise TypeError( - f"Only 'Feature' objects are supported, " - f"not {type(item).__name__}" + f"Only 'Feature' objects are supported, " f"not {type(item).__name__}" ) self.del_feature(item) @@ -626,8 +628,10 @@ def __init__(self, annotation, sequence, sequence_start=1): def __repr__(self): """Represent AnnotatedSequence as a string for debugging.""" - return f'AnnotatedSequence({self._annotation.__repr__()}, {self._sequence.__repr__()}, ' \ - f'sequence_start={self._seqstart})' + return ( + f"AnnotatedSequence({self._annotation.__repr__()}, {self._sequence.__repr__()}, " + f"sequence_start={self._seqstart})" + ) @property def sequence_start(self): @@ -643,7 +647,8 @@ def annotation(self): def __copy_create__(self): return AnnotatedSequence( - self._annotation.copy(), self._sequence.copy, self._seqstart) + self._annotation.copy(), self._sequence.copy, self._seqstart + ) def reverse_complement(self, sequence_start=1): """ @@ -676,10 +681,12 @@ def reverse_complement(self, sequence_start=1): # (seq_len-1) -> last sequence index # (loc.last-self._seqstart) -> location to index # ... + rev_seqstart -> index to location - rev_loc_first \ - = (seq_len-1) - (loc.last-self._seqstart) + rev_seqstart - rev_loc_last \ - = (seq_len-1) - (loc.first-self._seqstart) + rev_seqstart + rev_loc_first = ( + (seq_len - 1) - (loc.last - self._seqstart) + rev_seqstart + ) + rev_loc_last = ( + (seq_len - 1) - (loc.first - self._seqstart) + rev_seqstart + ) if loc.strand == Location.Strand.FORWARD: rev_loc_strand = Location.Strand.REVERSE @@ -700,17 +707,14 @@ def reverse_complement(self, sequence_start=1): if loc.defect & Location.Defect.BETWEEN: rev_loc_defect |= Location.Defect.BETWEEN - rev_locs.append(Location( - rev_loc_first, rev_loc_last, - rev_loc_strand, rev_loc_defect - )) - rev_features.append(Feature( - feature.key, rev_locs, feature.qual - )) + rev_locs.append( + Location( + rev_loc_first, rev_loc_last, rev_loc_strand, rev_loc_defect + ) + ) + rev_features.append(Feature(feature.key, rev_locs, feature.qual)) - return AnnotatedSequence( - Annotation(rev_features), rev_sequence, rev_seqstart - ) + return AnnotatedSequence(Annotation(rev_features), rev_sequence, rev_seqstart) def __getitem__(self, index): if isinstance(index, Feature): @@ -730,24 +734,20 @@ def __getitem__(self, index): pass elif strand is None: strand = loc.strand - else: # loc.strand != strand + else: # loc.strand != strand raise ValueError( "All locations of the feature must have the same " "strand direction" ) if strand == Location.Strand.FORWARD: - sorted_locs = sorted( - locs, key=lambda loc: loc.first - ) + sorted_locs = sorted(locs, key=lambda loc: loc.first) else: - sorted_locs = sorted( - locs, key=lambda loc: loc.last, reverse=True - ) + sorted_locs = sorted(locs, key=lambda loc: loc.last, reverse=True) # Merge the sequences corresponding to the ordered locations for loc in sorted_locs: slice_start = loc.first - self._seqstart # +1 due to exclusive stop - slice_stop = loc.last - self._seqstart +1 + slice_stop = loc.last - self._seqstart + 1 add_seq = self._sequence[slice_start:slice_stop] if loc.strand == Location.Strand.REVERSE: add_seq = add_seq.reverse().complement() @@ -775,17 +775,17 @@ def __getitem__(self, index): rel_seq_start = self._seqstart else: rel_seq_start = index.start - return AnnotatedSequence(self._annotation[index], - self._sequence[seq_start:seq_stop], - rel_seq_start) + return AnnotatedSequence( + self._annotation[index], + self._sequence[seq_start:seq_stop], + rel_seq_start, + ) elif isinstance(index, numbers.Integral): return self._sequence[index - self._seqstart] else: - raise TypeError( - f"'{type(index).__name__}' instances are invalid indices" - ) + raise TypeError(f"'{type(index).__name__}' instances are invalid indices") def __setitem__(self, index, item): if isinstance(index, Feature): @@ -796,10 +796,11 @@ def __setitem__(self, index, item): for loc in index.locs: slice_start = loc.first - self._seqstart # +1 due to exclusive stop - slice_stop = loc.last - self._seqstart +1 + slice_stop = loc.last - self._seqstart + 1 interval_size = slice_stop - slice_start - self._sequence[slice_start:slice_stop] \ - = sub_seq[sub_seq_i : sub_seq_i + interval_size] + self._sequence[slice_start:slice_stop] = sub_seq[ + sub_seq_i : sub_seq_i + interval_size + ] sub_seq_i += interval_size elif isinstance(index, slice): # Sequence start correction @@ -817,13 +818,13 @@ def __setitem__(self, index, item): # Item is a symbol self._sequence[index - self._seqstart] = item else: - raise TypeError( - f"'{type(index).__name__}' instances are invalid indices" - ) + raise TypeError(f"'{type(index).__name__}' instances are invalid indices") def __eq__(self, item): if not isinstance(item, AnnotatedSequence): return False - return ( self.annotation == item.annotation - and self.sequence == item.sequence - and self._seqstart == item._seqstart) + return ( + self.annotation == item.annotation + and self.sequence == item.sequence + and self._seqstart == item._seqstart + ) diff --git a/src/biotite/sequence/codon.py b/src/biotite/sequence/codon.py index fe50c791f..5380706a3 100644 --- a/src/biotite/sequence/codon.py +++ b/src/biotite/sequence/codon.py @@ -7,12 +7,11 @@ __all__ = ["CodonTable"] import copy -from os.path import join, dirname, realpath -import numpy as np from numbers import Integral +from os.path import dirname, join, realpath +import numpy as np from .seqtypes import NucleotideSequence, ProteinSequence - # Abbreviations _NUC_ALPH = NucleotideSequence.alphabet_unamb _PROT_ALPH = ProteinSequence.alphabet @@ -20,7 +19,7 @@ # Multiplier array that converts a codon in code representation # into a unique integer _radix = len(_NUC_ALPH) -_radix_multiplier = np.array([_radix**n for n in (2,1,0)], dtype=int) +_radix_multiplier = np.array([_radix**n for n in (2, 1, 0)], dtype=int) class CodonTable(object): @@ -29,14 +28,14 @@ class CodonTable(object): amino acid. It also defines start codons. A :class:`CodonTable` takes/outputs either the symbols or code of the codon/amino acid. - + Furthermore, this class is able to give a list of codons that corresponds to a given amino acid. - + The :func:`load()` method allows loading of NCBI codon tables. - + Objects of this class are immutable. - + Parameters ---------- codon_dict : dict of (str -> str) @@ -47,27 +46,27 @@ class CodonTable(object): starts : iterable object of str The start codons. Each entry must be a string of length 3 (all upper case). - + Examples -------- - + Get the amino acid coded by a given codon (symbol and code): - + >>> table = CodonTable.default_table() >>> print(table["ATG"]) M >>> print(table[(1,2,3)]) 14 - + Get the codons coding for a given amino acid (symbol and code): - + >>> table = CodonTable.default_table() >>> print(table["M"]) ('ATG',) >>> print(table[14]) ((0, 2, 0), (0, 2, 2), (1, 2, 0), (1, 2, 1), (1, 2, 2), (1, 2, 3)) """ - + # For efficient mapping of codon codes to amino acid codes, # especially in in the 'map_codon_codes()' function, the class # maps each possible codon into a unique number using a radix based @@ -77,7 +76,7 @@ class CodonTable(object): # file for builtin codon tables from NCBI _table_file = join(dirname(realpath(__file__)), "codon_tables.txt") - + def __init__(self, codon_dict, starts): # Check if 'starts' is iterable object of length 3 string for start in starts: @@ -100,12 +99,10 @@ def __init__(self, codon_dict, starts): if (self._codons == -1).any(): # Find the missing codon missing_index = np.where(self._codons == -1)[0][0] - codon_code = CodonTable._to_codon(missing_index) + codon_code = CodonTable._to_codon(missing_index) codon = _NUC_ALPH.decode_multiple(codon_code) codon_str = "".join(codon) - raise ValueError( - f"Codon dictionary does not contain codon '{codon_str}'" - ) + raise ValueError(f"Codon dictionary does not contain codon '{codon_str}'") def __repr__(self): """Represent CodonTable as a string for debugging.""" @@ -131,8 +128,10 @@ def __getitem__(self, item): codon_numbers = np.where(self._codons == aa_code)[0] codon_codes = CodonTable._to_codon(codon_numbers) codons = tuple( - ["".join(_NUC_ALPH.decode_multiple(codon_code)) - for codon_code in codon_codes] + [ + "".join(_NUC_ALPH.decode_multiple(codon_code)) + for codon_code in codon_codes + ] ) return codons elif len(item) == 3: @@ -155,30 +154,28 @@ def __getitem__(self, item): # Code for codon as any iterable object # Code for codon -> return corresponding amino acid codes if len(item) != 3: - raise ValueError( - f"{item} is an invalid sequence code for a codon" - ) + raise ValueError(f"{item} is an invalid sequence code for a codon") codon_number = CodonTable._to_number(item) aa_code = self._codons[codon_number] return aa_code - + def map_codon_codes(self, codon_codes): """ Efficiently map multiple codons to the corresponding amino acids. - + Parameters ---------- codon_codes : ndarray, dtype=int, shape=(n,3) The codons to be translated into amino acids. The codons are given as symbol codes. *n* is the amount of codons. - + Returns ------- aa_codes : ndarray, dtype=int, shape=(n,) The amino acids as symbol codes. - + Examples -------- >>> dna = NucleotideSequence("ATGGTTTAA") @@ -209,46 +206,50 @@ def map_codon_codes(self, codon_codes): codon_numbers = CodonTable._to_number(codon_codes) aa_codes = self._codons[codon_numbers] return aa_codes - + def codon_dict(self, code=False): """ Get the codon to amino acid mappings dictionary. - + Parameters ---------- code : bool If true, the dictionary contains keys and values as code. Otherwise, the dictionary contains strings for codons and amino acid. (Default: False) - + Returns ------- codon_dict : dict The dictionary mapping codons to amino acids. """ if code: - return {tuple(CodonTable._to_codon(codon_number)): aa_code - for codon_number, aa_code in enumerate(self._codons)} + return { + tuple(CodonTable._to_codon(codon_number)): aa_code + for codon_number, aa_code in enumerate(self._codons) + } else: - return {"".join(_NUC_ALPH.decode_multiple(codon_code)): - _PROT_ALPH.decode(aa_code) - for codon_code, aa_code - in self.codon_dict(code=True).items()} - + return { + "".join(_NUC_ALPH.decode_multiple(codon_code)): _PROT_ALPH.decode( + aa_code + ) + for codon_code, aa_code in self.codon_dict(code=True).items() + } + def is_start_codon(self, codon_codes): codon_numbers = CodonTable._to_number(codon_codes) return np.isin(codon_numbers, self._starts) - + def start_codons(self, code=False): """ Get the start codons of the codon table. - + Parameters ---------- code : bool If true, the code will be returned instead of strings. (Default: False) - + Returns ------- start_codons : tuple @@ -257,25 +258,29 @@ def start_codons(self, code=False): """ if code: return tuple( - [tuple(CodonTable._to_codon(codon_number)) - for codon_number in self._starts] + [ + tuple(CodonTable._to_codon(codon_number)) + for codon_number in self._starts + ] ) else: return tuple( - ["".join(_NUC_ALPH.decode_multiple(codon_code)) - for codon_code in self.start_codons(code=True)] + [ + "".join(_NUC_ALPH.decode_multiple(codon_code)) + for codon_code in self.start_codons(code=True) + ] ) - + def with_start_codons(self, starts): """ Create an new :class:`CodonTable` with the same codon mappings, but changed start codons. - + Parameters ---------- starts : iterable object of str The new start codons. - + Returns ------- new_table : CodonTable @@ -288,17 +293,17 @@ def with_start_codons(self, starts): ) new_table._starts = CodonTable._to_number(start_codon_codes) return new_table - + def with_codon_mappings(self, codon_dict): """ Create an new :class:`CodonTable` with partially changed codon mappings. - + Parameters ---------- codon_dict : dict of (str -> str) The changed codon mappings. - + Returns ------- new_table : CodonTable @@ -329,9 +334,9 @@ def __str__(self): else: string += " " # Add space for next codon - string += " "*3 + string += " " * 3 # Remove terminal space - string = string [:-6] + string = string[:-6] # Jump to next line string += "\n" # Add empty line @@ -354,10 +359,10 @@ def _to_codon(numbers): if not isinstance(numbers, np.ndarray): numbers = np.array(list(numbers), dtype=int) codons = np.zeros(numbers.shape + (3,), dtype=int) - for n in (2,1,0): + for n in (2, 1, 0): val = _radix**n digit = numbers // val - codons[..., -(n+1)] = digit + codons[..., -(n + 1)] = digit numbers = numbers - digit * val return codons @@ -365,14 +370,14 @@ def _to_codon(numbers): def load(table_name): """ Load a NCBI codon table. - + Parameters ---------- table_name : str or int If a string is given, it is interpreted as official NCBI codon table name (e.g. "Vertebrate Mitochondrial"). An integer is interpreted as NCBI codon table ID. - + Returns ------- table : CodonTable @@ -381,7 +386,7 @@ def load(table_name): # Loads codon tables from codon_tables.txt with open(CodonTable._table_file, "r") as f: lines = f.read().split("\n") - + # Extract data for codon table from file table_found = False aa = None @@ -405,7 +410,7 @@ def load(table_name): table_found = True if table_found: if line.startswith("AA"): - #Remove identifier + # Remove identifier aa = line[5:].strip() elif line.startswith("Init"): init = line[5:].strip() @@ -415,19 +420,24 @@ def load(table_name): base2 = line[5:].strip() elif line.startswith("Base3"): base3 = line[5:].strip() - + # Create codon table from data - if aa is not None and init is not None \ - and base1 is not None and base2 is not None and base3 is not None: - symbol_dict = {} - starts = [] - # aa, init and baseX all have the same length - for i in range(len(aa)): - codon = base1[i] + base2[i] + base3[i] - if init[i] == "i": - starts.append(codon) - symbol_dict[codon] = aa[i] - return CodonTable(symbol_dict, starts) + if ( + aa is not None + and init is not None + and base1 is not None + and base2 is not None + and base3 is not None + ): + symbol_dict = {} + starts = [] + # aa, init and baseX all have the same length + for i in range(len(aa)): + codon = base1[i] + base2[i] + base3[i] + if init[i] == "i": + starts.append(codon) + symbol_dict[codon] = aa[i] + return CodonTable(symbol_dict, starts) else: raise ValueError(f"Codon table '{table_name}' was not found") @@ -435,7 +445,7 @@ def load(table_name): def table_names(): """ The possible codon table names for :func:`load()`. - + Returns ------- names : list of str @@ -448,14 +458,14 @@ def table_names(): if line.startswith("name"): names.extend([name.strip() for name in line[4:].split(";")]) return names - + @staticmethod def default_table(): """ The default codon table. The table is equal to the NCBI "Standard" codon table, with the difference that only "ATG" is a start codon. - + Returns ------- table : CodonTable diff --git a/src/biotite/sequence/graphics/__init__.py b/src/biotite/sequence/graphics/__init__.py index b1dbbf051..4b0b39b9f 100644 --- a/src/biotite/sequence/graphics/__init__.py +++ b/src/biotite/sequence/graphics/__init__.py @@ -29,5 +29,5 @@ from .colorschemes import * from .dendrogram import * from .features import * -from .plasmid import * from .logo import * +from .plasmid import * diff --git a/src/biotite/sequence/graphics/alignment.py b/src/biotite/sequence/graphics/alignment.py index 45b44e326..aeaaf5b24 100644 --- a/src/biotite/sequence/graphics/alignment.py +++ b/src/biotite/sequence/graphics/alignment.py @@ -4,10 +4,17 @@ __name__ = "biotite.sequence.graphics" __author__ = "Patrick Kunzmann" -__all__ = ["SymbolPlotter", "LetterPlotter", "LetterSimilarityPlotter", - "LetterTypePlotter","ArrayPlotter", - "plot_alignment", "plot_alignment_similarity_based", - "plot_alignment_type_based","plot_alignment_array"] +__all__ = [ + "SymbolPlotter", + "LetterPlotter", + "LetterSimilarityPlotter", + "LetterTypePlotter", + "ArrayPlotter", + "plot_alignment", + "plot_alignment_similarity_based", + "plot_alignment_type_based", + "plot_alignment_array", +] import abc import numpy as np @@ -81,8 +88,7 @@ class LetterPlotter(SymbolPlotter, metaclass=abc.ABCMeta): :class:`matplotlib.Text` instance of each symbol. """ - def __init__(self, axes, color_symbols=False, - font_size=None, font_param=None): + def __init__(self, axes, color_symbols=False, font_size=None, font_param=None): super().__init__(axes) self._color_symbols = color_symbols self._font_size = font_size @@ -101,9 +107,15 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i): box = Rectangle(bbox.p0, bbox.width, bbox.height) self.axes.add_patch(box) text = self.axes.text( - bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2, - symbol, color="black", ha="center", va="center", - size=self._font_size, **self._font_param) + bbox.x0 + bbox.width / 2, + bbox.y0 + bbox.height / 2, + symbol, + color="black", + ha="center", + va="center", + size=self._font_size, + **self._font_param, + ) text.set_clip_on(True) if self._color_symbols: @@ -196,17 +208,16 @@ class LetterSimilarityPlotter(LetterPlotter): because *a* does also occur in *b*\ :sub:`i`. """ - def __init__(self, axes, matrix=None, color_symbols=False, - font_size=None, font_param=None): - + def __init__( + self, axes, matrix=None, color_symbols=False, font_size=None, font_param=None + ): super().__init__(axes, color_symbols, font_size, font_param) if matrix is not None: self._matrix = matrix.score_matrix() else: self._matrix = None # Default colormap - self._cmap = self._generate_colormap(colors["dimgreen"], - self._color_symbols) + self._cmap = self._generate_colormap(colors["dimgreen"], self._color_symbols) def set_color(self, color=None, cmap=None): """ @@ -257,8 +268,7 @@ def get_color(self, alignment, column_i, seq_i): similarities[i] = 0 else: code2 = alignment.sequences[i].code[index2] - similarities[i] = self._get_similarity(self._matrix, - code1, code2) + similarities[i] = self._get_similarity(self._matrix, code1, code2) # Delete self-similarity similarities = np.delete(similarities, seq_i) similarity = np.average(similarities) @@ -283,14 +293,18 @@ def _generate_colormap(color, to_black): if to_black: # From color to black cmap_val = np.stack( - [np.interp(np.linspace(0, 1, 100), [0, 1], [color[i], 0]) - for i in range(len(color))] + [ + np.interp(np.linspace(0, 1, 100), [0, 1], [color[i], 0]) + for i in range(len(color)) + ] ).transpose() else: # From white to color cmap_val = np.stack( - [np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) - for i in range(len(color))] + [ + np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) + for i in range(len(color)) + ] ).transpose() return ListedColormap(cmap_val) @@ -325,8 +339,15 @@ class LetterTypePlotter(LetterPlotter): :class:`matplotlib.Text` instance of each symbol. """ - def __init__(self, axes, alphabet, color_scheme=None, color_symbols=False, - font_size=None, font_param=None): + def __init__( + self, + axes, + alphabet, + color_scheme=None, + color_symbols=False, + font_size=None, + font_param=None, + ): super().__init__(axes, color_symbols, font_size, font_param) if color_scheme is None: @@ -346,7 +367,7 @@ def get_color(self, alignment, column_i, seq_i): class ArrayPlotter(LetterPlotter): - ''' + """ This :class:`SymbolPlotter` quantitatively decorates sequences alignments, with molecular recognition data obtained from e.g. microarrays. Symbols are visualized as characters on a colored background box. The color of a given box represents the recognition @@ -371,15 +392,14 @@ class ArrayPlotter(LetterPlotter): Additional parameters that is given to the :class:`matplotlib.Text` instance of each symbol. - ''' - def __init__(self, axes, fl_score, color_symbols=False, - font_size=None, font_param=None): + """ + def __init__( + self, axes, fl_score, color_symbols=False, font_size=None, font_param=None + ): super().__init__(axes, color_symbols, font_size, font_param) self.fl_score = fl_score - self._cmap = self._generate_colormap(colors["dimorange"], - self._color_symbols) - + self._cmap = self._generate_colormap(colors["dimorange"], self._color_symbols) def get_color(self, alignment, column_i, seq_i): index1 = alignment.trace[column_i, seq_i] @@ -389,7 +409,6 @@ def get_color(self, alignment, column_i, seq_i): spot_signal = self._get_signal(self.fl_score, column_i, seq_i) return self._cmap(spot_signal) - def _get_signal(self, fl_score, column_i, seq_i): if fl_score is None: signal = 0.0 @@ -400,7 +419,6 @@ def _get_signal(self, fl_score, column_i, seq_i): def get_cmap(self): return self._cmap - def plot_symbol(self, bbox, alignment, column_i, seq_i): from matplotlib.patches import Rectangle @@ -422,9 +440,15 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i): box = Rectangle(bbox.p0, bbox.width, bbox.height) self.axes.add_patch(box) text = self.axes.text( - bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2, - symbol, color="black", ha="center", va="center", - size=self._font_size, **self._font_param) + bbox.x0 + bbox.width / 2, + bbox.y0 + bbox.height / 2, + symbol, + color="black", + ha="center", + va="center", + size=self._font_size, + **self._font_param, + ) text.set_clip_on(True) if self._color_symbols: @@ -455,11 +479,20 @@ def _generate_colormap(color, to_black): return ListedColormap(cmap_val) -def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, - show_numbers=False, number_size=None, number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, symbol_spacing=None): +def plot_alignment( + axes, + alignment, + symbol_plotter, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + symbol_spacing=None, +): """ Plot a pairwise or multiple sequence alignment. @@ -545,7 +578,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, ) for i, func in enumerate(number_functions): if func is None: - number_functions[i] = (lambda x: x + 1) + number_functions[i] = lambda x: x + 1 seq_num = alignment.trace.shape[1] seq_len = alignment.trace.shape[0] @@ -573,7 +606,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, for i in range(seq_len): y = y_start for j in range(seq_num): - bbox = Bbox([[x, y], [x+1, y+1]]) + bbox = Bbox([[x, y], [x + 1, y + 1]]) symbol_plotter.plot_symbol(bbox, alignment, i, j) y += 1 line_pos += 1 @@ -583,8 +616,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, y_start += seq_num + spacing else: x += 1 - if (symbol_spacing - and (i + 1) % symbol_spacing == 0): + if symbol_spacing and (i + 1) % symbol_spacing == 0: line_pos += 1 x += 1 @@ -613,14 +645,12 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, y = 0.5 for i in range(line_count): for j in range(seq_num): - if i == line_count-1: + if i == line_count - 1: # Last line -> get number of last column in trace trace_pos = len(alignment.trace) - 1 else: - trace_pos = (i+1) * symbols_per_line - 1 - seq_index = _get_last_valid_index( - alignment, trace_pos, j - ) + trace_pos = (i + 1) * symbols_per_line - 1 + seq_index = _get_last_valid_index(alignment, trace_pos, j) # if -1 -> terminal gap # -> skip number for this sequence in this line if seq_index != -1: @@ -636,18 +666,14 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, axes.set_xlim(0, symbols_to_print) # Y-axis starts from top - lim = seq_num*line_count + spacing*(line_count-1) + lim = seq_num * line_count + spacing * (line_count - 1) axes.set_ylim(lim, 0) number_axes.set_ylim(lim, 0) axes.set_frame_on(False) number_axes.set_frame_on(False) # Remove ticks and set label and number size - axes.yaxis.set_tick_params( - left=False, right=False, labelsize=label_size - ) - number_axes.yaxis.set_tick_params( - left=False, right=False, labelsize=number_size - ) + axes.yaxis.set_tick_params(left=False, right=False, labelsize=label_size) + number_axes.yaxis.set_tick_params(left=False, right=False, labelsize=number_size) if show_line_position: axes.xaxis.set_tick_params( @@ -659,15 +685,25 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, ) -def plot_alignment_similarity_based(axes, alignment, symbols_per_line=50, - show_numbers=False, number_size=None, - number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, - color=None, cmap=None, matrix=None, - color_symbols=False, symbol_spacing=None, - symbol_size=None, symbol_param=None): +def plot_alignment_similarity_based( + axes, + alignment, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color=None, + cmap=None, + matrix=None, + color_symbols=False, + symbol_spacing=None, + symbol_size=None, + symbol_param=None, +): r""" Plot a pairwise or multiple sequence alignment highlighting the similarity per alignment column. @@ -788,31 +824,47 @@ def plot_alignment_similarity_based(axes, alignment, symbols_per_line=50, because *a* does also occur in *b*\ :sub:`i`. """ symbol_plotter = LetterSimilarityPlotter( - axes, matrix=matrix, font_size=symbol_size, font_param=symbol_param, - color_symbols=color_symbols + axes, + matrix=matrix, + font_size=symbol_size, + font_param=symbol_param, + color_symbols=color_symbols, ) if color is not None or cmap is not None: symbol_plotter.set_color(color=color, cmap=cmap) plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing, symbol_spacing=symbol_spacing + spacing=spacing, + symbol_spacing=symbol_spacing, ) -def plot_alignment_type_based(axes, alignment, symbols_per_line=50, - show_numbers=False, number_size=None, - number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, - color_scheme=None, color_symbols=False, - symbol_size=None, symbol_param=None, - symbol_spacing=None): +def plot_alignment_type_based( + axes, + alignment, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color_scheme=None, + color_symbols=False, + symbol_size=None, + symbol_param=None, + symbol_spacing=None, +): """ Plot a pairwise or multiple sequence alignment coloring each symbol based on the symbol type. @@ -897,27 +949,48 @@ def plot_alignment_type_based(axes, alignment, symbols_per_line=50, """ alphabet = alignment.sequences[0].get_alphabet() symbol_plotter = LetterTypePlotter( - axes, alphabet, font_size=symbol_size, font_param=symbol_param, - color_symbols=color_symbols, color_scheme=color_scheme + axes, + alphabet, + font_size=symbol_size, + font_param=symbol_param, + color_symbols=color_symbols, + color_scheme=color_scheme, ) plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing, symbol_spacing=symbol_spacing + spacing=spacing, + symbol_spacing=symbol_spacing, ) -def plot_alignment_array(axes, alignment, fl_score, symbols_per_line=50, - show_numbers=False, number_size=None, - number_functions=None, labels=None, label_size=None, - show_line_position=False, spacing=1, color=None, - cmap=None, symbol_spacing=None, - symbol_size=None, symbol_param=None): - ''' +def plot_alignment_array( + axes, + alignment, + fl_score, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color=None, + cmap=None, + symbol_spacing=None, + symbol_size=None, + symbol_param=None, +): + """ Plot a pairwise sequence alignment using an :class:`ArrayPlotter` instance. @@ -995,19 +1068,27 @@ def plot_alignment_array(axes, alignment, fl_score, symbols_per_line=50, A '*' represents a sequence match on the alignment A '-' represents a sequence gap on the alignment - ''' + """ symbol_plotter = ArrayPlotter( - axes, fl_score = fl_score, font_size = symbol_size, font_param = symbol_param, + axes, + fl_score=fl_score, + font_size=symbol_size, + font_param=symbol_param, ) plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing, symbol_spacing=symbol_spacing + spacing=spacing, + symbol_spacing=symbol_spacing, ) diff --git a/src/biotite/sequence/graphics/colorschemes.py b/src/biotite/sequence/graphics/colorschemes.py index 049cddbb4..88044e0f4 100644 --- a/src/biotite/sequence/graphics/colorschemes.py +++ b/src/biotite/sequence/graphics/colorschemes.py @@ -6,11 +6,10 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_color_scheme", "list_color_scheme_names", "load_color_scheme"] -import numpy as np -import json -from os.path import join, dirname, realpath import glob +import json import os +from os.path import dirname, join, realpath from ..alphabet import Alphabet @@ -26,13 +25,13 @@ def load_color_scheme(file_name): ---------- file_name : str The file name of the JSON file containing the scheme. - + Returns ------- scheme : dict A dictionary representing the color scheme, It contains the following keys, if the input file is proper: - + - **name** - Name of the scheme. - **alphabet** - :class:`Alphabet` instance describing the type of sequence the scheme can be used for. @@ -71,7 +70,7 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"): default : str or tuple, optional A *Matplotlib* compatible color that is used for symbols that have no defined color in the scheme. - + Returns ------- colors : list @@ -99,11 +98,10 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"): if scheme["name"] == name and scheme["alphabet"].extends(alphabet): colors = scheme["colors"] # Replace None values with default color - colors = [color if color is not None else default - for color in colors] + colors = [color if color is not None else default for color in colors] # Only return colors that are in scope of this alphabet # and not the extended alphabet - return colors[:len(alphabet)] + return colors[: len(alphabet)] raise ValueError(f"Unkown scheme '{name}' for given alphabet") @@ -117,7 +115,7 @@ def list_color_scheme_names(alphabet): The alphbet to get the color scheme names for. The alphabet of the scheme must equal or extend this parameter, to be included in the list. - + Returns ------- schemes : list of str @@ -136,4 +134,4 @@ def list_color_scheme_names(alphabet): for file_name in glob.glob(_scheme_dir + os.sep + "*.json"): scheme = load_color_scheme(file_name) - _color_schemes.append(scheme) \ No newline at end of file + _color_schemes.append(scheme) diff --git a/src/biotite/sequence/graphics/dendrogram.py b/src/biotite/sequence/graphics/dendrogram.py index f351c891f..254702443 100644 --- a/src/biotite/sequence/graphics/dendrogram.py +++ b/src/biotite/sequence/graphics/dendrogram.py @@ -8,9 +8,18 @@ import numpy as np -def plot_dendrogram(axes, tree, orientation="left", use_distances=True, - labels=None, label_size=None, color="black", - show_distance=True, **kwargs): + +def plot_dendrogram( + axes, + tree, + orientation="left", + use_distances=True, + labels=None, + label_size=None, + color="black", + show_distance=True, + **kwargs, +): """ Plot a dendrogram from a (phylogenetic) tree. @@ -24,7 +33,7 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True, If true, the `distance` attribute of the :class:`TreeNode` objects are used as distance measure. Otherwise the topological distance is used. - labels : list of str, optional + labels : list of str, optional The leaf node labels. The label of a leaf node is the entry at the position of its `index` attribute. @@ -40,9 +49,9 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True, Additional parameters that are used to draw the dendrogram lines. """ - + indices = tree.root.get_indices() - leaf_dict = {indices[i] : i for i in indices} + leaf_dict = {indices[i]: i for i in indices} # Required for setting the plot limits max_distance = 0 @@ -50,12 +59,12 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True, def _plot_node(node, distance): """ Draw the lines from the given node to its children. - + Parameters ---------- dist : float the distance of the node from root - + Returns ------- pos : float @@ -88,31 +97,43 @@ def _plot_node(node, distance): if orientation in ["left", "right"]: # Line connecting the childs axes.plot( - [distance, distance], [child_pos[0], child_pos[-1]], - color=color, marker="None", **kwargs + [distance, distance], + [child_pos[0], child_pos[-1]], + color=color, + marker="None", + **kwargs, ) # Lines depicting the distances of the childs for child_dist, pos in zip(child_distances, child_pos): axes.plot( - [distance, child_dist], [pos, pos], - color=color, marker="None", **kwargs + [distance, child_dist], + [pos, pos], + color=color, + marker="None", + **kwargs, ) elif orientation in ["bottom", "top"]: # Line connecting the childs axes.plot( - [child_pos[0], child_pos[-1]], [distance, distance], - color=color, marker="None", **kwargs + [child_pos[0], child_pos[-1]], + [distance, distance], + color=color, + marker="None", + **kwargs, ) # Lines depicting the distances of the childs for child_dist, pos in zip(child_distances, child_pos): axes.plot( - [pos, pos], [distance, child_dist], - color=color, marker="None", **kwargs + [pos, pos], + [distance, child_dist], + color=color, + marker="None", + **kwargs, ) else: raise ValueError(f"'{orientation}' is not a valid orientation") return center_pos - + _plot_node(tree.root, 0) if labels is not None: @@ -133,12 +154,18 @@ def _plot_node(node, distance): axes.set_yticks(np.arange(0, len(indices))) axes.set_yticklabels(labels) axes.yaxis.set_tick_params( - left=False, right=False, labelleft=False, labelright=True, - labelsize=label_size + left=False, + right=False, + labelleft=False, + labelright=True, + labelsize=label_size, ) axes.xaxis.set_tick_params( - bottom=True, top=False, labelbottom=show_distance, labeltop=False, - labelsize=label_size + bottom=True, + top=False, + labelbottom=show_distance, + labeltop=False, + labelsize=label_size, ) elif orientation == "right": axes.set_xlim(max_distance, zero_limit) @@ -146,12 +173,18 @@ def _plot_node(node, distance): axes.set_yticks(np.arange(0, len(indices))) axes.set_yticklabels(labels) axes.yaxis.set_tick_params( - left=False, right=False, labelleft=True, labelright=False, - labelsize=label_size + left=False, + right=False, + labelleft=True, + labelright=False, + labelsize=label_size, ) axes.xaxis.set_tick_params( - bottom=True, top=False, labelbottom=show_distance, labeltop=False, - labelsize=label_size + bottom=True, + top=False, + labelbottom=show_distance, + labeltop=False, + labelsize=label_size, ) elif orientation == "bottom": axes.set_ylim(zero_limit, max_distance) @@ -159,12 +192,18 @@ def _plot_node(node, distance): axes.set_xticks(np.arange(0, len(indices))) axes.set_xticklabels(labels) axes.xaxis.set_tick_params( - bottom=False, top=False, labelbottom=False, labeltop=True, - labelsize=label_size + bottom=False, + top=False, + labelbottom=False, + labeltop=True, + labelsize=label_size, ) axes.yaxis.set_tick_params( - left=True, right=False, labelleft=show_distance, labelright=False, - labelsize=label_size + left=True, + right=False, + labelleft=show_distance, + labelright=False, + labelsize=label_size, ) elif orientation == "top": axes.set_ylim(max_distance, zero_limit) @@ -172,13 +211,19 @@ def _plot_node(node, distance): axes.set_xticks(np.arange(0, len(indices))) axes.set_xticklabels(labels) axes.xaxis.set_tick_params( - bottom=False, top=False, labelbottom=True, labeltop=False, - labelsize=label_size + bottom=False, + top=False, + labelbottom=True, + labeltop=False, + labelsize=label_size, ) axes.yaxis.set_tick_params( - left=True, right=False, labelleft=show_distance, labelright=False, - labelsize=label_size + left=True, + right=False, + labelleft=show_distance, + labelright=False, + labelsize=label_size, ) else: raise ValueError(f"'{orientation}' is not a valid orientation") - axes.set_frame_on(False) \ No newline at end of file + axes.set_frame_on(False) diff --git a/src/biotite/sequence/graphics/features.py b/src/biotite/sequence/graphics/features.py index e3c6711ee..031ab04dc 100644 --- a/src/biotite/sequence/graphics/features.py +++ b/src/biotite/sequence/graphics/features.py @@ -4,22 +4,35 @@ __name__ = "biotite.sequence.graphics" __author__ = "Patrick Kunzmann" -__all__ = ["plot_feature_map", "FeaturePlotter", "MiscFeaturePlotter", - "CodingPlotter", "PromoterPlotter", "TerminatorPlotter", - "RBSPlotter"] +__all__ = [ + "plot_feature_map", + "FeaturePlotter", + "MiscFeaturePlotter", + "CodingPlotter", + "PromoterPlotter", + "TerminatorPlotter", + "RBSPlotter", +] -import copy import abc -import numpy as np -from ...visualize import colors, AdaptiveFancyArrow -from ..annotation import Annotation, Feature, Location - - -def plot_feature_map(axes, annotation, loc_range=None, - multi_line=True, symbols_per_line=1000, - show_numbers=False, number_size=None, line_width=0.05, - show_line_position=False, spacing=0.25, - feature_plotters=None, style_param=None): +from ...visualize import AdaptiveFancyArrow, colors +from ..annotation import Location + + +def plot_feature_map( + axes, + annotation, + loc_range=None, + multi_line=True, + symbols_per_line=1000, + show_numbers=False, + number_size=None, + line_width=0.05, + show_line_position=False, + spacing=0.25, + feature_plotters=None, + style_param=None, +): """ Plot a sequence annotation, by showing the range of each feature on one or multiple position depicting line(s). @@ -87,8 +100,8 @@ def plot_feature_map(axes, annotation, loc_range=None, features. When two features overlap, their drawing area does also overlap. """ - from matplotlib.transforms import Bbox from matplotlib.patches import Rectangle + from matplotlib.transforms import Bbox if loc_range is None: loc_range = annotation.get_location_range() @@ -98,13 +111,13 @@ def plot_feature_map(axes, annotation, loc_range=None, else: # Line length covers the entire location range symbols_per_line = loc_range_length - + plotters = [ PromoterPlotter(), TerminatorPlotter(), RBSPlotter(), CodingPlotter(), - MiscFeaturePlotter() + MiscFeaturePlotter(), ] if feature_plotters is not None: plotters = list(feature_plotters) + plotters @@ -116,7 +129,6 @@ def plot_feature_map(axes, annotation, loc_range=None, if loc_range_length % symbols_per_line != 0: line_count += 1 - ### Draw lines ### remaining_symbols = loc_range_length y = 0.5 @@ -127,14 +139,19 @@ def plot_feature_map(axes, annotation, loc_range=None, else: # Last line -> Line spans to end of annotation line_length = remaining_symbols - axes.add_patch(Rectangle( - (0, y-line_width/2), line_length, line_width, - color="gray", linewidth=0 - )) + axes.add_patch( + Rectangle( + (0, y - line_width / 2), + line_length, + line_width, + color="gray", + linewidth=0, + ) + ) # Increment by spacing and width (=1) of feature y += spacing + 1 remaining_symbols -= symbols_per_line - + ### Draw features ### line_start_loc = loc_range[0] y = 0 @@ -160,15 +177,12 @@ def plot_feature_map(axes, annotation, loc_range=None, width = loc_len height = 1 bbox = Bbox.from_bounds(x, y, width, height) - plotter.draw( - axes, feature, bbox, loc, - style_param=style_param - ) + plotter.draw(axes, feature, bbox, loc, style_param=style_param) # Increment by spacing and width (=1) of feature y += spacing + 1 remaining_symbols += symbols_per_line line_start_loc += symbols_per_line - + ### Draw position numbers ### ticks = [] tick_labels = [] @@ -176,11 +190,11 @@ def plot_feature_map(axes, annotation, loc_range=None, # Numbers at center height of each feature line -> 0.5 y = 0.5 for i in range(line_count): - if i == line_count-1: + if i == line_count - 1: # Last line -> get number of last column in trace - loc = loc_range[1] -1 + loc = loc_range[1] - 1 else: - loc = loc_range[0] + ((i+1) * symbols_per_line) -1 + loc = loc_range[0] + ((i + 1) * symbols_per_line) - 1 ticks.append(y) tick_labels.append(str(loc)) # Increment by spacing and width of feature (1) @@ -188,20 +202,17 @@ def plot_feature_map(axes, annotation, loc_range=None, axes.set_yticks(ticks) axes.set_yticklabels(tick_labels) - axes.set_xlim(0, symbols_per_line) # Y-axis starts from top - axes.set_ylim(1*line_count + spacing*(line_count-1), 0) + axes.set_ylim(1 * line_count + spacing * (line_count - 1), 0) axes.set_frame_on(False) # Draw location numbers on right side axes.get_yaxis().set_tick_params( left=False, right=False, labelleft=False, labelright=True ) # Remove ticks and set number font size - axes.yaxis.set_tick_params( - left=False, right=False, labelsize=number_size - ) - + axes.yaxis.set_tick_params(left=False, right=False, labelsize=number_size) + if show_line_position: axes.xaxis.set_tick_params( top=False, bottom=True, labeltop=False, labelbottom=True @@ -236,7 +247,7 @@ def matches(self, feature): ---------- feature : Feature The sequence feature to be checked. - + Returns ------- compatibility : bool @@ -244,7 +255,7 @@ def matches(self, feature): false otherwise. """ pass - + @abc.abstractmethod def draw(self, axes, feature, bbox, location, style_param): """ @@ -284,7 +295,7 @@ class CodingPlotter(FeaturePlotter): The width of the arrow head as fraction of the feature drawing area height. """ - + def __init__(self, tail_width=0.5, head_width=0.8): self._tail_width = tail_width self._head_width = head_width @@ -294,9 +305,9 @@ def matches(self, feature): return True else: return False - + def draw(self, axes, feature, bbox, loc, style_param): - y = bbox.y0 + bbox.height/2 + y = bbox.y0 + bbox.height / 2 dy = 0 if loc.strand == Location.Strand.FORWARD: x = bbox.x0 @@ -304,25 +315,35 @@ def draw(self, axes, feature, bbox, loc, style_param): else: x = bbox.x1 dx = -bbox.width - - if ( - loc.strand == Location.Strand.FORWARD - and loc.defect & Location.Defect.MISS_RIGHT - ) or ( - loc.strand == Location.Strand.REVERSE - and loc.defect & Location.Defect.MISS_LEFT - ): - # If the feature extends into the prevoius or next line - # do not draw an arrow head - draw_head = False + + if ( + loc.strand == Location.Strand.FORWARD + and loc.defect & Location.Defect.MISS_RIGHT + ) or ( + loc.strand == Location.Strand.REVERSE + and loc.defect & Location.Defect.MISS_LEFT + ): + # If the feature extends into the prevoius or next line + # do not draw an arrow head + draw_head = False else: - draw_head = True - + draw_head = True + # Create head with 90 degrees tip -> head width/length ratio = 1/2 - axes.add_patch(AdaptiveFancyArrow( - x, y, dx, dy, self._tail_width, self._head_width, head_ratio=0.5, - draw_head=draw_head, color=colors["dimgreen"], linewidth=0 - )) + axes.add_patch( + AdaptiveFancyArrow( + x, + y, + dx, + dy, + self._tail_width, + self._head_width, + head_ratio=0.5, + draw_head=draw_head, + color=colors["dimgreen"], + linewidth=0, + ) + ) if feature.key == "CDS": if "product" not in feature.qual: @@ -332,17 +353,23 @@ def draw(self, axes, feature, bbox, loc, style_param): else: label = feature.qual["product"] elif feature.key == "gene": - if "gene" not in feature.qual: + if "gene" not in feature.qual: label = None else: label = feature.qual["gene"] - + if label is not None: - center_x = bbox.x0 + bbox.width/2 - center_y = bbox.y0 + bbox.height/2 + center_x = bbox.x0 + bbox.width / 2 + center_y = bbox.y0 + bbox.height / 2 axes.text( - center_x, center_y, label, color="black", - ha="center", va="center", size=11) + center_x, + center_y, + label, + color="black", + ha="center", + va="center", + size=11, + ) class MiscFeaturePlotter(FeaturePlotter): @@ -363,17 +390,20 @@ def __init__(self, height=0.4): def matches(self, feature): return True - + def draw(self, axes, feature, bbox, loc, style_param): from matplotlib.patches import Rectangle rect = Rectangle( - (bbox.x0, bbox.y0 + bbox.height/2 * (1-self._height)), - bbox.width, bbox.height*self._height, - color=colors["dimorange"], linewidth=0 + (bbox.x0, bbox.y0 + bbox.height / 2 * (1 - self._height)), + bbox.width, + bbox.height * self._height, + color=colors["dimorange"], + linewidth=0, ) axes.add_patch(rect) + class PromoterPlotter(FeaturePlotter): """ A plotter for *regulatory* features with the *promoter* or @@ -394,8 +424,7 @@ class PromoterPlotter(FeaturePlotter): as fraction of the halffeature drawing area height. """ - def __init__(self, line_width=2, head_width=2, - head_length=6, head_height=0.8): + def __init__(self, line_width=2, head_width=2, head_length=6, head_height=0.8): self._line_width = line_width self._head_width = head_width self._head_length = head_length @@ -404,43 +433,42 @@ def __init__(self, line_width=2, head_width=2, def matches(self, feature): if feature.key == "regulatory": if "regulatory_class" in feature.qual: - if feature.qual["regulatory_class"] in ["promoter","TATA_box"]: + if feature.qual["regulatory_class"] in ["promoter", "TATA_box"]: return True return False - + def draw(self, axes, feature, bbox, loc, style_param): - from matplotlib.patches import FancyArrowPatch, ArrowStyle + from matplotlib.patches import ArrowStyle, FancyArrowPatch from matplotlib.path import Path - x_center = bbox.x0 + bbox.width/2 - y_center = bbox.y0 + bbox.height/2 + x_center = bbox.x0 + bbox.width / 2 + y_center = bbox.y0 + bbox.height / 2 path = Path( vertices=[ (bbox.x0, y_center), - (bbox.x0, y_center - bbox.height/2 * self._head_height), - (bbox.x1, y_center - bbox.height/2 * self._head_height), + (bbox.x0, y_center - bbox.height / 2 * self._head_height), + (bbox.x1, y_center - bbox.height / 2 * self._head_height), ], - codes=[ - Path.MOVETO, - Path.CURVE3, - Path.CURVE3 - ] + codes=[Path.MOVETO, Path.CURVE3, Path.CURVE3], ) style = ArrowStyle.CurveFilledB( head_width=self._head_width, head_length=self._head_length ) arrow = FancyArrowPatch( - path=path, arrowstyle=style, linewidth=self._line_width, - color="black" + path=path, arrowstyle=style, linewidth=self._line_width, color="black" ) axes.add_patch(arrow) - + if "note" in feature.qual: axes.text( - x_center, y_center + bbox.height/4, feature.qual["note"], - color="black", ha="center", va="center", - size=9 + x_center, + y_center + bbox.height / 4, + feature.qual["note"], + color="black", + ha="center", + va="center", + size=9, ) @@ -465,14 +493,17 @@ def matches(self, feature): if feature.qual["regulatory_class"] == "terminator": return True return False - - def draw(self, axes, feature, bbox, loc, style_param): - x = bbox.x0 + bbox.width/2 + def draw(self, axes, feature, bbox, loc, style_param): + x = bbox.x0 + bbox.width / 2 axes.plot( - (x, x), (bbox.y0, bbox.y1), color="black", - linestyle="-", linewidth=self._bar_width, marker="None" + (x, x), + (bbox.y0, bbox.y1), + color="black", + linestyle="-", + linewidth=self._bar_width, + marker="None", ) @@ -499,12 +530,15 @@ def matches(self, feature): if feature.qual["regulatory_class"] == "ribosome_binding_site": return True return False - + def draw(self, axes, feature, bbox, loc, style_param): from matplotlib.patches import Ellipse ellipse = Ellipse( - (bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2), - bbox.width, self._height*bbox.height, - color=colors["dimorange"], linewidth=0) - axes.add_patch(ellipse) \ No newline at end of file + (bbox.x0 + bbox.width / 2, bbox.y0 + bbox.height / 2), + bbox.width, + self._height * bbox.height, + color=colors["dimorange"], + linewidth=0, + ) + axes.add_patch(ellipse) diff --git a/src/biotite/sequence/graphics/logo.py b/src/biotite/sequence/graphics/logo.py index 7de7d0c39..42995814a 100644 --- a/src/biotite/sequence/graphics/logo.py +++ b/src/biotite/sequence/graphics/logo.py @@ -10,9 +10,6 @@ from ...visualize import set_font_size_in_coord from ..alphabet import LetterAlphabet from .colorschemes import get_color_scheme -import warnings -from ..align import Alignment -from .. import SequenceProfile def plot_sequence_logo(axes, profile, scheme=None, **kwargs): @@ -61,10 +58,10 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs): # 'color' and 'size' property is not passed on to text kwargs.pop("color", None) - kwargs.pop("size", None) + kwargs.pop("size", None) frequencies, entropies, max_entropy = _get_entropy(profile) - stack_heights = (max_entropy - entropies) + stack_heights = max_entropy - entropies symbols_heights = stack_heights[:, np.newaxis] * frequencies index_order = np.argsort(symbols_heights, axis=1) for i in range(symbols_heights.shape[0]): @@ -73,21 +70,25 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs): start_height = 0 for j in index_order[i]: # Stack the symbols at position on top of the preceeding one - height = symbols_heights[i,j] + height = symbols_heights[i, j] if height > 0: symbol = alphabet.decode(j) text = axes.text( - i+0.5, start_height, symbol, - ha="left", va="bottom", color=colors[j], + i + 0.5, + start_height, + symbol, + ha="left", + va="bottom", + color=colors[j], # Best results are obtained with this font size size=1, - **kwargs + **kwargs, ) text.set_clip_on(True) set_font_size_in_coord(text, width=1, height=height) start_height += height - axes.set_xlim(0.5, len(profile.symbols)+0.5) + axes.set_xlim(0.5, len(profile.symbols) + 0.5) axes.set_ylim(0, max_entropy) @@ -97,8 +98,7 @@ def _get_entropy(profile): # 0 * log2(0) = 0 -> Convert NaN to 0 no_zeros = freq != 0 pre_entropies = np.zeros(freq.shape) - pre_entropies[no_zeros] \ - = freq[no_zeros] * np.log2(freq[no_zeros]) + pre_entropies[no_zeros] = freq[no_zeros] * np.log2(freq[no_zeros]) entropies = -np.sum(pre_entropies, axis=1) max_entropy = np.log2(len(profile.alphabet)) - return freq, entropies, max_entropy \ No newline at end of file + return freq, entropies, max_entropy diff --git a/src/biotite/sequence/graphics/plasmid.py b/src/biotite/sequence/graphics/plasmid.py index 8527dc8d7..3869c36d2 100644 --- a/src/biotite/sequence/graphics/plasmid.py +++ b/src/biotite/sequence/graphics/plasmid.py @@ -6,20 +6,29 @@ __author__ = "Patrick Kunzmann" __all__ = ["plot_plasmid_map"] -import copy +import re import warnings -import abc import numpy as np -import re from ...visualize import colors -from ..annotation import Annotation, Feature, Location - - -def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, - tick_step=200, ring_width=0.01, feature_width=0.06, - spacing=0.01, arrow_head_length=0.04, label=None, - face_properties=None, label_properties=None, - omit_oversized_labels=True, feature_formatter=None): +from ..annotation import Feature, Location + + +def plot_plasmid_map( + axes, + annotation, + plasmid_size, + tick_length=0.02, + tick_step=200, + ring_width=0.01, + feature_width=0.06, + spacing=0.01, + arrow_head_length=0.04, + label=None, + face_properties=None, + label_properties=None, + omit_oversized_labels=True, + feature_formatter=None, +): """ Plot a plasmid map using the sequence features in the given :class:`Annotation`. @@ -84,26 +93,26 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, the following tuple: - *directional* : bool - + True, if the direction of the feature should be indicated by an arrow. Otherwise, the feature is plotted is arc. - + - *face_color* : tuple or str, optional - + A *Matplotlib* compatible color for the feature arrow/arc. - + - *label_color* : tuple or str, optional - + A *Matplotlib* compatible color for the feature label. - + - *label* : str or None - + The label to be displayed for this feature. None, if no label should be displayed. """ from matplotlib.projections.polar import PolarAxes - + if not isinstance(axes, PolarAxes): raise TypeError("The given axes must be a 'PolarAxes'") @@ -118,16 +127,13 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, if feature_formatter is None: feature_formatter = _default_feature_formatter - ### Setup matplotlib ### # The x-coordinate is given as angle (rad) # Full circle -> 2*pi - axes.set_xlim(0, 2*np.pi) + axes.set_xlim(0, 2 * np.pi) axes.set_ylim(0, 1) axes.yaxis.set_visible(False) - axes.xaxis.set_tick_params( - bottom=False, labelbottom=True - ) + axes.xaxis.set_tick_params(bottom=False, labelbottom=True) axes.set_theta_zero_location("N") axes.set_theta_direction("clockwise") axes.spines["polar"].set_visible(False) @@ -142,32 +148,39 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, axes.xaxis.set_ticks([_loc_to_rad(tick, plasmid_size) for tick in ticks]) axes.xaxis.set_ticklabels(tick_labels) ### Draw plasmid ring with ticks and central label ### - + # Plasmid ring # Use 'barh()' instead of a Rectangle patch to ensure that the axes # is properly initialized # Otherwise the feature rectangles are not curved, but straight axes.barh( - 1-ring_width-tick_length, 2*np.pi, ring_width, - align="edge", color="black" + 1 - ring_width - tick_length, 2 * np.pi, ring_width, align="edge", color="black" ) - + # Ticks (ticks itself, not the tick labels) for tick in ticks: angle = _loc_to_rad(tick, plasmid_size) axes.plot( - (angle, angle), (1-tick_length, 1), - color="black", linewidth=1, linestyle="-" + (angle, angle), + (1 - tick_length, 1), + color="black", + linewidth=1, + linestyle="-", ) - + # Central plasmid label if label is not None: axes.text( - 0, 0, label, ha="center", va="center", - color="black", size=32, fontweight="bold" + 0, + 0, + label, + ha="center", + va="center", + color="black", + size=32, + fontweight="bold", ) - ### Draw plasmid interior ### inner_radius = 1 - ring_width - tick_length features = sorted( @@ -177,28 +190,51 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, ], # Features are sorted by the length of their location range # The shortest come first - key = lambda feature: np.diff(feature.get_location_range())[0], - reverse = True + key=lambda feature: np.diff(feature.get_location_range())[0], + reverse=True, + ) + axes.add_artist( + PlasmidMap( + axes, + 0, + features, + plasmid_size, + inner_radius, + feature_width, + spacing, + arrow_head_length, + label, + face_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ) ) - axes.add_artist(PlasmidMap( - axes, 0, features, plasmid_size, inner_radius, feature_width, spacing, - arrow_head_length, label, face_properties, label_properties, - omit_oversized_labels, feature_formatter - )) try: # Only create these classes when matplotlib is installed from matplotlib.artist import Artist + from matplotlib.patches import Polygon, Rectangle from matplotlib.transforms import Bbox - from matplotlib.patches import Rectangle, Polygon - class PlasmidMap(Artist): - def __init__(self, axes, zorder, features, plasmid_size, radius, - feature_width, spacing, arrow_head_length, label, - face_properties, label_properties, omit_oversized_labels, - feature_formatter): + def __init__( + self, + axes, + zorder, + features, + plasmid_size, + radius, + feature_width, + spacing, + arrow_head_length, + label, + face_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ): super().__init__() self._axes = axes self.zorder = zorder @@ -212,30 +248,36 @@ def __init__(self, axes, zorder, features, plasmid_size, radius, for feature in features: indicators_for_feature = [] for loc in feature.locs: - # Set proper positions in 'draw()' method + # Set proper positions in 'draw()' method bbox = Bbox.from_extents(0, 0, 0, 0) # Draw features as curved arrows (feature indicator) - indicator = axes.add_artist(Feature_Indicator( - axes, self.zorder + 1, feature, loc, bbox, - arrow_head_length, face_properties, label_properties, - omit_oversized_labels, feature_formatter - )) + indicator = axes.add_artist( + Feature_Indicator( + axes, + self.zorder + 1, + feature, + loc, + bbox, + arrow_head_length, + face_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ) + ) indicators_for_feature.append(indicator) self._all_indicators.append(indicators_for_feature) - def draw(self, renderer, *args, **kwargs): # Find the maximum amount of feature rows # (used for overlapping features) - row_count = int( - self._radius // (self._feature_width + self._spacing) - ) + row_count = int(self._radius // (self._feature_width + self._spacing)) # Tracks the location ranges of feature that were added to # a row in order to check if that row is occupied ranges_in_row = [[] for i in range(row_count)] # Stores the bottom coordinate (radius) for each row row_bottoms = [ - self._radius - (row+1) * (self._feature_width + self._spacing) + self._radius - (row + 1) * (self._feature_width + self._spacing) for row in range(row_count) ] @@ -258,11 +300,13 @@ def draw(self, renderer, *args, **kwargs): # 'Normal feature' if first <= curr_last and last >= curr_first: is_occupied = True - else: # first < 1 + else: # first < 1 # Location is over periodic boundary - if first + self._plasmid_size <= curr_last \ - or last >= curr_first: - is_occupied = True + if ( + first + self._plasmid_size <= curr_last + or last >= curr_first + ): + is_occupied = True if not is_occupied: # Row is not occupied by another feature # in the location range of the new feature @@ -273,12 +317,10 @@ def draw(self, renderer, *args, **kwargs): else: # Location is over periodic boundary # Split into 'end' and 'start' part - ranges_in_row[row_i].append(( - first + self._plasmid_size, self._plasmid_size - )) - ranges_in_row[row_i].append(( - 1, last - )) + ranges_in_row[row_i].append( + (first + self._plasmid_size, self._plasmid_size) + ) + ranges_in_row[row_i].append((1, last)) row_bottom = row_bottoms[row_i] break if row_bottom is None: @@ -288,24 +330,31 @@ def draw(self, renderer, *args, **kwargs): "radius or decrease the feature width or spacing" ) else: - for loc, indicator in zip( - feature.locs, indicators_for_feature - ): + for loc, indicator in zip(feature.locs, indicators_for_feature): # Calculate arrow shape parameters - row_center = row_bottom + self._feature_width/2 + row_center = row_bottom + self._feature_width / 2 row_top = row_bottom + self._feature_width start_ang = _loc_to_rad(loc.first, self._plasmid_size) - stop_ang = _loc_to_rad(loc.last, self._plasmid_size) + stop_ang = _loc_to_rad(loc.last, self._plasmid_size) bbox = Bbox.from_extents( start_ang, row_bottom, stop_ang, row_top ) indicator.set_bbox(bbox) - class Feature_Indicator(Artist): - def __init__(self, axes, zorder, feature, loc, bbox, head_length, - arrow_properties, label_properties, omit_oversized_labels, - feature_formatter): + def __init__( + self, + axes, + zorder, + feature, + loc, + bbox, + head_length, + arrow_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ): super().__init__() self._axes = axes self.zorder = zorder @@ -313,44 +362,59 @@ def __init__(self, axes, zorder, feature, loc, bbox, head_length, self._bbox = bbox self._head_length = head_length self._omit_oversized_labels = omit_oversized_labels - + # Determine how to draw the feature - directional, face_color, label_color, label \ - = feature_formatter(feature) - + directional, face_color, label_color, label = feature_formatter(feature) + # Draw arrow as composition of a rectangle and a triangle, # as FancyArrow does not properly work for polar plots - self._arrow_tail = axes.add_patch(Rectangle( - # Set positions in 'draw()' method - (0, 0), 0, 0, - # Line width is set to 1 to avoid strange artifact in - # the transition from rectangle (tail) to polygon (head) - color=face_color, linewidth=1, zorder = self.zorder + 1, - **arrow_properties - )) - + self._arrow_tail = axes.add_patch( + Rectangle( + # Set positions in 'draw()' method + (0, 0), + 0, + 0, + # Line width is set to 1 to avoid strange artifact in + # the transition from rectangle (tail) to polygon (head) + color=face_color, + linewidth=1, + zorder=self.zorder + 1, + **arrow_properties, + ) + ) + if directional: # Only draw any arrow head when feature has a direction, # otherwise simply draw the tail (rectangle) - self._arrow_head = axes.add_patch(Polygon( - # Set positions in 'draw()' method - [(0, 0), (0, 0), (0, 0)], - color=face_color, linewidth=1, zorder = self.zorder + 1, - **arrow_properties - )) + self._arrow_head = axes.add_patch( + Polygon( + # Set positions in 'draw()' method + [(0, 0), (0, 0), (0, 0)], + color=face_color, + linewidth=1, + zorder=self.zorder + 1, + **arrow_properties, + ) + ) else: self._arrow_head = None if label is not None: label_properties["color"] = label_color - self._label = axes.add_artist(CurvedText( - # Set positions in 'draw()' method - axes, self.zorder + 1, 0, 0, label, label_properties - )) + self._label = axes.add_artist( + CurvedText( + # Set positions in 'draw()' method + axes, + self.zorder + 1, + 0, + 0, + label, + label_properties, + ) + ) else: self._label = None - def set_bbox(self, bbox): self._bbox = bbox @@ -359,7 +423,6 @@ def set_bbox(self, bbox): if self._label is not None: self._label.set_position(center_x, center_y) - def draw(self, renderer, *args, **kwargs): bbox = self._bbox center_x = (bbox.x0 + bbox.x1) / 2 @@ -369,7 +432,7 @@ def draw(self, renderer, *args, **kwargs): # irrespective of the radius in the polar plot # Calculate actual angle from given absolute width head_length = self._head_length / center_y - + # Check if the head should be drawn if self._arrow_head is None: head_length = 0 @@ -382,39 +445,38 @@ def draw(self, renderer, *args, **kwargs): rect_pos = (bbox.x0, bbox.y0) # (x0, y0), (x1, y1), (x2, y2) triangle_coord = [ - (bbox.x1 - head_length, bbox.y0), # base 1 - (bbox.x1 - head_length, bbox.y1), # base 2 - (bbox.x1, center_y) # tip + (bbox.x1 - head_length, bbox.y0), # base 1 + (bbox.x1 - head_length, bbox.y1), # base 2 + (bbox.x1, center_y), # tip ] else: - rect_pos = (bbox.x0+head_length, bbox.y0) + rect_pos = (bbox.x0 + head_length, bbox.y0) triangle_coord = [ - (bbox.x0 + head_length, bbox.y0), # base 1 - (bbox.x0 + head_length, bbox.y1), # base 2 - (bbox.x0, center_y) # tip + (bbox.x0 + head_length, bbox.y0), # base 1 + (bbox.x0 + head_length, bbox.y1), # base 2 + (bbox.x0, center_y), # tip ] - + # Update coordinates of sub-artists self._arrow_tail.set_xy(rect_pos) - self._arrow_tail.set_width(bbox.width-head_length) + self._arrow_tail.set_width(bbox.width - head_length) self._arrow_tail.set_height(bbox.height) if self._arrow_head is not None: self._arrow_head.set_xy(triangle_coord) - + if self._label is not None: # Do not draw the labels if it is larger than the # indicator - if self._omit_oversized_labels \ - and self._label.get_total_angle(renderer) > bbox.width: - self._label.set_visible(False) + if ( + self._omit_oversized_labels + and self._label.get_total_angle(renderer) > bbox.width + ): + self._label.set_visible(False) else: self._label.set_visible(True) - - class CurvedText(Artist): - def __init__(self, axes, zorder, angle, radius, string, - text_properties): + def __init__(self, axes, zorder, angle, radius, string, text_properties): super().__init__() self._axes = axes self.zorder = zorder @@ -425,35 +487,34 @@ def __init__(self, axes, zorder, angle, radius, string, for word in _split_into_words(string): text = axes.text( # Set position in 'draw()' method - 0, 0, + 0, + 0, word, - ha="center", va="center", + ha="center", + va="center", zorder=self.zorder + 1, **text_properties, ) self._texts.append(text) - def set_visible(self, visible): super().set_visible(visible) for text in self._texts: text.set_visible(visible) - def set_position(self, angle, radius): self._angle = angle self._radius = radius - def get_total_angle(self, renderer): return np.sum(self.get_word_angles(renderer)) - def get_word_angles(self, renderer): ax_px_radius = self._axes.get_window_extent(renderer).width / 2 ax_unit_radius = self._axes.get_ylim()[1] - circle_px_circumference = ax_px_radius * 2*np.pi \ - * (self._radius / ax_unit_radius) + circle_px_circumference = ( + ax_px_radius * 2 * np.pi * (self._radius / ax_unit_radius) + ) rad_angle = 360 - np.rad2deg(self._angle) # Avoid to draw the text upside down, when drawn on the @@ -462,7 +523,7 @@ def get_word_angles(self, renderer): turn_around = True else: turn_around = False - + angles = [] for text in self._texts: orig_rot = text.get_rotation() @@ -477,14 +538,12 @@ def get_word_angles(self, renderer): # In this case, assign a fixed width if np.isnan(word_px_width): word_px_width = 5.0 - word_angle \ - = 2*np.pi * word_px_width / circle_px_circumference + word_angle = 2 * np.pi * word_px_width / circle_px_circumference angles.append(word_angle) # Restore text.set_rotation(orig_rot) text.set_visible(orig_visible) return angles - def draw(self, renderer, *args, **kwargs): angles = self.get_word_angles(renderer) @@ -497,7 +556,7 @@ def draw(self, renderer, *args, **kwargs): turn_around = True else: turn_around = False - + # Now that the angle for each word is known, # the appropriate position and rotation can be set if turn_around: @@ -526,20 +585,18 @@ def draw(self, renderer, *args, **kwargs): pass - - def _loc_to_rad(loc, plasmid_size): if loc > plasmid_size: raise ValueError( f"Location {loc} is larger then the plasmid size of {plasmid_size}" ) # Location starts at 1 -> (loc-1) - return ((loc-1) / plasmid_size) * 2*np.pi + return ((loc - 1) / plasmid_size) * 2 * np.pi def _rad_to_loc(rad, plasmid_size): # Location starts at 1 -> + 1 - return rad / (2*np.pi) * plasmid_size + 1 + return rad / (2 * np.pi) * plasmid_size + 1 def _merge_over_periodic_boundary(feature, plasmid_size): @@ -547,7 +604,7 @@ def _merge_over_periodic_boundary(feature, plasmid_size): # Only one location -> no merge possible return feature first_loc = None - last_loc = None + last_loc = None # Find total first location of the feature for loc in feature.locs: if first_loc is None or loc.first < first_loc.first: @@ -558,38 +615,43 @@ def _merge_over_periodic_boundary(feature, plasmid_size): last_loc = loc # If the first and last location meet at the periodic boundary of # the plasmid -> merge them - if first_loc.first == 1 and last_loc.last == plasmid_size \ - and first_loc.strand == last_loc.strand: - new_locs = set(feature.locs) - new_locs.remove(first_loc) - new_locs.remove(last_loc) - new_locs.add(Location( + if ( + first_loc.first == 1 + and last_loc.last == plasmid_size + and first_loc.strand == last_loc.strand + ): + new_locs = set(feature.locs) + new_locs.remove(first_loc) + new_locs.remove(last_loc) + new_locs.add( + Location( # the fist base is now at negative location # by shifting by one plasmid 'period' - first = last_loc.first - plasmid_size, - last = first_loc.last, - strand = first_loc.strand, - defect = first_loc.defect | last_loc.defect - )) - return Feature(feature.key, new_locs, feature.qual) + first=last_loc.first - plasmid_size, + last=first_loc.last, + strand=first_loc.strand, + defect=first_loc.defect | last_loc.defect, + ) + ) + return Feature(feature.key, new_locs, feature.qual) else: return feature # ' ', '-' and '_' are word delimiters separators = re.compile(r"\s|_|-") + + def _split_into_words(string): - match_indices = sorted( - [match.start() for match in separators.finditer(string)] - ) + match_indices = sorted([match.start() for match in separators.finditer(string)]) current_index = 0 words = [] for i in match_indices: # Add word up to delimiter - words.append(string[current_index : i]) + words.append(string[current_index:i]) # Add delimiter - words.append(string[i : i+1]) - current_index = i+1 + words.append(string[i : i + 1]) + current_index = i + 1 # If there is a word after the last delimiter, add it too if current_index < len(string): words.append(string[current_index:]) @@ -618,44 +680,43 @@ def _default_feature_formatter(f): else: label = None return False, "black", "white", label - + # Origin of Replication elif f.key == "rep_origin": - return False, "indigo", "white", \ - f.qual.get("standard_name", "ori") - + return False, "indigo", "white", f.qual.get("standard_name", "ori") + # Coding sequences elif f.key in ["gene", "CDS", "rRNA"]: label = f.qual.get("product") if label is None: label = f.qual.get("gene") return True, colors["orange"], "black", label - + elif f.key == "regulatory": # Promoters if f.qual.get("regulatory_class") in [ "promoter", "TATA_box", "minus_35_signal", - "minus_10_signal" + "minus_10_signal", ]: return True, colors["dimgreen"], "black", f.qual.get("note") - + # Terminators elif f.qual.get("regulatory_class") in "terminator": return False, "firebrick", "white", f.qual.get("note") - + # RBS elif f.qual.get("regulatory_class") == "ribosome_binding_site": return False, colors["brightorange"], "white", None - + # Primers elif f.key == "primer_bind": return True, "royalblue", "black", f.qual.get("note") - + # Binding proteins elif f.key == "protein_bind": return False, colors["lightgreen"], "black", f.qual.get("note") - + # Misc - return True, "dimgray", "white", f.qual.get("note") \ No newline at end of file + return True, "dimgray", "white", f.qual.get("note") diff --git a/src/biotite/sequence/io/fasta/__init__.py b/src/biotite/sequence/io/fasta/__init__.py index 5aa14febe..8fad54b21 100644 --- a/src/biotite/sequence/io/fasta/__init__.py +++ b/src/biotite/sequence/io/fasta/__init__.py @@ -18,5 +18,5 @@ __name__ = "biotite.sequence.io.fasta" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/sequence/io/fasta/convert.py b/src/biotite/sequence/io/fasta/convert.py index 0e8ca854a..2cf33f198 100644 --- a/src/biotite/sequence/io/fasta/convert.py +++ b/src/biotite/sequence/io/fasta/convert.py @@ -7,13 +7,18 @@ import warnings from collections import OrderedDict -from ...sequence import Sequence +from ...align.alignment import Alignment from ...alphabet import AlphabetError, LetterAlphabet from ...seqtypes import NucleotideSequence, ProteinSequence -from ...align.alignment import Alignment -__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences", - "get_alignment", "set_alignment"] +__all__ = [ + "get_sequence", + "get_sequences", + "set_sequence", + "set_sequences", + "get_alignment", + "set_alignment", +] def get_sequence(fasta_file, header=None, seq_type=None): @@ -180,8 +185,10 @@ def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None): for i, seq_str in enumerate(seq_strings): seq_strings[i] = seq_str.replace(char, "-") # Remove gaps for creation of sequences - sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type) - for seq_str in seq_strings] + sequences = [ + _convert_to_sequence(seq_str.replace("-", ""), seq_type) + for seq_str in seq_strings + ] trace = Alignment.trace_from_strings(seq_strings) return Alignment(sequences, trace, score=None) @@ -212,18 +219,15 @@ def set_alignment(fasta_file, alignment, seq_names): def _convert_to_sequence(seq_str, seq_type=None): - # Define preprocessing of preimplemented sequence types # Replace selenocysteine with cysteine # and pyrrolysine with lysine - process_protein_sequence = ( - lambda x : x.upper().replace("U", "C").replace("O", "K") - ) + process_protein_sequence = lambda x: x.upper().replace("U", "C").replace("O", "K") # For nucleotides uracil is represented by thymine and there is only # one letter for completely unknown nucleotides process_nucleotide_sequence = ( - lambda x : x.upper().replace("U","T").replace("X","N") + lambda x: x.upper().replace("U", "T").replace("X", "N") ) # Set manually selected sequence type @@ -259,15 +263,19 @@ def _convert_to_sequence(seq_str, seq_type=None): ) return prot_seq except AlphabetError: - raise ValueError("FASTA data cannot be converted either to " - "'NucleotideSequence' nor to 'ProteinSequence'") + raise ValueError( + "FASTA data cannot be converted either to " + "'NucleotideSequence' nor to 'ProteinSequence'" + ) def _convert_to_string(sequence, as_rna): if not isinstance(sequence.get_alphabet(), LetterAlphabet): - raise ValueError("Only sequences using single letter alphabets " - "can be stored in a FASTA file") + raise ValueError( + "Only sequences using single letter alphabets " + "can be stored in a FASTA file" + ) if isinstance(sequence, NucleotideSequence) and as_rna: - return(str(sequence).replace("T", "U")) + return str(sequence).replace("T", "U") else: - return(str(sequence)) + return str(sequence) diff --git a/src/biotite/sequence/io/fasta/file.py b/src/biotite/sequence/io/fasta/file.py index 89eab5398..ab465c0e9 100644 --- a/src/biotite/sequence/io/fasta/file.py +++ b/src/biotite/sequence/io/fasta/file.py @@ -6,21 +6,21 @@ __author__ = "Patrick Kunzmann" __all__ = ["FastaFile"] -from ....file import TextFile, InvalidFileError, wrap_string from collections import OrderedDict from collections.abc import MutableMapping +from ....file import InvalidFileError, TextFile, wrap_string class FastaFile(TextFile, MutableMapping): """ This class represents a file in FASTA format. - + A FASTA file contains so called *header* lines, beginning with ``>``, that describe following sequence. The corresponding sequence starts at the line after the header line and ends at the next header line or at the end of file. The header along with its sequence forms an entry. - + This class is used in a dictionary like manner, implementing the :class:`MutableMapping` interface: Headers (without the leading ``>``) are used as keys, @@ -35,10 +35,10 @@ class FastaFile(TextFile, MutableMapping): after which a line break is inserted. Only relevant, when adding sequences to a file. Default is 80. - + Examples -------- - + >>> import os.path >>> file = FastaFile() >>> file["seq1"] = "ATACT" @@ -61,17 +61,17 @@ class FastaFile(TextFile, MutableMapping): {'seq2': 'AAAATT'} >>> file.write(os.path.join(path_to_directory, "test.fasta")) """ - + def __init__(self, chars_per_line=80): super().__init__() self._chars_per_line = chars_per_line self._entries = OrderedDict() - + @classmethod def read(cls, file, chars_per_line=80): """ Read a FASTA file. - + Parameters ---------- file : file-like object or str @@ -82,7 +82,7 @@ def read(cls, file, chars_per_line=80): after which a line break is inserted. Only relevant, when adding sequences to a file. Default is 80. - + Returns ------- file_object : FastaFile @@ -90,24 +90,23 @@ def read(cls, file, chars_per_line=80): """ file = super().read(file, chars_per_line) # Filter out empty and comment lines - file.lines = [line for line in file.lines - if len(line.strip()) != 0 and line[0] != ";"] + file.lines = [ + line for line in file.lines if len(line.strip()) != 0 and line[0] != ";" + ] if len(file.lines) == 0: raise InvalidFileError("File is empty or contains only comments") file._find_entries() return file - + def __setitem__(self, header, seq_str): if not isinstance(header, str): - raise IndexError( - "'FastaFile' only supports header strings as keys" - ) + raise IndexError("'FastaFile' only supports header strings as keys") if not isinstance(seq_str, str): - raise TypeError("'FastaFile' only supports sequence strings " - "as values") + raise TypeError("'FastaFile' only supports sequence strings " "as values") # Create lines for new header and sequence (with line breaks) - new_lines = [">" + header.replace("\n","").strip()] + \ - wrap_string(seq_str, width=self._chars_per_line) + new_lines = [">" + header.replace("\n", "").strip()] + wrap_string( + seq_str, width=self._chars_per_line + ) if header in self: # Delete lines of entry corresponding to the header, # if existing @@ -118,83 +117,75 @@ def __setitem__(self, header, seq_str): # Simply append lines # Add entry in a more efficient way than '_find_entries()' # for this simple case - self._entries[header] = ( - len(self.lines), - len(self.lines) + len(new_lines) - ) + self._entries[header] = (len(self.lines), len(self.lines) + len(new_lines)) self.lines += new_lines - + def __getitem__(self, header): if not isinstance(header, str): - raise IndexError( - "'FastaFile' only supports header strings as keys" - ) + raise IndexError("'FastaFile' only supports header strings as keys") start, stop = self._entries[header] # Concatenate sequence string from following lines - seq_string = "".join( - [line.strip() for line in self.lines[start+1 : stop]] - ) + seq_string = "".join([line.strip() for line in self.lines[start + 1 : stop]]) return seq_string - + def __delitem__(self, header): start, stop = self._entries[header] del self.lines[start:stop] del self._entries[header] self._find_entries() - + def __len__(self): return len(self._entries) - + def __iter__(self): return self._entries.__iter__() - + def __contains__(self, identifer): return identifer in self._entries - + def _find_entries(self): if len(self.lines) > 0 and self.lines[0][0] != ">": raise InvalidFileError( f"File starts with '{self.lines[0][0]}' instead of '>'" ) - + header_i = [] for i, line in enumerate(self.lines): if line[0] == ">": header_i.append(i) - + self._entries = OrderedDict() for j in range(len(header_i)): # Remove leading '>' from header header = self.lines[header_i[j]].strip()[1:] start = header_i[j] - if j < len(header_i) -1: + if j < len(header_i) - 1: # Header in mid or start of file # -> stop is start of next header - stop = header_i[j+1] + stop = header_i[j + 1] else: # Last header -> entry stops at end of file stop = len(self.lines) self._entries[header] = (start, stop) - @staticmethod def read_iter(file): """ Create an iterator over each sequence of the given FASTA file. - + Parameters ---------- file : file-like object or str The file to be read. Alternatively a file path can be supplied. - + Yields ------ header : str The header of the current sequence. seq_str : str The current sequence as string. - + Notes ----- This approach gives the same results as @@ -221,7 +212,6 @@ def read_iter(file): # Yield final entry if header is not None: yield header, "".join(seq_str_list) - @staticmethod def write_iter(file, items, chars_per_line=80): @@ -235,7 +225,7 @@ def write_iter(file, items, chars_per_line=80): Hence, this static method may save a large amount of memory if a large file should be written, especially if the `items` are provided as generator. - + Parameters ---------- file : file-like object or str @@ -256,23 +246,20 @@ def write_iter(file, items, chars_per_line=80): This method does not test, whether the given identifiers are unambiguous. """ + def line_generator(): for item in items: header, seq_str = item if not isinstance(header, str): - raise IndexError( - "'FastaFile' only supports header strings" - ) + raise IndexError("'FastaFile' only supports header strings") if not isinstance(seq_str, str): - raise TypeError( - "'FastaFile' only supports sequence strings" - ) - + raise TypeError("'FastaFile' only supports sequence strings") + # Yield header line - yield ">" + header.replace("\n","").strip() + yield ">" + header.replace("\n", "").strip() # Yield sequence line(s) for line in wrap_string(seq_str, width=chars_per_line): yield line - - TextFile.write_iter(file, line_generator()) \ No newline at end of file + + TextFile.write_iter(file, line_generator()) diff --git a/src/biotite/sequence/io/fastq/__init__.py b/src/biotite/sequence/io/fastq/__init__.py index d763198b1..cff2e7097 100644 --- a/src/biotite/sequence/io/fastq/__init__.py +++ b/src/biotite/sequence/io/fastq/__init__.py @@ -15,5 +15,5 @@ __name__ = "biotite.sequence.io.fastq" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/sequence/io/fastq/convert.py b/src/biotite/sequence/io/fastq/convert.py index 868536c6e..0ccc97c22 100644 --- a/src/biotite/sequence/io/fastq/convert.py +++ b/src/biotite/sequence/io/fastq/convert.py @@ -6,10 +6,7 @@ __author__ = "Patrick Kunzmann" from collections import OrderedDict -from ...sequence import Sequence -from ...alphabet import AlphabetError, LetterAlphabet from ...seqtypes import NucleotideSequence -from ...align.alignment import Alignment __all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"] @@ -17,7 +14,7 @@ def get_sequence(fastq_file, header=None): """ Get a sequence and quality scores from a `FastqFile` instance. - + Parameters ---------- fastq_file : FastqFile @@ -25,7 +22,7 @@ def get_sequence(fastq_file, header=None): header : str, optional The identifier to get the sequence and scores from. By default, the first sequence of the file is returned. - + Returns ------- sequence : NucleotideSequence @@ -43,7 +40,7 @@ def get_sequence(fastq_file, header=None): break if seq_str is None: raise ValueError("File does not contain any sequences") - processed_seq_str = seq_str.replace("U","T").replace("X","N") + processed_seq_str = seq_str.replace("U", "T").replace("X", "N") return NucleotideSequence(processed_seq_str), scores @@ -51,12 +48,12 @@ def get_sequences(fastq_file): """ Get a dictionary from a `FastqFile` instance, where identifiers are keys and sequence-score-tuples are values. - + Parameters ---------- fastq_file : FastqFile The `Fastqile` to be accessed. - + Returns ------- seq_dict : dict @@ -65,7 +62,7 @@ def get_sequences(fastq_file): """ seq_dict = OrderedDict() for header, (seq_str, scores) in fastq_file.items(): - processed_seq_str = seq_str.replace("U","T").replace("X","N") + processed_seq_str = seq_str.replace("U", "T").replace("X", "N") seq_dict[header] = NucleotideSequence(processed_seq_str), scores return seq_dict @@ -73,7 +70,7 @@ def get_sequences(fastq_file): def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False): """ Set a sequence and a quality score array in a `FastqFile` instance. - + Parameters ---------- fastq_file : FastqFile @@ -96,7 +93,7 @@ def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False): def set_sequences(fastq_file, sequence_dict, as_rna=False): """ Set sequences in a `FastqFile` instance from a dictionary. - + Parameters ---------- fastq_file : FastqFile @@ -115,6 +112,6 @@ def set_sequences(fastq_file, sequence_dict, as_rna=False): def _convert_to_string(sequence, as_rna): if as_rna: - return(str(sequence).replace("T", "U")) + return str(sequence).replace("T", "U") else: - return(str(sequence)) \ No newline at end of file + return str(sequence) diff --git a/src/biotite/sequence/io/fastq/file.py b/src/biotite/sequence/io/fastq/file.py index c90da37cd..5ac85c7b1 100644 --- a/src/biotite/sequence/io/fastq/file.py +++ b/src/biotite/sequence/io/fastq/file.py @@ -5,23 +5,21 @@ __name__ = "biotite.sequence.io.fastq" __author__ = "Patrick Kunzmann" -import warnings -from numbers import Integral from collections import OrderedDict from collections.abc import MutableMapping +from numbers import Integral import numpy as np -from ....file import TextFile, InvalidFileError, wrap_string -from ...seqtypes import NucleotideSequence +from ....file import InvalidFileError, TextFile, wrap_string __all__ = ["FastqFile"] _OFFSETS = { - "Sanger" : 33, - "Solexa" : 64, - "Illumina-1.3" : 64, - "Illumina-1.5" : 64, - "Illumina-1.8" : 33, + "Sanger": 33, + "Solexa": 64, + "Illumina-1.3": 64, + "Illumina-1.5": 64, + "Illumina-1.8": 33, } @@ -151,13 +149,10 @@ def get_seq_string(self, identifier): The sequence corresponding to the identifier. """ if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports identifier strings as keys" - ) - seq_start, seq_stop, score_start, score_stop \ - = self._entries[identifier] + raise IndexError("'FastqFile' only supports identifier strings as keys") + seq_start, seq_stop, score_start, score_stop = self._entries[identifier] # Concatenate sequence string from the sequence lines - seq_str = "".join(self.lines[seq_start : seq_stop]) + seq_str = "".join(self.lines[seq_start:seq_stop]) return seq_str def get_quality(self, identifier): @@ -175,15 +170,11 @@ def get_quality(self, identifier): The quality scores corresponding to the identifier. """ if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports identifier strings as keys" - ) - seq_start, seq_stop, score_start, score_stop \ - = self._entries[identifier] + raise IndexError("'FastqFile' only supports identifier strings as keys") + seq_start, seq_stop, score_start, score_stop = self._entries[identifier] # Concatenate sequence string from the score lines return _score_str_to_scores( - "".join(self.lines[score_start : score_stop]), - self._offset + "".join(self.lines[score_start:score_stop]), self._offset ) def __setitem__(self, identifier, item): @@ -194,9 +185,7 @@ def __setitem__(self, identifier, item): f"but score length is {len(scores)}" ) if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports strings as identifier" - ) + raise IndexError("'FastqFile' only supports strings as identifier") # Delete lines of entry corresponding to the identifier, # if already existing if identifier in self: @@ -204,14 +193,14 @@ def __setitem__(self, identifier, item): # Create new lines # Start with identifier line - new_lines = ["@" + identifier.replace("\n","").strip()] + new_lines = ["@" + identifier.replace("\n", "").strip()] # Append new lines with sequence string (with line breaks) seq_start_i = len(new_lines) if self._chars_per_line is None: new_lines.append(str(sequence)) else: new_lines += wrap_string(sequence, width=self._chars_per_line) - seq_stop_i =len(new_lines) + seq_stop_i = len(new_lines) # Append sequence-score separator new_lines += ["+"] # Append scores @@ -237,7 +226,7 @@ def __setitem__(self, identifier, item): len(self.lines) + seq_start_i, len(self.lines) + seq_stop_i, len(self.lines) + score_start_i, - len(self.lines) + score_stop_i + len(self.lines) + score_stop_i, ) self.lines += new_lines @@ -245,9 +234,8 @@ def __getitem__(self, identifier): return self.get_seq_string(identifier), self.get_quality(identifier) def __delitem__(self, identifier): - seq_start, seq_stop, score_start, score_stop \ - = self._entries[identifier] - del self.lines[seq_start-1 : score_stop] + seq_start, seq_stop, score_start, score_stop = self._entries[identifier] + del self.lines[seq_start - 1 : score_stop] del self._entries[identifier] self._find_entries() @@ -278,7 +266,7 @@ def _find_entries(self): if not in_scores and not in_sequence and line[0] == "@": # Identifier line identifier = line[1:] - seq_start_i = i+1 + seq_start_i = i + 1 # Next line is sequence in_sequence = True # Reset @@ -290,7 +278,7 @@ def _find_entries(self): in_sequence = False in_scores = True seq_stop_i = i - score_start_i = i+1 + score_start_i = i + 1 else: # Still in sequence seq_len += len(line) @@ -306,9 +294,12 @@ def _find_entries(self): in_scores = False # Record this entry self._entries[identifier] = ( - seq_start_i, seq_stop_i, score_start_i, score_stop_i + seq_start_i, + seq_stop_i, + score_start_i, + score_stop_i, ) - else: # score_len > seq_len + else: # score_len > seq_len raise InvalidFileError( f"The amount of scores is not equal to the sequence " f"length for the sequence in line {seq_start_i+1} " @@ -320,7 +311,6 @@ def _find_entries(self): if in_sequence or in_scores: raise InvalidFileError("The last entry in the file is incomplete") - @staticmethod def read_iter(file, offset): """ @@ -398,20 +388,15 @@ def read_iter(file, offset): # -> End of entry in_scores = False # yield this entry - scores = _score_str_to_scores( - "".join(score_str_list), - offset - ) + scores = _score_str_to_scores("".join(score_str_list), offset) yield identifier, ("".join(seq_str_list), scores) - else: # score_len > seq_len + else: # score_len > seq_len raise InvalidFileError( - f"The amount of scores is not equal to the sequence " - f"length" + "The amount of scores is not equal to the sequence " "length" ) else: - raise InvalidFileError(f"FASTQ file is invalid") - + raise InvalidFileError("FASTQ file is invalid") @staticmethod def write_iter(file, items, offset, chars_per_line=None): @@ -463,12 +448,10 @@ def line_generator(): f"but score length is {len(scores)}" ) if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports strings as identifier" - ) + raise IndexError("'FastqFile' only supports strings as identifier") # Yield identifier line - yield "@" + identifier.replace("\n","").strip() + yield "@" + identifier.replace("\n", "").strip() # Yield sequence line(s) if chars_per_line is None: @@ -495,15 +478,11 @@ def _score_str_to_scores(score_str, offset): """ Convert an ASCII string into actual score values. """ - scores = np.frombuffer( - bytearray( - score_str, encoding="ascii" - ), - dtype=np.int8 - ) + scores = np.frombuffer(bytearray(score_str, encoding="ascii"), dtype=np.int8) scores -= offset return scores + def _scores_to_score_str(scores, offset): """ Convert score values into an ASCII string. @@ -511,6 +490,7 @@ def _scores_to_score_str(scores, offset): scores = np.asarray(scores) + offset return scores.astype(np.int8, copy=False).tobytes().decode("ascii") + def _convert_offset(offset_val_or_string): """ If the given offset is a string return the corresponding numerical @@ -519,9 +499,9 @@ def _convert_offset(offset_val_or_string): if isinstance(offset_val_or_string, Integral): return offset_val_or_string elif isinstance(offset_val_or_string, str): - return _OFFSETS[offset_val_or_string] + return _OFFSETS[offset_val_or_string] else: raise TypeError( f"The offset must be either an integer or a string " f"indicating the format, not {type(offset_val_or_string).__name__}" - ) \ No newline at end of file + ) diff --git a/src/biotite/sequence/io/genbank/__init__.py b/src/biotite/sequence/io/genbank/__init__.py index bccb3feab..11f745f10 100644 --- a/src/biotite/sequence/io/genbank/__init__.py +++ b/src/biotite/sequence/io/genbank/__init__.py @@ -11,7 +11,7 @@ __name__ = "biotite.sequence.io.genbank" __author__ = "Patrick Kunzmann" -from .file import * from .annotation import * +from .file import * +from .metadata import * from .sequence import * -from .metadata import * \ No newline at end of file diff --git a/src/biotite/sequence/io/genbank/annotation.py b/src/biotite/sequence/io/genbank/annotation.py index fcd5e072b..1a28ae3af 100644 --- a/src/biotite/sequence/io/genbank/annotation.py +++ b/src/biotite/sequence/io/genbank/annotation.py @@ -14,8 +14,6 @@ import warnings from ....file import InvalidFileError from ...annotation import Annotation, Feature, Location -from .file import GenBankFile - _KEY_START = 5 _QUAL_START = 21 @@ -46,7 +44,6 @@ def get_annotation(gb_file, include_only=None): raise InvalidFileError("File has multiple 'FEATURES' fields") lines, _ = fields[0] - ### Parse all lines to create an index of features, # i.e. pairs of the feature key # and the text belonging to the respective feature @@ -60,13 +57,12 @@ def get_annotation(gb_file, include_only=None): # Store old feature key and value feature_list.append((feature_key, feature_value)) # Track new key - feature_key = line[_KEY_START : _QUAL_START-1].strip() + feature_key = line[_KEY_START : _QUAL_START - 1].strip() feature_value = "" feature_value += line[_QUAL_START:] + " " # Store last feature key and value (loop already exited) feature_list.append((feature_key, feature_value)) - ### Process only relevant features and put them into an Annotation annotation = Annotation() # Regex to separate qualifiers from each other @@ -114,7 +110,7 @@ def get_annotation(gb_file, include_only=None): # -> split at whitespaces, # as keys do not contain whitespaces for subpart in part.split(): - if not "=" in subpart: + if "=" not in subpart: # Qualifier without value, e.g. '/pseudo' # -> store immediately # Remove "/" -> subpart[1:] @@ -147,11 +143,11 @@ def get_annotation(gb_file, include_only=None): def _parse_locs(loc_str): locs = [] if loc_str.startswith(("join", "order")): - str_list = loc_str[loc_str.index("(")+1:loc_str.rindex(")")].split(",") + str_list = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")].split(",") for s in str_list: locs.extend(_parse_locs(s.strip())) elif loc_str.startswith("complement"): - compl_str = loc_str[loc_str.index("(")+1:loc_str.rindex(")")] + compl_str = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")] compl_locs = [ Location(loc.first, loc.last, Location.Strand.REVERSE, loc.defect) for loc in _parse_locs(compl_str) @@ -214,8 +210,6 @@ def _set_qual(qual_dict, key, val): qual_dict[key] = val - - def set_annotation(gb_file, annotation): """ Set the *FEATURES* field of a GenBank file with an annotation. @@ -236,12 +230,12 @@ def set_annotation(gb_file, annotation): for key, values in feature.qual.items(): if values is None: line = " " * _QUAL_START - line += f'/{key}' + line += f"/{key}" lines.append(line) else: for val in values.split("\n"): line = " " * _QUAL_START - line += f'/{key}="{val}"' + line += f'/{key}="{val}"' lines.append(line) gb_file.set_field("FEATURES", lines) @@ -254,11 +248,11 @@ def _convert_to_loc_string(locs): if len(locs) == 1: loc = list(locs)[0] loc_first_str = str(loc.first) - loc_last_str = str(loc.last) + loc_last_str = str(loc.last) if loc.defect & Location.Defect.BEYOND_LEFT: loc_first_str = "<" + loc_first_str if loc.defect & Location.Defect.BEYOND_RIGHT: - loc_last_str = ">" + loc_last_str + loc_last_str = ">" + loc_last_str if loc.first == loc.last: loc_string = loc_first_str elif loc.defect & Location.Defect.UNK_LOC: @@ -270,8 +264,6 @@ def _convert_to_loc_string(locs): if loc.strand == Location.Strand.REVERSE: loc_string = f"complement({loc_string})" else: - loc_string = ",".join( - [_convert_to_loc_string([loc]) for loc in locs] - ) + loc_string = ",".join([_convert_to_loc_string([loc]) for loc in locs]) loc_string = f"join({loc_string})" return loc_string diff --git a/src/biotite/sequence/io/genbank/file.py b/src/biotite/sequence/io/genbank/file.py index 72a225647..4bbbaff02 100644 --- a/src/biotite/sequence/io/genbank/file.py +++ b/src/biotite/sequence/io/genbank/file.py @@ -6,14 +6,16 @@ __author__ = "Patrick Kunzmann" __all__ = ["GenBankFile", "MultiFile"] -#import textwrap +# import textwrap import copy -#import re + +# import re import io -from ....file import TextFile, InvalidFileError from collections import OrderedDict -#from ...annotation import Location, Feature, Annotation, AnnotatedSequence -#from ...seqtypes import NucleotideSequence, ProteinSequence +from ....file import InvalidFileError, TextFile + +# from ...annotation import Location, Feature, Annotation, AnnotatedSequence +# from ...seqtypes import NucleotideSequence, ProteinSequence class GenBankFile(TextFile): @@ -33,7 +35,7 @@ class GenBankFile(TextFile): Some fields may occur multiple times, e.g. the *REFERENCE* field. A sample GenBank file can be viewed at ``_. - + This class provides a low-level interface for parsing, editing and writing GenBank files. It works like a list of field entries, where a field consists of the @@ -47,7 +49,7 @@ class GenBankFile(TextFile): The subfields are represented by a dictionary, with subfield names being keys and the corresponding lines being values. The *FEATURES* and *ORIGIN* fields have no subfields. - + Every entry can be obtained, set and deleted via the index operator. Notes @@ -55,7 +57,7 @@ class GenBankFile(TextFile): This class does not support location identifiers with references to other Entrez database entries, e.g. ``join(1..100,J00194.1:100..202)``. - + Examples -------- Create a GenBank file from scratch: @@ -79,9 +81,9 @@ class GenBankFile(TextFile): ['One line', 'A second line'] >>> print(subfields) OrderedDict([('SUBFIELD1', ['Single Line']), ('SUBFIELD2', ['Two', 'lines'])]) - + Adding an additional field: - + >>> file.insert(0, "OTHERFIELD", ["Another line"]) >>> print(len(file)) 2 @@ -174,18 +176,18 @@ def __init__(self): # and names of categories self._field_pos = [] self._find_field_indices() - + @classmethod def read(cls, file): """ Read a GenBank file. - + Parameters ---------- file : file-like object or str The file to be read. Alternatively a file path can be supplied. - + Returns ------- file_object : GenBankFile @@ -194,16 +196,16 @@ def read(cls, file): file = super().read(file) file._find_field_indices() return file - + def get_fields(self, name): """ Get all *GenBank* fields associated with a given field name. - + Parameters ---------- name : str The field name. - + Returns ------- fields : list of (list of str, OrderedDict of str -> str) @@ -218,17 +220,17 @@ def get_fields(self, name): indices = self.get_indices(name) # Omit the field name return [self[i][1:] for i in indices] - + def get_indices(self, name): """ Get the indices to all *GenBank* fields associated with a given field name. - + Parameters ---------- name : str The field name. - + Returns ------- fields : list of int @@ -242,7 +244,7 @@ def get_indices(self, name): if fname == name: indices.append(i) return indices - + def set_field(self, name, content, subfield_dict=None): """ Set a *GenBank* field with the given content. @@ -250,7 +252,7 @@ def set_field(self, name, content, subfield_dict=None): If the field already exists in the file, the field is overwritten, otherwise a new field is created at the end of the file. - + Parameters ---------- name : str @@ -261,7 +263,7 @@ def set_field(self, name, content, subfield_dict=None): The subfields of the field. The dictionary maps subfield names to the content lines of the respective subfield. - + Raises ------ InvalidFileError @@ -283,13 +285,13 @@ def set_field(self, name, content, subfield_dict=None): def __getitem__(self, index): index = self._translate_idx(index) start, stop, name = self._field_pos[index] - + if name in ["FEATURES", "ORIGIN"]: # For those two fields return the complete lines, # beginning with the line after the field name - content = self._get_field_content(start+1, stop, indent=0) + content = self._get_field_content(start + 1, stop, indent=0) subfield_dict = OrderedDict() - + else: # For all metadata fields use the # standard GenBank indentation (=12) @@ -297,7 +299,7 @@ def __getitem__(self, index): subfield_dict = OrderedDict() subfield_start = None first_subfield_start = None - for i in range(start+1, stop): + for i in range(start + 1, stop): line = self.lines[i] # Check if line contains a new subfield # (Header beginning from first column) @@ -320,12 +322,10 @@ def __getitem__(self, index): # that are not part of a subfield if first_subfield_start is not None: stop = first_subfield_start - content = self._get_field_content( - start, stop, indent=12 - ) - + content = self._get_field_content(start, stop, indent=12) + return name, content, subfield_dict - + def __setitem__(self, index, item): index = self._translate_idx(index) if not isinstance(item, tuple): @@ -342,7 +342,7 @@ def __setitem__(self, index, item): "Expected a tuple of name, content and optionally subfields" ) inserted_lines = self._to_lines(name, content, subfields) - + # Stop of field to be replaced is start of new field start, old_stop, _ = self._field_pos[index] # If not the last element is set, @@ -355,12 +355,12 @@ def __setitem__(self, index, item): # Shift the start/stop indices of the following fields # by the amount of created fields shift = len(inserted_lines) - (old_stop - start) - for i in range(index+1, len(self._field_pos)): + for i in range(index + 1, len(self._field_pos)): old_start, old_stop, fname = self._field_pos[i] - self._field_pos[i] = old_start+shift, old_stop+shift, fname + self._field_pos[i] = old_start + shift, old_stop + shift, fname # Add new entry - self._field_pos[index] = start, start+len(inserted_lines), name.upper() - + self._field_pos[index] = start, start + len(inserted_lines), name.upper() + def __delitem__(self, index): index = self._translate_idx(index) start, stop, _ = self._field_pos[index] @@ -369,17 +369,17 @@ def __delitem__(self, index): shift = stop - start for i in range(index, len(self._field_pos)): old_start, old_stop, name = self._field_pos[i] - self._field_pos[i] = old_start-shift, old_stop-shift, name - del self.lines[start : stop] + self._field_pos[i] = old_start - shift, old_stop - shift, name + del self.lines[start:stop] del self._field_pos[index] - + def __len__(self): return len(self._field_pos) def insert(self, index, name, content, subfields=None): """ Insert a *GenBank* field at the given position. - + Parameters ---------- index : int @@ -398,12 +398,12 @@ def insert(self, index, name, content, subfields=None): """ index = self._translate_idx(index, length_exclusive=False) inserted_lines = self._to_lines(name, content, subfields) - + # Stop of previous field is start of new field if index == 0: start = 0 else: - _, start, _ = self._field_pos[index-1] + _, start, _ = self._field_pos[index - 1] # If the new lines are not inserted at the end, # the following lines need to be added, too if start is not len(self.lines): @@ -416,17 +416,16 @@ def insert(self, index, name, content, subfields=None): shift = len(inserted_lines) for i in range(index, len(self._field_pos)): old_start, old_stop, fname = self._field_pos[i] - self._field_pos[i] = old_start+shift, old_stop+shift, fname + self._field_pos[i] = old_start + shift, old_stop + shift, fname # Add new entry self._field_pos.insert( - index, - (start, start+len(inserted_lines), name.upper()) + index, (start, start + len(inserted_lines), name.upper()) ) - + def append(self, name, content, subfields=None): """ Create a new *GenBank* field at the end of the file. - + Parameters ---------- name : str @@ -440,7 +439,6 @@ def append(self, name, content, subfields=None): """ self.insert(len(self), name, content, subfields) - def _find_field_indices(self): """ Identify the start and exclusive stop indices of lines @@ -469,10 +467,10 @@ def _find_field_indices(self): def _get_field_content(self, start, stop, indent): if indent == 0: - return self.lines[start : stop] + return self.lines[start:stop] else: - return [line[12:] for line in self.lines[start : stop]] - + return [line[12:] for line in self.lines[start:stop]] + def _to_lines(self, name, content, subfields): """ Convert the field name, field content und subfield dictionary @@ -480,22 +478,22 @@ def _to_lines(self, name, content, subfields): """ if subfields is None: subfields = {} - + name = name.strip().upper() if len(name) == 0: - raise ValueError(f"Must give a non emtpy name") - subfields = OrderedDict({ - subfield_name.upper().strip() : subfield_lines - for subfield_name, subfield_lines in subfields.items() - }) - + raise ValueError("Must give a non emtpy name") + subfields = OrderedDict( + { + subfield_name.upper().strip(): subfield_lines + for subfield_name, subfield_lines in subfields.items() + } + ) + # Create lines for new field if name == "FEATURES": # Header line plus all actual feature lines lines = copy.copy(content) - lines.insert( - 0, "FEATURES" + " "*13 + "Location/Qualifiers" - ) + lines.insert(0, "FEATURES" + " " * 13 + "Location/Qualifiers") elif name == "ORIGIN": # Header line plus all actual sequence lines lines = copy.copy(content) @@ -504,19 +502,19 @@ def _to_lines(self, name, content, subfields): name_column = [] content_column = [] # Create a line for the field name and empty lines - # for each additional line required by the content - name_column += [name] + [""] * (len(content)-1) + # for each additional line required by the content + name_column += [name] + [""] * (len(content) - 1) content_column += content for subfield_name, subfield_lines in subfields.items(): - name_column += [" " + subfield_name] \ - + [""] * (len(subfield_lines)-1) + name_column += [" " + subfield_name] + [""] * (len(subfield_lines) - 1) content_column += subfield_lines - lines = [f"{n_col:12}{c_col}" for n_col, c_col - in zip(name_column, content_column)] - + lines = [ + f"{n_col:12}{c_col}" + for n_col, c_col in zip(name_column, content_column) + ] + return lines - def _translate_idx(self, index, length_exclusive=True): """ Check index boundaries and convert negative index to positive @@ -539,15 +537,15 @@ class MultiFile(TextFile): """ This class represents a file in *GenBank* or *GenPept* format, that contains multiple entries, for more than one UID. - + The information for each UID are appended to each other in such a file. Objects of this class can be iterated to obtain a :class:`GenBankFile` for each entry in the file. - + Examples -------- - + >>> import os.path >>> file_name = fetch_single_file( ... ["1L2Y_A", "3O5R_A", "5UGO_A"], @@ -568,8 +566,8 @@ def __iter__(self): line = self.lines[i] if line.strip() == "//": # Create file with lines corresponding to that file - file_content = "\n".join(self.lines[start_i : i+1]) + file_content = "\n".join(self.lines[start_i : i + 1]) file = GenBankFile.read(io.StringIO(file_content)) # Reset file start index start_i = i - yield file \ No newline at end of file + yield file diff --git a/src/biotite/sequence/io/genbank/metadata.py b/src/biotite/sequence/io/genbank/metadata.py index f4d25004f..8654e42c4 100644 --- a/src/biotite/sequence/io/genbank/metadata.py +++ b/src/biotite/sequence/io/genbank/metadata.py @@ -8,17 +8,24 @@ __name__ = "biotite.sequence.io.genbank" __author__ = "Patrick Kunzmann, Natasha Jaffe" -__all__ = ["get_locus", "get_definition", "get_accession", "get_version", - "get_gi", "get_db_link", "get_source", - "set_locus"] +__all__ = [ + "get_locus", + "get_definition", + "get_accession", + "get_version", + "get_gi", + "get_db_link", + "get_source", + "set_locus", +] from ....file import InvalidFileError -from .file import GenBankFile + def get_locus(gb_file): """ Parse the *LOCUS* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile @@ -39,10 +46,10 @@ def get_locus(gb_file): The GenBank division to which the file belongs. date : str, optional The date of last modification. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> name, length, mol_type, is_circular, division, date = get_locus(file) @@ -68,59 +75,57 @@ def get_locus(gb_file): # The first field will always be the ID name = fields[0] - # The second field will always be the length followed + # The second field will always be the length followed # by units (eg 1224 aa) length = int(fields[1]) - # The third field *should* be the molecular type + # The third field *should* be the molecular type # but sometimes this is missing. This gets tricky # because sometimes the next field, circular/linear, # is missing, too. The field after that, division, # is a 3 letter all caps token. Unfortunately, mol_type - # is also often a 3 letter all caps token (eg DNA)! + # is also often a 3 letter all caps token (eg DNA)! # Fortunately, GenBank publishes the set list of divisions # here: https://www.ncbi.nlm.nih.gov/genbank/samplerecord , # so we can check against that set when determining whether # the current token represents the molecular type. divisions = ( - 'PRI', # primate sequences - 'ROD', # rodent sequences - 'MAM', # other mammalian sequences - 'VRT', # other vertebrate sequences - 'INV', # invertebrate sequences - 'PLN', # plant, fungal, and algal sequences - 'BCT', # bacterial sequences - 'VRL', # viral sequences - 'PHG', # bacteriophage sequences - 'SYN', # synthetic sequences - 'UNA', # unannotated sequences - 'EST', # EST sequences (expressed sequence tags) - 'PAT', # patent sequences - 'STS', # STS sequences (sequence tagged sites) - 'GSS', # GSS sequences (genome survey sequences) - 'HTG', # HTG sequences (high-throughput genomic sequences) - 'HTC', # unfinished high-throughput cDNA sequencing - 'ENV', # environmental sampling sequences - 'CON', + "PRI", # primate sequences + "ROD", # rodent sequences + "MAM", # other mammalian sequences + "VRT", # other vertebrate sequences + "INV", # invertebrate sequences + "PLN", # plant, fungal, and algal sequences + "BCT", # bacterial sequences + "VRL", # viral sequences + "PHG", # bacteriophage sequences + "SYN", # synthetic sequences + "UNA", # unannotated sequences + "EST", # EST sequences (expressed sequence tags) + "PAT", # patent sequences + "STS", # STS sequences (sequence tagged sites) + "GSS", # GSS sequences (genome survey sequences) + "HTG", # HTG sequences (high-throughput genomic sequences) + "HTC", # unfinished high-throughput cDNA sequencing + "ENV", # environmental sampling sequences + "CON", ) - # NOTE: Remember that fields[2] is the unit for length, + # NOTE: Remember that fields[2] is the unit for length, # eg bp or aa, so we move to fields[3] here. - if fields[3] not in ('linear', 'circular') \ - and fields[3] not in divisions: + if fields[3] not in ("linear", "circular") and fields[3] not in divisions: mol_type = fields[3] next_idx = 4 else: mol_type = None next_idx = 3 - - # The next field should be the token 'linear' or 'circular', + # The next field should be the token 'linear' or 'circular', # but sometimes this is missing - if 'linear' == fields[next_idx]: + if "linear" == fields[next_idx]: is_circular = False next_idx += 1 - elif 'circular' == fields[next_idx]: + elif "circular" == fields[next_idx]: is_circular = True next_idx += 1 else: @@ -136,23 +141,24 @@ def get_locus(gb_file): return name, length, mol_type, is_circular, division, date + def get_definition(gb_file): """ Parse the *DEFINITION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *DEFINITION* field from. - + Returns ------- definition : str Content of the *DEFINITION* field. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> print(get_definition(file)) @@ -161,23 +167,24 @@ def get_definition(gb_file): lines, _ = _expect_single_field(gb_file, "DEFINITION") return " ".join([line.strip() for line in lines]) + def get_accession(gb_file): """ Parse the *ACCESSION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *ACCESSION* field from. - + Returns ------- accession : str The accession ID of the file. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> print(get_accession(file)) @@ -187,16 +194,17 @@ def get_accession(gb_file): # 'ACCESSION' field has only one line return lines[0] + def get_version(gb_file): """ Parse the version from the *VERSION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *VERSION* field from. - + Returns ------- version : str @@ -206,16 +214,17 @@ def get_version(gb_file): # 'VERSION' field has only one line return lines[0].split()[0] + def get_gi(gb_file): """ Parse the GI from the *VERSION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *VERSION* field from. - + Returns ------- gi : str @@ -229,24 +238,25 @@ def get_gi(gb_file): # Truncate GI return int(version_info[1][3:]) + def get_db_link(gb_file): """ Parse the *DBLINK* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *DBLINK* field from. - + Returns ------- link_dict : dict A dictionary storing the database links, with the database name as key, and the corresponding ID as value. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> for key, val in get_db_link(file).items(): @@ -265,12 +275,12 @@ def get_db_link(gb_file): def get_source(gb_file): """ Parse the *SOURCE* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *SOURCE* field from. - + Returns ------- accession : str @@ -290,12 +300,12 @@ def _expect_single_field(gb_file, name): return fields[0] - -def set_locus(gb_file, name, length, mol_type=None, is_circular=False, - division=None, date=None): +def set_locus( + gb_file, name, length, mol_type=None, is_circular=False, division=None, date=None +): """ Set the *LOCUS* field of a GenBank file. - + Parameters ---------- gb_file : GenBankFile @@ -319,6 +329,8 @@ def set_locus(gb_file, name, length, mol_type=None, is_circular=False, circularity = "circular" if is_circular else "linear" division = "" if division is None else division date = "" if date is None else date - line = f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} " \ - f"{circularity:8} {division:3} {date:11}" - gb_file.set_field("LOCUS", [line]) \ No newline at end of file + line = ( + f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} " + f"{circularity:8} {division:3} {date:11}" + ) + gb_file.set_field("LOCUS", [line]) diff --git a/src/biotite/sequence/io/genbank/sequence.py b/src/biotite/sequence/io/genbank/sequence.py index 26ec645bb..1504bd9bf 100644 --- a/src/biotite/sequence/io/genbank/sequence.py +++ b/src/biotite/sequence/io/genbank/sequence.py @@ -8,17 +8,20 @@ __name__ = "biotite.sequence.io.genbank" __author__ = "Patrick Kunzmann" -__all__ = ["get_raw_sequence", "get_sequence", "get_annotated_sequence", - "set_sequence", "set_annotated_sequence"] +__all__ = [ + "get_raw_sequence", + "get_sequence", + "get_annotated_sequence", + "set_sequence", + "set_annotated_sequence", +] import re from ....file import InvalidFileError -from ...seqtypes import ProteinSequence, NucleotideSequence from ...annotation import AnnotatedSequence -from .file import GenBankFile +from ...seqtypes import NucleotideSequence, ProteinSequence from .annotation import get_annotation, set_annotation - _SYMBOLS_PER_CHUNK = 10 _SEQ_CHUNKS_PER_LINE = 6 _SYMBOLS_PER_LINE = _SYMBOLS_PER_CHUNK * _SEQ_CHUNKS_PER_LINE @@ -112,7 +115,7 @@ def _convert_seq_str(seq_str, format): if len(seq_str) == 0: raise InvalidFileError("The file's 'ORIGIN' field is empty") if format == "gb": - return NucleotideSequence(seq_str.replace("U","T").replace("X","N")) + return NucleotideSequence(seq_str.replace("U", "T").replace("X", "N")) elif format == "gp": return ProteinSequence(seq_str.replace("U", "C").replace("O", "K")) else: @@ -125,8 +128,6 @@ def _get_seq_start(origin_content): return int(origin_content[0].split()[0]) - - def set_sequence(gb_file, sequence, sequence_start=1): """ Set the *ORIGIN* field of a GenBank file with a sequence. @@ -167,6 +168,4 @@ def set_annotated_sequence(gb_file, annot_sequence): The annotated sequence that is put into the GenBank file. """ set_annotation(gb_file, annot_sequence.annotation) - set_sequence( - gb_file, annot_sequence.sequence, annot_sequence.sequence_start - ) \ No newline at end of file + set_sequence(gb_file, annot_sequence.sequence, annot_sequence.sequence_start) diff --git a/src/biotite/sequence/io/general.py b/src/biotite/sequence/io/general.py index 09b7c2722..54ed5bf4c 100644 --- a/src/biotite/sequence/io/general.py +++ b/src/biotite/sequence/io/general.py @@ -9,31 +9,27 @@ __name__ = "biotite.sequence.io" __author__ = "Patrick Kunzmann" -__all__ = ["load_sequence", "save_sequence", - "load_sequences", "save_sequences"] +__all__ = ["load_sequence", "save_sequence", "load_sequences", "save_sequences"] -import itertools import os.path -import io from collections import OrderedDict import numpy as np -from ..seqtypes import NucleotideSequence, ProteinSequence -from ..alphabet import Alphabet +from ..seqtypes import NucleotideSequence def load_sequence(file_path): """ Load a sequence from a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the file extension. - + Parameters ---------- file_path : str The path to the sequence file. - + Returns ------- sequence : Sequence @@ -43,10 +39,12 @@ def load_sequence(file_path): filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: from .fasta import FastaFile, get_sequence + file = FastaFile.read(file_path) return get_sequence(file) elif suffix in [".fastq", ".fq"]: from .fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile.read(file_path, offset="Sanger") @@ -57,6 +55,7 @@ def load_sequence(file_path): return sequence elif suffix in [".gb", ".gbk", ".gp"]: from .genbank import GenBankFile, get_sequence + format = "gp" if suffix == ".gp" else "gb" file = GenBankFile.read(file_path) return get_sequence(file, format) @@ -68,10 +67,10 @@ def save_sequence(file_path, sequence): """ Save a sequence into a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the given file extension. - + Parameters ---------- file_path : str @@ -83,11 +82,13 @@ def save_sequence(file_path, sequence): filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: from .fasta import FastaFile, set_sequence + file = FastaFile() set_sequence(file, sequence) file.write(file_path) elif suffix in [".fastq", ".fq"]: from .fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile(offset="Sanger") @@ -97,6 +98,7 @@ def save_sequence(file_path, sequence): file.write(file_path) elif suffix in [".gb", ".gbk", ".gp"]: from .genbank import GenBankFile, set_locus, set_sequence + file = GenBankFile() set_locus(file, "sequence", len(sequence)) set_sequence(file, sequence) @@ -109,37 +111,42 @@ def load_sequences(file_path): """ Load multiple sequences from a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the file extension. - + Parameters ---------- file_path : str The path to the sequence file. - + Returns ------- sequences : dict of (str, Sequence) The sequences in the file. This dictionary maps each header name to - the respective sequence. + the respective sequence. """ # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: from .fasta import FastaFile, get_sequences + file = FastaFile.read(file_path) return get_sequences(file) elif suffix in [".fastq", ".fq"]: from .fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile.read(file_path, offset="Sanger") - return {identifier : NucleotideSequence(seq_str) - for identifier, (seq_str, scores) in file.items()} + return { + identifier: NucleotideSequence(seq_str) + for identifier, (seq_str, scores) in file.items() + } elif suffix in [".gb", ".gbk", ".gp"]: from .genbank import MultiFile, get_definition, get_sequence + file = MultiFile.read(file_path) format = "gp" if suffix == ".gp" else "gb" sequences = OrderedDict() @@ -154,10 +161,10 @@ def save_sequences(file_path, sequences): """ Save multiple sequences into a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the given file extension. - + Parameters ---------- file_path : str @@ -170,11 +177,13 @@ def save_sequences(file_path, sequences): filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: from .fasta import FastaFile, set_sequences + file = FastaFile() set_sequences(file, sequences) file.write(file_path) elif suffix in [".fastq", ".fq"]: from .fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile(offset="Sanger") diff --git a/src/biotite/sequence/io/gff/__init__.py b/src/biotite/sequence/io/gff/__init__.py index f544a0ddd..52bac129c 100644 --- a/src/biotite/sequence/io/gff/__init__.py +++ b/src/biotite/sequence/io/gff/__init__.py @@ -14,7 +14,7 @@ GFF 3 files. This means, that you cannot directly access the the parent or child of a feature. However, the ``Id`` and ``Name`` attributes are stored in the - qualifiers of the created :class:`Feature` objects. + qualifiers of the created :class:`Feature` objects. Hence, it is possible to implement such a data structure from this information. """ @@ -22,5 +22,5 @@ __name__ = "biotite.sequence.io.gff" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/sequence/io/gff/convert.py b/src/biotite/sequence/io/gff/convert.py index 9c8782f65..51971d799 100644 --- a/src/biotite/sequence/io/gff/convert.py +++ b/src/biotite/sequence/io/gff/convert.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_annotation", "set_annotation"] -from ...annotation import Location, Feature, Annotation +from ...annotation import Annotation, Feature, Location def get_annotation(gff_file): @@ -22,12 +22,12 @@ def get_annotation(gff_file): Thus, for entries with the same ``ID``, the *type* and *attributes* are only parsed once and the locations are aggregated from each entry. - + Parameters ---------- gff_file : GFFFile The file tro extract the :class:`Annotation` object from. - + Returns ------- annotation : Annotation @@ -45,9 +45,7 @@ def get_annotation(gff_file): # (beginning of the file) if current_key is not None: # Beginning of new feature -> Save previous feature - annot.add_feature( - Feature(current_key, current_locs, current_qual) - ) + annot.add_feature(Feature(current_key, current_locs, current_qual)) # Track new feature current_key = type current_locs = [Location(start, end, strand)] @@ -61,15 +59,14 @@ def get_annotation(gff_file): return annot -def set_annotation(gff_file, annotation, - seqid=None, source=None, is_stranded=True): +def set_annotation(gff_file, annotation, seqid=None, source=None, is_stranded=True): """ Write an :class:`Annotation` object into a GFF3 file. Each feature will get one entry for each location it has. :class:`Feature` objects with multiple locations require the ``ID`` qualifier in its :attr:`Feature.qual` attribute. - + Parameters ---------- gff_file : GFFFile @@ -87,14 +84,13 @@ def set_annotation(gff_file, annotation, for feature in sorted(annotation): if len(feature.locs) > 1 and "ID" not in feature.qual: raise ValueError( - "The 'Id' qualifier is required " - "for features with multiple locations" + "The 'Id' qualifier is required " "for features with multiple locations" ) ## seqid ## if seqid is not None and " " in seqid: raise ValueError("The 'seqid' must not contain whitespaces") ## source ## - #Nothing to be done + # Nothing to be done ## type ## type = feature.key ## strand ## @@ -128,6 +124,5 @@ def set_annotation(gff_file, annotation, else: phase = None gff_file.append( - seqid, source, type, start, end, - score, strand, phase, attributes - ) \ No newline at end of file + seqid, source, type, start, end, score, strand, phase, attributes + ) diff --git a/src/biotite/sequence/io/gff/file.py b/src/biotite/sequence/io/gff/file.py index f708712d2..dff00822d 100644 --- a/src/biotite/sequence/io/gff/file.py +++ b/src/biotite/sequence/io/gff/file.py @@ -6,19 +6,17 @@ __author__ = "Patrick Kunzmann" __all__ = ["GFFFile"] -import copy import string -from urllib.parse import quote, unquote import warnings -from ....file import TextFile, InvalidFileError +from urllib.parse import quote, unquote +from ....file import InvalidFileError, TextFile from ...annotation import Location - # All punctuation characters except # percent, semicolon, equals, ampersand, comma -_NOT_QUOTED = "".join( - [char for char in string.punctuation if char not in "%;=&,"] -) + " " +_NOT_QUOTED = ( + "".join([char for char in string.punctuation if char not in "%;=&,"]) + " " +) class GFFFile(TextFile): @@ -61,7 +59,7 @@ class GFFFile(TextFile): The content after the ``##FASTA`` directive is simply ignored. Please provide the sequence via a separate file or read the FASTA data directly via the :attr:`lines` attribute: - + >>> import os.path >>> from io import StringIO >>> gff_file = GFFFile.read(os.path.join(path_to_sequences, "indexing_test.gff3")) @@ -121,7 +119,7 @@ class GFFFile(TextFile): ##Example directive param1 param2 SomeSeqID Biotite CDS 1 99 . + 0 ID=FeatureID;product=A protein """ - + def __init__(self): super().__init__() # Maps entry indices to line indices @@ -132,18 +130,18 @@ def __init__(self): self._has_fasta = None self._index_entries() self.append_directive("gff-version", "3") - + @classmethod def read(cls, file): """ Read a GFF3 file. - + Parameters ---------- file : file-like object or str The file to be read. Alternatively a file path can be supplied. - + Returns ------- file_object : GFFFile @@ -152,18 +150,29 @@ def read(cls, file): file = super().read(file) file._index_entries() return file - - def insert(self, index, seqid, source, type, start, end, - score, strand, phase, attributes=None): + + def insert( + self, + index, + seqid, + source, + type, + start, + end, + score, + strand, + phase, + attributes=None, + ): """ Insert an entry at the given index. - + Parameters ---------- index : int Index where the entry is inserted. If the index is equal to the length of the file, the entry - is appended at the end of the file. + is appended at the end of the file. seqid : str The ID of the reference sequence. source : str @@ -184,22 +193,23 @@ def insert(self, index, seqid, source, type, start, end, Additional properties of the feature. """ if index == len(self): - self.append(seqid, source, type, start, end, - score, strand, phase, attributes) + self.append( + seqid, source, type, start, end, score, strand, phase, attributes + ) else: line_index = self._entries[index] line = GFFFile._create_line( - seqid, source, type, start, end, - score, strand, phase, attributes + seqid, source, type, start, end, score, strand, phase, attributes ) self.lines.insert(line_index, line) self._index_entries() - - def append(self, seqid, source, type, start, end, - score, strand, phase, attributes=None): + + def append( + self, seqid, source, type, start, end, score, strand, phase, attributes=None + ): """ Append an entry to the end of the file. - + Parameters ---------- seqid : str @@ -232,11 +242,11 @@ def append(self, seqid, source, type, start, end, self.lines.append(line) # Fast update of entry index by adding last line self._entries.append(len(self.lines) - 1) - + def append_directive(self, directive, *args): """ Append a directive line to the end of the file. - + Parameters ---------- directive : str @@ -245,13 +255,13 @@ def append_directive(self, directive, *args): Optional parameters for the directive. Each argument is simply appended to the directive, separated by a single space character. - + Raises ------ NotImplementedError If the ``##FASTA`` directive is used, which is not supported. - + Examples -------- @@ -262,17 +272,15 @@ def append_directive(self, directive, *args): ##Example directive param1 param2 """ if directive.startswith("FASTA"): - raise NotImplementedError( - "Adding FASTA information is not supported" - ) + raise NotImplementedError("Adding FASTA information is not supported") directive_line = "##" + directive + " " + " ".join(args) self._directives.append((directive_line[2:], len(self.lines))) self.lines.append(directive_line) - + def directives(self): """ Get the directives in the file. - + Returns ------- directives : list of tuple(str, int) @@ -283,7 +291,7 @@ def directives(self): """ # Sort in line order return sorted(self._directives, key=lambda directive: directive[1]) - + def __setitem__(self, index, item): seqid, source, type, start, end, score, strand, phase, attrib = item line = GFFFile._create_line( @@ -292,15 +300,13 @@ def __setitem__(self, index, item): line_index = self._entries[index] self.lines[line_index] = line - def __getitem__(self, index): - if (index >= 0 and index >= len(self)) or \ - (index < 0 and -index > len(self)): - raise IndexError( - f"Index {index} is out of range for GFFFile with " - f"{len(self)} entries" - ) - + if (index >= 0 and index >= len(self)) or (index < 0 and -index > len(self)): + raise IndexError( + f"Index {index} is out of range for GFFFile with " + f"{len(self)} entries" + ) + line_index = self._entries[index] # Columns are tab separated s = self.lines[line_index].strip().split("\t") @@ -324,15 +330,15 @@ def __getitem__(self, index): attrib = GFFFile._parse_attributes(attrib) return seqid, source, type, start, end, score, strand, phase, attrib - + def __delitem__(self, index): line_index = self._entries[index] del self.lines[line_index] self._index_entries() - + def __len__(self): return len(self._entries) - + def _index_entries(self): """ Parse the file for comment and directive lines. @@ -374,15 +380,12 @@ def _index_entries(self): self._entries = self._entries[:entry_counter] @staticmethod - def _create_line(seqid, source, type, start, end, - score, strand, phase, attributes): + def _create_line(seqid, source, type, start, end, score, strand, phase, attributes): """ Create a line for a newly created entry. """ - seqid = quote(seqid.strip(), safe=_NOT_QUOTED) \ - if seqid is not None else "." - source = quote(source.strip(), safe=_NOT_QUOTED) \ - if source is not None else "." + seqid = quote(seqid.strip(), safe=_NOT_QUOTED) if seqid is not None else "." + source = quote(source.strip(), safe=_NOT_QUOTED) if source is not None else "." type = type.strip() # Perform checks @@ -394,7 +397,7 @@ def _create_line(seqid, source, type, start, end, raise ValueError("'type' must not be empty") if seqid[0] == ">": raise ValueError("'seqid' must not start with '>'") - + score = str(score) if score is not None else "." if strand == Location.Strand.FORWARD: strand = "+" @@ -403,16 +406,31 @@ def _create_line(seqid, source, type, start, end, else: strand = "." phase = str(phase) if phase is not None else "." - attributes = ";".join( - [quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED) - for key, val in attributes.items()] - ) if attributes is not None and len(attributes) > 0 else "." + attributes = ( + ";".join( + [ + quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED) + for key, val in attributes.items() + ] + ) + if attributes is not None and len(attributes) > 0 + else "." + ) return "\t".join( - [seqid, source, type, str(start), str(end), - str(score), strand, phase, attributes] + [ + seqid, + source, + type, + str(start), + str(end), + str(score), + strand, + phase, + attributes, + ] ) - + @staticmethod def _parse_attributes(attributes): """ @@ -426,9 +444,7 @@ def _parse_attributes(attributes): for entry in attrib_entries: compounds = entry.split("=") if len(compounds) != 2: - raise InvalidFileError( - f"Attribute entry '{entry}' is invalid" - ) + raise InvalidFileError(f"Attribute entry '{entry}' is invalid") key, val = compounds attrib_dict[unquote(key)] = unquote(val) - return attrib_dict \ No newline at end of file + return attrib_dict diff --git a/src/biotite/sequence/phylo/__init__.py b/src/biotite/sequence/phylo/__init__.py index d70caa681..5d29f1a9e 100644 --- a/src/biotite/sequence/phylo/__init__.py +++ b/src/biotite/sequence/phylo/__init__.py @@ -31,6 +31,6 @@ __name__ = "biotite.sequence.phylo" __author__ = "Patrick Kunzmann" +from .nj import * from .tree import * from .upgma import * -from .nj import * \ No newline at end of file diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py index 1a140e1f9..d8320107b 100644 --- a/src/biotite/sequence/profile.py +++ b/src/biotite/sequence/profile.py @@ -4,9 +4,9 @@ import warnings import numpy as np -from .seqtypes import NucleotideSequence, ProteinSequence, GeneralSequence -from .alphabet import LetterAlphabet from .align.alignment import get_codes +from .alphabet import LetterAlphabet +from .seqtypes import GeneralSequence, NucleotideSequence, ProteinSequence __name__ = "biotite.sequence" __author__ = "Maximilian Greil" @@ -73,7 +73,7 @@ class SequenceProfile(object): be created from an indefinite number of aligned sequences. With :meth:`sequence_probability_from_matrix()` the probability of a - sequence can be calculated based on the before calculated position + sequence can be calculated based on the before calculated position probability matrix of this instance of object SequenceProfile. With :meth:`sequence_score_from_matrix()` the score of a sequence @@ -154,8 +154,10 @@ def gaps(self, new_gaps): def __repr__(self): """Represent SequenceProfile as a string for debugging.""" - return f"SequenceProfile(np.{np.array_repr(self.symbols)}, " \ - f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))" + return ( + f"SequenceProfile(np.{np.array_repr(self.symbols)}, " + f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))" + ) def __eq__(self, item): if not isinstance(item, SequenceProfile): @@ -204,16 +206,16 @@ def from_alignment(alignment, alphabet=None): for alph in (seq.alphabet for seq in alignment.sequences): if not alphabet.extends(alph): raise ValueError( - f"The given alphabet is incompatible with a least one " + "The given alphabet is incompatible with a least one " "alphabet of the given sequences" ) symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int) gaps = np.zeros(len(sequences[0]), dtype=int) sequences = np.transpose(sequences) for i in range(len(sequences)): - row = np.where(sequences[i, ] == -1, len(alphabet), sequences[i, ]) + row = np.where(sequences[i,] == -1, len(alphabet), sequences[i,]) count = np.bincount(row, minlength=len(alphabet) + 1) - symbols[i, ] = count[0:len(alphabet)] + symbols[i,] = count[0 : len(alphabet)] gaps[i] = count[-1] return SequenceProfile(symbols, gaps, alphabet) @@ -248,10 +250,21 @@ def to_consensus(self, as_general=False): def _dna_to_consensus(self): codes = { - (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'T', - (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M', - (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V', - (0, 1, 2, 3): 'N' + (0,): "A", + (1,): "C", + (2,): "G", + (3,): "T", + (0, 2): "R", + (1, 3): "Y", + (1, 2): "S", + (0, 3): "W", + (2, 3): "K", + (0, 1): "M", + (1, 2, 3): "B", + (0, 2, 3): "D", + (0, 1, 3): "H", + (0, 1, 2): "V", + (0, 1, 2, 3): "N", } consensus = "" maxes = np.max(self.symbols, axis=1) @@ -261,10 +274,21 @@ def _dna_to_consensus(self): def _rna_to_consensus(self): codes = { - (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'U', - (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M', - (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V', - (0, 1, 2, 3): 'N' + (0,): "A", + (1,): "C", + (2,): "G", + (3,): "U", + (0, 2): "R", + (1, 3): "Y", + (1, 2): "S", + (0, 3): "W", + (2, 3): "K", + (0, 1): "M", + (1, 2, 3): "B", + (0, 2, 3): "D", + (0, 1, 3): "H", + (0, 1, 2): "V", + (0, 1, 2, 3): "N", } consensus = "" maxes = np.max(self.symbols, axis=1) @@ -307,7 +331,7 @@ def probability_matrix(self, pseudocount=0): .. math:: P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p} - + :math:`S`: The symbol. :math:`C_S`: The count of symbol :math:`S` at the sequence @@ -330,11 +354,10 @@ def probability_matrix(self, pseudocount=0): The calculated the position probability matrix. """ if pseudocount < 0: - raise ValueError( - f"Pseudocount can not be smaller than zero." - ) - return (self.symbols + pseudocount / self.symbols.shape[1]) / \ - (np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount) + raise ValueError("Pseudocount can not be smaller than zero.") + return (self.symbols + pseudocount / self.symbols.shape[1]) / ( + np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount + ) def log_odds_matrix(self, background_frequencies=None, pseudocount=0): r""" @@ -346,7 +369,7 @@ def log_odds_matrix(self, background_frequencies=None, pseudocount=0): .. math:: W(S) = \log_2 \left( \frac{P(S)}{B_S} \right) - + :math:`S`: The symbol. :math:`P(S)`: The probability of symbol :math:`S` at the @@ -363,7 +386,7 @@ def log_odds_matrix(self, background_frequencies=None, pseudocount=0): background_frequencies: ndarray, shape=(k,), dtype=float, optional The background frequencies for each symbol in the alphabet. By default, a uniform distribution is assumed. - + Returns ------- pwm: ndarray, dtype=float, shape=(n,k) @@ -383,7 +406,7 @@ def sequence_probability(self, sequence, pseudocount=0): Calculate probability of a sequence based on the position probability matrix (PPM). - The sequence probability is the product of the probability of + The sequence probability is the product of the probability of the respective symbol over all sequence positions. Parameters @@ -419,7 +442,7 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0): Calculate score of a sequence based on the position weight matrix (PWM). - The score is the sum of weights (log-odds scores) of + The score is the sum of weights (log-odds scores) of the respective symbol over all sequence positions. Parameters @@ -442,7 +465,9 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0): """ if background_frequencies is None: background_frequencies = 1 / len(self.alphabet) - pwm = self.log_odds_matrix(background_frequencies=background_frequencies, pseudocount=pseudocount) + pwm = self.log_odds_matrix( + background_frequencies=background_frequencies, pseudocount=pseudocount + ) if len(sequence) != len(pwm): raise ValueError( f"The given sequence has a different length ({len(sequence)}) than " diff --git a/src/biotite/sequence/search.py b/src/biotite/sequence/search.py index c57e7d119..96af23d03 100644 --- a/src/biotite/sequence/search.py +++ b/src/biotite/sequence/search.py @@ -4,8 +4,7 @@ __name__ = "biotite.sequence" __author__ = "Patrick Kunzmann" -__all__ = ["find_subsequence", "find_symbol", "find_symbol_first", - "find_symbol_last"] +__all__ = ["find_subsequence", "find_symbol", "find_symbol_first", "find_symbol_last"] import numpy as np @@ -13,7 +12,7 @@ def find_subsequence(sequence, query): """ Find a subsequence in a sequence. - + Parameters ---------- sequence : Sequence @@ -21,26 +20,26 @@ def find_subsequence(sequence, query): query : Sequence The potential subsequence. Its alphabet must extend the `sequence` alphabet. - + Returns ------- match_indices : ndarray The starting indices in `sequence`, where `query` has been found. The array is empty if no match has been found. - + Raises ------ ValueError If the `query` alphabet does not extend the `sequence` alphabet. - + Examples -------- - + >>> main_seq = NucleotideSequence("ACTGAATGA") >>> sub_seq = NucleotideSequence("TGA") >>> print(find_subsequence(main_seq, sub_seq)) [2 6] - + """ if not sequence.get_alphabet().extends(query.get_alphabet()): raise ValueError("The sequences alphabets are not equal") @@ -52,17 +51,18 @@ def find_subsequence(sequence, query): match_indices.append(i) return np.array(match_indices) + def find_symbol(sequence, symbol): """ Find a symbol in a sequence. - + Parameters ---------- sequence : Sequence The sequence to find the symbol in. symbol : object The symbol to be found in `sequence`. - + Returns ------- match_indices : ndarray @@ -71,17 +71,18 @@ def find_symbol(sequence, symbol): code = sequence.get_alphabet().encode(symbol) return np.where(sequence.code == code)[0] + def find_symbol_first(sequence, symbol): """ Find first occurence of a symbol in a sequence. - + Parameters ---------- sequence : Sequence The sequence to find the symbol in. symbol : object The symbol to be found in `sequence`. - + Returns ------- first_index : int @@ -92,18 +93,19 @@ def find_symbol_first(sequence, symbol): if len(match_i) == 0: return -1 return np.min(match_i) - + + def find_symbol_last(sequence, symbol): """ Find last occurence of a symbol in a sequence. - + Parameters ---------- sequence : Sequence The sequence to find the symbol in. symbol : object The symbol to be found in `sequence`. - + Returns ------- flast_index : int diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py index 76254e13f..2df25aaa4 100644 --- a/src/biotite/sequence/seqtypes.py +++ b/src/biotite/sequence/seqtypes.py @@ -6,17 +6,16 @@ __author__ = "Patrick Kunzmann", "Thomas Nevolianis" __all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"] -from .sequence import Sequence -from .alphabet import LetterAlphabet, AlphabetError, AlphabetMapper import numpy as np -import copy +from .alphabet import AlphabetError, AlphabetMapper, LetterAlphabet +from .sequence import Sequence class GeneralSequence(Sequence): """ This class allows the creation of a sequence with custom :class:`Alphabet` without the need to subclass :class:`Sequence`. - + Parameters ---------- alphabet : Alphabet @@ -27,22 +26,24 @@ class GeneralSequence(Sequence): may also be a :class:`str` object. By default the sequence is empty. """ - + def __init__(self, alphabet, sequence=()): self._alphabet = alphabet super().__init__(sequence) def __repr__(self): """Represent GeneralSequence as a string for debugging.""" - return f"GeneralSequence(Alphabet({self._alphabet}), " \ - f"[{', '.join([repr(symbol) for symbol in self.symbols])}])" + return ( + f"GeneralSequence(Alphabet({self._alphabet}), " + f"[{', '.join([repr(symbol) for symbol in self.symbols])}])" + ) def __copy_create__(self): return GeneralSequence(self._alphabet) - + def get_alphabet(self): return self._alphabet - + def as_type(self, sequence): """ Convert the :class:`GeneralSequence` into a sequence of another @@ -58,12 +59,12 @@ def as_type(self, sequence): of this object. The alphabet must equal or extend the alphabet of this object. - + Returns ------- sequence : Sequence The input `sequence` with replaced sequence code. - + Raises ------ AlphabetError @@ -78,16 +79,17 @@ def as_type(self, sequence): sequence.code = self.code return sequence + class NucleotideSequence(Sequence): """ Representation of a nucleotide sequence (DNA or RNA). - + This class may have one of two different alphabets: :attr:`unambiguous_alphabet()` contains only the unambiguous DNA letters 'A', 'C', 'G' and 'T'. - :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous + :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous letters. - + Parameters ---------- sequence : iterable object, optional @@ -100,35 +102,36 @@ class NucleotideSequence(Sequence): ambiguous letters in the sequence, the ambiguous alphabet is used. """ - - alphabet_unamb = LetterAlphabet(["A","C","G","T"]) - alphabet_amb = LetterAlphabet( - ["A","C","G","T","R","Y","W","S", - "M","K","H","B","V","D","N"] + + alphabet_unamb = LetterAlphabet(["A", "C", "G", "T"]) + alphabet_amb = LetterAlphabet( + ["A", "C", "G", "T", "R", "Y", "W", "S", "M", "K", "H", "B", "V", "D", "N"] ) - - compl_symbol_dict = {"A" : "T", - "C" : "G", - "G" : "C", - "T" : "A", - "M" : "K", - "R" : "Y", - "W" : "W", - "S" : "S", - "Y" : "R", - "K" : "M", - "V" : "B", - "H" : "D", - "D" : "H", - "B" : "V", - "N" : "N"} + + compl_symbol_dict = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "N": "N", + } # List comprehension does not work in this scope _compl_symbols = [] for _symbol in alphabet_amb.get_symbols(): _compl_symbols.append(compl_symbol_dict[_symbol]) _compl_alphabet_unamb = LetterAlphabet(_compl_symbols) _compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb) - + def __init__(self, sequence=[], ambiguous=None): if isinstance(sequence, str): sequence = sequence.upper() @@ -164,28 +167,28 @@ def __copy_create__(self): else: seq_copy = NucleotideSequence(ambiguous=False) return seq_copy - + def get_alphabet(self): return self._alphabet - + def complement(self): """ Get the complement nucleotide sequence. - + Returns ------- complement : NucleotideSequence The complement sequence. - + Examples -------- - + >>> dna_seq = NucleotideSequence("ACGCTT") >>> print(dna_seq.complement()) TGCGAA >>> print(dna_seq.reverse().complement()) AAGCGT - + """ # Interpreting the sequence code of this object in the # complementary alphabet gives the complementary symbols @@ -194,18 +197,18 @@ def complement(self): # alphabet into the original alphabet compl_code = NucleotideSequence._compl_mapper[self.code] return self.copy(compl_code) - + def translate(self, complete=False, codon_table=None, met_start=False): """ Translate the nucleotide sequence into a protein sequence. - + If `complete` is true, the entire sequence is translated, beginning with the first codon and ending with the last codon, even if stop codons occur during the translation. - + Otherwise this method returns possible ORFs in the sequence, even if not stop codon occurs in an ORF. - + Parameters ---------- complete : bool, optional @@ -222,7 +225,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): Otherwise the translation starts with the amino acid the codon codes for. Only applies, if `complete` is false. (Default: False) - + Returns ------- protein : ProteinSequence or list of ProteinSequence @@ -233,15 +236,15 @@ def translate(self, complete=False, codon_table=None, met_start=False): pos : list of tuple (int, int) Is only returned if `complete` is false. The list contains a tuple for each ORF. - The first element of the tuple is the index of the + The first element of the tuple is the index of the :class:`NucleotideSequence`, where the translation starts. The second element is the exclusive stop index, it represents the first nucleotide in the :class:`NucleotideSequence` after a stop codon. - + Examples -------- - + >>> dna_seq = NucleotideSequence("AATGATGCTATAGAT") >>> prot_seq = dna_seq.translate(complete=True) >>> print(prot_seq) @@ -251,7 +254,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): ... print(seq) MML* ML* - + """ if self._alphabet != NucleotideSequence.alphabet_unamb: raise AlphabetError("Translation requires unambiguous alphabet") @@ -259,21 +262,24 @@ def translate(self, complete=False, codon_table=None, met_start=False): if codon_table is None: # Import at this position to avoid circular import from .codon import CodonTable + codon_table = CodonTable.default_table() - + if complete: if len(self) % 3 != 0: - raise ValueError("Sequence length needs to be a multiple of 3 " - "for complete translation") + raise ValueError( + "Sequence length needs to be a multiple of 3 " + "for complete translation" + ) # Reshape code into (n,3), with n being the amount of codons codons = self.code.reshape(-1, 3) protein_seq = ProteinSequence() protein_seq.code = codon_table.map_codon_codes(codons) return protein_seq - + else: stop_code = ProteinSequence.alphabet.encode("*") - met_code = ProteinSequence.alphabet.encode("M") + met_code = ProteinSequence.alphabet.encode("M") protein_seqs = [] pos = [] code = self.code @@ -282,7 +288,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): # The frame length is always a multiple of 3 # If there is a trailing partial codon, remove it frame_length = ((len(code) - shift) // 3) * 3 - frame = code[shift : shift+frame_length] + frame = code[shift : shift + frame_length] # Reshape frame into (n,3), with n being the amount of codons frame_codons = frame.reshape(-1, 3) # At first, translate frame completely @@ -297,8 +303,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): stops = np.where(code_from_start == stop_code)[0] # Find first stop codon after start codon # Include stop -> stops[0] + 1 - stop_i = stops[0] + 1 if len(stops) > 0 \ - else len(code_from_start) + stop_i = stops[0] + 1 if len(stops) > 0 else len(code_from_start) code_from_start_to_stop = code_from_start[:stop_i] prot_seq = ProteinSequence() if met_start: @@ -310,13 +315,13 @@ def translate(self, complete=False, codon_table=None, met_start=False): protein_seqs.append(prot_seq) # Codon indices are transformed # to nucleotide sequence indices - pos.append((shift + start_i*3, shift + (start_i+stop_i)*3)) + pos.append((shift + start_i * 3, shift + (start_i + stop_i) * 3)) # Sort by start position order = np.argsort([start for start, stop in pos]) pos = [pos[i] for i in order] protein_seqs = [protein_seqs[i] for i in order] return protein_seqs, pos - + @staticmethod def unambiguous_alphabet(): """ @@ -329,7 +334,7 @@ def unambiguous_alphabet(): The unambiguous nucleotide alphabet. """ return NucleotideSequence.alphabet_unamb - + @staticmethod def ambiguous_alphabet(): """ @@ -348,10 +353,10 @@ def ambiguous_alphabet(): class ProteinSequence(Sequence): """ Representation of a protein sequence. - + Furthermore this class offers a conversion of amino acids from 3-letter code into 1-letter code and vice versa. - + Parameters ---------- sequence : iterable object, optional @@ -359,7 +364,7 @@ class ProteinSequence(Sequence): string. May take upper or lower case letters. If a list is given, the list elements can be 1-letter or 3-letter amino acid representations. By default the sequence is empty. - + Notes ----- The :class:`Alphabet` of this :class:`Sequence` class does not @@ -370,106 +375,139 @@ class ProteinSequence(Sequence): """ _codon_table = None - - alphabet = LetterAlphabet(["A","C","D","E","F","G","H","I","K","L", - "M","N","P","Q","R","S","T","V","W","Y", - "B","Z","X","*"]) + + alphabet = LetterAlphabet( + [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + "B", + "Z", + "X", + "*", + ] + ) # Masses are taken from # https://web.expasy.org/findmod/findmod_masses.html#AA - _mol_weight_average = np.array([ - 71.0788, # A - 103.1388, # C - 115.0886, # D - 129.1155, # E - 147.1766, # F - 57.0519, # G - 137.1411, # H - 113.1594, # I - 128.1741, # K - 113.1594, # L - 131.1926, # M - 114.1038, # N - 97.1167, # P - 128.1307, # Q - 156.1875, # R - 87.0782, # S - 101.1051, # T - 99.1326, # V - 186.2132, # W - 163.1760, # Y - np.nan, # B - np.nan, # Z - np.nan, # X - np.nan, # * - ]) - - _mol_weight_monoisotopic = np.array([ - 71.03711, # A - 103.00919, # C - 115.02694, # D - 129.04259, # E - 147.06841, # F - 57.02146, # G - 137.05891, # H - 113.08406, # I - 128.09496, # K - 113.08406, # L - 131.04049, # M - 114.04293, # N - 97.05276, # P - 128.05858, # Q - 156.10111, # R - 87.03203, # S - 101.04768, # T - 99.06841, # V - 186.07931, # W - 163.06333, # Y - np.nan, # B - np.nan, # Z - np.nan, # X - np.nan, # * - ]) - - _dict_1to3 = {"A" : "ALA", - "C" : "CYS", - "D" : "ASP", - "E" : "GLU", - "F" : "PHE", - "G" : "GLY", - "H" : "HIS", - "I" : "ILE", - "K" : "LYS", - "L" : "LEU", - "M" : "MET", - "N" : "ASN", - "P" : "PRO", - "Q" : "GLN", - "R" : "ARG", - "S" : "SER", - "T" : "THR", - "V" : "VAL", - "W" : "TRP", - "Y" : "TYR", - "B" : "ASX", - "Z" : "GLX", - "X" : "UNK", - "*" : " * "} - + _mol_weight_average = np.array( + [ + 71.0788, # A + 103.1388, # C + 115.0886, # D + 129.1155, # E + 147.1766, # F + 57.0519, # G + 137.1411, # H + 113.1594, # I + 128.1741, # K + 113.1594, # L + 131.1926, # M + 114.1038, # N + 97.1167, # P + 128.1307, # Q + 156.1875, # R + 87.0782, # S + 101.1051, # T + 99.1326, # V + 186.2132, # W + 163.1760, # Y + np.nan, # B + np.nan, # Z + np.nan, # X + np.nan, # * + ] + ) + + _mol_weight_monoisotopic = np.array( + [ + 71.03711, # A + 103.00919, # C + 115.02694, # D + 129.04259, # E + 147.06841, # F + 57.02146, # G + 137.05891, # H + 113.08406, # I + 128.09496, # K + 113.08406, # L + 131.04049, # M + 114.04293, # N + 97.05276, # P + 128.05858, # Q + 156.10111, # R + 87.03203, # S + 101.04768, # T + 99.06841, # V + 186.07931, # W + 163.06333, # Y + np.nan, # B + np.nan, # Z + np.nan, # X + np.nan, # * + ] + ) + + _dict_1to3 = { + "A": "ALA", + "C": "CYS", + "D": "ASP", + "E": "GLU", + "F": "PHE", + "G": "GLY", + "H": "HIS", + "I": "ILE", + "K": "LYS", + "L": "LEU", + "M": "MET", + "N": "ASN", + "P": "PRO", + "Q": "GLN", + "R": "ARG", + "S": "SER", + "T": "THR", + "V": "VAL", + "W": "TRP", + "Y": "TYR", + "B": "ASX", + "Z": "GLX", + "X": "UNK", + "*": " * ", + } + _dict_3to1 = {} for _key, _value in _dict_1to3.items(): _dict_3to1[_value] = _key _dict_3to1["SEC"] = "C" _dict_3to1["MSE"] = "M" - + def __init__(self, sequence=()): dict_3to1 = ProteinSequence._dict_3to1 alph = ProteinSequence.alphabet # Convert 3-letter codes to single letter codes, # if list contains 3-letter codes - sequence = [dict_3to1[symbol.upper()] if len(symbol) == 3 - else symbol.upper() for symbol in sequence] + sequence = [ + dict_3to1[symbol.upper()] if len(symbol) == 3 else symbol.upper() + for symbol in sequence + ] super().__init__(sequence) def __repr__(self): @@ -478,11 +516,11 @@ def __repr__(self): def get_alphabet(self): return ProteinSequence.alphabet - + def remove_stops(self): """ Remove *stop signals* from the sequence. - + Returns ------- no_stop : ProteinSequence @@ -493,34 +531,34 @@ def remove_stops(self): seq_code = no_stop.code no_stop.code = seq_code[seq_code != stop_code] return no_stop - + @staticmethod def convert_letter_3to1(symbol): """ Convert a 3-letter to a 1-letter amino acid representation. - + Parameters ---------- symbol : string 3-letter amino acid representation. - + Returns ------- convert : string 1-letter amino acid representation. """ return ProteinSequence._dict_3to1[symbol.upper()] - + @staticmethod def convert_letter_1to3(symbol): """ Convert a 1-letter to a 3-letter amino acid representation. - + Parameters ---------- symbol : string 1-letter amino acid representation. - + Returns ------- convert : string @@ -531,7 +569,7 @@ def convert_letter_1to3(symbol): def get_molecular_weight(self, monoisotopic=False): """ Calculate the molecular weight of this protein. - + Average protein molecular weight is calculated by the addition of average isotopic masses of the amino acids in the protein and the average isotopic mass of one water @@ -550,7 +588,6 @@ def get_molecular_weight(self, monoisotopic=False): if np.isnan(weight): raise ValueError( - "Sequence contains ambiguous amino acids, " - "cannot calculate weight" + "Sequence contains ambiguous amino acids, " "cannot calculate weight" ) return weight diff --git a/src/biotite/sequence/sequence.py b/src/biotite/sequence/sequence.py index f9a69dfb0..6acdcfb8a 100644 --- a/src/biotite/sequence/sequence.py +++ b/src/biotite/sequence/sequence.py @@ -10,16 +10,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["Sequence"] -import numbers import abc +import numbers import numpy as np -from .alphabet import Alphabet, LetterAlphabet from ..copyable import Copyable +from .alphabet import LetterAlphabet - -_size_uint8 = np.iinfo(np.uint8 ).max +1 -_size_uint16 = np.iinfo(np.uint16).max +1 -_size_uint32 = np.iinfo(np.uint32).max +1 +_size_uint8 = np.iinfo(np.uint8).max + 1 +_size_uint16 = np.iinfo(np.uint16).max + 1 +_size_uint32 = np.iinfo(np.uint32).max + 1 class Sequence(Copyable, metaclass=abc.ABCMeta): @@ -277,12 +276,10 @@ def get_symbol_frequency(self): corresponding number of occurences in the sequence as values. """ - counts = np.bincount( - self._seq_code, minlength=len(self.get_alphabet()) - ) + counts = np.bincount(self._seq_code, minlength=len(self.get_alphabet())) return { - symbol: count for symbol, count - in zip(self.get_alphabet().get_symbols(), counts) + symbol: count + for symbol, count in zip(self.get_alphabet().get_symbols(), counts) } def __getitem__(self, index): @@ -329,12 +326,13 @@ def __eq__(self, item): def __str__(self): alph = self.get_alphabet() if isinstance(alph, LetterAlphabet): - return alph.decode_multiple(self._seq_code, as_bytes=True)\ - .tobytes().decode("ASCII") - else: - return ", ".join( - [str(e) for e in alph.decode_multiple(self._seq_code)] + return ( + alph.decode_multiple(self._seq_code, as_bytes=True) + .tobytes() + .decode("ASCII") ) + else: + return ", ".join([str(e) for e in alph.decode_multiple(self._seq_code)]) def __add__(self, sequence): if self.get_alphabet().extends(sequence.get_alphabet()): diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py index 0685b0e61..6349a3d85 100644 --- a/src/biotite/structure/__init__.py +++ b/src/biotite/structure/__init__.py @@ -104,9 +104,11 @@ __author__ = "Patrick Kunzmann" from .atoms import * +from .basepairs import * from .bonds import * from .box import * from .celllist import * +from .chains import * from .charges import * from .compare import * from .density import * @@ -122,11 +124,9 @@ from .rdf import * from .repair import * from .residues import * -from .chains import * from .sasa import * from .sequence import * from .sse import * from .superimpose import * from .transform import * -from .basepairs import * # util and resutil are used internally diff --git a/src/biotite/structure/atoms.py b/src/biotite/structure/atoms.py index 47f97de7d..3c344063e 100644 --- a/src/biotite/structure/atoms.py +++ b/src/biotite/structure/atoms.py @@ -4,19 +4,27 @@ """ This module contains the main types of the ``structure`` subpackage: -:class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`. +:class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`. """ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["Atom", "AtomArray", "AtomArrayStack", - "array", "stack", "repeat", "from_template", "coord"] +__all__ = [ + "Atom", + "AtomArray", + "AtomArrayStack", + "array", + "stack", + "repeat", + "from_template", + "coord", +] -import numbers import abc +import numbers import numpy as np -from .bonds import BondList from ..copyable import Copyable +from .bonds import BondList class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta): @@ -26,7 +34,7 @@ class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta): It implements functionality for annotation arrays and also rudimentarily for coordinates. """ - + def __init__(self, length): """ Create the annotation arrays @@ -43,14 +51,14 @@ def __init__(self, length): self.add_annotation("hetero", dtype=bool) self.add_annotation("atom_name", dtype="U6") self.add_annotation("element", dtype="U2") - + def array_length(self): """ Get the length of the atom array. - + This value is equivalent to the length of each annotation array. For :class:`AtomArray` it is the same as ``len(array)``. - + Returns ------- length : int @@ -71,15 +79,15 @@ def shape(self): shape : tuple of int Shape of the object. """ - return - + return + def add_annotation(self, category, dtype): """ Add an annotation category, if not already existing. - + Initially the new annotation is filled with the *zero* representation of the given type. - + Parameters ---------- category : str @@ -87,19 +95,18 @@ def add_annotation(self, category, dtype): dtype : type or str A type instance or a valid *NumPy* *dtype* string. Defines the type of the annotation - + See Also -------- set_annotation """ if category not in self._annot: - self._annot[str(category)] = np.zeros(self._array_length, - dtype=dtype) - + self._annot[str(category)] = np.zeros(self._array_length, dtype=dtype) + def del_annotation(self, category): """ Removes an annotation category. - + Parameters ---------- category : str @@ -107,32 +114,30 @@ def del_annotation(self, category): """ if category in self._annot: del self._annot[str(category)] - + def get_annotation(self, category): """ Return an annotation array. - + Parameters ---------- category : str The annotation category to be returned. - + Returns ------- array : ndarray The annotation array. """ if category not in self._annot: - raise ValueError( - f"Annotation category '{category}' is not existing" - ) + raise ValueError(f"Annotation category '{category}' is not existing") return self._annot[category] - + def set_annotation(self, category, array): """ Set an annotation array. If the annotation category does not exist yet, the category is created. - + Parameters ---------- category : str @@ -143,28 +148,25 @@ def set_annotation(self, category, array): """ if len(array) != self._array_length: raise IndexError( - f"Expected array length {self._array_length}, " - f"but got {len(array)}" + f"Expected array length {self._array_length}, " f"but got {len(array)}" ) if category in self._annot: # Keep the dtype if the annotation already exists - self._annot[category] = np.asarray( - array, dtype=self._annot[category].dtype - ) + self._annot[category] = np.asarray(array, dtype=self._annot[category].dtype) else: self._annot[category] = np.asarray(array) - + def get_annotation_categories(self): """ Return a list containing all annotation array categories. - + Returns ------- categories : list The list containing the names of each annotation array. """ return list(self._annot.keys()) - + def _subarray(self, index): # Index is one dimensional (boolean mask, index array) new_coord = self._coord[..., index, :] @@ -180,10 +182,9 @@ def _subarray(self, index): if self._box is not None: new_object._box = self._box for annotation in self._annot: - new_object._annot[annotation] = (self._annot[annotation] - .__getitem__(index)) + new_object._annot[annotation] = self._annot[annotation].__getitem__(index) return new_object - + def _set_element(self, index, atom): try: if isinstance(index, (numbers.Integral, np.ndarray)): @@ -191,12 +192,10 @@ def _set_element(self, index, atom): self._annot[name][index] = atom._annot[name] self._coord[..., index, :] = atom.coord else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") except KeyError: raise KeyError("The annotations of the 'Atom' are incompatible") - + def _del_element(self, index): if isinstance(index, numbers.Integral): for name in self._annot: @@ -208,20 +207,18 @@ def _del_element(self, index): mask[index] = False self._bonds = self._bonds[mask] else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) - + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") + def equal_annotations(self, item): """ Check, if this object shares equal annotation arrays with the given :class:`AtomArray` or :class:`AtomArrayStack`. - + Parameters ---------- item : AtomArray or AtomArrayStack The object to compare the annotation arrays with. - + Returns ------- equality : bool @@ -235,24 +232,24 @@ def equal_annotations(self, item): if not np.array_equal(self._annot[name], item._annot[name]): return False return True - + def equal_annotation_categories(self, item): """ Check, if this object shares equal annotation array categories with the given :class:`AtomArray` or :class:`AtomArrayStack`. - + Parameters ---------- item : AtomArray or AtomArrayStack The object to compare the annotation arrays with. - + Returns ------- equality : bool True, if the annotation array names are equal. """ return sorted(self._annot.keys()) == sorted(item._annot.keys()) - + def __getattr__(self, attr): """ If the attribute is an annotation, the annotation is returned @@ -273,7 +270,7 @@ def __getattr__(self, attr): raise AttributeError( f"'{type(self).__name__}' object has no attribute '{attr}'" ) - + def __setattr__(self, attr, value): """ If the attribute is an annotation, the :attr:`value` is saved @@ -287,15 +284,13 @@ def __setattr__(self, attr, value): if isinstance(self, AtomArray): if value.ndim != 2: raise ValueError( - "A 2-dimensional ndarray is expected " - "for an AtomArray" - ) + "A 2-dimensional ndarray is expected " "for an AtomArray" + ) elif isinstance(self, AtomArrayStack): if value.ndim != 3: raise ValueError( - "A 3-dimensional ndarray is expected " - "for an AtomArrayStack" - ) + "A 3-dimensional ndarray is expected " "for an AtomArrayStack" + ) if value.shape[-2] != self._array_length: raise ValueError( f"Expected array length {self._array_length}, " @@ -304,7 +299,7 @@ def __setattr__(self, attr, value): if value.shape[-1] != 3: raise TypeError("Expected 3 coordinates for each atom") super().__setattr__("_coord", value.astype(np.float32, copy=False)) - + elif attr == "bonds": if isinstance(value, BondList): if value.get_atom_count() != self._array_length: @@ -318,22 +313,21 @@ def __setattr__(self, attr, value): super().__setattr__("_bonds", None) else: raise TypeError("Value must be 'BondList'") - + elif attr == "box": if isinstance(value, np.ndarray): if isinstance(self, AtomArray): if value.ndim != 2: raise ValueError( - "A 2-dimensional ndarray is expected " - "for an AtomArray" - ) - else: # AtomArrayStack + "A 2-dimensional ndarray is expected " "for an AtomArray" + ) + else: # AtomArrayStack if value.ndim != 3: raise ValueError( "A 3-dimensional ndarray is expected " "for an AtomArrayStack" - ) - if value.shape[-2:] != (3,3): + ) + if value.shape[-2:] != (3, 3): raise TypeError("Box must be a 3x3 matrix (three vectors)") box = value.astype(np.float32, copy=False) super().__setattr__("_box", box) @@ -342,14 +336,14 @@ def __setattr__(self, attr, value): super().__setattr__("_box", None) else: raise TypeError("Box must be ndarray of floats or None") - + elif attr == "_annot": super().__setattr__(attr, value) elif attr in self._annot: self.set_annotation(attr, value) else: super().__setattr__(attr, value) - + def __dir__(self): attr = super().__dir__() attr.append("coord") @@ -358,7 +352,7 @@ def __dir__(self): for name in self._annot.keys(): attr.append(name) return attr - + def __eq__(self, item): """ See Also @@ -376,30 +370,31 @@ def __eq__(self, item): if not np.array_equal(self._box, item._box): return False return np.array_equal(self._coord, item._coord) - + def __len__(self): """ The length of the annotation arrays. - + Returns ------- length : int Length of the annotation arrays. """ return self._array_length - + def __add__(self, array): if type(self) != type(array): raise TypeError("Can only concatenate two arrays or two stacks") # Create either new array or stack, depending of the own type if isinstance(self, AtomArray): - concat = AtomArray(length = self._array_length+array._array_length) + concat = AtomArray(length=self._array_length + array._array_length) if isinstance(self, AtomArrayStack): - concat = AtomArrayStack(self.stack_depth(), - self._array_length + array._array_length) - + concat = AtomArrayStack( + self.stack_depth(), self._array_length + array._array_length + ) + concat._coord = np.concatenate((self._coord, array.coord), axis=-2) - + # Transfer only annotations, # which are existent in both operands arr_categories = list(array._annot.keys()) @@ -407,29 +402,29 @@ def __add__(self, array): if category in arr_categories: annot = self._annot[category] arr_annot = array._annot[category] - concat._annot[category] = np.concatenate((annot,arr_annot)) - + concat._annot[category] = np.concatenate((annot, arr_annot)) + # Concatenate bonds lists, # if at least one of them contains bond information if self._bonds is not None or array._bonds is not None: bonds1 = self._bonds bonds2 = array._bonds if bonds1 is None: - bonds1 = BondList(self._array_length) + bonds1 = BondList(self._array_length) if bonds2 is None: bonds2 = BondList(array._array_length) concat._bonds = bonds1 + bonds2 - + # Copy box if self._box is not None: concat._box = np.copy(self._box) return concat - + def __copy_fill__(self, clone): super().__copy_fill__(clone) self._copy_annotations(clone) clone._coord = np.copy(self._coord) - + def _copy_annotations(self, clone): for name in self._annot: clone._annot[name] = np.copy(self._annot[name]) @@ -437,23 +432,23 @@ def _copy_annotations(self, clone): clone._box = np.copy(self._box) if self._bonds is not None: clone._bonds = self._bonds.copy() - + class Atom(Copyable): """ A representation of a single atom. - + The coordinates an annotations can be accessed directly. A detailed description of each annotation category can be viewed :doc:`here `. - + Parameters ---------- coord: list or ndarray The x, y and z coordinates. kwargs Atom annotations as key value pair. - + Attributes ---------- {annot} : scalar @@ -463,19 +458,19 @@ class Atom(Copyable): shape : tuple of int Shape of the object. In case of an :class:`Atom`, the tuple is empty. - + Examples -------- - + >>> atom = Atom([1,2,3], chain_id="A") >>> atom.atom_name = "CA" >>> print(atom.atom_name) CA >>> print(atom.coord) [1. 2. 3.] - + """ - + def __init__(self, coord, **kwargs): self._annot = {} self._annot["chain_id"] = "" @@ -500,17 +495,17 @@ def __repr__(self): """Represent Atom as a string for debugging.""" # print out key-value pairs and format strings in quotation marks annot_parts = [ - f'{key}="{value}"' if isinstance(value, str) else f'{key}={value}' + f'{key}="{value}"' if isinstance(value, str) else f"{key}={value}" for key, value in self._annot.items() ] - annot = ', '.join(annot_parts) - return f'Atom(np.{np.array_repr(self.coord)}, {annot})' + annot = ", ".join(annot_parts) + return f"Atom(np.{np.array_repr(self.coord)}, {annot})" @property def shape(self): return () - + def __getattr__(self, attr): if attr in super().__getattribute__("_annot"): return self._annot[attr] @@ -518,7 +513,7 @@ def __getattr__(self, attr): raise AttributeError( f"'{type(self).__name__}' object has no attribute '{attr}'" ) - + def __setattr__(self, attr, value): if attr == "_annot": super().__setattr__(attr, value) @@ -526,16 +521,18 @@ def __setattr__(self, attr, value): super().__setattr__(attr, value) else: self._annot[attr] = value - + def __str__(self): hetero = "HET" if self.hetero else "" - return f"{hetero:3} {self.chain_id:3} " \ - f"{self.res_id:5d}{self.ins_code:1} {self.res_name:3} " \ - f"{self.atom_name:6} {self.element:2} " \ - f"{self.coord[0]:8.3f} " \ - f"{self.coord[1]:8.3f} " \ - f"{self.coord[2]:8.3f}" - + return ( + f"{hetero:3} {self.chain_id:3} " + f"{self.res_id:5d}{self.ins_code:1} {self.res_name:3} " + f"{self.atom_name:6} {self.element:2} " + f"{self.coord[0]:8.3f} " + f"{self.coord[1]:8.3f} " + f"{self.coord[2]:8.3f}" + ) + def __eq__(self, item): if not isinstance(item, Atom): return False @@ -547,18 +544,18 @@ def __eq__(self, item): if self._annot[name] != item._annot[name]: return False return True - + def __ne__(self, item): return not self == item - + def __copy_create__(self): return Atom(self.coord, **self._annot) - + class AtomArray(_AtomArrayBase): """ An array representation of a model consisting of multiple atoms. - + An :class:`AtomArray` can be seen as a list of :class:`Atom` instances. Instead of using directly a list, this class uses an *NumPy* @@ -573,14 +570,14 @@ class AtomArray(_AtomArrayBase): or :func:`set_annotation()`. A detailed description of each annotation category can be viewed :doc:`here `. - + In order to get an an subarray of an :class:`AtomArray`, *NumPy* style indexing is used. This includes slices, boolean arrays, index arrays and even *Ellipsis* notation. Using a single integer as index returns a single :class:`Atom` instance. - + Inserting or appending an :class:`AtomArray` to another :class:`AtomArray` is done with the '+' operator. Only the annotation categories, which are existing in both arrays, @@ -611,7 +608,7 @@ class AtomArray(_AtomArrayBase): ---------- length : int The fixed amount of atoms in the array. - + Attributes ---------- {annot} : ndarray @@ -629,44 +626,44 @@ class AtomArray(_AtomArrayBase): Shape of the atom array. The single value in the tuple is the length of the atom array. - + Examples -------- Creating an atom array from atoms: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") >>> atom_array = array([atom1, atom2, atom3]) >>> print(atom_array.array_length()) 3 - + Accessing an annotation array: - + >>> print(atom_array.chain_id) ['A' 'A' 'B'] - + Accessing the coordinates: - + >>> print(atom_array.coord) [[1. 2. 3.] [2. 3. 4.] [3. 4. 5.]] - + *NumPy* style filtering: - + >>> atom_array = atom_array[atom_array.chain_id == "A"] >>> print(atom_array.array_length()) 2 - + Inserting an atom array: - + >>> insert = array([Atom([7,8,9], chain_id="C")]) >>> atom_array = atom_array[0:1] + insert + atom_array[1:2] >>> print(atom_array.chain_id) ['A' 'C' 'A'] """ - + def __init__(self, length): super().__init__(length) if length is None: @@ -676,13 +673,13 @@ def __init__(self, length): def __repr__(self): """Represent AtomArray as a string for debugging.""" - atoms = '' + atoms = "" for i in range(0, self.array_length()): if len(atoms) == 0: - atoms = '\n\t' + self.get_atom(i).__repr__() + atoms = "\n\t" + self.get_atom(i).__repr__() else: - atoms = atoms + ',\n\t' + self.get_atom(i).__repr__() - return f'array([{atoms}\n])' + atoms = atoms + ",\n\t" + self.get_atom(i).__repr__() + return f"array([{atoms}\n])" @property def shape(self): @@ -703,33 +700,33 @@ def shape(self): -------- array_length """ - return self.array_length(), + return (self.array_length(),) def get_atom(self, index): """ Obtain the atom instance of the array at the specified index. - + The same as ``array[index]``, if `index` is an integer. - + Parameters ---------- index : int Index of the atom. - + Returns ------- atom : Atom - Atom at position `index`. + Atom at position `index`. """ kwargs = {} for name, annotation in self._annot.items(): kwargs[name] = annotation[index] - return Atom(coord = self._coord[index], kwargs=kwargs) - + return Atom(coord=self._coord[index], kwargs=kwargs) + def __iter__(self): """ Iterate through the array. - + Yields ------ atom : Atom @@ -738,16 +735,16 @@ def __iter__(self): while i < len(self): yield self.get_atom(i) i += 1 - + def __getitem__(self, index): """ Obtain a subarray or the atom instance at the specified index. - + Parameters ---------- index : object All index types *NumPy* accepts, are valid. - + Returns ------- sub_array : Atom or AtomArray @@ -763,16 +760,14 @@ def __getitem__(self, index): # If first index is "...", just ignore the first index return self.__getitem__(index[1]) else: - raise IndexError( - "'AtomArray' does not accept multidimensional indices" - ) + raise IndexError("'AtomArray' does not accept multidimensional indices") else: return self._subarray(index) - + def __setitem__(self, index, atom): """ Set the atom at the specified array position. - + Parameters ---------- index : int @@ -781,38 +776,38 @@ def __setitem__(self, index, atom): The atom to be set. """ self._set_element(index, atom) - + def __delitem__(self, index): """ Deletes the atom at the specified array position. - + Parameters ---------- index : int The position where the atom should be deleted. """ self._del_element(index) - + def __len__(self): """ The length of the array. - + Returns ------- length : int Length of the array. """ return self.array_length() - + def __eq__(self, item): """ Check if the array equals another :class:`AtomArray`. - + Parameters ---------- item : object Object to campare the array with. - + Returns ------- equal : bool @@ -824,15 +819,15 @@ def __eq__(self, item): if not isinstance(item, AtomArray): return False return True - + def __str__(self): """ Get a string representation of the array. - + Each line contains the attributes of one atom. """ return "\n".join([str(atom) for atom in self]) - + def __copy_create__(self): return AtomArray(self.array_length()) @@ -841,7 +836,7 @@ class AtomArrayStack(_AtomArrayBase): """ A collection of multiple :class:`AtomArray` instances, where each atom array has equal annotation arrays. - + Effectively, this means that each atom is occuring in every array in the stack at differing coordinates. This situation arises e.g. in NMR-elucidated or simulated structures. Since the annotations are @@ -849,7 +844,7 @@ class AtomArrayStack(_AtomArrayBase): coordinate array is 3-D (m x n x 3). A detailed description of each annotation category can be viewed :doc:`here `. - + Indexing works similar to :class:`AtomArray`, with the difference, that two index dimensions are possible: The first index dimension specifies the array(s), the second index @@ -857,24 +852,24 @@ class AtomArrayStack(_AtomArrayBase): in :class:`AtomArray`). Using a single integer as first dimension index returns a single :class:`AtomArray` instance. - + Concatenation of atoms for each array in the stack is done using the '+' operator. For addition of atom arrays onto the stack use the :func:`stack()` method. The :attr:`box` attribute has the shape *m x 3 x 3*, as the cell might be different for each frame in the atom array stack. - + Parameters ---------- depth : int The fixed amount of arrays in the stack. When indexing, this is the length of the first dimension. - + length : int The fixed amount of atoms in each array in the stack. When indexing, this is the length of the second dimension. - + Attributes ---------- {annot} : ndarray, shape=(n,) @@ -892,15 +887,15 @@ class AtomArrayStack(_AtomArrayBase): Shape of the stack. The numbers correspond to the stack depth and array length, respectively. - + See also -------- AtomArray - + Examples -------- Creating an atom array stack from two arrays: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") @@ -925,7 +920,7 @@ class AtomArrayStack(_AtomArrayBase): [5. 6. 7.] [6. 7. 8.]]] """ - + def __init__(self, depth, length): super().__init__(length) if depth == None or length == None: @@ -935,30 +930,30 @@ def __init__(self, depth, length): def __repr__(self): """Represent AtomArrayStack as a string for debugging.""" - arrays = '' + arrays = "" for i in range(0, self.stack_depth()): if len(arrays) == 0: - arrays = '\n\t' + self.get_array(i).__repr__() + arrays = "\n\t" + self.get_array(i).__repr__() else: - arrays = arrays + ',\n\t' + self.get_array(i).__repr__() - return f'stack([{arrays}\n])' + arrays = arrays + ",\n\t" + self.get_array(i).__repr__() + return f"stack([{arrays}\n])" def get_array(self, index): """ Obtain the atom array instance of the stack at the specified index. - + The same as ``stack[index]``, if `index` is an integer. - + Parameters ---------- index : int Index of the atom array. - + Returns ------- array : AtomArray - AtomArray at position `index`. + AtomArray at position `index`. """ array = AtomArray(self.array_length()) for name in self._annot: @@ -970,14 +965,14 @@ def get_array(self, index): array._box = self._box[index] return array - + def stack_depth(self): """ Get the depth of the stack. - + This value represents the amount of atom arrays in the stack. It is the same as ``len(array)``. - + Returns ------- length : int @@ -1005,7 +1000,7 @@ def shape(self): def __iter__(self): """ Iterate through the array. - + Yields ------ array : AtomArray @@ -1014,17 +1009,17 @@ def __iter__(self): while i < len(self): yield self.get_array(i) i += 1 - + def __getitem__(self, index): """ Obtain the atom array instance or an substack at the specified index. - + Parameters ---------- index : object All index types *NumPy* accepts are valid. - + Returns ------- sub_array : AtomArray or AtomArrayStack @@ -1033,7 +1028,7 @@ def __getitem__(self, index): Otherwise an :class:`AtomArrayStack` with reduced depth and length is returned. In case the index is a tuple(int, int) an :class:`Atom` - instance is returned. + instance is returned. """ if isinstance(index, numbers.Integral): return self.get_array(index) @@ -1050,7 +1045,7 @@ def __getitem__(self, index): if isinstance(index[1], numbers.Integral): # Prevent reduction in dimensionality # in second dimension - new_stack = self._subarray(slice(index[1], index[1]+1)) + new_stack = self._subarray(slice(index[1], index[1] + 1)) else: new_stack = self._subarray(index[1]) if index[0] is not Ellipsis: @@ -1065,14 +1060,13 @@ def __getitem__(self, index): if self._box is not None: new_stack._box = self._box[index] return new_stack - - + def __setitem__(self, index, array): """ Set the atom array at the specified stack position. - + The array and the stack must have equal annotation arrays. - + Parameters ---------- index : int @@ -1081,26 +1075,20 @@ def __setitem__(self, index, array): The atom array to be set. """ if not self.equal_annotations(array): - raise ValueError( - "The stack and the array have unequal annotations" - ) + raise ValueError("The stack and the array have unequal annotations") if self.bonds != array.bonds: - raise ValueError( - "The stack and the array have unequal bonds" - ) + raise ValueError("The stack and the array have unequal bonds") if isinstance(index, numbers.Integral): self.coord[index] = array.coord if self.box is not None: self.box[index] = array.box else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) - + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") + def __delitem__(self, index): """ Deletes the atom array at the specified stack position. - + Parameters ---------- index : int @@ -1109,14 +1097,12 @@ def __delitem__(self, index): if isinstance(index, numbers.Integral): self._coord = np.delete(self._coord, index, axis=0) else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) - + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") + def __len__(self): """ The depth of the stack, i.e. the amount of models. - + Returns ------- depth : int @@ -1124,16 +1110,16 @@ def __len__(self): """ # length is determined by length of coord attribute return self._coord.shape[0] - + def __eq__(self, item): """ Check if the array equals another :class:`AtomArray` - + Parameters ---------- item : object Object to campare the array with. - + Returns ------- equal : bool @@ -1145,20 +1131,20 @@ def __eq__(self, item): if not isinstance(item, AtomArrayStack): return False return True - + def __str__(self): """ Get a string representation of the stack. - + :class:`AtomArray` strings eparated by blank lines and a line indicating the index. """ string = "" for i, array in enumerate(self): - string += "Model " + str(i+1) + "\n" + string += "Model " + str(i + 1) + "\n" string += str(array) + "\n" + "\n" return string - + def __copy_create__(self): return AtomArrayStack(self.stack_depth(), self.array_length()) @@ -1166,23 +1152,23 @@ def __copy_create__(self): def array(atoms): """ Create an :class:`AtomArray` from a list of :class:`Atom`. - + Parameters ---------- atoms : iterable object of Atom The atoms to be combined in an array. All atoms must share the same annotation categories. - + Returns ------- array : AtomArray The listed atoms as array. - + Examples -------- - + Creating an atom array from atoms: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") @@ -1204,7 +1190,7 @@ def array(atoms): array = AtomArray(len(atoms)) # Add all (also optional) annotation categories for name in names: - array.add_annotation(name, dtype=type(atoms[0]._annot[name])) + array.add_annotation(name, dtype=type(atoms[0]._annot[name])) # Add all atoms to AtomArray for i in range(len(atoms)): for name in names: @@ -1216,23 +1202,23 @@ def array(atoms): def stack(arrays): """ Create an :class:`AtomArrayStack` from a list of :class:`AtomArray`. - + Parameters ---------- arrays : iterable object of AtomArray The atom arrays to be combined in a stack. All atom arrays must have an equal number of atoms and equal annotation arrays. - + Returns ------- stack : AtomArrayStack The stacked atom arrays. - + Examples -------- Creating an atom array stack from two arrays: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") @@ -1272,7 +1258,7 @@ def stack(arrays): array_stack = AtomArrayStack(array_count, ref_array.array_length()) for name, annotation in ref_array._annot.items(): array_stack._annot[name] = annotation - coord_list = [array._coord for array in arrays] + coord_list = [array._coord for array in arrays] array_stack._coord = np.stack(coord_list, axis=0) # Take bond list from first array array_stack._bonds = ref_array._bonds @@ -1296,14 +1282,14 @@ def repeat(atoms, coord): The length of first dimension determines the number of repeats. If `atoms` is an :class:`AtomArray` 3 dimensions, otherwise 4 dimensions are required. - + Returns ------- repeated: AtomArray, shape=(n*k,) or AtomArrayStack, shape=(m,n*k) The repeated atoms. Whether an :class:`AtomArray` or an :class:`AtomArrayStack` is returned depends on the input `atoms`. - + Examples -------- @@ -1336,7 +1322,7 @@ def repeat(atoms, coord): raise ValueError( f"Expected 4 dimensions for the coordinate array, got {coord.ndim}" ) - + repetitions = len(coord) orig_length = atoms.array_length() new_length = orig_length * repetitions @@ -1358,24 +1344,24 @@ def repeat(atoms, coord): ) repeated = AtomArrayStack(atoms.stack_depth(), new_length) repeated.coord = coord.reshape((atoms.stack_depth(), new_length, 3)) - + else: raise TypeError( f"Expected 'AtomArray' or 'AtomArrayStack', " f"but got {type(atoms).__name__}" ) - + for category in atoms.get_annotation_categories(): annot = np.tile(atoms.get_annotation(category), repetitions) repeated.set_annotation(category, annot) if atoms.bonds is not None: repeated_bonds = atoms.bonds.copy() - for _ in range(repetitions-1): + for _ in range(repetitions - 1): repeated_bonds += atoms.bonds repeated.bonds = repeated_bonds if atoms.box is not None: repeated.box = atoms.box.copy() - + return repeated @@ -1383,7 +1369,7 @@ def from_template(template, coord, box=None): """ Create an :class:`AtomArrayStack` using template atoms and given coordinates. - + Parameters ---------- template : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) @@ -1393,7 +1379,7 @@ def from_template(template, coord, box=None): The coordinates for each model of the returned stack. box : ndarray, optional, dtype=float, shape=(l,3,3) The box for each model of the returned stack. - + Returns ------- array_stack : AtomArrayStack @@ -1409,7 +1395,7 @@ def from_template(template, coord, box=None): # Create empty stack with no models new_stack = AtomArrayStack(0, template.array_length()) - + for category in template.get_annotation_categories(): annot = template.get_annotation(category) new_stack.set_annotation(category, annot) @@ -1417,30 +1403,30 @@ def from_template(template, coord, box=None): new_stack.bonds = template.bonds.copy() if box is not None: new_stack.box = box.copy() - + # After setting the coordinates the number of models is the number # of models in the new coordinates new_stack.coord = coord - + return new_stack def coord(item): """ Get the atom coordinates of the given array. - + This may be directly and :class:`Atom`, :class:`AtomArray` or :class:`AtomArrayStack` or alternatively an (n x 3) or (m x n x 3) :class:`ndarray` containing the coordinates. - + Parameters ---------- item : Atom or AtomArray or AtomArrayStack or ndarray Returns the :attr:`coord` attribute, if `item` is an :class:`Atom`, :class:`AtomArray` or :class:`AtomArrayStack`. Directly returns the input, if `item` is a :class:`ndarray`. - + Returns ------- coord : ndarray diff --git a/src/biotite/structure/basepairs.py b/src/biotite/structure/basepairs.py index 371477cd0..02b7a4a0e 100644 --- a/src/biotite/structure/basepairs.py +++ b/src/biotite/structure/basepairs.py @@ -8,23 +8,33 @@ __name__ = "biotite.structure" __author__ = "Tom David Müller" -__all__ = ["base_pairs", "map_nucleotide", "base_stacking", "base_pairs_edge", - "Edge", "base_pairs_glycosidic_bond", "GlycosidicBond"] +__all__ = [ + "base_pairs", + "map_nucleotide", + "base_stacking", + "base_pairs_edge", + "Edge", + "base_pairs_glycosidic_bond", + "GlycosidicBond", +] -import numpy as np import warnings from enum import IntEnum +import numpy as np from .atoms import Atom, array -from .superimpose import superimpose -from .filter import filter_nucleotides from .celllist import CellList +from .compare import rmsd +from .error import ( + BadStructureError, + IncompleteStructureWarning, + UnexpectedStructureWarning, +) +from .filter import filter_nucleotides from .hbond import hbond -from .error import IncompleteStructureWarning, UnexpectedStructureWarning, \ - BadStructureError -from .util import distance, norm_vector -from .residues import get_residue_starts_for, get_residue_masks from .info.standardize import standardize_order -from .compare import rmsd +from .residues import get_residue_masks, get_residue_starts_for +from .superimpose import superimpose +from .util import distance, norm_vector def _get_std_adenine(): @@ -43,31 +53,29 @@ def _get_std_adenine(): ring center, :class:`ndarray` containing the coordinates of the imidazole ring center """ - atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A") - atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A") - atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A") - atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A") - atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A") - atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A") - atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A") - atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A") - atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A") - atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A") + atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A") + atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A") + atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A") + atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A") + atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A") + atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A") + atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A") + atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A") + atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A") + atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A") adenine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, - atom9, atom10] + [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10] ) # Get the midpoint between the N1 and C4 atoms midpoint = np.mean([atom7.coord, atom10.coord], axis=-2) # Calculate the coordinates of the aromatic ring centers pyrimidine_center = np.mean( - [atom4.coord, atom5.coord, atom7.coord, - atom8.coord, atom9.coord, atom10.coord], axis=-2 + [atom4.coord, atom5.coord, atom7.coord, atom8.coord, atom9.coord, atom10.coord], + axis=-2, ) imidazole_center = np.mean( - [atom1.coord, atom2.coord, atom3.coord, - atom4.coord, atom10.coord], axis=-2 + [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom10.coord], axis=-2 ) return adenine, (midpoint, pyrimidine_center, imidazole_center) @@ -75,37 +83,35 @@ def _get_std_adenine(): def _get_std_cytosine(): """ - Get standard base variables for cytosine. + Get standard base variables for cytosine. - Returns - ------- - standard_base : AtomArray - Standard coordinates nomenclature of the cytosine base as - :class:`AtomArray` with nomenclature of PDB File Format V3 - coordinates : tuple (ndarray, ndarray, dtype=float) - :class:`ndarray` containing the center according to the SCHNaP- - paper referenced in the function ``base_pairs``, - :class:`ndarray` containing the coordinates of the pyrimidine - ring center + Returns + ------- + standard_base : AtomArray + Standard coordinates nomenclature of the cytosine base as + :class:`AtomArray` with nomenclature of PDB File Format V3 + coordinates : tuple (ndarray, ndarray, dtype=float) + :class:`ndarray` containing the center according to the SCHNaP- + paper referenced in the function ``base_pairs``, + :class:`ndarray` containing the coordinates of the pyrimidine + ring center """ - atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C") - atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C") - atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C") - atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C") - atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C") - atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C") - atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C") - atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C") - cytosine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8] - ) + atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C") + atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C") + atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C") + atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C") + atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C") + atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C") + atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C") + atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C") + cytosine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8]) # Get the midpoint between the N3 and C6 atoms midpoint = np.mean([atom4.coord, atom8.coord], axis=-2) # Calculate the coordinates of the aromatic ring center pyrimidine_center = np.mean( - [atom1.coord, atom2.coord, atom4.coord, - atom5.coord, atom7.coord, atom8.coord], axis=-2 + [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord], + axis=-2, ) return cytosine, (midpoint, pyrimidine_center) @@ -127,32 +133,37 @@ def _get_std_guanine(): ring center, :class:`ndarray` containing the coordinates of the imidazole ring center """ - atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G") - atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G") - atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G") - atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G") - atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G") - atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G") - atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G") - atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G") - atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G") - atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G") - atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G") + atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G") + atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G") + atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G") + atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G") + atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G") + atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G") + atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G") + atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G") + atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G") + atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G") + atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G") guanine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, - atom9, atom10, atom11] + [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10, atom11] ) # Get the midpoint between the N1 and C4 atoms midpoint = np.mean([atom7.coord, atom11.coord], axis=-2) # Calculate the coordinates of the aromatic ring centers pyrimidine_center = np.mean( - [atom4.coord, atom5.coord, atom7.coord, - atom8.coord, atom10.coord, atom11.coord], axis=-2 + [ + atom4.coord, + atom5.coord, + atom7.coord, + atom8.coord, + atom10.coord, + atom11.coord, + ], + axis=-2, ) imidazole_center = np.mean( - [atom1.coord, atom2.coord, atom3.coord, - atom4.coord, atom11.coord], axis=-2 + [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom11.coord], axis=-2 ) return guanine, (midpoint, pyrimidine_center, imidazole_center) @@ -173,25 +184,23 @@ def _get_std_thymine(): :class:`ndarray` containing the coordinates of the pyrimidine ring center """ - atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T") - atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T") - atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T") - atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T") - atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T") - atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T") - atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T") - atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T") - atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T") - thymine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9] - ) + atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T") + atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T") + atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T") + atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T") + atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T") + atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T") + atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T") + atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T") + atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T") + thymine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9]) # Get the midpoint between the N3 and C6 atoms midpoint = np.mean([atom4.coord, atom9.coord], axis=-2) # Calculate the coordinates of the aromatic ring center pyrimidine_center = np.mean( - [atom1.coord, atom2.coord, atom4.coord, - atom5.coord, atom7.coord, atom9.coord], axis=-2 + [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom9.coord], + axis=-2, ) return thymine, (midpoint, pyrimidine_center) @@ -212,30 +221,28 @@ def _get_std_uracil(): :class:`ndarray` containing the coordinates of the pyrimidine ring center """ - atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U") - atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U") - atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U") - atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U") - atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U") - atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U") - atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U") - atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U") - uracil = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8] - ) + atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U") + atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U") + atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U") + atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U") + atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U") + atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U") + atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U") + atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U") + uracil = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8]) # Get the midpoint between the N3 and C6 atoms midpoint = np.mean([atom4.coord, atom8.coord], axis=-2) # Calculate the coordinates of the aromatic ring center pyrimidine_center = np.mean( - [atom1.coord, atom2.coord, atom4.coord, - atom5.coord, atom7.coord, atom8.coord], axis=-2 + [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord], + axis=-2, ) return uracil, (midpoint, pyrimidine_center) -_STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine() +_STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine() _STD_CYTOSINE, _STD_CYTOSINE_RING_CENTERS = _get_std_cytosine() _STD_GUANINE, _STD_GUANINE_RING_CENTERS = _get_std_guanine() _STD_THYMINE, _STD_THYMINE_RING_CENTERS = _get_std_thymine() @@ -247,35 +254,35 @@ def _get_std_uracil(): _GUANINE_CONTAINING_NUCLEOTIDES = ["G", "DG"] _URACIL_CONTAINING_NUCLEOTIDES = ["U", "DU"] _REFERENCE_NUCLEOTIDE_NAMES = ( - _ADENINE_CONTAINING_NUCLEOTIDES + - _THYMINE_CONTAINING_NUCLEOTIDES + - _CYTOSINE_CONTAINING_NUCLEOTIDES + - _GUANINE_CONTAINING_NUCLEOTIDES + - _URACIL_CONTAINING_NUCLEOTIDES + _ADENINE_CONTAINING_NUCLEOTIDES + + _THYMINE_CONTAINING_NUCLEOTIDES + + _CYTOSINE_CONTAINING_NUCLEOTIDES + + _GUANINE_CONTAINING_NUCLEOTIDES + + _URACIL_CONTAINING_NUCLEOTIDES ) # Atoms that are part of respective base edges according to the # Leontis-Westhof nomenclature _WATSON_CRICK_EDGE = { - "A" : ["N6", "N1"], - "G" : ["O6", "N1", "N2"], - "U" : ["O4", "N3", "O2"], - "T" : ["O4", "N3", "O2"], - "C" : ["N4", "N3", "O2"] + "A": ["N6", "N1"], + "G": ["O6", "N1", "N2"], + "U": ["O4", "N3", "O2"], + "T": ["O4", "N3", "O2"], + "C": ["N4", "N3", "O2"], } _HOOGSTEEN_EDGE = { - "A" : ["N6", "N7"], - "G" : ["O6", "N7"], - "U" : ["O4"], - "T" : ["O4"], - "C" : ["N4"] + "A": ["N6", "N7"], + "G": ["O6", "N7"], + "U": ["O4"], + "T": ["O4"], + "C": ["N4"], } _SUGAR_EDGE = { - "A" : ["N3", "O2'"], - "G" : ["N2", "N3", "O2'"], - "U" : ["O2", "O2'"], - "T" : ["O2", "O2'"], - "C" : ["O2", "O2'"] + "A": ["N3", "O2'"], + "G": ["N2", "N3", "O2'"], + "U": ["O2", "O2'"], + "T": ["O2", "O2'"], + "C": ["O2", "O2'"], } _EDGES = [_WATSON_CRICK_EDGE, _HOOGSTEEN_EDGE, _SUGAR_EDGE] @@ -284,9 +291,10 @@ class Edge(IntEnum): """ This enum type represents the interacting edge for a given base. """ - INVALID = 0, - WATSON_CRICK = 1, - HOOGSTEEN = 2, + + INVALID = (0,) + WATSON_CRICK = (1,) + HOOGSTEEN = (2,) SUGAR = 3 @@ -295,9 +303,10 @@ class GlycosidicBond(IntEnum): This enum type represents the relative glycosidic bond orientation for a given base pair. """ + INVALID = 0 - CIS = 1, - TRANS = 2, + CIS = (1,) + TRANS = (2,) def base_pairs_edge(atom_array, base_pairs): @@ -390,7 +399,7 @@ def base_pairs_edge(atom_array, base_pairs): .. footbibliography:: """ # Result-``ndarray`` matches the dimensions of the input array - results = np.zeros_like(base_pairs, dtype='uint8') + results = np.zeros_like(base_pairs, dtype="uint8") # Get the residue masks for each residue base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten()) @@ -441,16 +450,15 @@ def _get_edge_matrix(atom_array, base_masks): ) # filter out donor/acceptor heteroatoms and flatten for easy # iteration - hbonds = hbonds[:, (0,2)].flatten() + hbonds = hbonds[:, (0, 2)].flatten() # ``ndarray`` with one row for each base and the number of # bonded edge heteroatoms as in ``_edge`` as columns - matrix = np.zeros((2, 3), dtype='int32') + matrix = np.zeros((2, 3), dtype="int32") # Iterate through the atoms and corresponding atoms indices # that are part of the hydrogen bonds for atom, atom_index in zip(atom_array[hbonds], hbonds): - if atom.res_name not in _REFERENCE_NUCLEOTIDE_NAMES: continue @@ -460,8 +468,10 @@ def _get_edge_matrix(atom_array, base_masks): for base_index, base_mask in enumerate(base_masks): # If a donor/acceptor atom name matches a name in # the corresponding edge list increase the tally - if (base_mask[atom_index] and - atom.atom_name in edge_type[atom.res_name[-1]]): + if ( + base_mask[atom_index] + and atom.atom_name in edge_type[atom.res_name[-1]] + ): matrix[base_index, edge_type_index] += 1 return matrix @@ -540,7 +550,7 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): .. footbibliography:: """ - results = np.zeros(len(base_pairs), dtype='uint8') + results = np.zeros(len(base_pairs), dtype="uint8") # Get the residue masks for each residue base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten()) @@ -552,7 +562,6 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): ) for i, pair_masks in enumerate(base_pairs_masks): - # position vectors of each bases geometric center geometric_centers = np.zeros((2, 3)) # direction vectors of the glycosidic bonds @@ -565,23 +574,22 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): # For Purines the glycosidic bond is between the C1' and the # N9 atoms, for pyrimidines it is between the C1' atom and # the N1 atom - if (base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES or - base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES): - - geometric_centers[base_index] = ( - (ring_center[0] + ring_center[1]) / 2 - ) + if ( + base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES + or base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES + ): + geometric_centers[base_index] = (ring_center[0] + ring_center[1]) / 2 base_atom = base[base.atom_name == "N9"][0] - elif (base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES or - base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES or - base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES): - + elif ( + base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES + or base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES + or base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES + ): geometric_centers[base_index] = ring_center[0] base_atom = base[base.atom_name == "N1"][0] else: - results[i] = GlycosidicBond.INVALID break @@ -596,15 +604,16 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): geometric_centers_dir = geometric_centers[1] - geometric_centers[0] # Check the orientation of the glycosidic bonds - if np.dot( - np.cross(geometric_centers_dir, glycosidic_bonds[0]), - np.cross(geometric_centers_dir, glycosidic_bonds[1]) - ) < 0: - + if ( + np.dot( + np.cross(geometric_centers_dir, glycosidic_bonds[0]), + np.cross(geometric_centers_dir, glycosidic_bonds[1]), + ) + < 0 + ): results[i] = GlycosidicBond.TRANS else: - results[i] = GlycosidicBond.CIS return results @@ -723,15 +732,18 @@ def base_stacking(atom_array, min_atoms_per_base=3): for i in range(2): base_tuple = _match_base(bases[i], min_atoms_per_base) - if(base_tuple is None): + if base_tuple is None: break transformed_std_vectors[i] = base_tuple - normal_vectors = np.vstack((transformed_std_vectors[0][1], - transformed_std_vectors[1][1])) - aromatic_ring_centers = [transformed_std_vectors[0][3:], - transformed_std_vectors[1][3:]] + normal_vectors = np.vstack( + (transformed_std_vectors[0][1], transformed_std_vectors[1][1]) + ) + aromatic_ring_centers = [ + transformed_std_vectors[0][3:], + transformed_std_vectors[1][3:], + ] # Check if the base pairs are stacked. stacked = _check_base_stacking(aromatic_ring_centers, normal_vectors) @@ -744,7 +756,7 @@ def base_stacking(atom_array, min_atoms_per_base=3): return np.array(stacked_bases) -def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): +def base_pairs(atom_array, min_atoms_per_base=3, unique=True): """ Use DSSR criteria to find the base pairs in an :class:`AtomArray`. @@ -854,11 +866,8 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): nucleotides_boolean = filter_nucleotides(atom_array) # Disregard the phosphate-backbone - non_phosphate_boolean = ( - ~ np.isin( - atom_array.atom_name, - ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"] - ) + non_phosphate_boolean = ~np.isin( + atom_array.atom_name, ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"] ) # Combine the two boolean masks @@ -867,7 +876,6 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): # Get only nucleosides nucleosides = atom_array[boolean_mask] - # Get the base pair candidates according to a N/O cutoff distance, # where each base is identified as the first index of its respective # residue @@ -896,9 +904,7 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): base1 = nucleosides[base1_mask] base2 = nucleosides[base2_mask] - hbonds = _check_dssr_criteria( - (base1, base2), min_atoms_per_base, unique - ) + hbonds = _check_dssr_criteria((base1, base2), min_atoms_per_base, unique) # If no hydrogens are present use the number N/O pairs to # decide between multiple pairing possibilities. @@ -906,7 +912,7 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): if hbonds is None: # Each N/O-pair is detected twice. Thus, the number of # matches must be divided by two. - hbonds = n_o_pairs/2 + hbonds = n_o_pairs / 2 if hbonds != -1: basepairs.append((base1_index, base2_index)) if unique: @@ -922,20 +928,16 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): # Get all bases that have non-unique pairing interactions base_indices, occurrences = np.unique(basepairs, return_counts=True) for base_index, occurrence in zip(base_indices, occurrences): - if(occurrence > 1): + if occurrence > 1: # Write the non-unique base pairs to a dictionary as # 'index: number of hydrogen bonds' remove_candidates = {} - for i, row in enumerate( - np.asarray(basepair_array == base_index) - ): - if(np.any(row)): + for i, row in enumerate(np.asarray(basepair_array == base_index)): + if np.any(row): remove_candidates[i] = basepairs_hbonds[i] # Flag all non-unique base pairs for removal except the # one that has the most hydrogen bonds - del remove_candidates[ - max(remove_candidates, key=remove_candidates.get) - ] + del remove_candidates[max(remove_candidates, key=remove_candidates.get)] to_remove += list(remove_candidates.keys()) # Remove all flagged base pairs from the output `ndarray` basepair_array = np.delete(basepair_array, to_remove, axis=0) @@ -984,21 +986,22 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): # Generate the data necessary for analysis of each base. for i in range(2): - transformed_std_vectors[i] = _match_base( - basepair[i], min_atoms_per_base - ) + transformed_std_vectors[i] = _match_base(basepair[i], min_atoms_per_base) - if(transformed_std_vectors[i] is None): + if transformed_std_vectors[i] is None: return -1 - origins = np.vstack((transformed_std_vectors[0][0], - transformed_std_vectors[1][0])) - normal_vectors = np.vstack((transformed_std_vectors[0][1], - transformed_std_vectors[1][1])) - schnaap_origins = np.vstack((transformed_std_vectors[0][2], - transformed_std_vectors[1][2])) - aromatic_ring_centers = [transformed_std_vectors[0][3:], - transformed_std_vectors[1][3:]] + origins = np.vstack((transformed_std_vectors[0][0], transformed_std_vectors[1][0])) + normal_vectors = np.vstack( + (transformed_std_vectors[0][1], transformed_std_vectors[1][1]) + ) + schnaap_origins = np.vstack( + (transformed_std_vectors[0][2], transformed_std_vectors[1][2]) + ) + aromatic_ring_centers = [ + transformed_std_vectors[0][3:], + transformed_std_vectors[1][3:], + ] # Criterion 1: Distance between orgins <=15 Å if not (distance(origins[0], origins[1]) <= 15): @@ -1009,9 +1012,8 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): # Average the base normal vectors. If the angle between the vectors # is >=90°, flip one vector before averaging mean_normal_vector = ( - normal_vectors[0] + (normal_vectors[1] * np.sign(np.dot( - normal_vectors[0], normal_vectors[1] - ))) + normal_vectors[0] + + (normal_vectors[1] * np.sign(np.dot(normal_vectors[0], normal_vectors[1]))) ) / 2 norm_vector(mean_normal_vector) # Calculate the distance vector between the two SCHNAaP origins @@ -1024,8 +1026,9 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): return -1 # Criterion 3: Angle between normal vectors <=65° - if not (np.arccos(np.dot(normal_vectors[0], normal_vectors[1])) - >= ((115*np.pi)/180)): + if not ( + np.arccos(np.dot(normal_vectors[0], normal_vectors[1])) >= ((115 * np.pi) / 180) + ): return -1 # Criterion 4: Absence of stacking @@ -1035,8 +1038,7 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): # Criterion 5: Presence of at least one hydrogen bond # # Check if both bases came with hydrogens. - if (("H" in basepair[0].element) - and ("H" in basepair[1].element)): + if ("H" in basepair[0].element) and ("H" in basepair[1].element): # For Structures that contain hydrogens, check for their # presence directly. # @@ -1044,11 +1046,13 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): potential_basepair = basepair[0] + basepair[1] # Get the number of hydrogen bonds - bonds = len(hbond( - potential_basepair, - np.ones_like(potential_basepair, dtype=bool), - np.ones_like(potential_basepair, dtype=bool) - )) + bonds = len( + hbond( + potential_basepair, + np.ones_like(potential_basepair, dtype=bool), + np.ones_like(potential_basepair, dtype=bool), + ) + ) if bonds > 0: return bonds @@ -1085,7 +1089,7 @@ def _check_base_stacking(aromatic_ring_centers, normal_vectors): wrong_distance = True for ring_center1 in aromatic_ring_centers[0]: for ring_center2 in aromatic_ring_centers[1]: - if (distance(ring_center1, ring_center2) <= 4.5): + if distance(ring_center1, ring_center2) <= 4.5: wrong_distance = False normalized_distance_vectors.append(ring_center2 - ring_center1) norm_vector(normalized_distance_vectors[-1]) @@ -1106,8 +1110,7 @@ def _check_base_stacking(aromatic_ring_centers, normal_vectors): dist_normal_vector_angle = np.rad2deg( np.arccos(np.dot(normal_vector, normalized_dist_vector)) ) - if ((dist_normal_vector_angle >= 40) and - (dist_normal_vector_angle <= 140)): + if (dist_normal_vector_angle >= 40) and (dist_normal_vector_angle <= 140): return False return True @@ -1142,19 +1145,19 @@ def _match_base(nucleotide, min_atoms_per_base): if one_letter_code is None: return None - if (one_letter_code == 'A'): + if one_letter_code == "A": std_base = _STD_ADENINE std_ring_centers = _STD_ADENINE_RING_CENTERS - elif (one_letter_code == 'T'): + elif one_letter_code == "T": std_base = _STD_THYMINE std_ring_centers = _STD_THYMINE_RING_CENTERS - elif (one_letter_code == 'C'): + elif one_letter_code == "C": std_base = _STD_CYTOSINE std_ring_centers = _STD_CYTOSINE_RING_CENTERS - elif (one_letter_code == 'G'): + elif one_letter_code == "G": std_base = _STD_GUANINE std_ring_centers = _STD_GUANINE_RING_CENTERS - elif (one_letter_code == 'U'): + elif one_letter_code == "U": std_base = _STD_URACIL std_ring_centers = _STD_URACIL_RING_CENTERS @@ -1162,16 +1165,10 @@ def _match_base(nucleotide, min_atoms_per_base): vectors = np.vstack((vectors, std_ring_centers)) # Select the matching atoms of the nucleotide and the standard base - nucleotide_matched = nucleotide[ - np.isin(nucleotide.atom_name, std_base.atom_name) - ] - std_base_matched = std_base[ - np.isin(std_base.atom_name, nucleotide.atom_name) - ] + nucleotide_matched = nucleotide[np.isin(nucleotide.atom_name, std_base.atom_name)] + std_base_matched = std_base[np.isin(std_base.atom_name, nucleotide.atom_name)] # Ensure the nucleotide does not contain duplicate atom names - _, unique_indices = np.unique( - nucleotide_matched.atom_name, return_index=True - ) + _, unique_indices = np.unique(nucleotide_matched.atom_name, return_index=True) nucleotide_matched = nucleotide_matched[unique_indices] # Only continue if minimum number of matching atoms is reached if len(nucleotide_matched) < min_atoms_per_base: @@ -1179,21 +1176,19 @@ def _match_base(nucleotide, min_atoms_per_base): f"Nucleotide with res_id {nucleotide.res_id[0]} and " f"chain_id {nucleotide.chain_id[0]} has less than 3 base " f"atoms, unable to check for base pair.", - IncompleteStructureWarning + IncompleteStructureWarning, ) return None # Reorder the atoms of the nucleotide to obtain the standard RCSB # PDB atom order. - nucleotide_matched = nucleotide_matched[ - standardize_order(nucleotide_matched) - ] + nucleotide_matched = nucleotide_matched[standardize_order(nucleotide_matched)] # Match the selected std_base to the base. _, transformation = superimpose(nucleotide_matched, std_base_matched) vectors = transformation.apply(vectors) # Normalize the base-normal-vector - vectors[1,:] = vectors[1,:]-vectors[0,:] - norm_vector(vectors[1,:]) + vectors[1, :] = vectors[1, :] - vectors[0, :] + norm_vector(vectors[1, :]) return vectors @@ -1259,8 +1254,11 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # List of the standard bases for easy iteration std_base_list = [ - _STD_ADENINE, _STD_THYMINE, _STD_CYTOSINE, _STD_GUANINE, - _STD_URACIL + _STD_ADENINE, + _STD_THYMINE, + _STD_CYTOSINE, + _STD_GUANINE, + _STD_URACIL, ] # The number of matched atoms for each 'standard' base @@ -1275,7 +1273,7 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): f"{residue.chain_id[0]} has an overlap with the reference " f"bases which is less than {min_atoms_per_base} atoms. " f"Unable to map nucleotide.", - IncompleteStructureWarning + IncompleteStructureWarning, ) return None, False @@ -1284,7 +1282,7 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # Iterate through the reference bases with the maximum number of # matching atoms - for ref_base in np.array(std_base_list, dtype='object')[ + for ref_base in np.array(std_base_list, dtype="object")[ np.array(matched_atom_no) == np.max(matched_atom_no) ]: # Copy the residue as the res_name property of the ``AtomArray`` @@ -1293,12 +1291,8 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # Select the matching atoms of the nucleotide and the reference # base - nuc = nuc[ - np.isin(nuc.atom_name, ref_base.atom_name) - ] - ref_base_matched = ref_base[ - np.isin(ref_base.atom_name, nuc.atom_name) - ] + nuc = nuc[np.isin(nuc.atom_name, ref_base.atom_name)] + ref_base_matched = ref_base[np.isin(ref_base.atom_name, nuc.atom_name)] # Set the res_name property to the same as the reference base. # This is a requirement for ``standardize_order`` @@ -1319,14 +1313,14 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # If the RMSD is lower than the specified cutoff or better than # a previous found reference, the current reference is selected # as best base - if(rmsd(fitted, ref_base_matched) < rmsd_cutoff): + if rmsd(fitted, ref_base_matched) < rmsd_cutoff: rmsd_cutoff = rmsd(fitted, ref_base_matched) best_base = ref_base_matched.res_name[0][-1] if best_base is None: warnings.warn( f"Base Type {residue.res_name[0]} not supported. ", - UnexpectedStructureWarning + UnexpectedStructureWarning, ) return None @@ -1360,9 +1354,9 @@ def _get_proximate_residues(atom_array, boolean_mask, cutoff): # Get the indices of the atoms that are within the maximum cutoff # of each other - indices = CellList( - atom_array, cutoff, selection=boolean_mask - ).get_atoms(atom_array.coord[boolean_mask], cutoff) + indices = CellList(atom_array, cutoff, selection=boolean_mask).get_atoms( + atom_array.coord[boolean_mask], cutoff + ) # Loop through the indices of potential partners pairs = [] @@ -1375,16 +1369,12 @@ def _get_proximate_residues(atom_array, boolean_mask, cutoff): # indices. pairs = np.array(pairs) basepair_candidates_shape = pairs.shape - pairs = get_residue_starts_for( - atom_array, pairs.flatten() - ).reshape(basepair_candidates_shape) + pairs = get_residue_starts_for(atom_array, pairs.flatten()).reshape( + basepair_candidates_shape + ) # Remove candidates where the pairs are from the same residue - pairs = np.delete( - pairs, np.where( - pairs[:,0] == pairs[:,1] - ), axis=0 - ) + pairs = np.delete(pairs, np.where(pairs[:, 0] == pairs[:, 1]), axis=0) # Sort the residue starts for each pair for i, candidate in enumerate(pairs): pairs[i] = sorted(candidate) @@ -1411,5 +1401,4 @@ def _filter_atom_type(atom_array, atom_names): This array is ``True`` for all indices in the :class:`AtomArray` , where the atom has the desired atom names. """ - return (np.isin(atom_array.atom_name, atom_names) - & (atom_array.res_id != -1)) + return np.isin(atom_array.atom_name, atom_names) & (atom_array.res_id != -1) diff --git a/src/biotite/structure/box.py b/src/biotite/structure/box.py index ae4918add..a04400b84 100644 --- a/src/biotite/structure/box.py +++ b/src/biotite/structure/box.py @@ -4,25 +4,33 @@ """ Functions related to working with the simulation box or unit cell -of a structure +of a structure """ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["vectors_from_unitcell", "unitcell_from_vectors", "box_volume", - "repeat_box", "repeat_box_coord", "move_inside_box", - "remove_pbc", "remove_pbc_from_coord", - "coord_to_fraction", "fraction_to_coord", "is_orthogonal"] +__all__ = [ + "vectors_from_unitcell", + "unitcell_from_vectors", + "box_volume", + "repeat_box", + "repeat_box_coord", + "move_inside_box", + "remove_pbc", + "remove_pbc_from_coord", + "coord_to_fraction", + "fraction_to_coord", + "is_orthogonal", +] -from collections.abc import Iterable from numbers import Integral import numpy as np import numpy.linalg as linalg -from .util import vector_dot from .atoms import repeat -from .molecules import get_molecule_masks from .chains import get_chain_masks, get_chain_starts from .error import BadStructureError +from .molecules import get_molecule_masks +from .util import vector_dot def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): @@ -41,7 +49,7 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): The angles between the box vectors in radians. *alpha* is the angle between *b* and *c*, *beta* between *a* and *c*, *gamma* between *a* and *b* - + Returns ------- box : ndarray, dtype=float, shape=(3,3) @@ -49,7 +57,7 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): The vector components are in the last dimension. The value can be directly used as :attr:`box` attribute in an atom array. - + See also -------- unitcell_from_vectors @@ -58,19 +66,15 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): b_x = len_b * np.cos(gamma) b_y = len_b * np.sin(gamma) c_x = len_c * np.cos(beta) - c_y = len_c * (np.cos(alpha) - np.cos(beta)*np.cos(gamma)) / np.sin(gamma) - c_z = np.sqrt(len_c*len_c - c_x*c_x - c_y*c_y) - box = np.array([ - [a_x, 0, 0], - [b_x, b_y, 0], - [c_x, c_y, c_z] - ], dtype=np.float32) - + c_y = len_c * (np.cos(alpha) - np.cos(beta) * np.cos(gamma)) / np.sin(gamma) + c_z = np.sqrt(len_c * len_c - c_x * c_x - c_y * c_y) + box = np.array([[a_x, 0, 0], [b_x, b_y, 0], [c_x, c_y, c_z]], dtype=np.float32) + # Fix numerical errors, as values, that are actually 0, # might not be calculated as such tol = 1e-4 * (len_a + len_b + len_c) box[np.abs(box) < tol] = 0 - + return box @@ -84,7 +88,7 @@ def unitcell_from_vectors(box): ---------- box : ndarray, shape=(3,3) The box vectors - + Returns ------- len_a, len_b, len_c : float @@ -103,7 +107,7 @@ def unitcell_from_vectors(box): len_b = linalg.norm(b) len_c = linalg.norm(c) alpha = np.arccos(np.dot(b, c) / (len_b * len_c)) - beta = np.arccos(np.dot(a, c) / (len_a * len_c)) + beta = np.arccos(np.dot(a, c) / (len_a * len_c)) gamma = np.arccos(np.dot(a, b) / (len_a * len_b)) return len_a, len_b, len_c, alpha, beta, gamma @@ -116,7 +120,7 @@ def box_volume(box): ---------- box : ndarray, shape=(3,3) or shape=(m,3,3) One or multiple boxes to get the volume for. - + Returns ------- volume : float or ndarray, shape=(m,) @@ -159,7 +163,7 @@ def repeat_box(atoms, amount=1): Indices to the atoms in the original atom array (stack). Equal to ``numpy.tile(np.arange(atoms.array_length()), (1 + 2 * amount) ** 3)``. - + See also -------- repeat_box_coord @@ -232,12 +236,12 @@ def repeat_box(atoms, amount=1): """ if atoms.box is None: raise BadStructureError("Structure has no box") - + repeat_coord, indices = repeat_box_coord(atoms.coord, atoms.box) # Unroll repeated coordinates for input to 'repeat()' if repeat_coord.ndim == 2: repeat_coord = repeat_coord.reshape(-1, atoms.array_length(), 3) - else: # ndim == 3 + else: # ndim == 3 repeat_coord = repeat_coord.reshape( atoms.stack_depth(), -1, atoms.array_length(), 3 ) @@ -283,16 +287,15 @@ def repeat_box_coord(coord, box, amount=1): raise TypeError("The amount must be an integer") # List of numpy arrays for each box repeat coords_for_boxes = [coord] - for i in range(-amount, amount+1): - for j in range(-amount, amount+1): - for k in range(-amount, amount+1): + for i in range(-amount, amount + 1): + for j in range(-amount, amount + 1): + for k in range(-amount, amount + 1): # Omit the central box if i != 0 or j != 0 or k != 0: temp_coord = coord.copy() # Shift coordinates to adjacent box/unit cell translation_vec = np.sum( - box * np.array([i,j,k])[:, np.newaxis], - axis=-2 + box * np.array([i, j, k])[:, np.newaxis], axis=-2 ) # 'newaxis' to perform same translation on all # atoms for each model @@ -300,7 +303,7 @@ def repeat_box_coord(coord, box, amount=1): coords_for_boxes.append(temp_coord) return ( np.concatenate(coords_for_boxes, axis=-2), - np.tile(np.arange(coord.shape[-2]), (1 + 2 * amount) ** 3) + np.tile(np.arange(coord.shape[-2]), (1 + 2 * amount) ** 3), ) @@ -323,16 +326,16 @@ def move_inside_box(coord, box): The box(es) for one or multiple models. When `coord` is given for multiple models, :attr:`box` must be given for multiple models as well. - + Returns ------- moved_coord : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) The moved coordinates. Has the same shape is the input `coord`. - + Examples -------- - + >>> box = np.array([[10,0,0], [0,10,0], [0,0,10]], dtype=float) >>> inside_coord = [ 1, 2, 3] >>> outside_coord = [ 1, 22, 54] @@ -363,7 +366,7 @@ def remove_pbc(atoms, selection=None): To determine the molecules the structure is required to have an associated `BondList`. Otherwise segmentation removal is performed on a per-chain basis. - + Parameters ---------- atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) @@ -373,13 +376,13 @@ def remove_pbc(atoms, selection=None): selection : ndarray, dtype=bool, shape=(n,) Specifies which parts of `atoms` are sanitized, i.e the segmentation is removed. - + Returns ------- sanitized_atoms : AtomArray or AtomArrayStack The input structure with removed segmentation over periodic boundaries. - + See also -------- remove_pbc_from_coord @@ -393,11 +396,9 @@ def remove_pbc(atoms, selection=None): """ # Avoid circular import from .geometry import centroid - + if atoms.box is None: - raise BadStructureError( - "The 'box' attribute must be set in the structure" - ) + raise BadStructureError("The 'box' attribute must be set in the structure") new_atoms = atoms.copy() if atoms.bonds is not None: @@ -414,10 +415,8 @@ def remove_pbc(atoms, selection=None): ) # Put center of molecule into box center = centroid(new_atoms.coord[..., mask, :])[..., np.newaxis, :] - center_in_box = move_inside_box( - center, new_atoms.box - ) - new_atoms.coord[..., mask, :] += (center_in_box - center) + center_in_box = move_inside_box(center, new_atoms.box) + new_atoms.coord[..., mask, :] += center_in_box - center return new_atoms @@ -433,11 +432,11 @@ def remove_pbc_from_coord(coord, box): the displacement coordinates in adjacent array positions. Basically, this function performs the reverse action of :func:`move_inside_box()`. - + Parameters ---------- coord : ndarray, dtype=float, shape=(m,n,3) or shape=(n,3) - The coordinates of the potentially segmented structure. + The coordinates of the potentially segmented structure. box : ndarray, dtype=float, shape=(m,3,3) or shape=(3,3) The simulation box or unit cell that is used as periodic boundary. @@ -447,7 +446,7 @@ def remove_pbc_from_coord(coord, box): ------- sanitized_coord : ndarray, dtype=float, shape=(m,n,3) or shape=(n,3) The reassembled coordinates. - + See also -------- remove_pbc_from_coord @@ -464,18 +463,13 @@ def remove_pbc_from_coord(coord, box): # Import in function to avoid circular import from .geometry import index_displacement + # Get the PBC-sanitized displacements of all coordinates # to the respective next coordinate index_pairs = np.stack( - [ - np.arange(0, coord.shape[-2] - 1), - np.arange(1, coord.shape[-2] ) - ], - axis=1 - ) - neighbour_disp = index_displacement( - coord, index_pairs, box=box, periodic=True + [np.arange(0, coord.shape[-2] - 1), np.arange(1, coord.shape[-2])], axis=1 ) + neighbour_disp = index_displacement(coord, index_pairs, box=box, periodic=True) # Get the PBC-sanitized displacements of all but the first # coordinates to (0,0,0) absolute_disp = np.cumsum(neighbour_disp, axis=-2) @@ -501,19 +495,19 @@ def coord_to_fraction(coord, box): The box(es) for one or multiple models. When `coord` is given for multiple models, :attr:`box` must be given for multiple models as well. - + Returns ------- fraction : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) The fractions of the box vectors. - + See also -------- fraction_to_coord Examples -------- - + >>> box = np.array([[5,0,0], [0,5,0], [0,5,5]], dtype=float) >>> coord = np.array( ... [[1,1,1], [10,0,0], [0,0,10], [-5,2,1]], @@ -548,12 +542,12 @@ def fraction_to_coord(fraction, box): The box(es) for one or multiple models. When `coord` is given for multiple models, :attr:`box` must be given for multiple models as well. - + Returns ------- coord : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) The coordinates. - + See also -------- coord_to_fraction @@ -572,12 +566,12 @@ def is_orthogonal(box): ---------- box : ndarray, dtype=float, shape=(3,3) or shape=(m,3,3) A single box or multiple boxes. - + Returns ------- is_orthgonal : bool or ndarray, shape=(m,), dtype=bool True, if the box vectors are orthogonal, false otherwise - + Notes ----- Due to possible numerical errors, this function also evaluates two @@ -587,6 +581,8 @@ def is_orthogonal(box): # Fix numerical errors, as values, that are actually 0, # might not be calculated as such tol = 1e-6 - return (np.abs(vector_dot(box[..., 0, :], box[..., 1, :])) < tol) & \ - (np.abs(vector_dot(box[..., 0, :], box[..., 2, :])) < tol) & \ - (np.abs(vector_dot(box[..., 1, :], box[..., 2, :])) < tol) \ No newline at end of file + return ( + (np.abs(vector_dot(box[..., 0, :], box[..., 1, :])) < tol) + & (np.abs(vector_dot(box[..., 0, :], box[..., 2, :])) < tol) + & (np.abs(vector_dot(box[..., 1, :], box[..., 2, :])) < tol) + ) diff --git a/src/biotite/structure/chains.py b/src/biotite/structure/chains.py index df3134267..2778855c2 100644 --- a/src/biotite/structure/chains.py +++ b/src/biotite/structure/chains.py @@ -9,9 +9,18 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["get_chain_starts", "apply_chain_wise", "spread_chain_wise", - "get_chain_masks", "get_chain_starts_for", "get_chain_positions", - "chain_iter", "get_chains", "get_chain_count", "chain_iter"] +__all__ = [ + "get_chain_starts", + "apply_chain_wise", + "spread_chain_wise", + "get_chain_masks", + "get_chain_starts_for", + "get_chain_positions", + "chain_iter", + "get_chains", + "get_chain_count", + "chain_iter", +] import numpy as np from .resutil import * @@ -21,10 +30,10 @@ def get_chain_starts(array, add_exclusive_stop=False): """ Get the indices in an atom array, which indicates the beginning of a new chain. - + A new chain starts, when the chain ID changes or when the residue ID decreases. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -33,17 +42,17 @@ def get_chain_starts(array, add_exclusive_stop=False): If true, the exclusive stop of the input atom array, i.e. ``array.array_length()``, is added to the returned array of start indices as last element. - + Returns ------- starts : ndarray, dtype=int The start indices of new chains in `array`. - + Notes ----- This method is internally used by all other chain-related functions. - + See also -------- get_residue_starts @@ -51,13 +60,13 @@ def get_chain_starts(array, add_exclusive_stop=False): diff = np.diff(array.res_id) res_id_decrement = diff < 0 # This mask is 'true' at indices where the value changes - chain_id_changes = (array.chain_id[1:] != array.chain_id[:-1]) - + chain_id_changes = array.chain_id[1:] != array.chain_id[:-1] + # Convert mask to indices # Add 1, to shift the indices from the end of a chain # to the start of a new chain chain_starts = np.where(res_id_decrement | chain_id_changes)[0] + 1 - + # The first chain is not included yet -> Insert '[0]' if add_exclusive_stop: return np.concatenate(([0], chain_starts, [array.array_length()])) @@ -69,7 +78,7 @@ def apply_chain_wise(array, data, function, axis=None): """ Apply a function to intervals of data, where each interval corresponds to one chain. - + The function takes an atom array (stack) and an data array (`ndarray`) of the same length. The function iterates through the chain IDs of the atom array (stack) and identifies intervals of @@ -77,8 +86,8 @@ def apply_chain_wise(array, data, function, axis=None): partitioned into the same intervals, and each interval (also an :class:`ndarray`) is put as parameter into `function`. Each return value is stored as element in the resulting :class:`ndarray`, therefore each element - corresponds to one chain. - + corresponds to one chain. + Parameters ---------- array : AtomArray or AtomArrayStack @@ -92,14 +101,14 @@ def apply_chain_wise(array, data, function, axis=None): must return a value with the same shape and data type. axis : int, optional This value is given to the `axis` parameter of `function`. - + Returns ------- processed_data : ndarray Chain-wise evaluation of `data` by `function`. The size of the first dimension of this array is equal to the amount of chains. - + See also -------- apply_residue_wise @@ -114,11 +123,11 @@ def spread_chain_wise(array, input_data): Each value in the chain-wise input is assigned to all atoms of this chain: - + ``output_data[i] = input_data[j]``, *i* is incremented from atom to atom, *j* is incremented every chain change. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -126,13 +135,13 @@ def spread_chain_wise(array, input_data): input_data : ndarray The data to be spread. The length of axis=0 must be equal to the amount of different chain IDs in `array`. - + Returns ------- output_data : ndarray Chain-wise spread `input_data`. Length is the same as `array_length()` of `array`. - + See also -------- spread_residue_wise @@ -154,14 +163,14 @@ def get_chain_masks(array, indices): These indices indicate the atoms to get the corresponding chains for. Negative indices are not allowed. - + Returns ------- chains_masks : ndarray, dtype=bool, shape=(k,n) Multiple boolean masks, one for each given index in `indices`. Each array masks the atoms that belong to the same chain as the atom at the given index. - + See also -------- get_residue_masks @@ -183,13 +192,13 @@ def get_chain_starts_for(array, indices): These indices point to the atoms to get the corresponding chain starts for. Negative indices are not allowed. - + Returns ------- start_indices : ndarray, dtype=int, shape=(k,) The indices that point to the chain starts for the input `indices`. - + See also -------- get_residue_starts_for @@ -214,12 +223,12 @@ def get_chain_positions(array, indices): These indices point to the atoms to get the corresponding chain positions for. Negative indices are not allowed. - + Returns ------- start_indices : ndarray, dtype=int, shape=(k,) The indices that point to the position of the chains. - + See also -------- get_residue_positions @@ -231,20 +240,20 @@ def get_chain_positions(array, indices): def get_chains(array): """ Get the chain IDs of an atom array (stack). - + The chains are listed in the same order they occur in the array (stack). - + Parameters ---------- array : AtomArray or AtomArrayStack The atom array (stack), where the chains are determined. - + Returns ------- ids : ndarray, dtype=str List of chain IDs. - + See also -------- get_residues @@ -255,20 +264,20 @@ def get_chains(array): def get_chain_count(array): """ Get the amount of chains in an atom array (stack). - + The count is determined from the `chain_id` annotation. Each time the chain ID changes, the count is incremented. - + Parameters ---------- array : AtomArray or AtomArrayStack The atom array (stack), where the chains are counted. - + Returns ------- count : int Amount of chains. - + See also -------- get_residue_count @@ -279,20 +288,20 @@ def get_chain_count(array): def chain_iter(array): """ Iterate over all chains in an atom array (stack). - + Parameters ---------- array : AtomArray or AtomArrayStack The atom array (stack) to iterate over. - + Yields ------ chain : AtomArray or AtomArrayStack A single chain of the input `array`. - + See also -------- residue_iter """ starts = get_chain_starts(array, add_exclusive_stop=True) - return segment_iter(array, starts) \ No newline at end of file + return segment_iter(array, starts) diff --git a/src/biotite/structure/compare.py b/src/biotite/structure/compare.py index abb6b7e9f..0a07ea383 100644 --- a/src/biotite/structure/compare.py +++ b/src/biotite/structure/compare.py @@ -12,7 +12,7 @@ __all__ = ["rmsd", "rmspd", "rmsf", "average"] import numpy as np -from .atoms import Atom, AtomArray, AtomArrayStack, coord +from .atoms import AtomArrayStack, coord from .geometry import index_distance from .util import vector_dot @@ -20,13 +20,13 @@ def rmsd(reference, subject): r""" Calculate the RMSD between two structures. - + The *root-mean-square-deviation* (RMSD) indicates the overall deviation of each model of a structure to a reference structure. It is defined as: - + .. math:: RMSD = \sqrt{ \frac{1}{n} \sum\limits_{i=1}^n (x_i - x_{ref,i})^2} - + Parameters ---------- reference : AtomArray or ndarray, dtype=float, shape=(n,3) @@ -37,7 +37,7 @@ def rmsd(reference, subject): Structure(s) to be compared with `reference`. Alternatively, coordinates can be provided directly as :class:`ndarray`. - + Returns ------- rmsd : float or ndarray, dtype=float, shape=(m,) @@ -45,7 +45,7 @@ def rmsd(reference, subject): If subject is an :class:`AtomArray` a float is returned. If subject is an :class:`AtomArrayStack` a :class:`ndarray` containing the RMSD for each model is returned. - + See Also -------- rmsf @@ -71,16 +71,17 @@ def rmsd(reference, subject): """ return np.sqrt(np.mean(_sq_euclidian(reference, subject), axis=-1)) + def rmspd(reference, subject, periodic=False, box=None): r""" - Calculate the RMSD of atom pair distances for given structures + Calculate the RMSD of atom pair distances for given structures relative to those found in a reference structure. - Unlike the standard RMSD, the *root-mean-square-pairwise-deviation* - (RMSPD) is a fit-free method to determine deviations between + Unlike the standard RMSD, the *root-mean-square-pairwise-deviation* + (RMSPD) is a fit-free method to determine deviations between a structure and a preset reference. - .. math:: RMSPD = \sqrt{ \frac{1}{n^2} \sum\limits_{i=1}^n \sum\limits_{j \neq i}^n (d_{ij} - d_{ref,ij})^2} + .. math:: RMSPD = \sqrt{ \frac{1}{n^2} \sum\limits_{i=1}^n \sum\limits_{j \neq i}^n (d_{ij} - d_{ref,ij})^2} Parameters ---------- @@ -102,7 +103,7 @@ def rmspd(reference, subject, periodic=False, box=None): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- rmspd : float or ndarray, dtype=float, shape=(m,) @@ -110,7 +111,7 @@ def rmspd(reference, subject, periodic=False, box=None): If subject is an :class:`AtomArray` a float is returned. If subject is an :class:`AtomArrayStack` a :class:`ndarray` containing the RMSD for each model is returned. - + Warnings -------- Internally, this function uses :func:`index_distance()`. @@ -119,7 +120,7 @@ def rmspd(reference, subject, periodic=False, box=None): prior to the computation of RMSPDs with `periodic` set to false to ensure correct results. (e.g. with :func:`remove_pbc()`). - + See also -------- index_distance @@ -134,9 +135,10 @@ def rmspd(reference, subject, periodic=False, box=None): refdist = index_distance(reference, pairs, periodic=periodic, box=box) subjdist = index_distance(subject, pairs, periodic=periodic, box=box) - rmspd = np.sqrt(np.sum((subjdist - refdist)**2, axis = -1))/reflen + rmspd = np.sqrt(np.sum((subjdist - refdist) ** 2, axis=-1)) / reflen return rmspd + def rmsf(reference, subject): r""" Calculate the RMSF between two structures. @@ -146,9 +148,9 @@ def rmsf(reference, subject): models. Usually the reference structure, is the average over all models. The RMSF is defined as: - + .. math:: RMSF(i) = \sqrt{ \frac{1}{T} \sum\limits_{t=1}^T (x_i(t) - x_{ref,i}(t))^2} - + Parameters ---------- reference : AtomArray or ndarray, dtype=float, shape=(n,3) @@ -161,14 +163,14 @@ def rmsf(reference, subject): :class:`AtomArrayStack`. Alternatively, coordinates can be provided directly as :class:`ndarray`. - + Returns ------- rmsf : ndarray, dtype=float, shape=(n,) RMSF between subject and reference structure. Each element gives the RMSF for the atom at the respective index. - + See Also -------- rmsd @@ -198,41 +200,39 @@ def rmsf(reference, subject): def average(atoms): """ Calculate an average structure. - + The average structure has the average coordinates of the input models. - + Parameters ---------- atoms : AtomArrayStack or ndarray, dtype=float, shape=(m,n,3) The structure models to be averaged. Alternatively, coordinates can be provided directly as :class:`ndarray`. - + Returns ------- average : AtomArray or ndarray, dtype=float, shape=(n,3) Structure with averaged atom coordinates. If `atoms` is a :class:`ndarray` and :class:`ndarray` is also returned. - + See Also -------- rmsd, rmsf - + Notes ----- The calculated average structure is not suitable for visualization or geometric calculations, since bond lengths and angles will deviate from meaningful values. This method is rather useful to provide a reference structure for - calculation of e.g. the RMSD or RMSF. + calculation of e.g. the RMSD or RMSF. """ coords = coord(atoms) if coords.ndim != 3: - raise TypeError( - "Expected an AtomArrayStack or an ndarray with shape (m,n,3)" - ) + raise TypeError("Expected an AtomArrayStack or an ndarray with shape (m,n,3)") mean_coords = np.mean(coords, axis=0) if isinstance(atoms, AtomArrayStack): mean_array = atoms[0].copy() @@ -246,7 +246,7 @@ def _sq_euclidian(reference, subject): """ Calculate squared euclidian distance between atoms in two structures. - + Parameters ---------- reference : AtomArray or ndarray, dtype=float, shape=(n,3) @@ -254,7 +254,7 @@ def _sq_euclidian(reference, subject): subject : AtomArray or AtomArrayStack or ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) Structure(s) whose atoms squared euclidian distance to `reference` is measured. - + Returns ------- ndarray, dtype=float, shape=(n,) or shape=(m,n) @@ -271,4 +271,4 @@ def _sq_euclidian(reference, subject): "Expected an AtomArray or an ndarray with shape (n,3) as reference" ) dif = subject_coord - reference_coord - return vector_dot(dif, dif) \ No newline at end of file + return vector_dot(dif, dif) diff --git a/src/biotite/structure/density.py b/src/biotite/structure/density.py index 5f6043412..9065672dd 100644 --- a/src/biotite/structure/density.py +++ b/src/biotite/structure/density.py @@ -14,8 +14,7 @@ from .atoms import coord -def density(atoms, selection=None, delta=1.0, bins=None, - density=False, weights=None): +def density(atoms, selection=None, delta=1.0, bins=None, density=False, weights=None): r""" Compute the density of the selected atoms. @@ -51,13 +50,13 @@ def density(atoms, selection=None, delta=1.0, bins=None, Otherwise, returns the probability density function of each bin. See :func:`numpy.histogramdd()` for further details. weights: ndarray, shape=(n,) or shape=(m,n), optional - An array of values to weight the contribution of *n* atoms in + An array of values to weight the contribution of *n* atoms in *m* models. If the shape is *(n,)*, the weights will be interpreted as *per atom*. A shape of *(m,n)* allows to additionally weight atoms on a *per model* basis. - + Returns ------- H : ndarray, dtype=float @@ -69,12 +68,12 @@ def density(atoms, selection=None, delta=1.0, bins=None, A list containing the 3 arrays describing the bin edges. """ coords = coord(atoms) - + is_stack = coords.ndim == 3 # Define the grid for coordinate binning based on coordinates of # supplied atoms - # This makes the binning independent of a supplied box vector and + # This makes the binning independent of a supplied box vector and # fluctuating box dimensions are not a problem # However, this means that the user has to make sure the region of # interest is in the center of the box, i.e. by centering the @@ -84,19 +83,17 @@ def density(atoms, selection=None, delta=1.0, bins=None, axis = (0, 1) else: axis = 0 - grid_min, grid_max = np.min( - coords, axis=axis), np.max(coords, axis=axis - ) + grid_min, grid_max = np.min(coords, axis=axis), np.max(coords, axis=axis) bins = [ - np.arange(grid_min[0], grid_max[0]+delta, delta), - np.arange(grid_min[1], grid_max[1]+delta, delta), - np.arange(grid_min[2], grid_max[2]+delta, delta), + np.arange(grid_min[0], grid_max[0] + delta, delta), + np.arange(grid_min[1], grid_max[1] + delta, delta), + np.arange(grid_min[2], grid_max[2] + delta, delta), ] if selection is None: selected_coords = coords else: - selected_coords = coords[...,selection, :] + selected_coords = coords[..., selection, :] # Reshape the coords into Nx3 coords = selected_coords.reshape((np.prod(selected_coords.shape[:-1]), 3)) @@ -106,9 +103,7 @@ def density(atoms, selection=None, delta=1.0, bins=None, if is_stack and len(weights.shape) < 2: weights = np.tile(weights, len(selected_coords)) weights = weights.reshape(coords.shape[0]) - + # Calculate the histogram - hist = np.histogramdd( - coords, bins=bins, density=density, weights=weights - ) + hist = np.histogramdd(coords, bins=bins, density=density, weights=weights) return hist diff --git a/src/biotite/structure/dotbracket.py b/src/biotite/structure/dotbracket.py index ebfc3cf7f..0d208cc19 100644 --- a/src/biotite/structure/dotbracket.py +++ b/src/biotite/structure/dotbracket.py @@ -9,8 +9,7 @@ __name__ = "biotite.structure" __author__ = "Tom David Müller" -__all__ = ["dot_bracket_from_structure", "dot_bracket", - "base_pairs_from_dot_bracket"] +__all__ = ["dot_bracket_from_structure", "dot_bracket", "base_pairs_from_dot_bracket"] import numpy as np from .basepairs import base_pairs @@ -24,7 +23,8 @@ def dot_bracket_from_structure( - nucleic_acid_strand, scores=None, max_pseudoknot_order=None): + nucleic_acid_strand, scores=None, max_pseudoknot_order=None +): """ Represent a nucleic-acid-strand in dot-bracket-letter-notation (DBL-notation). :footcite:`Antczak2018` @@ -53,16 +53,18 @@ def dot_bracket_from_structure( References ---------- - + .. footbibliography:: """ basepairs = base_pairs(nucleic_acid_strand) if len(basepairs) == 0: - return [''] + return [""] basepairs = get_residue_positions(nucleic_acid_strand, basepairs) length = get_residue_count(nucleic_acid_strand) - return dot_bracket(basepairs, length, scores=scores, - max_pseudoknot_order=max_pseudoknot_order) + return dot_bracket( + basepairs, length, scores=scores, max_pseudoknot_order=max_pseudoknot_order + ) + def dot_bracket(basepairs, length, scores=None, max_pseudoknot_order=None): """ @@ -115,21 +117,20 @@ def dot_bracket(basepairs, length, scores=None, max_pseudoknot_order=None): References ---------- - + .. footbibliography:: """ # Make sure the lower residue is on the left for each row basepairs = np.sort(basepairs, axis=1) # Get pseudoknot order - pseudoknot_order = pseudoknots(basepairs, scores=scores, - max_pseudoknot_order=max_pseudoknot_order) + pseudoknot_order = pseudoknots( + basepairs, scores=scores, max_pseudoknot_order=max_pseudoknot_order + ) # Each optimal pseudoknot order solution is represented in # dot-bracket-notation - notations = [ - bytearray((b"."*length)) for _ in range(len(pseudoknot_order)) - ] + notations = [bytearray((b"." * length)) for _ in range(len(pseudoknot_order))] for s, solution in enumerate(pseudoknot_order): for basepair, order in zip(basepairs, solution): if order == -1: @@ -138,6 +139,7 @@ def dot_bracket(basepairs, length, scores=None, max_pseudoknot_order=None): notations[s][basepair[1]] = _CLOSING_BRACKETS_BYTES[order] return [notation.decode() for notation in notations] + def base_pairs_from_dot_bracket(dot_bracket_notation): """ Extract the base pairs from a nucleic-acid-strand in @@ -172,7 +174,7 @@ def base_pairs_from_dot_bracket(dot_bracket_notation): References ---------- - + .. footbibliography:: """ basepairs = [] @@ -180,7 +182,6 @@ def base_pairs_from_dot_bracket(dot_bracket_notation): # Iterate through input string and extract base pairs for pos, symbol in enumerate(dot_bracket_notation): - if symbol in _OPENING_BRACKETS: # Add opening residues to list (separate list for each # bracket type) @@ -197,9 +198,7 @@ def base_pairs_from_dot_bracket(dot_bracket_notation): else: if symbol != ".": - raise ValueError( - f"'{symbol}' is an invalid character for DBL-notation" - ) + raise ValueError(f"'{symbol}' is an invalid character for DBL-notation") for not_closed in opened_brackets: if not_closed != []: @@ -208,7 +207,6 @@ def base_pairs_from_dot_bracket(dot_bracket_notation): "closing bracket" ) - # Sort the base pair indices in ascending order basepairs = np.array(basepairs) if len(basepairs) > 0: diff --git a/src/biotite/structure/error.py b/src/biotite/structure/error.py index 269ee2276..1fe632e97 100644 --- a/src/biotite/structure/error.py +++ b/src/biotite/structure/error.py @@ -8,24 +8,32 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["BadStructureError", "IncompleteStructureWarning", - "UnexpectedStructureWarning"] +__all__ = [ + "BadStructureError", + "IncompleteStructureWarning", + "UnexpectedStructureWarning", +] class BadStructureError(Exception): """ Indicates that a structure is not suitable for a certain operation. """ + pass + class IncompleteStructureWarning(Warning): """ Indicates that a structure is not complete. """ + pass + class UnexpectedStructureWarning(Warning): """ Indicates that a structure was not expected. """ + pass diff --git a/src/biotite/structure/filter.py b/src/biotite/structure/filter.py index 13ccd486a..9eaf9cb20 100644 --- a/src/biotite/structure/filter.py +++ b/src/biotite/structure/filter.py @@ -9,32 +9,60 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Tom David Müller" -__all__ = ["filter_solvent", "filter_monoatomic_ions", "filter_nucleotides", - "filter_canonical_nucleotides", "filter_amino_acids", - "filter_canonical_amino_acids", "filter_carbohydrates", - "filter_intersection", "filter_first_altloc", - "filter_highest_occupancy_altloc", "filter_peptide_backbone", - "filter_phosphate_backbone", "filter_linear_bond_continuity", - "filter_polymer"] +__all__ = [ + "filter_solvent", + "filter_monoatomic_ions", + "filter_nucleotides", + "filter_canonical_nucleotides", + "filter_amino_acids", + "filter_canonical_amino_acids", + "filter_carbohydrates", + "filter_intersection", + "filter_first_altloc", + "filter_highest_occupancy_altloc", + "filter_peptide_backbone", + "filter_phosphate_backbone", + "filter_linear_bond_continuity", + "filter_polymer", +] -import warnings -import numpy as np from functools import partial +import numpy as np from .atoms import array as atom_array -from .residues import get_residue_starts, get_residue_count from .info.groups import amino_acid_names, carbohydrate_names, nucleotide_names - - -_canonical_aa_list = ["ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS", - "ILE","LEU","LYS","MET","PHE","PRO","PYL","SER","THR", - "TRP","TYR","VAL", "SEC"] +from .residues import get_residue_count, get_residue_starts + +_canonical_aa_list = [ + "ALA", + "ARG", + "ASN", + "ASP", + "CYS", + "GLN", + "GLU", + "GLY", + "HIS", + "ILE", + "LEU", + "LYS", + "MET", + "PHE", + "PRO", + "PYL", + "SER", + "THR", + "TRP", + "TYR", + "VAL", + "SEC", +] _canonical_nucleotide_list = ["A", "DA", "G", "DG", "C", "DC", "U", "DT"] -_solvent_list = ["HOH","SOL"] +_solvent_list = ["HOH", "SOL"] -_peptide_backbone_atoms = ['N', 'CA', 'C'] -_phosphate_backbone_atoms = ['P', 'O5\'', 'C5\'', 'C4\'', 'C3\'', 'O3\''] +_peptide_backbone_atoms = ["N", "CA", "C"] +_phosphate_backbone_atoms = ["P", "O5'", "C5'", "C4'", "C3'", "O3'"] def filter_monoatomic_ions(array): @@ -55,7 +83,7 @@ def filter_monoatomic_ions(array): """ # Exclusively in monoatomic ions, # the element name is equal to the residue name - return (array.res_name == array.element) + return array.res_name == array.element def filter_solvent(array): @@ -228,8 +256,9 @@ def filter_peptide_backbone(array): is a part of the peptide backbone. """ - return (_filter_atom_names(array, _peptide_backbone_atoms) & - filter_amino_acids(array)) + return _filter_atom_names(array, _peptide_backbone_atoms) & filter_amino_acids( + array + ) def filter_phosphate_backbone(array): @@ -250,8 +279,9 @@ def filter_phosphate_backbone(array): is a part of the phosphate backbone. """ - return (_filter_atom_names(array, _phosphate_backbone_atoms) & - filter_nucleotides(array)) + return _filter_atom_names(array, _phosphate_backbone_atoms) & filter_nucleotides( + array + ) def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8): @@ -297,21 +327,20 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8): def _is_polymer(array, min_size, pol_type): - - if pol_type.startswith('p'): + if pol_type.startswith("p"): filt_fn = filter_amino_acids - elif pol_type.startswith('n'): + elif pol_type.startswith("n"): filt_fn = filter_nucleotides - elif pol_type.startswith('c'): + elif pol_type.startswith("c"): filt_fn = filter_carbohydrates else: - raise ValueError(f'Unsupported polymer type {pol_type}') + raise ValueError(f"Unsupported polymer type {pol_type}") mask = filt_fn(array) return get_residue_count(array[mask]) >= min_size -def filter_polymer(array, min_size=2, pol_type='peptide'): +def filter_polymer(array, min_size=2, pol_type="peptide"): """ Filter for atoms that are a part of a consecutive standard macromolecular polymer entity. @@ -335,12 +364,13 @@ def filter_polymer(array, min_size=2, pol_type='peptide'): """ # Import `check_res_id_continuity` here to avoid circular imports from .integrity import check_res_id_continuity + split_idx = check_res_id_continuity(array) check_pol = partial(_is_polymer, min_size=min_size, pol_type=pol_type) bool_idx = map( lambda a: np.full(len(a), check_pol(atom_array(a)), dtype=bool), - np.split(array, split_idx) + np.split(array, split_idx), ) return np.concatenate(list(bool_idx)) @@ -384,13 +414,17 @@ def filter_intersection(array, intersect): intersect_categories = intersect.get_annotation_categories() # Check atom equality only for categories, # which exist in both arrays - categories = [category for category in array.get_annotation_categories() - if category in intersect_categories] + categories = [ + category + for category in array.get_annotation_categories() + if category in intersect_categories + ] for i in range(array.array_length()): subfilter = np.full(intersect.array_length(), True, dtype=bool) for category in categories: - subfilter &= (intersect.get_annotation(category) - == array.get_annotation(category)[i]) + subfilter &= ( + intersect.get_annotation(category) == array.get_annotation(category)[i] + ) filter[i] = subfilter.any() return filter @@ -456,7 +490,7 @@ def filter_first_altloc(atoms, altloc_ids): letter_altloc_ids = [l for l in altloc_ids[start:stop] if l.isalpha()] if len(letter_altloc_ids) > 0: first_id = letter_altloc_ids[0] - altloc_filter[start:stop] |= (altloc_ids[start:stop] == first_id) + altloc_filter[start:stop] |= altloc_ids[start:stop] == first_id else: # No altloc ID in this residue -> Nothing to do pass @@ -540,13 +574,11 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies): highest = -1.0 highest_id = None for id in set(letter_altloc_ids): - occupancy_sum = np.sum( - occupancies_in_res[altloc_ids_in_res == id] - ) + occupancy_sum = np.sum(occupancies_in_res[altloc_ids_in_res == id]) if occupancy_sum > highest: highest = occupancy_sum highest_id = id - altloc_filter[start:stop] |= (altloc_ids[start:stop] == highest_id) + altloc_filter[start:stop] |= altloc_ids[start:stop] == highest_id else: # No altloc ID in this residue -> Nothing to do pass diff --git a/src/biotite/structure/geometry.py b/src/biotite/structure/geometry.py index e27ba233a..2048c3451 100644 --- a/src/biotite/structure/geometry.py +++ b/src/biotite/structure/geometry.py @@ -9,18 +9,26 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["displacement", "index_displacement", "distance", "index_distance", - "angle", "index_angle", "dihedral", "index_dihedral", - "dihedral_backbone", "centroid"] +__all__ = [ + "displacement", + "index_displacement", + "distance", + "index_distance", + "angle", + "index_angle", + "dihedral", + "index_dihedral", + "dihedral_backbone", + "centroid", +] import numpy as np -from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .util import vector_dot, norm_vector -from .filter import filter_peptide_backbone +from .atoms import AtomArray, AtomArrayStack, coord +from .box import coord_to_fraction, fraction_to_coord, is_orthogonal from .chains import chain_iter -from .box import (coord_to_fraction, fraction_to_coord, - move_inside_box, is_orthogonal) from .error import BadStructureError +from .filter import filter_peptide_backbone +from .util import norm_vector, vector_dot def displacement(atoms1, atoms2, box=None): @@ -81,28 +89,24 @@ def displacement(atoms1, atoms2, box=None): fractions = fractions[np.newaxis, :] disp = disp[np.newaxis, :] if orthogonality: - _displacement_orthogonal_box( - fractions, box, disp - ) + _displacement_orthogonal_box(fractions, box, disp) else: _displacement_triclinic_box( fractions.astype(diff.dtype, copy=False), box.astype(diff.dtype, copy=False), - disp + disp, ) # Transform back disp = disp[0] if fractions.ndim == 2: # Single model if orthogonality: - _displacement_orthogonal_box( - fractions, box, disp - ) + _displacement_orthogonal_box(fractions, box, disp) else: _displacement_triclinic_box( fractions.astype(diff.dtype, copy=False), box.astype(diff.dtype, copy=False), - disp + disp, ) elif fractions.ndim == 3: # Multiple models @@ -117,19 +121,15 @@ def displacement(atoms1, atoms2, box=None): else: raise ValueError(f"{box.ndim} are to many box dimensions") if orthogonality_for_model: - _displacement_orthogonal_box( - fractions[i], box_for_model, disp[i] - ) + _displacement_orthogonal_box(fractions[i], box_for_model, disp[i]) else: _displacement_triclinic_box( fractions[i].astype(diff.dtype, copy=False), box_for_model.astype(diff.dtype, copy=False), - disp[i] + disp[i], ) else: - raise ValueError( - f"{diff.shape} is an invalid shape for atom coordinates" - ) + raise ValueError(f"{diff.shape} is an invalid shape for atom coordinates") return disp else: @@ -318,7 +318,7 @@ def angle(atoms1, atoms2, atoms3, box=None): v2 = displacement(atoms3, atoms2, box) norm_vector(v1) norm_vector(v2) - return np.arccos(vector_dot(v1,v2)) + return np.arccos(vector_dot(v1, v2)) def index_angle(*args, **kwargs): @@ -416,9 +416,9 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): n2 = np.cross(v2, v3) # Calculation using atan2, to ensure the correct sign of the angle - x = vector_dot(n1,n2) - y = vector_dot(np.cross(n1,n2), v2) - return np.arctan2(y,x) + x = vector_dot(n1, n2) + y = vector_dot(np.cross(n1, n2), v2) + return np.arctan2(y, x) def index_dihedral(*args, **kwargs): @@ -542,14 +542,16 @@ def dihedral_backbone(atom_array): bb_filter = filter_peptide_backbone(atom_array) backbone = atom_array[..., bb_filter] - if backbone.array_length() % 3 != 0 \ - or (backbone.atom_name[0::3] != "N" ).any() \ - or (backbone.atom_name[1::3] != "CA").any() \ - or (backbone.atom_name[2::3] != "C" ).any(): - raise BadStructureError( - "The backbone is invalid, must be repeats of (N, CA, C), " - "maybe a backbone atom is missing" - ) + if ( + backbone.array_length() % 3 != 0 + or (backbone.atom_name[0::3] != "N").any() + or (backbone.atom_name[1::3] != "CA").any() + or (backbone.atom_name[2::3] != "C").any() + ): + raise BadStructureError( + "The backbone is invalid, must be repeats of (N, CA, C), " + "maybe a backbone atom is missing" + ) phis = [] psis = [] omegas = [] @@ -558,9 +560,11 @@ def dihedral_backbone(atom_array): phis.append(phi) psis.append(psi) omegas.append(omega) - return np.concatenate(phis, axis=-1), np.concatenate(psis, axis=-1), \ - np.concatenate(omegas, axis=-1) - + return ( + np.concatenate(phis, axis=-1), + np.concatenate(psis, axis=-1), + np.concatenate(omegas, axis=-1), + ) def _dihedral_backbone(chain_bb): @@ -571,15 +575,15 @@ def _dihedral_backbone(chain_bb): # Dim 2: X, Y, Z coordinates # Dim 3: Atoms involved in dihedral angle if isinstance(chain_bb, AtomArray): - angle_coord_shape = (len(bb_coord)//3, 3, 4) + angle_coord_shape = (len(bb_coord) // 3, 3, 4) elif isinstance(chain_bb, AtomArrayStack): - angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1]//3, 3, 4) - phi_coord = np.full(angle_coord_shape, np.nan) - psi_coord = np.full(angle_coord_shape, np.nan) + angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1] // 3, 3, 4) + phi_coord = np.full(angle_coord_shape, np.nan) + psi_coord = np.full(angle_coord_shape, np.nan) omega_coord = np.full(angle_coord_shape, np.nan) # Indices for coordinates of CA atoms - ca_i = np.arange(bb_coord.shape[-2]//3) * 3 + 1 + ca_i = np.arange(bb_coord.shape[-2] // 3) * 3 + 1 # fmt: off phi_coord [..., 1:, :, 0] = bb_coord[..., ca_i[1: ]-2, :] phi_coord [..., 1:, :, 1] = bb_coord[..., ca_i[1: ]-1, :] @@ -595,12 +599,18 @@ def _dihedral_backbone(chain_bb): omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3, :] # fmt: on - phi = dihedral(phi_coord[...,0], phi_coord[...,1], - phi_coord[...,2], phi_coord[...,3]) - psi = dihedral(psi_coord[...,0], psi_coord[...,1], - psi_coord[...,2], psi_coord[...,3]) - omega = dihedral(omega_coord[...,0], omega_coord[...,1], - omega_coord[...,2], omega_coord[...,3]) + phi = dihedral( + phi_coord[..., 0], phi_coord[..., 1], phi_coord[..., 2], phi_coord[..., 3] + ) + psi = dihedral( + psi_coord[..., 0], psi_coord[..., 1], psi_coord[..., 2], psi_coord[..., 3] + ) + omega = dihedral( + omega_coord[..., 0], + omega_coord[..., 1], + omega_coord[..., 2], + omega_coord[..., 3], + ) return phi, psi, omega @@ -625,8 +635,9 @@ def centroid(atoms): return np.mean(coord(atoms), axis=-2) -def _call_non_index_function(function, expected_amount, - atoms, indices, periodic=False, box=None): +def _call_non_index_function( + function, expected_amount, atoms, indices, periodic=False, box=None +): """ Call an `xxx()` function based on the parameters given to a `index_xxx()` function. @@ -638,15 +649,14 @@ def _call_non_index_function(function, expected_amount, ) coord_list = [] for i in range(expected_amount): - coord_list.append(coord(atoms)[..., indices[:,i], :]) + coord_list.append(coord(atoms)[..., indices[:, i], :]) if periodic: if box is None: if isinstance(atoms, (AtomArray, AtomArrayStack)): box = atoms.box else: raise ValueError( - "If `atoms` are coordinates, " - "the box must be set explicitly" + "If `atoms` are coordinates, " "the box must be set explicitly" ) else: box = None @@ -680,10 +690,10 @@ def _displacement_triclinic_box(fractions, box, disp): for i in range(-1, 1): for j in range(-1, 1): for k in range(-1, 1): - x = i*box[0,0] + j*box[1,0] + k*box[2,0] - y = i*box[0,1] + j*box[1,1] + k*box[2,1] - z = i*box[0,2] + j*box[1,2] + k*box[2,2] - periodic_shift.append([x,y,z]) + x = i * box[0, 0] + j * box[1, 0] + k * box[2, 0] + y = i * box[0, 1] + j * box[1, 1] + k * box[2, 1] + z = i * box[0, 2] + j * box[1, 2] + k * box[2, 2] + periodic_shift.append([x, y, z]) periodic_shift = np.array(periodic_shift, dtype=disp.dtype) # Create 8 periodically shifted variants for each atom shifted_diffs = diffs[:, np.newaxis, :] + periodic_shift[np.newaxis, :, :] @@ -694,6 +704,5 @@ def _displacement_triclinic_box(fractions, box, disp): # for each given non-PBC-aware displacement find the PBC-aware # displacement with the lowest distance disp[:] = shifted_diffs[ - np.arange(len(shifted_diffs)), - np.argmin(sq_distance, axis=1) + np.arange(len(shifted_diffs)), np.argmin(sq_distance, axis=1) ] diff --git a/src/biotite/structure/graphics/atoms.py b/src/biotite/structure/graphics/atoms.py index bc91492d9..dec54f1fa 100644 --- a/src/biotite/structure/graphics/atoms.py +++ b/src/biotite/structure/graphics/atoms.py @@ -7,18 +7,25 @@ __all__ = ["plot_atoms", "plot_ball_and_stick_model"] import numpy as np -import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d.art3d import Line3DCollection -def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, - center=None, size=None, zoom=1.0): +def plot_atoms( + axes, + atoms, + colors, + line_width=1.0, + background_color=None, + center=None, + size=None, + zoom=1.0, +): """ Plot an :class:`AtomArray` as lines between bonded atoms. The z-axis points into the screen plane. - + Parameters ---------- axes : Axes3D @@ -49,7 +56,7 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, - ``> 1.0``: Zoom in. - ``< 1.0``: Zoom out. - + Notes ----- This is a very simple visualization tools for quick visual analysis @@ -61,38 +68,37 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, raise ValueError("The given axes mut be an 'Axes3D'") if atoms.bonds is None: raise ValueError("The atom array must have an associated bond list") - + # Calculating connections between atoms line_coord = [] line_colors = [] - for index1, index2 in atoms.bonds.as_array()[:,:2]: + for index1, index2 in atoms.bonds.as_array()[:, :2]: # Every connection consist of two lines: # One from the first atom to the center # and from from the second atom to the center line_start = atoms.coord[index1] line_end = atoms.coord[index2] line_center = (line_start + line_end) / 2 - + # Add line from first atom - line_coord.append(( - line_start, line_center - )) + line_coord.append((line_start, line_center)) line_colors.append(colors[index1]) - + # Add line from second atom - line_coord.append(( - line_end, line_center - )) + line_coord.append((line_end, line_center)) line_colors.append(colors[index2]) # Plot computed line coordinates and colors # Use 'Line3DCollection' for higher efficiency lines = Line3DCollection( - line_coord, color=line_colors, linewidths=line_width, - capstyle="round", joinstyle="round" + line_coord, + color=line_colors, + linewidths=line_width, + capstyle="round", + joinstyle="round", ) axes.add_collection(lines) - + # Set viewing angle axes.azim = -90 axes.elev = 90 @@ -105,17 +111,25 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, _set_box(axes, atoms.coord, center, size, zoom) -def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200, - line_color="black", line_width=1.0, - background_color=None, center=None, - size=None, zoom=1.0): +def plot_ball_and_stick_model( + axes, + atoms, + colors, + ball_size=200, + line_color="black", + line_width=1.0, + background_color=None, + center=None, + size=None, + zoom=1.0, +): """ Plot an :class:`AtomArray` as *ball-and-stick* model. The z-axis points into the screen plane. UNSTABLE: This function is probably subject to future changes. - + Parameters ---------- axes : Axes3D @@ -154,7 +168,7 @@ def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200, - ``> 1.0``: Zoom in. - ``< 1.0``: Zoom out. - + Notes ----- This is a very simple visualization tools for quick visual analysis @@ -166,26 +180,27 @@ def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200, raise ValueError("The given axes mut be an 'Axes3D'") if atoms.bonds is None: raise ValueError("The atom array must have an associated bond list") - + # Calculating connections between atoms line_coord = [ (atoms.coord[index1], atoms.coord[index2]) - for index1, index2 in atoms.bonds.as_array()[:,:2] + for index1, index2 in atoms.bonds.as_array()[:, :2] ] # Plot sticks # Use 'Line3DCollection' for higher efficiency sticks = Line3DCollection( - line_coord, color=line_color, linewidths=line_width, - capstyle="round", joinstyle="round" + line_coord, + color=line_color, + linewidths=line_width, + capstyle="round", + joinstyle="round", ) axes.add_collection(sticks) # Plot balls - axes.scatter( - *atoms.coord.T, s=ball_size, c=colors, linewidth=0, alpha=1 - ) - + axes.scatter(*atoms.coord.T, s=ball_size, c=colors, linewidth=0, alpha=1) + # Set viewing angle axes.azim = -90 axes.elev = 90 @@ -211,16 +226,18 @@ def _set_box(axes, coord, center, size, zoom): ) if size is None: - size = np.array([ - coord[:, 0].max() - coord[:, 0].min(), - coord[:, 1].max() - coord[:, 1].min(), - coord[:, 2].max() - coord[:, 2].min() - ]).max() - - axes.set_xlim(center[0] - size/(2*zoom), center[0] + size/(2*zoom)) - axes.set_ylim(center[1] - size/(2*zoom), center[1] + size/(2*zoom)) - axes.set_zlim(center[2] - size/(2*zoom), center[2] + size/(2*zoom)) - + size = np.array( + [ + coord[:, 0].max() - coord[:, 0].min(), + coord[:, 1].max() - coord[:, 1].min(), + coord[:, 2].max() - coord[:, 2].min(), + ] + ).max() + + axes.set_xlim(center[0] - size / (2 * zoom), center[0] + size / (2 * zoom)) + axes.set_ylim(center[1] - size / (2 * zoom), center[1] + size / (2 * zoom)) + axes.set_zlim(center[2] - size / (2 * zoom), center[2] + size / (2 * zoom)) + # Make the axis lengths of the 'plot box' equal # The 'plot box' is not visible due to 'axes.axis("off")' - axes.set_box_aspect([1,1,1]) \ No newline at end of file + axes.set_box_aspect([1, 1, 1]) diff --git a/src/biotite/structure/graphics/rna.py b/src/biotite/structure/graphics/rna.py index b2cf6d198..5e8f322eb 100644 --- a/src/biotite/structure/graphics/rna.py +++ b/src/biotite/structure/graphics/rna.py @@ -7,29 +7,43 @@ __all__ = ["plot_nucleotide_secondary_structure"] import shutil -import numpy as np from itertools import repeat -from .. import pseudoknots +import numpy as np from ...application.viennarna import RNAplotApp +from .. import pseudoknots def plot_nucleotide_secondary_structure( - axes, base_labels, base_pairs, length, - layout_type=RNAplotApp.Layout.NAVIEW, draw_pseudoknots=True, - pseudoknot_order=None, angle=0, bond_linewidth=1, bond_linestyle=None, - bond_color='black', backbone_linewidth=1, backbone_linestyle='solid', - backbone_color='grey', base_text=None, base_box=None, - annotation_positions=None, annotation_offset=8.5, annotation_text=None, - border=0.03, bin_path="RNAplot" - ): + axes, + base_labels, + base_pairs, + length, + layout_type=RNAplotApp.Layout.NAVIEW, + draw_pseudoknots=True, + pseudoknot_order=None, + angle=0, + bond_linewidth=1, + bond_linestyle=None, + bond_color="black", + backbone_linewidth=1, + backbone_linestyle="solid", + backbone_color="grey", + base_text=None, + base_box=None, + annotation_positions=None, + annotation_offset=8.5, + annotation_text=None, + border=0.03, + bin_path="RNAplot", +): """ Generate 2D plots of nucleic acid secondary structures using the interface to *RNAplot*, which is part of the *ViennaRNA* software package. - Internally a :class:`biotite.application.viennarna.RNAplotApp` - instance is created to generate coordinates for each individual base - on a 2D plane. *ViennaRNA* must be installed in order to use this + Internally a :class:`biotite.application.viennarna.RNAplotApp` + instance is created to generate coordinates for each individual base + on a 2D plane. *ViennaRNA* must be installed in order to use this function. Parameters @@ -49,7 +63,7 @@ def plot_nucleotide_secondary_structure( Whether pseudoknotted bonds should be drawn. pseudoknot_order : iterable, optional (default: None) The pseudoknot order of each pair in the input `base_pairs`. - If no pseudoknot order is given, a solution determined by + If no pseudoknot order is given, a solution determined by :func:`biotite.structure.pseudoknots` is picked at random. angle : int or float, optional (default: 0) The angle the plot should be rotated. @@ -74,9 +88,9 @@ def plot_nucleotide_secondary_structure( backbone_color : str or ndarray, shape=(3,) or shape=(4,), dtype=float, optional (default: 'grey') The *Matplotlib* compatible color of the backbone. base_text : dict or iterable, optional (default: {'size': 'small'}) - The keyword parameters for the *Matplotlib* ``Text`` objects - denoting the type of each base. Provide a single value to set - the parameters for all labels or an iterable to set the + The keyword parameters for the *Matplotlib* ``Text`` objects + denoting the type of each base. Provide a single value to set + the parameters for all labels or an iterable to set the parameters for each individual label. base_box : dict or iterable, optional (default: {'pad'=0, 'color'='white'}) The *Matplotlib* compatible properties of the ``FancyBboxPatch`` @@ -91,9 +105,9 @@ def plot_nucleotide_secondary_structure( annotation_offset : int or float, optional (default: 8.5) The offset of the annotations from the base labels. annotation_text : dict or iterable, optional (default: {'size': 'small'}) - The keyword parameters for the *Matplotlib* ``Text`` objects - annotating the sequence. Provide a single value to set the - parameters for all annotations or an iterable to set the + The keyword parameters for the *Matplotlib* ``Text`` objects + annotating the sequence. Provide a single value to set the + parameters for all annotations or an iterable to set the parameters for each individual annotation. border : float, optional (default: 0.03) The percentage of the coordinate range to be left as whitespace @@ -105,8 +119,8 @@ def plot_nucleotide_secondary_structure( # Check if RNAplot is installed if shutil.which(bin_path) is None: raise FileNotFoundError( - 'RNAplot is not installed at the specified location, unable to ' - 'plot secondary structure.' + "RNAplot is not installed at the specified location, unable to " + "plot secondary structure." ) # Get the unknotted base pairs @@ -127,7 +141,7 @@ def plot_nucleotide_secondary_structure( # Set the default properties of the Matplotlib `bbox` surrounding # the base labels if base_box is None: - base_box=np.full(length, {'pad': 0, 'color': 'white'}) + base_box = np.full(length, {"pad": 0, "color": "white"}) # if `base_box` is a dictionary, extrapolate elif isinstance(base_box, dict): base_box = np.full(length, base_box) @@ -135,25 +149,23 @@ def plot_nucleotide_secondary_structure( # By default pseudoknotted bonds are denoted as dashed lines, while # unknotted bonds are denoted as solid lines if bond_linestyle is None: - bond_linestyle = np.full(base_pairs.shape[0], 'solid', dtype='object') - bond_linestyle[pseudoknot_order != 0] = 'dashed' + bond_linestyle = np.full(base_pairs.shape[0], "solid", dtype="object") + bond_linestyle[pseudoknot_order != 0] = "dashed" # If `bond_linestyle` is a string, extrapolate elif isinstance(bond_linestyle, str): - bond_linestyle = np.full( - base_pairs.shape[0], bond_linestyle, dtype='object' - ) + bond_linestyle = np.full(base_pairs.shape[0], bond_linestyle, dtype="object") # If pseudoknots are not to be drawn, remove pseudoknotted bonds, # regardless of the given linestyles if not draw_pseudoknots: # Ensure that the array can hold the 'None' value # (not possible with 'U1' dtype for example) - bond_linestyle = np.asarray(bond_linestyle, dtype='object') - bond_linestyle[pseudoknot_order != 0] = 'None' + bond_linestyle = np.asarray(bond_linestyle, dtype="object") + bond_linestyle[pseudoknot_order != 0] = "None" # Set the default properties of the base labels if base_text is None: - base_text = np.full(length, {'size': 'small'}) + base_text = np.full(length, {"size": "small"}) elif isinstance(base_text, dict): base_text = np.full(length, base_text) @@ -164,7 +176,7 @@ def plot_nucleotide_secondary_structure( # Set the default font properties of the base annotations if annotation_text is None: - annotation_text = repeat({'size': 'small'}) + annotation_text = repeat({"size": "small"}) elif isinstance(annotation_text, dict): annotation_text = repeat(annotation_text) @@ -173,15 +185,14 @@ def plot_nucleotide_secondary_structure( base_pairs=unknotted_base_pairs, length=length, bin_path=bin_path, - layout_type=layout_type + layout_type=layout_type, ) # Rotate Coordinates if angle != 0: angle = np.deg2rad(angle) rot_matrix = np.array( - [[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]] + [[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]] ) for i, coord in enumerate(coordinates): coordinates[i] = np.dot(rot_matrix, coord) @@ -197,31 +208,32 @@ def plot_nucleotide_secondary_structure( ) axes.set_frame_on(False) - # Define buffer area (Border) coord_range = abs(np.max(coordinates)) + abs(np.min(coordinates)) - buffer = border*coord_range + buffer = border * coord_range # Adjust display axes.set_xlim( - np.min(coordinates[:,0])-buffer, np.max(coordinates[:,0])+buffer + np.min(coordinates[:, 0]) - buffer, np.max(coordinates[:, 0]) + buffer ) axes.set_ylim( - np.min(coordinates[:,1])-buffer, np.max(coordinates[:,1])+buffer + np.min(coordinates[:, 1]) - buffer, np.max(coordinates[:, 1]) + buffer ) - axes.set_aspect(aspect='equal') + axes.set_aspect(aspect="equal") # Draw backbone - axes.plot(coordinates[:,0], coordinates[:,1], color=backbone_color, - linestyle=backbone_linestyle, linewidth=backbone_linewidth) + axes.plot( + coordinates[:, 0], + coordinates[:, 1], + color=backbone_color, + linestyle=backbone_linestyle, + linewidth=backbone_linewidth, + ) # Draw base labels - for coords, label, box, text in zip( - coordinates, base_labels, base_box, base_text - ): + for coords, label, box, text in zip(coordinates, base_labels, base_box, base_text): t = axes.text( - x=coords[0], y=coords[1], s=label, - ha='center', va='center', **text + x=coords[0], y=coords[1], s=label, ha="center", va="center", **text ) t.set_bbox(box) @@ -237,37 +249,41 @@ def plot_nucleotide_secondary_structure( # Draw annotations for i, text in zip(annotation_positions, annotation_text): - if (i > 0) and ((i+1) < length): + if (i > 0) and ((i + 1) < length): # Get the average of the direction vectors to the next and # previous base vector_to_previous = np.array( - [coordinates[i-1][0] - coordinates[i][0], - coordinates[i-1][1] - coordinates[i][1]] - ) - vector_to_previous = vector_to_previous / np.linalg.norm( - vector_to_previous + [ + coordinates[i - 1][0] - coordinates[i][0], + coordinates[i - 1][1] - coordinates[i][1], + ] ) + vector_to_previous = vector_to_previous / np.linalg.norm(vector_to_previous) vector_to_next = np.array( - [coordinates[i][0] - coordinates[i+1][0], - coordinates[i][1] - coordinates[i+1][1]] - ) - vector_to_next = vector_to_next / np.linalg.norm( - vector_to_next + [ + coordinates[i][0] - coordinates[i + 1][0], + coordinates[i][1] - coordinates[i + 1][1], + ] ) + vector_to_next = vector_to_next / np.linalg.norm(vector_to_next) vector = (vector_to_next + vector_to_previous) / 2 elif i > 0: # For the last base get the direction vector to the previous # base vector = np.array( - [coordinates[i-1][0] - coordinates[i][0], - coordinates[i-1][1] - coordinates[i][1]] + [ + coordinates[i - 1][0] - coordinates[i][0], + coordinates[i - 1][1] - coordinates[i][1], + ] ) else: # For the first base get the direction vector to the next # base vector = np.array( - [coordinates[i][0] - coordinates[i+1][0], - coordinates[i][1] - coordinates[i+1][1]] + [ + coordinates[i][0] - coordinates[i + 1][0], + coordinates[i][1] - coordinates[i + 1][1], + ] ) # Normalize the vector vector = vector / np.linalg.norm(vector) @@ -275,8 +291,5 @@ def plot_nucleotide_secondary_structure( vector = np.array([vector[1], -vector[0]]) # The annotations are offset in the direction of the # perpendicular vector - x, y = coordinates[i] + (annotation_offset*vector) - axes.text( - x=x, y=y, s=i+1, - ha='center', va='center', **text - ) \ No newline at end of file + x, y = coordinates[i] + (annotation_offset * vector) + axes.text(x=x, y=y, s=i + 1, ha="center", va="center", **text) diff --git a/src/biotite/structure/hbond.py b/src/biotite/structure/hbond.py index a23c5cdcd..7dc80c52d 100644 --- a/src/biotite/structure/hbond.py +++ b/src/biotite/structure/hbond.py @@ -11,16 +11,23 @@ __all__ = ["hbond", "hbond_frequency"] import warnings -from .geometry import distance, angle import numpy as np from .atoms import AtomArrayStack, stack from .celllist import CellList - - -def hbond(atoms, selection1=None, selection2=None, selection1_type='both', - cutoff_dist=2.5, cutoff_angle=120, - donor_elements=('O', 'N', 'S'), acceptor_elements=('O', 'N', 'S'), - periodic=False): +from .geometry import angle, distance + + +def hbond( + atoms, + selection1=None, + selection2=None, + selection1_type="both", + cutoff_dist=2.5, + cutoff_angle=120, + donor_elements=("O", "N", "S"), + acceptor_elements=("O", "N", "S"), + periodic=False, +): r""" Find hydrogen bonds in a structure using the Baker-Hubbard algorithm. :footcite:`Baker1984` @@ -31,7 +38,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', and :math:`d_{H,A} \le 2.5 \mathring{A}`. Consequently, the given structure must contain hydrogen atoms. Otherwise, no hydrogen bonds will be found. - + Parameters ---------- atoms : AtomArray or AtomArrayStack @@ -60,7 +67,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', boundary conditions. The `box` attribute of `atoms` is required in this case. (Default: False). - + Returns ------- triplets : ndarray, dtype=int, shape=(n,3) @@ -74,7 +81,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', *m x n* matrix that shows if an interaction with index *n* in `triplets` is present in the model *m* of the input `atoms`. Only returned if `atoms` is an :class:`AtomArrayStack`. - + Notes ----- The result of this function may include false positives: @@ -84,19 +91,19 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', For example, a nitrogen atom with positive charge could be considered as acceptor atom by this method, although this does make sense from a chemical perspective. - + Examples -------- Calculate the total number of hydrogen bonds found in each model: - + >>> triplets, mask = hbond(atom_array_stack) >>> hbonds_per_model = np.count_nonzero(mask, axis=1) >>> print(hbonds_per_model) [14 14 14 12 11 12 9 13 9 14 13 13 14 11 11 12 11 14 14 13 14 13 15 17 14 12 15 12 12 13 13 13 12 12 11 14 10 11] - + Get hydrogen bond donors of third model: - + >>> # Third model -> index 2 >>> triplets = triplets[mask[2,:]] >>> # First column contains donors @@ -137,12 +144,12 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', single_model = True else: single_model = False - + if periodic: box = atoms.box else: box = None - + # Mask for donor/acceptor elements donor_element_mask = np.isin(atoms.element, donor_elements) acceptor_element_mask = np.isin(atoms.element, acceptor_elements) @@ -152,69 +159,81 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', if selection2 is None: selection2 = np.ones(atoms.array_length(), dtype=bool) - if selection1_type == 'both': + if selection1_type == "both": # The two selections are separated into three selections: # the original ones without the overlaping part # and one containing the overlap - # This prevents redundant triplets and unnecessary computation + # This prevents redundant triplets and unnecessary computation overlap_selection = selection1 & selection2 # Original selections without overlaping part exclusive_selection1 = selection1 & (~overlap_selection) exclusive_selection2 = selection2 & (~overlap_selection) - + # Put selections to list for cleaner iteration - selections = [ - exclusive_selection1, exclusive_selection2, overlap_selection - ] + selections = [exclusive_selection1, exclusive_selection2, overlap_selection] selection_combinations = [ - #(0,0), is not included, would be same selection + # (0,0), is not included, would be same selection # as donor and acceptor simultaneously - (0,1), - (0,2), - (1,0), - #(1,1), # same reason above - (1,2), - (2,0), - (2,1), - (2,2) # overlaping part, combination is necessary + (0, 1), + (0, 2), + (1, 0), + # (1,1), # same reason above + (1, 2), + (2, 0), + (2, 1), + (2, 2), # overlaping part, combination is necessary ] - + all_comb_triplets = [] all_comb_mask = [] for selection_index1, selection_index2 in selection_combinations: donor_mask = selections[selection_index1] acceptor_mask = selections[selection_index2] - if np.count_nonzero(donor_mask) != 0 and \ - np.count_nonzero(acceptor_mask) != 0: - # Calculate triplets and mask - triplets, mask = _hbond( - atoms, donor_mask, acceptor_mask, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, - box - ) - all_comb_triplets.append(triplets) - all_comb_mask.append(mask) + if ( + np.count_nonzero(donor_mask) != 0 + and np.count_nonzero(acceptor_mask) != 0 + ): + # Calculate triplets and mask + triplets, mask = _hbond( + atoms, + donor_mask, + acceptor_mask, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, + ) + all_comb_triplets.append(triplets) + all_comb_mask.append(mask) # Merge results from all combinations triplets = np.concatenate(all_comb_triplets, axis=0) mask = np.concatenate(all_comb_mask, axis=1) - elif selection1_type == 'donor': + elif selection1_type == "donor": triplets, mask = _hbond( - atoms, selection1, selection2, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, - box + atoms, + selection1, + selection2, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, ) - - elif selection1_type == 'acceptor': + + elif selection1_type == "acceptor": triplets, mask = _hbond( - atoms, selection2, selection1, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, - box + atoms, + selection2, + selection1, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, ) - + else: raise ValueError(f"Unkown selection type '{selection1_type}'") @@ -228,12 +247,18 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', return triplets, mask -def _hbond(atoms, donor_mask, acceptor_mask, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, box): - +def _hbond( + atoms, + donor_mask, + acceptor_mask, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, +): # Filter donor/acceptor elements - donor_mask &= donor_element_mask + donor_mask &= donor_element_mask acceptor_mask &= acceptor_element_mask first_model_box = box[0] if box is not None else None @@ -254,47 +279,43 @@ def _hbond(atoms, donor_mask, acceptor_mask, if len(donor_h_i) == 0 or len(acceptor_i) == 0: # Return empty triplets and mask return ( - np.zeros((0,3), dtype=int), - np.zeros((atoms.stack_depth(),0), dtype=bool) + np.zeros((0, 3), dtype=int), + np.zeros((atoms.stack_depth(), 0), dtype=bool), ) - + # Narrow the amount of possible acceptor to donor-H connections # down via the distance cutoff parameter using a cell list # Save in acceptor-to-hydrogen matrix # (true when distance smaller than cutoff) coord = atoms.coord - possible_bonds = np.zeros( - (len(acceptor_i), len(donor_h_i)), - dtype=bool - ) + possible_bonds = np.zeros((len(acceptor_i), len(donor_h_i)), dtype=bool) periodic = False if box is None else True for model_i in range(atoms.stack_depth()): donor_h_coord = coord[model_i, donor_h_mask] acceptor_coord = coord[model_i, acceptor_mask] box_for_model = box[model_i] if box is not None else None cell_list = CellList( - donor_h_coord, cell_size=cutoff_dist, - periodic=periodic, box=box_for_model - ) - possible_bonds |= cell_list.get_atoms_in_cells( - acceptor_coord, as_mask=True + donor_h_coord, cell_size=cutoff_dist, periodic=periodic, box=box_for_model ) + possible_bonds |= cell_list.get_atoms_in_cells(acceptor_coord, as_mask=True) possible_bonds_i = np.where(possible_bonds) # Narrow down acceptor_i = acceptor_i[possible_bonds_i[0]] donor_h_i = donor_h_i[possible_bonds_i[1]] - + # Build D-H..A triplets donor_i = associated_donor_indices[donor_h_i] triplets = np.stack((donor_i, donor_h_i, acceptor_i), axis=1) # Remove entries where donor and acceptor are the same triplets = triplets[donor_i != acceptor_i] - + hbond_mask = _is_hbond( - coord[:, triplets[:,0]], # donors - coord[:, triplets[:,1]], # donor hydrogens - coord[:, triplets[:,2]], # acceptors - box, cutoff_dist=cutoff_dist, cutoff_angle=cutoff_angle + coord[:, triplets[:, 0]], # donors + coord[:, triplets[:, 1]], # donor hydrogens + coord[:, triplets[:, 2]], # acceptors + box, + cutoff_dist=cutoff_dist, + cutoff_angle=cutoff_angle, ) # Reduce output to contain only triplets counted at least once @@ -311,14 +332,14 @@ def _get_bonded_h(array, donor_mask, bonds): all donors in atoms[donor_mask]. A `BondsList` is used for detecting bonded hydrogen atoms. """ - hydrogen_mask = (array.element == "H") - + hydrogen_mask = array.element == "H" + donor_hydrogen_mask = np.zeros(len(array), dtype=bool) associated_donor_indices = np.full(len(array), -1, dtype=int) all_bond_indices, _ = bonds.get_all_bonds() donor_indices = np.where(donor_mask)[0] - + for donor_i in donor_indices: bonded_indices = all_bond_indices[donor_i] # Remove padding values @@ -327,7 +348,7 @@ def _get_bonded_h(array, donor_mask, bonds): bonded_indices = bonded_indices[hydrogen_mask[bonded_indices]] donor_hydrogen_mask[bonded_indices] = True associated_donor_indices[bonded_indices] = donor_i - + return donor_hydrogen_mask, associated_donor_indices @@ -342,22 +363,20 @@ def _get_bonded_h_via_distance(array, donor_mask, box): coord = array.coord res_id = array.res_id - hydrogen_mask = (array.element == "H") - + hydrogen_mask = array.element == "H" + donor_hydrogen_mask = np.zeros(len(array), dtype=bool) associated_donor_indices = np.full(len(array), -1, dtype=int) donor_indices = np.where(donor_mask)[0] for donor_i in donor_indices: candidate_mask = hydrogen_mask & (res_id == res_id[donor_i]) - distances = distance( - coord[donor_i], coord[candidate_mask], box=box - ) + distances = distance(coord[donor_i], coord[candidate_mask], box=box) donor_h_indices = np.where(candidate_mask)[0][distances <= CUTOFF] for i in donor_h_indices: associated_donor_indices[i] = donor_i donor_hydrogen_mask[i] = True - + return donor_hydrogen_mask, associated_donor_indices @@ -378,12 +397,12 @@ def hbond_frequency(mask): The frequency is the amount of models, where the respective bond exists divided by the total amount of models. - + Parameters ---------- mask: ndarray, dtype=bool, shape=(m,n) Input mask obtained from `hbond` function. - + Returns ------- ndarray, dtype=Float @@ -406,4 +425,4 @@ def hbond_frequency(mask): 0.132 0.053 0.026 0.158 0.026 0.868 0.211 0.026 0.921 0.316 0.079 0.237 0.105 0.421 0.079 0.026 1.000 0.053 0.132 0.026 0.184] """ - return mask.sum(axis=0)/len(mask) + return mask.sum(axis=0) / len(mask) diff --git a/src/biotite/structure/info/__init__.py b/src/biotite/structure/info/__init__.py index 4d754a9b8..3c7078ff7 100644 --- a/src/biotite/structure/info/__init__.py +++ b/src/biotite/structure/info/__init__.py @@ -14,8 +14,6 @@ __name__ = "biotite.structure.info" __author__ = "Patrick Kunzmann, Tom David Müller" -from .groups import * - from .atoms import * from .bonds import * from .groups import * diff --git a/src/biotite/structure/info/atoms.py b/src/biotite/structure/info/atoms.py index 9b8f29113..bf9971995 100644 --- a/src/biotite/structure/info/atoms.py +++ b/src/biotite/structure/info/atoms.py @@ -8,7 +8,6 @@ from .ccd import get_ccd - # fmt: off NON_HETERO_RESIDUES = set([ "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", @@ -77,8 +76,6 @@ def residue(res_name): try: component = get_component(get_ccd(), res_name=res_name) except KeyError: - raise KeyError( - f"No atom information found for residue '{res_name}' in CCD" - ) + raise KeyError(f"No atom information found for residue '{res_name}' in CCD") component.hetero[:] = res_name not in NON_HETERO_RESIDUES return component diff --git a/src/biotite/structure/info/bonds.py b/src/biotite/structure/info/bonds.py index 421058162..5d6cc90b1 100644 --- a/src/biotite/structure/info/bonds.py +++ b/src/biotite/structure/info/bonds.py @@ -9,15 +9,14 @@ from ..bonds import BondType from .ccd import get_from_ccd - BOND_TYPES = { - ("SING", "N") : BondType.SINGLE, - ("DOUB", "N") : BondType.DOUBLE, - ("TRIP", "N") : BondType.TRIPLE, - ("QUAD", "N") : BondType.QUADRUPLE, - ("SING", "Y") : BondType.AROMATIC_SINGLE, - ("DOUB", "Y") : BondType.AROMATIC_DOUBLE, - ("TRIP", "Y") : BondType.AROMATIC_TRIPLE, + ("SING", "N"): BondType.SINGLE, + ("DOUB", "N"): BondType.DOUBLE, + ("TRIP", "N"): BondType.TRIPLE, + ("QUAD", "N"): BondType.QUADRUPLE, + ("SING", "Y"): BondType.AROMATIC_SINGLE, + ("DOUB", "Y"): BondType.AROMATIC_DOUBLE, + ("TRIP", "Y"): BondType.AROMATIC_TRIPLE, } _intra_bonds = {} @@ -62,8 +61,7 @@ def bond_type(res_name, atom_name1, atom_name2): return None # Try both atom orders bond_type_int = bonds_for_residue.get( - (atom_name1, atom_name2), - bonds_for_residue.get((atom_name2, atom_name1)) + (atom_name1, atom_name2), bonds_for_residue.get((atom_name2, atom_name1)) ) if bond_type_int is not None: return BondType(bond_type_int) @@ -137,7 +135,7 @@ def bonds_in_residue(res_name): chem_comp_bond_dict["atom_id_1"], chem_comp_bond_dict["atom_id_2"], chem_comp_bond_dict["value_order"], - chem_comp_bond_dict["pdbx_aromatic_flag"] + chem_comp_bond_dict["pdbx_aromatic_flag"], ): bond_type = BOND_TYPES[order, aromatic_flag] bonds_for_residue[atom1.item(), atom2.item()] = bond_type diff --git a/src/biotite/structure/info/ccd.py b/src/biotite/structure/info/ccd.py index 8942f59ba..16ec824f5 100644 --- a/src/biotite/structure/info/ccd.py +++ b/src/biotite/structure/info/ccd.py @@ -9,7 +9,6 @@ from pathlib import Path import numpy as np - CCD_DIR = Path(__file__).parent / "ccd" INDEX_COLUMN_NAME = { "chem_comp": "id", @@ -104,7 +103,7 @@ def _index_residues(id_column): # The final start is the exclusive stop of last residue residue_starts = np.concatenate(([0], residue_starts, [len(id_column)])) index = {} - for i in range(len(residue_starts)-1): + for i in range(len(residue_starts) - 1): comp_id = id_column[residue_starts[i]].item() - index[comp_id] = (residue_starts[i], residue_starts[i+1]) - return index \ No newline at end of file + index[comp_id] = (residue_starts[i], residue_starts[i + 1]) + return index diff --git a/src/biotite/structure/info/groups.py b/src/biotite/structure/info/groups.py index c719acd3f..781f9c587 100644 --- a/src/biotite/structure/info/groups.py +++ b/src/biotite/structure/info/groups.py @@ -7,8 +7,6 @@ __all__ = ["amino_acid_names", "nucleotide_names", "carbohydrate_names"] from pathlib import Path -import copy - CCD_DIR = Path(__file__).parent / "ccd" @@ -84,4 +82,4 @@ def _get_group_members(group_name): if group_name not in group_lists: with open(CCD_DIR / f"{group_name}.txt", "r") as file: group_lists[group_name] = tuple(file.read().split()) - return group_lists[group_name] \ No newline at end of file + return group_lists[group_name] diff --git a/src/biotite/structure/info/masses.py b/src/biotite/structure/info/masses.py index 73c0b6828..8dc639480 100644 --- a/src/biotite/structure/info/masses.py +++ b/src/biotite/structure/info/masses.py @@ -11,7 +11,6 @@ from ..atoms import Atom, AtomArray, AtomArrayStack from .ccd import get_from_ccd - # Masses are taken from http://www.sbcs.qmul.ac.uk/iupac/AtWt/ (2018/03/01) ATOM_MASSES_FILE = Path(__file__).parent / "atom_masses.json" _atom_masses = None @@ -109,15 +108,11 @@ def mass(item, is_residue=None): elif isinstance(item, Atom): result_mass = mass(item.element, is_residue=False) elif isinstance(item, AtomArray) or isinstance(item, AtomArrayStack): - result_mass = sum( - (mass(element, is_residue=False) for element in item.element) - ) + result_mass = sum((mass(element, is_residue=False) for element in item.element)) else: - raise TypeError( - f"Cannot calculate mass for {type(item).__name__} objects" - ) + raise TypeError(f"Cannot calculate mass for {type(item).__name__} objects") if result_mass is None: raise KeyError(f"{item} is not known") - return result_mass \ No newline at end of file + return result_mass diff --git a/src/biotite/structure/info/radii.py b/src/biotite/structure/info/radii.py index e1c2651f7..e3ed202e2 100644 --- a/src/biotite/structure/info/radii.py +++ b/src/biotite/structure/info/radii.py @@ -8,7 +8,6 @@ from .bonds import bonds_in_residue - # fmt: off # Contains tuples for the different ProtOr groups: # Tuple contains: element, valency, H count @@ -115,8 +114,7 @@ def vdw_radius_protor(res_name, atom_name): # Use cached radii for the residue, if already calculated if atom_name not in _protor_radii[res_name]: raise KeyError( - f"Residue '{res_name}' does not contain an atom named " - f"'{atom_name}'" + f"Residue '{res_name}' does not contain an atom named " f"'{atom_name}'" ) return _protor_radii[res_name].get(atom_name) else: @@ -126,6 +124,7 @@ def vdw_radius_protor(res_name, atom_name): # are cached return vdw_radius_protor(res_name, atom_name) + def _calculate_protor_radii(res_name): """ Calculate the ProtOr VdW radii for all atoms (atom names) in @@ -161,8 +160,7 @@ def _calculate_protor_radii(res_name): group[2] += 1 groups[main_atom] = group # Get radii based on ProtOr groups - radii = {atom : _PROTOR_RADII.get(tuple(group)) - for atom, group in groups.items()} + radii = {atom: _PROTOR_RADII.get(tuple(group)) for atom, group in groups.items()} return radii @@ -196,4 +194,4 @@ def vdw_radius_single(element): >>> print(vdw_radius_single("C")) 1.7 """ - return _SINGLE_RADII.get(element.upper()) \ No newline at end of file + return _SINGLE_RADII.get(element.upper()) diff --git a/src/biotite/structure/info/standardize.py b/src/biotite/structure/info/standardize.py index 2b1000265..94a7b3b7b 100644 --- a/src/biotite/structure/info/standardize.py +++ b/src/biotite/structure/info/standardize.py @@ -8,9 +8,9 @@ import warnings import numpy as np -from .ccd import get_from_ccd -from ..residues import get_residue_starts from ..error import BadStructureError +from ..residues import get_residue_starts +from .ccd import get_from_ccd def standardize_order(atoms): @@ -116,26 +116,24 @@ def standardize_order(atoms): reordered_indices = np.zeros(atoms.array_length(), dtype=int) starts = get_residue_starts(atoms, add_exclusive_stop=True) - for i in range(len(starts)-1): + for i in range(len(starts) - 1): start = starts[i] - stop = starts[i+1] + stop = starts[i + 1] res_name = atoms.res_name[start] - standard_atom_names = get_from_ccd( - "chem_comp_atom", res_name, "atom_id" - ) + standard_atom_names = get_from_ccd("chem_comp_atom", res_name, "atom_id") if standard_atom_names is None: # If the residue is not in the CCD, keep the current order warnings.warn( f"Residue '{res_name}' is not in the CCD, " f"keeping current atom order" ) - reordered_indices[start : stop] = np.arange(start, stop) + reordered_indices[start:stop] = np.arange(start, stop) continue - reordered_indices[start : stop] = _reorder( - atoms.atom_name[start : stop], standard_atom_names - ) + start + reordered_indices[start:stop] = ( + _reorder(atoms.atom_name[start:stop], standard_atom_names) + start + ) return reordered_indices @@ -164,17 +162,13 @@ def _reorder(origin, target): Indices for `origin` that that changes the order of `origin` to the order of `target`. """ - target_hits, origin_hits = np.where( - target[:, np.newaxis] == origin[np.newaxis, :] - ) + target_hits, origin_hits = np.where(target[:, np.newaxis] == origin[np.newaxis, :]) counts = np.bincount(target_hits, minlength=len(target)) if (counts > 1).any(): counts = np.bincount(target_hits, minlength=len(target)) # Identify which atom is duplicate - duplicate_i = np.where( - counts > 1 - )[0][0] + duplicate_i = np.where(counts > 1)[0][0] duplicate_name = target[duplicate_i] raise BadStructureError( f"Input structure has duplicate atom '{duplicate_name}'" @@ -185,12 +179,7 @@ def _reorder(origin, target): # to the target structure # -> Identify which atoms are missing in the target structure # and append these to the end of the residue - missing_atom_mask = np.bincount( - origin_hits, minlength=len(origin) - ).astype(bool) - return np.concatenate([ - origin_hits, - np.where(~missing_atom_mask)[0] - ]) + missing_atom_mask = np.bincount(origin_hits, minlength=len(origin)).astype(bool) + return np.concatenate([origin_hits, np.where(~missing_atom_mask)[0]]) else: - return origin_hits \ No newline at end of file + return origin_hits diff --git a/src/biotite/structure/integrity.py b/src/biotite/structure/integrity.py index 567908fad..0c18560f0 100644 --- a/src/biotite/structure/integrity.py +++ b/src/biotite/structure/integrity.py @@ -9,21 +9,26 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Daniel Bauer" -__all__ = ["check_atom_id_continuity", - "check_res_id_continuity", "check_backbone_continuity", - "check_duplicate_atoms", - "check_linear_continuity"] +__all__ = [ + "check_atom_id_continuity", + "check_res_id_continuity", + "check_backbone_continuity", + "check_duplicate_atoms", + "check_linear_continuity", +] import numpy as np -import warnings -from .filter import ( - filter_peptide_backbone, filter_phosphate_backbone, filter_linear_bond_continuity) from .box import coord_to_fraction +from .filter import ( + filter_linear_bond_continuity, + filter_peptide_backbone, + filter_phosphate_backbone, +) def _check_continuity(array): diff = np.diff(array) - discontinuity = np.where( ((diff != 0) & (diff != 1)) ) + discontinuity = np.where(((diff != 0) & (diff != 1))) return discontinuity[0] + 1 @@ -164,8 +169,9 @@ def check_duplicate_atoms(array): The first occurence of an atom is not counted as duplicate. """ duplicates = [] - annots = [array.get_annotation(category) for category - in array.get_annotation_categories()] + annots = [ + array.get_annotation(category) for category in array.get_annotation_categories() + ] for i in range(1, array.array_length()): # Start with assumption that all atoms in the array # until index i are duplicates of the atom at index i @@ -174,7 +180,7 @@ def check_duplicate_atoms(array): # For each annotation array filter out the atoms until # index i that have an unequal annotation # to the atom at index i - is_duplicate &= (annot[:i] == annot[i]) + is_duplicate &= annot[:i] == annot[i] # After checking all annotation arrays, # if there still is any duplicate to the atom at index i, # add i the the list of duplicate atom indices diff --git a/src/biotite/structure/io/__init__.py b/src/biotite/structure/io/__init__.py index 3c3678c0d..510a65cf4 100644 --- a/src/biotite/structure/io/__init__.py +++ b/src/biotite/structure/io/__init__.py @@ -26,4 +26,4 @@ __author__ = "Patrick Kunzmann" from .general import * -from .trajfile import * \ No newline at end of file +from .trajfile import * diff --git a/src/biotite/structure/io/dcd/__init__.py b/src/biotite/structure/io/dcd/__init__.py index aa5e79366..1145f2376 100644 --- a/src/biotite/structure/io/dcd/__init__.py +++ b/src/biotite/structure/io/dcd/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.dcd" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/dcd/file.py b/src/biotite/structure/io/dcd/file.py index 5aa1071f4..81c0ef800 100644 --- a/src/biotite/structure/io/dcd/file.py +++ b/src/biotite/structure/io/dcd/file.py @@ -7,20 +7,21 @@ __all__ = ["DCDFile"] import numpy as np +from ...box import unitcell_from_vectors, vectors_from_unitcell from ..trajfile import TrajectoryFile -from ...box import vectors_from_unitcell, unitcell_from_vectors class DCDFile(TrajectoryFile): """ This file class represents a DCD trajectory file. """ - + @classmethod def traj_type(cls): import mdtraj.formats as traj + return traj.DCDTrajectoryFile - + @classmethod def process_read_values(cls, read_values): # .netcdf files use Angstrom @@ -28,38 +29,40 @@ def process_read_values(cls, read_values): cell_lengths = read_values[1] cell_angles = read_values[2] if cell_lengths is None or cell_angles is None: - box = None + box = None else: box = np.stack( - [vectors_from_unitcell(a, b, c, alpha, beta, gamma) - for (a, b, c), (alpha, beta, gamma) - in zip(cell_lengths, np.deg2rad(cell_angles))], - axis=0 + [ + vectors_from_unitcell(a, b, c, alpha, beta, gamma) + for (a, b, c), (alpha, beta, gamma) in zip( + cell_lengths, np.deg2rad(cell_angles) + ) + ], + axis=0, ) return coord, box, None - + @classmethod def prepare_write_values(cls, coord, box, time): - xyz = coord.astype(np.float32, copy=False) \ - if coord is not None else None + xyz = coord.astype(np.float32, copy=False) if coord is not None else None if box is None: cell_lengths = None - cell_angles = None + cell_angles = None else: cell_lengths = np.zeros((len(box), 3), dtype=np.float32) - cell_angles = np.zeros((len(box), 3), dtype=np.float32) + cell_angles = np.zeros((len(box), 3), dtype=np.float32) for i, model_box in enumerate(box): a, b, c, alpha, beta, gamma = unitcell_from_vectors(model_box) cell_lengths[i] = np.array((a, b, c)) cell_angles[i] = np.rad2deg((alpha, beta, gamma)) return { - "xyz" : xyz, - "cell_lengths" : cell_lengths, - "cell_angles" : cell_angles, + "xyz": xyz, + "cell_lengths": cell_lengths, + "cell_angles": cell_angles, } def set_time(self, time): if time is not None: raise NotImplementedError( "This trajectory file does not support writing simulation time" - ) \ No newline at end of file + ) diff --git a/src/biotite/structure/io/general.py b/src/biotite/structure/io/general.py index ba0e0828b..a58c8fb02 100644 --- a/src/biotite/structure/io/general.py +++ b/src/biotite/structure/io/general.py @@ -12,8 +12,8 @@ __all__ = ["load_structure", "save_structure"] import datetime -import os.path import io +import os.path from ..atoms import AtomArrayStack @@ -65,56 +65,59 @@ def load_structure(file_path, template=None, **kwargs): match suffix: case ".pdb": from .pdb import PDBFile + file = PDBFile.read(file_path) array = file.get_structure(**kwargs) return _as_single_model_if_possible(array) case ".pdbqt": from .pdbqt import PDBQTFile + file = PDBQTFile.read(file_path) array = file.get_structure(**kwargs) return _as_single_model_if_possible(array) case ".cif" | ".pdbx": from .pdbx import CIFFile, get_structure + file = CIFFile.read(file_path) array = get_structure(file, **kwargs) return _as_single_model_if_possible(array) case ".bcif": from .pdbx import BinaryCIFFile, get_structure + file = BinaryCIFFile.read(file_path) array = get_structure(file, **kwargs) return _as_single_model_if_possible(array) case ".gro": from .gro import GROFile + file = GROFile.read(file_path) array = file.get_structure(**kwargs) return _as_single_model_if_possible(array) case ".mol": from .mol import MOLFile + file = MOLFile.read(file_path) array = file.get_structure(**kwargs) # MOL and SDF files only contain a single model return array case ".sdf" | ".sd": from .mol import SDFile, get_structure + file = SDFile.read(file_path) array = get_structure(file, **kwargs) return array case ".trr" | ".xtc" | ".tng" | ".dcd" | ".netcdf": if template is None: - raise TypeError( - "Template must be specified for trajectory files" - ) + raise TypeError("Template must be specified for trajectory files") # Filter template for atom ids, if an unfiltered template - if ( - "atom_i" in kwargs - and template.shape[-1] != len(kwargs["atom_i"]) - ): + if "atom_i" in kwargs and template.shape[-1] != len(kwargs["atom_i"]): template = template[..., kwargs["atom_i"]] - from .trr import TRRFile - from .xtc import XTCFile - from .tng import TNGFile from .dcd import DCDFile from .netcdf import NetCDFFile + from .tng import TNGFile + from .trr import TRRFile + from .xtc import XTCFile + if suffix == ".trr": traj_file_cls = TRRFile if suffix == ".xtc": @@ -160,48 +163,56 @@ def save_structure(file_path, array, **kwargs): match suffix: case ".pdb": from .pdb import PDBFile + file = PDBFile() file.set_structure(array, **kwargs) file.write(file_path) case ".pdbqt": from .pdbqt import PDBQTFile + file = PDBQTFile() file.set_structure(array, **kwargs) file.write(file_path) case ".cif" | ".pdbx": from .pdbx import CIFFile, set_structure + file = CIFFile() set_structure(file, array, **kwargs) file.write(file_path) case ".bcif": from .pdbx import BinaryCIFFile, set_structure + file = BinaryCIFFile() set_structure(file, array, **kwargs) file.write(file_path) case ".gro": from .gro import GROFile + file = GROFile() file.set_structure(array, **kwargs) file.write(file_path) case ".mol": from .mol import MOLFile + file = MOLFile() file.set_structure(array, **kwargs) file.header = _mol_header() file.write(file_path) case ".sdf" | ".sd": from .mol import SDFile, SDRecord, set_structure + record = SDRecord() record.set_structure(array, **kwargs) record.header = _mol_header() file = SDFile({"Molecule": record}) file.write(file_path) case ".trr" | ".xtc" | ".tng" | ".dcd" | ".netcdf": - from .trr import TRRFile - from .xtc import XTCFile - from .tng import TNGFile from .dcd import DCDFile from .netcdf import NetCDFFile + from .tng import TNGFile + from .trr import TRRFile + from .xtc import XTCFile + if suffix == ".trr": traj_file_cls = TRRFile if suffix == ".xtc": @@ -229,9 +240,10 @@ def _as_single_model_if_possible(atoms): def _mol_header(): from .mol import Header + return Header( mol_name="Molecule", program="Biotite", time=datetime.datetime.now(), dimensions="3D", - ) \ No newline at end of file + ) diff --git a/src/biotite/structure/io/gro/__init__.py b/src/biotite/structure/io/gro/__init__.py index 8d10671b5..e58ccff55 100644 --- a/src/biotite/structure/io/gro/__init__.py +++ b/src/biotite/structure/io/gro/__init__.py @@ -11,4 +11,4 @@ __name__ = "biotite.structure.io.gro" __author__ = "Daniel Bauer" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/gro/file.py b/src/biotite/structure/io/gro/file.py index 188338e50..47040120e 100644 --- a/src/biotite/structure/io/gro/file.py +++ b/src/biotite/structure/io/gro/file.py @@ -6,25 +6,27 @@ __author__ = "Daniel Bauer, Patrick Kunzmann" __all__ = ["GROFile"] +import copy +from datetime import datetime import numpy as np +from ....file import InvalidFileError, TextFile from ...atoms import AtomArray, AtomArrayStack from ...box import is_orthogonal -from ....file import TextFile, InvalidFileError -from ...repair import infer_elements from ...error import BadStructureError -import copy -from datetime import datetime +from ...repair import infer_elements -_atom_records = {"res_id" : (0, 5), - "res_name" : (5,10), - "atom_name" : (10,15), - "atom_id" : (15,20), - "coord_x" : (20, 28), - "coord_y" : (28, 36), - "coord_z" : (36, 44), - "v_x" : (44, 52), - "v_y" : (52, 60), - "v_z" : (60, 68)} +_atom_records = { + "res_id": (0, 5), + "res_name": (5, 10), + "atom_name": (10, 15), + "atom_id": (15, 20), + "coord_x": (20, 28), + "coord_y": (28, 36), + "coord_z": (36, 44), + "v_x": (44, 52), + "v_y": (52, 60), + "v_z": (60, 68), +} class GROFile(TextFile): @@ -48,6 +50,7 @@ class GROFile(TextFile): >>> file.write(os.path.join(path_to_directory, "1l2y_mod.gro")) """ + def get_model_count(self): """ Get the number of models contained in this GRO file. @@ -63,7 +66,6 @@ def get_model_count(self): model_count += 1 return model_count - def get_structure(self, model=None): """ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the @@ -91,9 +93,7 @@ def get_atom_line_i(model_start_i, model_atom_counts): """ Helper function to get the indices of all atoms for a model """ - return np.arange( - model_start_i+1, model_start_i+1+model_atom_counts - ) + return np.arange(model_start_i + 1, model_start_i + 1 + model_atom_counts) def set_box_dimen(box_param): """ @@ -114,33 +114,31 @@ def set_box_dimen(box_param): return None if len(box_param) == 3: x, y, z = box_param - return np.array([[x,0,0], [0,y,0], [0,0,z]], dtype=float) + return np.array([[x, 0, 0], [0, y, 0], [0, 0, z]], dtype=float) elif len(box_param) == 9: x1, y2, z3, x2, x3, y1, y3, z1, z2 = box_param - return np.array( - [[x1,x2,x3], [y1,y2,y3], [z1,z2,z3]], dtype=float - ) + return np.array([[x1, x2, x3], [y1, y2, y3], [z1, z2, z3]], dtype=float) else: raise InvalidFileError( f"Invalid amount of box parameters: {len(box_param)}" ) # Line indices where a new model starts - model_start_i = np.array([i for i in range(len(self.lines)) - if _is_int(self.lines[i])], - dtype=int) + model_start_i = np.array( + [i for i in range(len(self.lines)) if _is_int(self.lines[i])], dtype=int + ) # Number of atoms in each model - model_atom_counts = np.array( - [int(self.lines[i]) for i in model_start_i] - ) + model_atom_counts = np.array([int(self.lines[i]) for i in model_start_i]) if model is None: # Check if all models have the same length if np.all(model_atom_counts != model_atom_counts[0]): - raise BadStructureError("The models in the file have unequal " - "amount of atoms, give an explicit " - "model instead") + raise BadStructureError( + "The models in the file have unequal " + "amount of atoms, give an explicit " + "model instead" + ) depth = len(model_start_i) length = model_atom_counts[0] array = AtomArrayStack(depth, length) @@ -159,10 +157,10 @@ def set_box_dimen(box_param): f"the given model {model} does not exist" ) - length = model_atom_counts[model-1] + length = model_atom_counts[model - 1] array = AtomArray(length) - annot_i = get_atom_line_i(model_start_i[model-1], length) + annot_i = get_atom_line_i(model_start_i[model - 1], length) # Replace empty strings for elements with guessed types # i is index in array, line_i is line index @@ -179,27 +177,25 @@ def set_box_dimen(box_param): for i, line_i in enumerate(atom_i): line = self.lines[line_i] # gro files use nm instead of A - array.coord[i,0] = float(line[20:28])*10 - array.coord[i,1] = float(line[28:36])*10 - array.coord[i,2] = float(line[36:44])*10 + array.coord[i, 0] = float(line[20:28]) * 10 + array.coord[i, 1] = float(line[28:36]) * 10 + array.coord[i, 2] = float(line[36:44]) * 10 # Box is stored in last line (after coordinates) box_i = atom_i[-1] + 1 - box_param = [float(e)*10 for e in self.lines[box_i].split()] + box_param = [float(e) * 10 for e in self.lines[box_i].split()] array.box = set_box_dimen(box_param) elif isinstance(array, AtomArrayStack): for m in range(len(model_start_i)): - atom_i = get_atom_line_i( - model_start_i[m], model_atom_counts[m] - ) + atom_i = get_atom_line_i(model_start_i[m], model_atom_counts[m]) for i, line_i in enumerate(atom_i): line = self.lines[line_i] - array.coord[m,i,0] = float(line[20:28])*10 - array.coord[m,i,1] = float(line[28:36])*10 - array.coord[m,i,2] = float(line[36:44])*10 + array.coord[m, i, 0] = float(line[20:28]) * 10 + array.coord[m, i, 1] = float(line[28:36]) * 10 + array.coord[m, i, 2] = float(line[36:44]) * 10 # Box is stored in last line (after coordinates) box_i = atom_i[-1] + 1 - box_param = [float(e)*10 for e in self.lines[box_i].split()] + box_param = [float(e) * 10 for e in self.lines[box_i].split()] box = set_box_dimen(box_param) # Create a box in the stack if not already existing # and the box is not a dummy @@ -210,7 +206,6 @@ def set_box_dimen(box_param): return array - def set_structure(self, array): """ Set the :class:`AtomArray` or :class:`AtomArrayStack` for the @@ -223,6 +218,7 @@ def set_structure(self, array): is given, each array in the stack is saved as separate model. """ + def get_box_dimen(array): """ GRO files have the box dimensions as last line for each @@ -253,10 +249,15 @@ def get_box_dimen(array): else: box = box / 10 box_elements = ( - box[0,0], box[1,1], box[2,2], - box[0,1], box[0,2], - box[1,0], box[1,2], - box[2,0], box[2,1], + box[0, 0], + box[1, 1], + box[2, 2], + box[0, 1], + box[0, 2], + box[1, 0], + box[1, 2], + box[2, 0], + box[2, 1], ) return " ".join([f"{e:>9.5f}" for e in box_elements]) @@ -266,17 +267,11 @@ def get_box_dimen(array): atom_id = np.arange(1, array.array_length() + 1) # Atom IDs are supported up to 99999, # but negative IDs are also possible - gro_atom_id = np.where( - atom_id > 0, - ((atom_id - 1) % 99999) + 1, - atom_id - ) + gro_atom_id = np.where(atom_id > 0, ((atom_id - 1) % 99999) + 1, atom_id) # Residue IDs are supported up to 9999, # but negative IDs are also possible gro_res_id = np.where( - array.res_id > 0, - ((array.res_id - 1) % 99999) + 1, - array.res_id + array.res_id > 0, ((array.res_id - 1) % 99999) + 1, array.res_id ) if isinstance(array, AtomArray): @@ -290,10 +285,14 @@ def get_box_dimen(array): fmt = "{:>5d}{:5s}{:>5s}{:>5d}{:>8.3f}{:>8.3f}{:>8.3f}" for i in range(array.array_length()): # gro format is in nm -> multiply coords by 10 - self.lines[i+2] = fmt.format( - gro_res_id[i], array.res_name[i], array.atom_name[i], - gro_atom_id[i], array.coord[i,0]/10, array.coord[i,1]/10, - array.coord[i,2]/10 + self.lines[i + 2] = fmt.format( + gro_res_id[i], + array.res_name[i], + array.atom_name[i], + gro_atom_id[i], + array.coord[i, 0] / 10, + array.coord[i, 1] / 10, + array.coord[i, 2] / 10, ) # Write box lines self.lines[-1] = get_box_dimen(array) @@ -304,10 +303,11 @@ def get_box_dimen(array): # Therefore template lines are created # which are afterwards applied for each model templines = [None] * array.array_length() - fmt = '{:>5d}{:5s}{:>5s}{:5d}' + fmt = "{:>5d}{:5s}{:>5s}{:5d}" for i in range(array.array_length()): - templines[i] = fmt.format(gro_res_id[i], array.res_name[i], - array.atom_name[i], gro_atom_id[i]) + templines[i] = fmt.format( + gro_res_id[i], array.res_name[i], array.atom_name[i], gro_atom_id[i] + ) for i in range(array.stack_depth()): self.lines.append( @@ -319,10 +319,11 @@ def get_box_dimen(array): modellines = copy.copy(templines) for j, line in enumerate(modellines): # Insert coordinates - line = (line + "{:>8.3f}{:>8.3f}{:>8.3f}".format( - array.coord[i,j,0]/10, - array.coord[i,j,1]/10, - array.coord[i,j,2]/10)) + line = line + "{:>8.3f}{:>8.3f}{:>8.3f}".format( + array.coord[i, j, 0] / 10, + array.coord[i, j, 1] / 10, + array.coord[i, j, 2] / 10, + ) modellines[j] = line self.lines.extend(modellines) self.lines.append(get_box_dimen(array[i])) @@ -340,4 +341,4 @@ def _is_int(string): int(string) return True except ValueError: - return False \ No newline at end of file + return False diff --git a/src/biotite/structure/io/mol/__init__.py b/src/biotite/structure/io/mol/__init__.py index 9e8ee2097..ba71d85a2 100644 --- a/src/biotite/structure/io/mol/__init__.py +++ b/src/biotite/structure/io/mol/__init__.py @@ -17,4 +17,4 @@ from .convert import * from .header import * from .mol import * -from .sdf import * \ No newline at end of file +from .sdf import * diff --git a/src/biotite/structure/io/mol/convert.py b/src/biotite/structure/io/mol/convert.py index 2961c79c9..e85e04773 100644 --- a/src/biotite/structure/io/mol/convert.py +++ b/src/biotite/structure/io/mol/convert.py @@ -6,9 +6,9 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_structure", "set_structure"] +from ...bonds import BondType from .mol import MOLFile from .sdf import SDFile, SDRecord -from ...bonds import BondType def get_structure(mol_file, record_name=None): @@ -39,8 +39,9 @@ def get_structure(mol_file, record_name=None): return record.get_structure() -def set_structure(mol_file, atoms, default_bond_type=BondType.ANY, - version=None, record_name=None): +def set_structure( + mol_file, atoms, default_bond_type=BondType.ANY, version=None, record_name=None +): """ Set the :class:`AtomArray` for the MOL file. @@ -88,9 +89,7 @@ def _get_record(file, record_name): else: return file[record_name] else: - raise TypeError( - f"Unsupported file type '{type(file).__name__}'" - ) + raise TypeError(f"Unsupported file type '{type(file).__name__}'") def _get_or_create_record(file, record_name): @@ -110,6 +109,4 @@ def _get_or_create_record(file, record_name): file[record_name] = record return file[record_name] else: - raise TypeError( - f"Unsupported file type '{type(file).__name__}'" - ) \ No newline at end of file + raise TypeError(f"Unsupported file type '{type(file).__name__}'") diff --git a/src/biotite/structure/io/mol/ctab.py b/src/biotite/structure/io/mol/ctab.py index e8fff5d10..25d045443 100644 --- a/src/biotite/structure/io/mol/ctab.py +++ b/src/biotite/structure/io/mol/ctab.py @@ -12,13 +12,13 @@ __all__ = ["read_structure_from_ctab", "write_structure_to_ctab"] import itertools -import warnings import shlex +import warnings import numpy as np from ....file import InvalidFileError -from ...error import BadStructureError from ...atoms import AtomArray, AtomArrayStack from ...bonds import BondList, BondType +from ...error import BadStructureError BOND_TYPE_MAPPING = { 1: BondType.SINGLE, @@ -84,8 +84,7 @@ def read_structure_from_ctab(ctab_lines): raise InvalidFileError(f"Unknown CTAB version '{unkown_version}'") -def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, - version=None): +def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, version=None): """ Convert an :class:`AtomArray` into a *MDL* connection table (Ctab). @@ -124,8 +123,7 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, """ if isinstance(atoms, AtomArrayStack): raise TypeError( - "An 'AtomArrayStack' was given, " - "but only a single model can be written" + "An 'AtomArrayStack' was given, " "but only a single model can be written" ) if atoms.bonds is None: raise BadStructureError("Input AtomArray has no associated BondList") @@ -134,9 +132,7 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, match version: case None: - if _is_v2000_compatible( - atoms.array_length(), atoms.bonds.get_bond_count() - ): + if _is_v2000_compatible(atoms.array_length(), atoms.bonds.get_bond_count()): return _write_structure_to_ctab_v2000(atoms, default_bond_type) else: return _write_structure_to_ctab_v3000(atoms, default_bond_type) @@ -160,7 +156,8 @@ def _read_structure_from_ctab_v2000(ctab_lines): atom_lines = ctab_lines[1 : 1 + n_atoms] bond_lines = ctab_lines[1 + n_atoms : 1 + n_atoms + n_bonds] charge_lines = [ - line for line in ctab_lines[1 + n_atoms + n_bonds:] + line + for line in ctab_lines[1 + n_atoms + n_bonds :] if line.startswith("M CHG") ] @@ -208,10 +205,9 @@ def _read_structure_from_ctab_v2000(ctab_lines): return atoms + def _read_structure_from_ctab_v3000(ctab_lines): - v30_lines = [ - line[6:].strip() for line in ctab_lines if line.startswith("M V30") - ] + v30_lines = [line[6:].strip() for line in ctab_lines if line.startswith("M V30")] atom_lines = _get_block_v3000(v30_lines, "ATOM") if len(atom_lines) == 0: @@ -262,16 +258,20 @@ def _read_structure_from_ctab_v3000(ctab_lines): return atoms + def _get_version(counts_line): return counts_line[33:39].strip() + def _is_v2000_compatible(n_atoms, n_bonds): # The format uses a maximum of 3 digits for the atom and bond count return n_atoms < 1000 and n_bonds < 1000 + def _get_counts_v2000(counts_line): return int(counts_line[0:3]), int(counts_line[3:6]) + def _get_block_v3000(v30_lines, block_name): block_lines = [] in_block = False @@ -282,13 +282,12 @@ def _get_block_v3000(v30_lines, block_name): if in_block: return block_lines else: - raise InvalidFileError( - f"Block '{block_name}' ended before it began" - ) + raise InvalidFileError(f"Block '{block_name}' ended before it began") elif in_block: block_lines.append(line) return block_lines + def create_property_dict_v3000(property_strings): properties = {} for prop in property_strings: @@ -315,7 +314,8 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type): f" {atoms.element[i].capitalize():3}" f"{0:>2}" # Mass difference -> unused f"{CHARGE_MAPPING_REV.get(charge[i], 0):>3d}" - + f"{0:>3d}" * 10 # More unused fields + + f"{0:>3d}" + * 10 # More unused fields for i in range(atoms.array_length()) ] @@ -323,7 +323,8 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type): bond_lines = [ f"{i+1:>3d}{j+1:>3d}" f"{BOND_TYPE_MAPPING_REV.get(bond_type, default_bond_value):>3d}" - + f"{0:>3d}" * 4 + + f"{0:>3d}" + * 4 for i, j, bond_type in atoms.bonds.as_array() ] @@ -332,8 +333,7 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type): charge_lines = [] # Each `M CHG` line can contain up to 8 charges for batch in _batched( - [(atom_i, c) for atom_i, c in enumerate(charge) if c != 0], - N_CHARGES_PER_LINE + [(atom_i, c) for atom_i, c in enumerate(charge) if c != 0], N_CHARGES_PER_LINE ): charge_lines.append( f"M CHG{len(batch):>3d}" @@ -349,9 +349,7 @@ def _write_structure_to_ctab_v3000(atoms, default_bond_type): except AttributeError: charges = np.zeros(atoms.array_length(), dtype=int) - counts_line = ( - f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0" - ) + counts_line = f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0" atom_lines = [ f"{i + 1}" @@ -375,32 +373,35 @@ def _write_structure_to_ctab_v3000(atoms, default_bond_type): ] lines = ( - ["BEGIN CTAB"] + - [counts_line] + - ["BEGIN ATOM"] + - atom_lines + - ["END ATOM"] + - ["BEGIN BOND"] + - bond_lines + - ["END BOND"] + - ["END CTAB"] + ["BEGIN CTAB"] + + [counts_line] + + ["BEGIN ATOM"] + + atom_lines + + ["END ATOM"] + + ["BEGIN BOND"] + + bond_lines + + ["END BOND"] + + ["END CTAB"] ) # Mark lines as V3000 CTAB lines = ["M V30 " + line for line in lines] return [V2000_COMPATIBILITY_LINE] + lines + ["M END"] + def _to_property(charge): if charge == 0: return "" else: return f"CHG={charge}" + def _quote(string): if " " in string or len(string) == 0: return f'"{string}"' else: return string + def _batched(iterable, n): """ Equivalent to :func:`itertools.batched()`. @@ -411,4 +412,4 @@ def _batched(iterable, n): """ iterator = iter(iterable) while batch := tuple(itertools.islice(iterator, n)): - yield batch \ No newline at end of file + yield batch diff --git a/src/biotite/structure/io/mol/header.py b/src/biotite/structure/io/mol/header.py index 3b4f1b48d..0c459acac 100644 --- a/src/biotite/structure/io/mol/header.py +++ b/src/biotite/structure/io/mol/header.py @@ -6,16 +6,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["Header"] -import warnings import datetime +import warnings from dataclasses import dataclass - _DATE_FORMAT = "%m%d%y%H%M" @dataclass -class Header(): +class Header: """ The header for connection tables. @@ -70,20 +69,25 @@ def deserialize(text): try: time = datetime.datetime.strptime(time_string, _DATE_FORMAT) except ValueError: - warnings.warn( - f"Invalid time format '{time_string}' in file header" - ) + warnings.warn(f"Invalid time format '{time_string}' in file header") time = None dimensions = lines[1][20:22].strip() scaling_factors = lines[1][22:34].strip() - energy = lines[1][34:46].strip() + energy = lines[1][34:46].strip() registry_number = lines[1][46:52].strip() comments = lines[2].strip() return Header( - mol_name, initials, program, time, dimensions, - scaling_factors, energy, registry_number, comments + mol_name, + initials, + program, + time, + dimensions, + scaling_factors, + energy, + registry_number, + comments, ) def serialize(self): @@ -113,4 +117,4 @@ def serialize(self): return text def __str__(self): - return self.serialize() \ No newline at end of file + return self.serialize() diff --git a/src/biotite/structure/io/mol/mol.py b/src/biotite/structure/io/mol/mol.py index de58cdfb7..682a06d1f 100644 --- a/src/biotite/structure/io/mol/mol.py +++ b/src/biotite/structure/io/mol/mol.py @@ -6,11 +6,10 @@ __author__ = "Patrick Kunzmann" __all__ = ["MOLFile"] -from ....file import TextFile, InvalidFileError +from ....file import InvalidFileError, TextFile +from ...bonds import BondType from .ctab import read_structure_from_ctab, write_structure_to_ctab from .header import Header -from ...bonds import BondType - # Number of header lines N_HEADER = 3 @@ -80,27 +79,23 @@ def __init__(self): self.lines = [""] * N_HEADER self._header = None - @classmethod def read(cls, file): mol_file = super().read(file) mol_file._header = None return mol_file - @property def header(self): if self._header is None: self._header = Header.deserialize("\n".join(self.lines[0:3]) + "\n") return self._header - @header.setter def header(self, header): self._header = header self.lines[0:3] = self._header.serialize().splitlines() - def get_structure(self): """ Get an :class:`AtomArray` from the MOL file. @@ -118,9 +113,7 @@ def get_structure(self): raise InvalidFileError("File does not contain structure data") return read_structure_from_ctab(ctab_lines) - - def set_structure(self, atoms, default_bond_type=BondType.ANY, - version=None): + def set_structure(self, atoms, default_bond_type=BondType.ANY, version=None): """ Set the :class:`AtomArray` for the file. @@ -146,9 +139,8 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY, ) - def _get_ctab_lines(lines): for i, line in enumerate(lines): if line.startswith("M END"): - return lines[N_HEADER:i+1] + return lines[N_HEADER : i + 1] return lines[N_HEADER:] diff --git a/src/biotite/structure/io/mol/sdf.py b/src/biotite/structure/io/mol/sdf.py index a2b35096b..aa3aeb8ed 100644 --- a/src/biotite/structure/io/mol/sdf.py +++ b/src/biotite/structure/io/mol/sdf.py @@ -8,16 +8,21 @@ import re import warnings +from collections.abc import Mapping, MutableMapping from dataclasses import dataclass -from collections.abc import MutableMapping, Mapping import numpy as np -from ....file import File, InvalidFileError, is_open_compatible, is_text, \ - DeserializationError, SerializationError -from .ctab import read_structure_from_ctab, write_structure_to_ctab -from .header import Header +from ....file import ( + DeserializationError, + File, + InvalidFileError, + SerializationError, + is_open_compatible, + is_text, +) from ...atoms import AtomArray from ...bonds import BondList, BondType - +from .ctab import read_structure_from_ctab, write_structure_to_ctab +from .header import Header _N_HEADER = 3 # Number of header lines @@ -96,6 +101,7 @@ class Key: number, name, registry_internal, registry_external The same as the parameters. """ + # The characters that can be given as input to `name` # First character must be alphanumeric, # following characters may include underscores and periods @@ -103,7 +109,7 @@ class Key: # they are still used in practice and therefore allowed here _NAME_INPUT_REGEX = re.compile(r"^[a-zA-Z0-9][\w.]*$") # These regexes are used to parse the key from a line - _COMPONENT_REGEX = { + _COMPONENT_REGEX = { "number": re.compile(r"^DT(\d+)$"), "name": re.compile(r"^<([a-zA-Z0-9][\w.]*)>$"), "registry_internal": re.compile(r"^(\d+)$"), @@ -162,9 +168,7 @@ def deserialize(text): break else: # There is no matching pattern - raise DeserializationError( - f"Invalid key component '{component}'" - ) + raise DeserializationError(f"Invalid key component '{component}'") return Metadata.Key(**parsed_component_dict) def serialize(self): @@ -190,7 +194,6 @@ def serialize(self): def __str__(self): return self.serialize() - def __init__(self, metadata=None): if metadata is None: metadata = {} @@ -222,9 +225,7 @@ def deserialize(text): current_value = None else: if current_key is None: - raise DeserializationError( - "Value found before metadata key" - ) + raise DeserializationError("Value found before metadata key") if current_value is None: current_value = line else: @@ -483,8 +484,7 @@ def get_structure(self): raise InvalidFileError("File does not contain structure data") return read_structure_from_ctab(ctab_lines) - def set_structure(self, atoms, default_bond_type=BondType.ANY, - version=None): + def set_structure(self, atoms, default_bond_type=BondType.ANY, version=None): """ Set the structural data in the SD record. @@ -505,9 +505,9 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY, By default, ``"V2000"`` is used, unless the number of atoms or bonds exceeds 999, in which case ``"V3000"`` is used. """ - self._ctab = _join_with_terminal_newline(write_structure_to_ctab( - atoms, default_bond_type, version - )) + self._ctab = _join_with_terminal_newline( + write_structure_to_ctab(atoms, default_bond_type, version) + ) def __eq__(self, other): if not isinstance(other, type(self)): @@ -736,28 +736,29 @@ def deserialize(text): The content to be deserialized. """ lines = text.splitlines() - record_ends = np.array([ - i for i, line in enumerate(lines) - if line.startswith(_RECORD_DELIMITER) - ], dtype=int) + record_ends = np.array( + [i for i, line in enumerate(lines) if line.startswith(_RECORD_DELIMITER)], + dtype=int, + ) if len(record_ends) == 0: warnings.warn( "Final record delimiter missing, " "maybe this is a MOL file instead of a SD file" ) - record_ends = np.array([len(lines)-1], dtype=int) + record_ends = np.array([len(lines) - 1], dtype=int) # The first record starts at the first line and the last # delimiter is at the end of the file # Records in the middle start directly after the delimiter record_starts = np.concatenate(([0], record_ends[:-1] + 1), dtype=int) record_names = [lines[start].strip() for start in record_starts] - return SDFile({ - # Do not include the delimiter - # -> stop at end (instead of end + 1) - name: _join_with_terminal_newline(lines[start : end]) - for name, start, end - in zip(record_names, record_starts, record_ends) - }) + return SDFile( + { + # Do not include the delimiter + # -> stop at end (instead of end + 1) + name: _join_with_terminal_newline(lines[start:end]) + for name, start, end in zip(record_names, record_starts, record_ends) + } + ) def serialize(self): """ @@ -836,18 +837,14 @@ def __getitem__(self, key): try: record = SDRecord.deserialize(record) except: - raise DeserializationError( - f"Failed to deserialize record '{key}'" - ) + raise DeserializationError(f"Failed to deserialize record '{key}'") # Update with deserialized object self._records[key] = record return record def __setitem__(self, key, record): if not isinstance(record, SDRecord): - raise TypeError( - f"Expected 'SDRecord', but got '{type(record).__name__}'" - ) + raise TypeError(f"Expected 'SDRecord', but got '{type(record).__name__}'") # The molecule name in the header is unique across the file record.header.mol_name = key self._records[key] = record @@ -895,22 +892,19 @@ def _to_metadata_key(key): return Metadata.Key(name=key) else: raise TypeError( - "Expected 'Metadata.Key' or str, " - f"but got '{type(key).__name__}'" + "Expected 'Metadata.Key' or str, " f"but got '{type(key).__name__}'" ) def _add_key_value_pair(metadata, key, value): if key is not None: if value is None: - raise DeserializationError( - f"No value found for metadata key {key}" - ) + raise DeserializationError(f"No value found for metadata key {key}") metadata[key] = value def _get_ctab_stop(lines): for i in range(_N_HEADER, len(lines)): if lines[i].startswith("M END"): - return i+1 - return len(lines) \ No newline at end of file + return i + 1 + return len(lines) diff --git a/src/biotite/structure/io/netcdf/__init__.py b/src/biotite/structure/io/netcdf/__init__.py index 9926d405c..085e0c080 100644 --- a/src/biotite/structure/io/netcdf/__init__.py +++ b/src/biotite/structure/io/netcdf/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.netcdf" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/netcdf/file.py b/src/biotite/structure/io/netcdf/file.py index c651657e1..bc46333a5 100644 --- a/src/biotite/structure/io/netcdf/file.py +++ b/src/biotite/structure/io/netcdf/file.py @@ -7,20 +7,21 @@ __all__ = ["NetCDFFile"] import numpy as np +from ...box import unitcell_from_vectors, vectors_from_unitcell from ..trajfile import TrajectoryFile -from ...box import vectors_from_unitcell, unitcell_from_vectors class NetCDFFile(TrajectoryFile): """ This file class represents a NetCDF trajectory file. """ - + @classmethod def traj_type(cls): import mdtraj.formats as traj + return traj.NetCDFTrajectoryFile - + @classmethod def process_read_values(cls, read_values): # .dcd files use Angstrom @@ -29,35 +30,36 @@ def process_read_values(cls, read_values): cell_lengths = read_values[2] cell_angles = read_values[3] if cell_lengths is None or cell_angles is None: - box = None + box = None else: box = np.stack( - [vectors_from_unitcell(a, b, c, alpha, beta, gamma) - for (a, b, c), (alpha, beta, gamma) - in zip(cell_lengths, np.deg2rad(cell_angles))], - axis=0 + [ + vectors_from_unitcell(a, b, c, alpha, beta, gamma) + for (a, b, c), (alpha, beta, gamma) in zip( + cell_lengths, np.deg2rad(cell_angles) + ) + ], + axis=0, ) return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): - coord = coord.astype(np.float32, copy=False) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None + coord = coord.astype(np.float32, copy=False) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None if box is None: cell_lengths = None - cell_angles = None + cell_angles = None else: cell_lengths = np.zeros((len(box), 3), dtype=np.float32) - cell_angles = np.zeros((len(box), 3), dtype=np.float32) + cell_angles = np.zeros((len(box), 3), dtype=np.float32) for i, model_box in enumerate(box): a, b, c, alpha, beta, gamma = unitcell_from_vectors(model_box) cell_lengths[i] = np.array((a, b, c)) cell_angles[i] = np.rad2deg((alpha, beta, gamma)) return { - "coordinates" : coord, - "time" : time, - "cell_lengths" : cell_lengths, - "cell_angles" : cell_angles, - } \ No newline at end of file + "coordinates": coord, + "time": time, + "cell_lengths": cell_lengths, + "cell_angles": cell_angles, + } diff --git a/src/biotite/structure/io/pdb/__init__.py b/src/biotite/structure/io/pdb/__init__.py index 1dc97904b..687527d69 100644 --- a/src/biotite/structure/io/pdb/__init__.py +++ b/src/biotite/structure/io/pdb/__init__.py @@ -16,5 +16,5 @@ __name__ = "biotite.structure.io.pdb" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/structure/io/pdb/convert.py b/src/biotite/structure/io/pdb/convert.py index 7d4bc19dd..127e49fbb 100644 --- a/src/biotite/structure/io/pdb/convert.py +++ b/src/biotite/structure/io/pdb/convert.py @@ -9,8 +9,14 @@ __name__ = "biotite.structure.io.pdb" __author__ = "Patrick Kunzmann" -__all__ = ["get_model_count", "get_structure", "set_structure", - "list_assemblies", "get_assembly", "get_symmetry_mates"] +__all__ = [ + "get_model_count", + "get_structure", + "set_structure", + "list_assemblies", + "get_assembly", + "get_symmetry_mates", +] def get_model_count(pdb_file): @@ -30,8 +36,9 @@ def get_model_count(pdb_file): return pdb_file.get_model_count() -def get_structure(pdb_file, model=None, altloc="first", extra_fields=[], - include_bonds=False): +def get_structure( + pdb_file, model=None, altloc="first", extra_fields=[], include_bonds=False +): """ Create an :class:`AtomArray` or :class:`AtomArrayStack` from a :class:`PDBFile`. @@ -39,7 +46,7 @@ def get_structure(pdb_file, model=None, altloc="first", extra_fields=[], This function is a thin wrapper around the :class:`PDBFile` method :func:`get_structure()` for the sake of consistency with other ``structure.io`` subpackages. - + Parameters ---------- pdb_file : PDBFile @@ -77,12 +84,12 @@ def get_structure(pdb_file, model=None, altloc="first", extra_fields=[], (e.g. especially inter-residue bonds), have :attr:`BondType.ANY`, since the PDB format itself does not support bond orders. - + Returns ------- array : AtomArray or AtomArrayStack The return type depends on the `model` parameter. - + """ return pdb_file.get_structure(model, altloc, extra_fields, include_bonds) @@ -95,11 +102,11 @@ def set_structure(pdb_file, array, hybrid36=False): This function is a thin wrapper around the :class:`PDBFile` method :func:`set_structure()` for the sake of consistency with other ``structure.io`` subpackages. - + This will save the coordinates, the mandatory annotation categories and the optional annotation categories 'atom_id', 'b_factor', 'occupancy' and 'charge'. - + Parameters ---------- pdb_file : PDBFile @@ -137,7 +144,7 @@ def list_assemblies(pdb_file): ------- assemblies : list of str A list that contains the available assembly IDs. - + Examples -------- >>> import os.path @@ -148,8 +155,14 @@ def list_assemblies(pdb_file): return pdb_file.list_assemblies() -def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first", - extra_fields=[], include_bonds=False): +def get_assembly( + pdb_file, + assembly_id=None, + model=None, + altloc="first", + extra_fields=[], + include_bonds=False, +): """ Build the given biological assembly. @@ -205,7 +218,7 @@ def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first", assembly : AtomArray or AtomArrayStack The assembly. The return type depends on the `model` parameter. - + Examples -------- @@ -218,8 +231,9 @@ def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first", ) -def get_symmetry_mates(pdb_file, model=None, altloc="first", - extra_fields=[], include_bonds=False): +def get_symmetry_mates( + pdb_file, model=None, altloc="first", extra_fields=[], include_bonds=False +): """ Build a structure model containing all symmetric copies of the structure within a single unit cell, given by the space @@ -274,13 +288,13 @@ def get_symmetry_mates(pdb_file, model=None, altloc="first", symmetry_mates : AtomArray or AtomArrayStack All atoms within a single unit cell. The return type depends on the `model` parameter. - + Notes ----- To expand the structure beyond a single unit cell, use :func:`repeat_box()` with the return value as its input. - + Examples -------- @@ -288,6 +302,4 @@ def get_symmetry_mates(pdb_file, model=None, altloc="first", >>> file = PDBFile.read(os.path.join(path_to_structures, "1aki.pdb")) >>> atoms_in_unit_cell = get_symmetry_mates(file, model=1) """ - return pdb_file.get_symmetry_mates( - model, altloc, extra_fields, include_bonds - ) \ No newline at end of file + return pdb_file.get_symmetry_mates(model, altloc, extra_fields, include_bonds) diff --git a/src/biotite/structure/io/pdb/file.py b/src/biotite/structure/io/pdb/file.py index 208f6acfb..8b0d97fd2 100644 --- a/src/biotite/structure/io/pdb/file.py +++ b/src/biotite/structure/io/pdb/file.py @@ -8,20 +8,19 @@ import warnings import numpy as np +from ....file import InvalidFileError, TextFile from ...atoms import AtomArray, AtomArrayStack, repeat from ...bonds import BondList, connect_via_residue_names -from ...box import vectors_from_unitcell, unitcell_from_vectors -from ....file import TextFile, InvalidFileError -from ...repair import infer_elements +from ...box import unitcell_from_vectors, vectors_from_unitcell from ...error import BadStructureError from ...filter import ( filter_first_altloc, filter_highest_occupancy_altloc, filter_solvent, ) +from ...repair import infer_elements from ...util import matrix_rotate -from .hybrid36 import encode_hybrid36, decode_hybrid36, max_hybrid36_number - +from .hybrid36 import decode_hybrid36, encode_hybrid36, max_hybrid36_number _PDB_MAX_ATOMS = 99999 _PDB_MAX_RESIDUES = 9999 @@ -82,6 +81,7 @@ class PDBFile(TextFile): >>> file.set_structure(array_stack_mod) >>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb")) """ + @classmethod def read(cls, file): file = super().read(file) @@ -91,7 +91,6 @@ def read(cls, file): file._index_models_and_atoms() return file - def get_remark(self, number): r""" Get the lines containing the *REMARK* records with the given @@ -140,7 +139,8 @@ def get_remark(self, number): remark_string = f"REMARK {number:>3d}" # Find lines and omit ``REMARK XXX `` part remark_lines = [ - line[CONTENT_START_COLUMN:] for line in self.lines + line[CONTENT_START_COLUMN:] + for line in self.lines if line.startswith(remark_string) ] if len(remark_lines) == 0: @@ -149,7 +149,6 @@ def get_remark(self, number): remark_lines = remark_lines[1:] return remark_lines - def get_model_count(self): """ Get the number of models contained in the PDB file. @@ -161,7 +160,6 @@ def get_model_count(self): """ return len(self._model_start_i) - def get_coord(self, model=None): """ Get only the coordinates from the PDB file. @@ -239,21 +237,21 @@ def get_coord(self, model=None): if model is None: coord = np.zeros( (len(self._model_start_i), self._get_model_length(), 3), - dtype=np.float32 + dtype=np.float32, ) m = 0 i = 0 for line_i in self._atom_line_i: if ( - m < len(self._model_start_i)-1 - and line_i > self._model_start_i[m+1] + m < len(self._model_start_i) - 1 + and line_i > self._model_start_i[m + 1] ): m += 1 i = 0 line = self.lines[line_i] - coord[m,i,0] = float(line[_coord_x]) - coord[m,i,1] = float(line[_coord_y]) - coord[m,i,2] = float(line[_coord_z]) + coord[m, i, 0] = float(line[_coord_x]) + coord[m, i, 1] = float(line[_coord_y]) + coord[m, i, 2] = float(line[_coord_z]) i += 1 return coord @@ -262,12 +260,11 @@ def get_coord(self, model=None): coord = np.zeros((len(coord_i), 3), dtype=np.float32) for i, line_i in enumerate(coord_i): line = self.lines[line_i] - coord[i,0] = float(line[_coord_x]) - coord[i,1] = float(line[_coord_y]) - coord[i,2] = float(line[_coord_z]) + coord[i, 0] = float(line[_coord_x]) + coord[i, 1] = float(line[_coord_y]) + coord[i, 2] = float(line[_coord_z]) return coord - def get_b_factor(self, model=None): """ Get only the B-factors from the PDB file. @@ -300,20 +297,19 @@ def get_b_factor(self, model=None): """ if model is None: b_factor = np.zeros( - (len(self._model_start_i), self._get_model_length()), - dtype=np.float32 + (len(self._model_start_i), self._get_model_length()), dtype=np.float32 ) m = 0 i = 0 for line_i in self._atom_line_i: if ( - m < len(self._model_start_i)-1 - and line_i > self._model_start_i[m+1] + m < len(self._model_start_i) - 1 + and line_i > self._model_start_i[m + 1] ): m += 1 i = 0 line = self.lines[line_i] - b_factor[m,i] = float(line[_temp_f]) + b_factor[m, i] = float(line[_temp_f]) i += 1 return b_factor @@ -325,9 +321,9 @@ def get_b_factor(self, model=None): b_factor[i] = float(line[_temp_f]) return b_factor - - def get_structure(self, model=None, altloc="first", extra_fields=[], - include_bonds=False): + def get_structure( + self, model=None, altloc="first", extra_fields=[], include_bonds=False + ): """ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the PDB file. @@ -391,17 +387,17 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], array = AtomArray(len(coord_i)) # Create mandatory and optional annotation arrays - chain_id = np.zeros(array.array_length(), array.chain_id.dtype) - res_id = np.zeros(array.array_length(), array.res_id.dtype) - ins_code = np.zeros(array.array_length(), array.ins_code.dtype) - res_name = np.zeros(array.array_length(), array.res_name.dtype) - hetero = np.zeros(array.array_length(), array.hetero.dtype) + chain_id = np.zeros(array.array_length(), array.chain_id.dtype) + res_id = np.zeros(array.array_length(), array.res_id.dtype) + ins_code = np.zeros(array.array_length(), array.ins_code.dtype) + res_name = np.zeros(array.array_length(), array.res_name.dtype) + hetero = np.zeros(array.array_length(), array.hetero.dtype) atom_name = np.zeros(array.array_length(), array.atom_name.dtype) - element = np.zeros(array.array_length(), array.element.dtype) + element = np.zeros(array.array_length(), array.element.dtype) atom_id_raw = np.zeros(array.array_length(), "U5") - charge_raw = np.zeros(array.array_length(), "U2") + charge_raw = np.zeros(array.array_length(), "U2") occupancy = np.zeros(array.array_length(), float) - b_factor = np.zeros(array.array_length(), float) + b_factor = np.zeros(array.array_length(), float) altloc_id = np.zeros(array.array_length(), dtype="U1") # Fill annotation array @@ -425,13 +421,11 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], occupancy[i] = float(line[_occupancy].strip()) b_factor[i] = float(line[_temp_f].strip()) - if include_bonds or \ - (extra_fields is not None and "atom_id" in extra_fields): - # The atom IDs are only required in these two cases - atom_id = np.array( - [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw], - dtype=int - ) + if include_bonds or (extra_fields is not None and "atom_id" in extra_fields): + # The atom IDs are only required in these two cases + atom_id = np.array( + [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw], dtype=int + ) else: atom_id = None @@ -444,16 +438,16 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], array.atom_name = atom_name array.element = element - for field in (extra_fields if extra_fields is not None else []): + for field in extra_fields if extra_fields is not None else []: if field == "atom_id": # Copy is necessary to avoid double masking in # later altloc ID filtering array.set_annotation("atom_id", atom_id.copy()) elif field == "charge": charge = np.array(charge_raw) - array.set_annotation("charge", np.where( - charge == " ", "0", charge - ).astype(int)) + array.set_annotation( + "charge", np.where(charge == " ", "0", charge).astype(int) + ) elif field == "occupancy": array.set_annotation("occupancy", occupancy) elif field == "b_factor": @@ -485,7 +479,10 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], m = 0 i = 0 for line_i in self._atom_line_i: - if m < len(self._model_start_i)-1 and line_i > self._model_start_i[m+1]: + if ( + m < len(self._model_start_i) - 1 + and line_i > self._model_start_i[m + 1] + ): m += 1 i = 0 line = self.lines[line_i] @@ -506,9 +503,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], alpha = np.deg2rad(float(line[_alpha])) beta = np.deg2rad(float(line[_beta])) gamma = np.deg2rad(float(line[_gamma])) - box = vectors_from_unitcell( - len_a, len_b, len_c, alpha, beta, gamma - ) + box = vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma) except ValueError: # File contains invalid 'CRYST1' record warnings.warn( @@ -526,9 +521,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], # Filter altloc IDs if altloc == "occupancy": - filter = filter_highest_occupancy_altloc( - array, altloc_id, occupancy - ) + filter = filter_highest_occupancy_altloc(array, altloc_id, occupancy) array = array[..., filter] atom_id = atom_id[filter] if atom_id is not None else None elif altloc == "first": @@ -548,7 +541,6 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], return array - def set_structure(self, array, hybrid36=False): """ Set the :class:`AtomArray` or :class:`AtomArrayStack` for the @@ -596,39 +588,42 @@ def set_structure(self, array, hybrid36=False): occupancy = np.char.array(np.full(natoms, " 1.00", dtype="U6")) if "charge" in annot_categories: charge = np.char.array( - [str(np.abs(charge)) + "+" if charge > 0 else - (str(np.abs(charge)) + "-" if charge < 0 else "") - for charge in array.get_annotation("charge")] + [ + str(np.abs(charge)) + "+" + if charge > 0 + else (str(np.abs(charge)) + "-" if charge < 0 else "") + for charge in array.get_annotation("charge") + ] ) else: charge = np.char.array(np.full(natoms, " ", dtype="U2")) if hybrid36: - pdb_atom_id = np.char.array( - [encode_hybrid36(i, 5) for i in atom_id] - ) - pdb_res_id = np.char.array( - [encode_hybrid36(i, 4) for i in array.res_id] - ) + pdb_atom_id = np.char.array([encode_hybrid36(i, 5) for i in atom_id]) + pdb_res_id = np.char.array([encode_hybrid36(i, 4) for i in array.res_id]) else: # Atom IDs are supported up to 99999, # but negative IDs are also possible - pdb_atom_id = np.char.array(np.where( - atom_id > 0, - ((atom_id - 1) % _PDB_MAX_ATOMS) + 1, - atom_id - ).astype(str)) + pdb_atom_id = np.char.array( + np.where( + atom_id > 0, ((atom_id - 1) % _PDB_MAX_ATOMS) + 1, atom_id + ).astype(str) + ) # Residue IDs are supported up to 9999, # but negative IDs are also possible - pdb_res_id = np.char.array(np.where( - array.res_id > 0, - ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1, - array.res_id - ).astype(str)) + pdb_res_id = np.char.array( + np.where( + array.res_id > 0, + ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1, + array.res_id, + ).astype(str) + ) names = np.char.array( - [f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm - for atm, elem in zip(array.atom_name, array.element)] + [ + f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm + for atm, elem in zip(array.atom_name, array.element) + ] ) res_names = np.char.array(array.res_name) chain_ids = np.char.array(array.chain_id) @@ -637,17 +632,20 @@ def set_structure(self, array, hybrid36=False): elements = np.char.array(array.element) first_half = ( - record.ljust(6) + - pdb_atom_id.rjust(5) + - spaces + - names.ljust(4) + - spaces + res_names.rjust(3) + spaces + chain_ids + - pdb_res_id.rjust(4) + ins_codes.rjust(1) + record.ljust(6) + + pdb_atom_id.rjust(5) + + spaces + + names.ljust(4) + + spaces + + res_names.rjust(3) + + spaces + + chain_ids + + pdb_res_id.rjust(4) + + ins_codes.rjust(1) ) second_half = ( - occupancy + b_factor + 10 * spaces + - elements.rjust(2) + charge.rjust(2) + occupancy + b_factor + 10 * spaces + elements.rjust(2) + charge.rjust(2) ) coords = array.coord @@ -674,9 +672,10 @@ def set_structure(self, array, hybrid36=False): self.lines.append(f"MODEL {model_num:4}") # Bundle non-coordinate data to simplify iteration self.lines.extend( - [f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}" - for start, (x, y, z), end in - zip(first_half, coord_i, second_half)] + [ + f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}" + for start, (x, y, z), end in zip(first_half, coord_i, second_half) + ] ) if is_stack: self.lines.append("ENDMDL") @@ -688,18 +687,15 @@ def set_structure(self, array, hybrid36=False): hetero_indices = np.where(array.hetero & ~filter_solvent(array))[0] bond_array = array.bonds.as_array() bond_array = bond_array[ - np.isin(bond_array[:,0], hetero_indices) | - np.isin(bond_array[:,1], hetero_indices) | - (array.res_id [bond_array[:,0]] != array.res_id [bond_array[:,1]]) | - (array.chain_id[bond_array[:,0]] != array.chain_id[bond_array[:,1]]) + np.isin(bond_array[:, 0], hetero_indices) + | np.isin(bond_array[:, 1], hetero_indices) + | (array.res_id[bond_array[:, 0]] != array.res_id[bond_array[:, 1]]) + | (array.chain_id[bond_array[:, 0]] != array.chain_id[bond_array[:, 1]]) ] - self._set_bonds( - BondList(array.array_length(), bond_array), pdb_atom_id - ) + self._set_bonds(BondList(array.array_length(), bond_array), pdb_atom_id) self._index_models_and_atoms() - def list_assemblies(self): """ List the biological assemblies that are available for the @@ -727,14 +723,16 @@ def list_assemblies(self): raise InvalidFileError( "File does not contain assembly information (REMARK 300)" ) - return [ - assembly_id.strip() - for assembly_id in remark_lines[0][12:].split(",") - ] - - - def get_assembly(self, assembly_id=None, model=None, altloc="first", - extra_fields=[], include_bonds=False): + return [assembly_id.strip() for assembly_id in remark_lines[0][12:].split(",")] + + def get_assembly( + self, + assembly_id=None, + model=None, + altloc="first", + extra_fields=[], + include_bonds=False, + ): """ Build the given biological assembly. @@ -829,18 +827,16 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", if assembly_start_i is None: if assembly_id is None: raise InvalidFileError( - "File does not contain transformation " - "expressions for assemblies" + "File does not contain transformation " "expressions for assemblies" ) else: - raise KeyError( - f"The assembly ID '{assembly_id}' is not found" - ) - assembly_lines = remark_lines[assembly_start_i : assembly_stop_i] + raise KeyError(f"The assembly ID '{assembly_id}' is not found") + assembly_lines = remark_lines[assembly_start_i:assembly_stop_i] # Get transformations for a set of chains chain_set_start_indices = [ - i for i, line in enumerate(assembly_lines) + i + for i, line in enumerate(assembly_lines) if line.startswith("APPLY THE FOLLOWING TO CHAINS") ] # Add exclusive stop at end of records @@ -848,17 +844,17 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", assembly = None for i in range(len(chain_set_start_indices) - 1): start = chain_set_start_indices[i] - stop = chain_set_start_indices[i+1] + stop = chain_set_start_indices[i + 1] # Read affected chain IDs from the following line(s) affected_chain_ids = [] transform_start = None - for j, line in enumerate(assembly_lines[start : stop]): - if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or \ - line.startswith(" AND CHAINS:"): - affected_chain_ids += [ - chain_id.strip() - for chain_id in line[30:].split(",") - ] + for j, line in enumerate(assembly_lines[start:stop]): + if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or line.startswith( + " AND CHAINS:" + ): + affected_chain_ids += [ + chain_id.strip() for chain_id in line[30:].split(",") + ] else: # Chain specification has finished # BIOMT lines start directly after chain specification @@ -866,11 +862,9 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", break # Parse transformations from BIOMT lines if transform_start is None: - raise InvalidFileError( - "No 'BIOMT' records found for chosen assembly" - ) + raise InvalidFileError("No 'BIOMT' records found for chosen assembly") rotations, translations = _parse_transformations( - assembly_lines[transform_start : stop] + assembly_lines[transform_start:stop] ) # Filter affected chains sub_structure = structure[ @@ -888,9 +882,9 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", return assembly - - def get_symmetry_mates(self, model=None, altloc="first", - extra_fields=[], include_bonds=False): + def get_symmetry_mates( + self, model=None, altloc="first", extra_fields=[], include_bonds=False + ): """ Build a structure model containing all symmetric copies of the structure within a single unit cell, given by the space @@ -971,27 +965,15 @@ def get_symmetry_mates(self, model=None, altloc="first", "File does not contain crystallographic symmetry " "information (REMARK 350)" ) - transform_lines = [ - line for line in remark_lines if line.startswith(" SMTRY") - ] - rotations, translations = _parse_transformations( - transform_lines - ) - return _apply_transformations( - structure, rotations, translations - ) - - - + transform_lines = [line for line in remark_lines if line.startswith(" SMTRY")] + rotations, translations = _parse_transformations(transform_lines) + return _apply_transformations(structure, rotations, translations) def _index_models_and_atoms(self): # Line indices where a new model starts self._model_start_i = np.array( - [ - i for i in range(len(self.lines)) - if self.lines[i].startswith(("MODEL")) - ], - dtype=int + [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))], + dtype=int, ) if len(self._model_start_i) == 0: # It could be an empty file or a file with a single model, @@ -1005,13 +987,13 @@ def _index_models_and_atoms(self): # Line indices with ATOM or HETATM records self._atom_line_i = np.array( [ - i for i in range(len(self.lines)) + i + for i in range(len(self.lines)) if self.lines[i].startswith(("ATOM", "HETATM")) ], - dtype=int + dtype=int, ) - def _get_atom_record_indices_for_model(self, model): last_model = len(self._model_start_i) if model == 0: @@ -1020,12 +1002,11 @@ def _get_atom_record_indices_for_model(self, model): model = last_model + model + 1 if model < 0 else model if model < last_model: - line_filter = ( - (self._atom_line_i >= self._model_start_i[model-1]) & - (self._atom_line_i < self._model_start_i[model ]) + line_filter = (self._atom_line_i >= self._model_start_i[model - 1]) & ( + self._atom_line_i < self._model_start_i[model] ) elif model == last_model: - line_filter = (self._atom_line_i >= self._model_start_i[model-1]) + line_filter = self._atom_line_i >= self._model_start_i[model - 1] else: raise ValueError( f"The file has {last_model} models, " @@ -1033,7 +1014,6 @@ def _get_atom_record_indices_for_model(self, model): ) return self._atom_line_i[line_filter] - def _get_model_length(self): """ Determine length of models and check that all models @@ -1043,11 +1023,13 @@ def _get_model_length(self): length = None for model_i in range(len(self._model_start_i)): model_start = self._model_start_i[model_i] - model_stop = self._model_start_i[model_i+1] \ - if model_i+1 < n_models else len(self.lines) + model_stop = ( + self._model_start_i[model_i + 1] + if model_i + 1 < n_models + else len(self.lines) + ) model_length = np.count_nonzero( - (self._atom_line_i >= model_start) & - (self._atom_line_i < model_stop) + (self._atom_line_i >= model_start) & (self._atom_line_i < model_stop) ) if length is None: length = model_length @@ -1058,26 +1040,22 @@ def _get_model_length(self): ) return length - def _get_bonds(self, atom_ids): - conect_lines = [line for line in self.lines - if line.startswith("CONECT")] + conect_lines = [line for line in self.lines if line.startswith("CONECT")] # Mapping from atom ids to indices in an AtomArray - atom_id_to_index = np.zeros(atom_ids[-1]+1, dtype=int) + atom_id_to_index = np.zeros(atom_ids[-1] + 1, dtype=int) try: for i, id in enumerate(atom_ids): atom_id_to_index[id] = i except IndexError as e: - raise InvalidFileError( - "Atom IDs are not strictly increasing" - ) from e + raise InvalidFileError("Atom IDs are not strictly increasing") from e bonds = [] for line in conect_lines: - center_id = atom_id_to_index[decode_hybrid36(line[6 : 11])] + center_id = atom_id_to_index[decode_hybrid36(line[6:11])] for i in range(11, 31, 5): - id_string = line[i : i+5] + id_string = line[i : i + 5] try: id = atom_id_to_index[decode_hybrid36(id_string)] except ValueError: @@ -1089,7 +1067,6 @@ def _get_bonds(self, atom_ids): # is equal to the length of the AtomArray return BondList(len(atom_ids), np.array(bonds, dtype=np.uint32)) - def _set_bonds(self, bond_list, atom_ids): # Bond type is unused since PDB does not support bond orders bonds, _ = bond_list.get_all_bonds() @@ -1136,9 +1113,7 @@ def _parse_transformations(lines): # transformation index) are not used transformations = [float(e) for e in line.split()[2:]] if len(transformations) != 4: - raise InvalidFileError( - "Invalid number of transformation vector elements" - ) + raise InvalidFileError("Invalid number of transformation vector elements") rotations[transformation_i, component_i, :] = transformations[:3] translations[transformation_i, component_i] = transformations[3] @@ -1237,4 +1212,4 @@ def _number_of_integer_digits(values): n_digits = 0 n_digits = max(n_digits, len(str(np.min(values)))) n_digits = max(n_digits, len(str(np.max(values)))) - return n_digits \ No newline at end of file + return n_digits diff --git a/src/biotite/structure/io/pdbqt/__init__.py b/src/biotite/structure/io/pdbqt/__init__.py index 6c406636a..ea81ca4fc 100644 --- a/src/biotite/structure/io/pdbqt/__init__.py +++ b/src/biotite/structure/io/pdbqt/__init__.py @@ -11,5 +11,5 @@ __name__ = "biotite.structure.io.pdbqt" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/structure/io/pdbqt/convert.py b/src/biotite/structure/io/pdbqt/convert.py index ee335ccc6..051339c4f 100644 --- a/src/biotite/structure/io/pdbqt/convert.py +++ b/src/biotite/structure/io/pdbqt/convert.py @@ -18,7 +18,7 @@ def get_structure(pdbqt_file, model=None): PDBQT file. EXPERIMENTAL: Future API changes are probable. - + Parameters ---------- pdbqt_file : PDBQTFile @@ -32,7 +32,7 @@ def get_structure(pdbqt_file, model=None): If this parameter is omitted, an :class:`AtomArrayStack` containing all models will be returned, even if the structure contains only one model. - + Returns ------- array : AtomArray or AtomArrayStack @@ -41,13 +41,20 @@ def get_structure(pdbqt_file, model=None): return pdbqt_file.get_structure(model) -def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, - rotatable_bonds=None, root=None, include_torsdof=True): +def set_structure( + pdbqt_file, + atoms, + charges=None, + atom_types=None, + rotatable_bonds=None, + root=None, + include_torsdof=True, +): """ Write an :class:`AtomArray` into a PDBQT file. EXPERIMENTAL: Future API changes are probable. - + Parameters ---------- pdbqt_file : PDBQTFile @@ -71,7 +78,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, be written. - ``'rigid'`` - The molecule is handled as rigid ligand: Only a ``ROOT`` line will be written. - - ``'all'`` - The molecule is handled as flexible + - ``'all'`` - The molecule is handled as flexible ligand: A ``ROOT`` line will be written and all rotatable bonds are included using ``BRANCH`` and ``ENDBRANCH`` @@ -81,7 +88,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, A ``ROOT`` line will be written and all bonds in the given :class:`BondList` are considered flexible via ``BRANCH`` and ``ENDBRANCH`` lines. - + root : int, optional Specifies the index of the atom following the ``ROOT`` line. Setting the root atom is useful for specifying the *anchor* @@ -93,7 +100,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, By default, a ``TORSDOF`` (torsional degrees of freedom) record is written at the end of the file. By setting this parameter to false, the record is omitted. - + Returns ------- mask : ndarray, shape=(n,), dtype=bool @@ -102,6 +109,5 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, hydrogen. """ return pdbqt_file.set_structure( - atoms, charges, atom_types, rotatable_bonds, root, - include_torsdof - ) \ No newline at end of file + atoms, charges, atom_types, rotatable_bonds, root, include_torsdof + ) diff --git a/src/biotite/structure/io/pdbqt/file.py b/src/biotite/structure/io/pdbqt/file.py index 271d4bc69..d4541429c 100644 --- a/src/biotite/structure/io/pdbqt/file.py +++ b/src/biotite/structure/io/pdbqt/file.py @@ -8,17 +8,28 @@ import warnings import numpy as np -from ....file import TextFile, InvalidFileError -from ...error import BadStructureError +from ....file import InvalidFileError, TextFile from ...atoms import AtomArray, AtomArrayStack -from ...charges import partial_charges from ...bonds import BondList, BondType, find_connected, find_rotatable_bonds - +from ...charges import partial_charges +from ...error import BadStructureError PARAMETRIZED_ELEMENTS = [ - "H", "C", "N", "O", "P", "S", - "F", "Cl", "Br", "I", - "Mg", "Ca", "Mn", "Fe", "Zn" + "H", + "C", + "N", + "O", + "P", + "S", + "F", + "Cl", + "Br", + "I", + "Mg", + "Ca", + "Mn", + "Fe", + "Zn", ] @@ -116,13 +127,15 @@ def get_remarks(self, model=None): ``'REMARKS'``. """ # Line indices where a new model starts - model_start_i = np.array([i for i in range(len(self.lines)) - if self.lines[i].startswith(("MODEL"))], - dtype=int) + model_start_i = np.array( + [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))], + dtype=int, + ) # Line indices with ATOM or HETATM records - remark_line_i = np.array([i for i in range(len(self.lines)) if - self.lines[i].startswith("REMARK")], - dtype=int) + remark_line_i = np.array( + [i for i in range(len(self.lines)) if self.lines[i].startswith("REMARK")], + dtype=int, + ) # Structures containing only one model may omit MODEL record # In these cases model starting index is set to 0 if len(model_start_i) == 0: @@ -135,7 +148,7 @@ def get_remarks(self, model=None): remarks = [] for i in range(len(model_start_i) - 1): start = model_start_i[i] - stop = model_start_i[i+1] + stop = model_start_i[i + 1] model_remark_line_i = remark_line_i[ (remark_line_i >= start) & (remark_line_i < stop) ] @@ -152,10 +165,11 @@ def get_remarks(self, model=None): model = last_model + model + 1 if model < 0 else model if model < last_model: - line_filter = ( ( remark_line_i >= model_start_i[model-1] ) & - ( remark_line_i < model_start_i[model ] ) ) + line_filter = (remark_line_i >= model_start_i[model - 1]) & ( + remark_line_i < model_start_i[model] + ) elif model == last_model: - line_filter = (remark_line_i >= model_start_i[model-1]) + line_filter = remark_line_i >= model_start_i[model - 1] else: raise ValueError( f"The file has {last_model} models, " @@ -166,7 +180,6 @@ def get_remarks(self, model=None): # Do not include 'REMARK ' itself -> begin from pos 8 return "\n".join([self.lines[i][7:] for i in remark_line_i]) - def get_structure(self, model=None): """ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the @@ -190,13 +203,19 @@ def get_structure(self, model=None): The return type depends on the `model` parameter. """ # Line indices where a new model starts - model_start_i = np.array([i for i in range(len(self.lines)) - if self.lines[i].startswith(("MODEL"))], - dtype=int) + model_start_i = np.array( + [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))], + dtype=int, + ) # Line indices with ATOM or HETATM records - atom_line_i = np.array([i for i in range(len(self.lines)) if - self.lines[i].startswith(("ATOM", "HETATM"))], - dtype=int) + atom_line_i = np.array( + [ + i + for i in range(len(self.lines)) + if self.lines[i].startswith(("ATOM", "HETATM")) + ], + dtype=int, + ) # Structures containing only one model may omit MODEL record # In these cases model starting index is set to 0 if len(model_start_i) == 0: @@ -224,10 +243,11 @@ def get_structure(self, model=None): model = last_model + model + 1 if model < 0 else model if model < last_model: - line_filter = ( ( atom_line_i >= model_start_i[model-1] ) & - ( atom_line_i < model_start_i[model ] ) ) + line_filter = (atom_line_i >= model_start_i[model - 1]) & ( + atom_line_i < model_start_i[model] + ) elif model == last_model: - line_filter = (atom_line_i >= model_start_i[model-1]) + line_filter = atom_line_i >= model_start_i[model - 1] else: raise ValueError( f"The file has {last_model} models, " @@ -237,16 +257,16 @@ def get_structure(self, model=None): array = AtomArray(len(coord_i)) # Save atom IDs for later sorting into the original atom order - atom_id = np.zeros(array.array_length(), int) + atom_id = np.zeros(array.array_length(), int) # Create annotation arrays - chain_id = np.zeros(array.array_length(), array.chain_id.dtype) - res_id = np.zeros(array.array_length(), array.res_id.dtype) - ins_code = np.zeros(array.array_length(), array.ins_code.dtype) - res_name = np.zeros(array.array_length(), array.res_name.dtype) - hetero = np.zeros(array.array_length(), array.hetero.dtype) + chain_id = np.zeros(array.array_length(), array.chain_id.dtype) + res_id = np.zeros(array.array_length(), array.res_id.dtype) + ins_code = np.zeros(array.array_length(), array.ins_code.dtype) + res_name = np.zeros(array.array_length(), array.res_name.dtype) + hetero = np.zeros(array.array_length(), array.hetero.dtype) atom_name = np.zeros(array.array_length(), array.atom_name.dtype) - element = np.zeros(array.array_length(), array.element.dtype) + element = np.zeros(array.array_length(), array.element.dtype) # Fill annotation array # i is index in array, line_i is line index @@ -258,7 +278,7 @@ def get_structure(self, model=None): res_id[i] = int(line[22:26]) ins_code[i] = line[26].strip() res_name[i] = line[17:20].strip() - hetero[i] = (False if line[0:4] == "ATOM" else True) + hetero[i] = False if line[0:4] == "ATOM" else True atom_name[i] = line[12:16].strip() element[i] = line[76:78].strip() @@ -275,21 +295,21 @@ def get_structure(self, model=None): if isinstance(array, AtomArray): for i, line_i in enumerate(coord_i): line = self.lines[line_i] - array.coord[i,0] = float(line[30:38]) - array.coord[i,1] = float(line[38:46]) - array.coord[i,2] = float(line[46:54]) + array.coord[i, 0] = float(line[30:38]) + array.coord[i, 1] = float(line[38:46]) + array.coord[i, 2] = float(line[46:54]) elif isinstance(array, AtomArrayStack): m = 0 i = 0 for line_i in atom_line_i: - if m < len(model_start_i)-1 and line_i > model_start_i[m+1]: + if m < len(model_start_i) - 1 and line_i > model_start_i[m + 1]: m += 1 i = 0 line = self.lines[line_i] - array.coord[m,i,0] = float(line[30:38]) - array.coord[m,i,1] = float(line[38:46]) - array.coord[m,i,2] = float(line[46:54]) + array.coord[m, i, 0] = float(line[30:38]) + array.coord[m, i, 1] = float(line[38:46]) + array.coord[m, i, 2] = float(line[46:54]) i += 1 # Sort into the original atom order @@ -297,9 +317,15 @@ def get_structure(self, model=None): return array - - def set_structure(self, atoms, charges=None, atom_types=None, - rotatable_bonds=None, root=None, include_torsdof=True): + def set_structure( + self, + atoms, + charges=None, + atom_types=None, + rotatable_bonds=None, + root=None, + include_torsdof=True, + ): """ Write an :class:`AtomArray` into the PDBQT file. @@ -394,12 +420,8 @@ def set_structure(self, atoms, charges=None, atom_types=None, use_root = True else: if rotatable_bonds.ndim != 2 or rotatable_bonds.shape[1] != 2: - raise ValueError( - "An (nx2) array is expected for rotatable bonds" - ) - rotatable_bonds = BondList( - len(mask), np.asarray(rotatable_bonds) - )[mask] + raise ValueError("An (nx2) array is expected for rotatable bonds") + rotatable_bonds = BondList(len(mask), np.asarray(rotatable_bonds))[mask] use_root = True if root is None: @@ -430,31 +452,47 @@ def set_structure(self, atoms, charges=None, atom_types=None, if "atom_id" in atoms.get_annotation_categories(): atom_id = atoms.atom_id else: - atom_id = np.arange(1, atoms.array_length()+1) + atom_id = np.arange(1, atoms.array_length() + 1) occupancy = np.ones(atoms.array_length()) b_factor = np.zeros(atoms.array_length()) # Convert rotatable bonds into array for easier handling # The bond type is irrelevant from this point on - rotatable_bonds = rotatable_bonds.as_array()[:,:2] + rotatable_bonds = rotatable_bonds.as_array()[:, :2] self.lines = [] self._write_atoms( - atoms, charges, types, - atom_id, hetero, occupancy, b_factor, - root_index, rotatable_bonds, - np.zeros(len(rotatable_bonds), dtype=bool), use_root + atoms, + charges, + types, + atom_id, + hetero, + occupancy, + b_factor, + root_index, + rotatable_bonds, + np.zeros(len(rotatable_bonds), dtype=bool), + use_root, ) if include_torsdof: self.lines.append(f"TORSDOF {len(rotatable_bonds)}") return mask - - def _write_atoms(self, atoms, charges, types, - atom_id, hetero, occupancy, b_factor, - root_atom, rotatable_bonds, visited_rotatable_bonds, - is_root): + def _write_atoms( + self, + atoms, + charges, + types, + atom_id, + hetero, + occupancy, + b_factor, + root_atom, + rotatable_bonds, + visited_rotatable_bonds, + is_root, + ): if len(rotatable_bonds) != 0: # Get the indices to atoms of this branch, i.e. a group of # atoms that are connected by non-rotatable bonds @@ -465,9 +503,7 @@ def _write_atoms(self, atoms, charges, types, # the rotatable bond should always be listed first # -> Remove root atom and insert it at the beginning this_branch_indices = np.insert( - this_branch_indices[this_branch_indices != root_atom], - 0, - root_atom + this_branch_indices[this_branch_indices != root_atom], 0, root_atom ) else: # No rotatable bonds @@ -525,18 +561,24 @@ def _write_atoms(self, atoms, charges, types, f"BRANCH {atom_id[this_br_i]:>3d} {atom_id[new_br_i]:>3d}" ) self._write_atoms( - atoms, charges, types, - atom_id, hetero, occupancy, b_factor, + atoms, + charges, + types, + atom_id, + hetero, + occupancy, + b_factor, # The root atom of the branch - #is the other atom of the rotatable bond - new_br_i, rotatable_bonds, visited_rotatable_bonds, - False + # is the other atom of the rotatable bond + new_br_i, + rotatable_bonds, + visited_rotatable_bonds, + False, ) self.lines.append( f"ENDBRANCH {atom_id[this_br_i]:>3d} {atom_id[new_br_i]:>3d}" ) - def _get_model_length(self, model_start_i, atom_line_i): """ Determine length of models and check that all models @@ -546,8 +588,11 @@ def _get_model_length(self, model_start_i, atom_line_i): length = None for model_i in range(len(model_start_i)): model_start = model_start_i[model_i] - model_stop = model_start_i[model_i+1] if model_i+1 < n_models \ - else len(self.lines) + model_stop = ( + model_start_i[model_i + 1] + if model_i + 1 < n_models + else len(self.lines) + ) model_length = np.count_nonzero( (atom_line_i >= model_start) & (atom_line_i < model_stop) ) @@ -613,8 +658,7 @@ def convert_atoms(atoms, charges): ) elif element == "C": if np.isin( - all_bond_types[i], - [BondType.AROMATIC_SINGLE, BondType.AROMATIC_DOUBLE] + all_bond_types[i], [BondType.AROMATIC_SINGLE, BondType.AROMATIC_DOUBLE] ).any(): # Aromatic carbon atom_types[i] = "A" @@ -637,4 +681,4 @@ def convert_atoms(atoms, charges): atom_types[i] = "H" mask = ~hydrogen_removal_mask - return atoms[mask], charges[mask], atom_types[mask], mask \ No newline at end of file + return atoms[mask], charges[mask], atom_types[mask], mask diff --git a/src/biotite/structure/io/pdbx/__init__.py b/src/biotite/structure/io/pdbx/__init__.py index ccad4ca21..0b3714b48 100644 --- a/src/biotite/structure/io/pdbx/__init__.py +++ b/src/biotite/structure/io/pdbx/__init__.py @@ -15,8 +15,8 @@ __name__ = "biotite.structure.io.pdbx" __author__ = "Patrick Kunzmann" -from .convert import * from .bcif import * from .cif import * from .component import * -from .encoding import * \ No newline at end of file +from .convert import * +from .encoding import * diff --git a/src/biotite/structure/io/pdbx/bcif.py b/src/biotite/structure/io/pdbx/bcif.py index 4b9331ff6..f18364a57 100644 --- a/src/biotite/structure/io/pdbx/bcif.py +++ b/src/biotite/structure/io/pdbx/bcif.py @@ -4,16 +4,25 @@ __name__ = "biotite.structure.io.pdbx" __author__ = "Patrick Kunzmann" -__all__ = ["BinaryCIFFile", "BinaryCIFBlock", "BinaryCIFCategory", - "BinaryCIFColumn", "BinaryCIFData"] +__all__ = [ + "BinaryCIFFile", + "BinaryCIFBlock", + "BinaryCIFCategory", + "BinaryCIFColumn", + "BinaryCIFData", +] from collections.abc import Sequence -import numpy as np import msgpack -from .component import _Component, _HierarchicalContainer, MaskValue -from .encoding import decode_stepwise, encode_stepwise, deserialize_encoding, \ - create_uncompressed_encoding -from ....file import File, is_binary, is_open_compatible, SerializationError +import numpy as np +from ....file import File, SerializationError, is_binary, is_open_compatible +from .component import MaskValue, _Component, _HierarchicalContainer +from .encoding import ( + create_uncompressed_encoding, + decode_stepwise, + deserialize_encoding, + encode_stepwise, +) class BinaryCIFData(_Component): @@ -74,10 +83,7 @@ class BinaryCIFData(_Component): """ def __init__(self, array, encoding=None): - if ( - not isinstance(array, (Sequence, np.ndarray)) - or isinstance(array, str) - ): + if not isinstance(array, (Sequence, np.ndarray)) or isinstance(array, str): array = [array] array = np.asarray(array) if np.issubdtype(array.dtype, np.object_): @@ -107,19 +113,13 @@ def supercomponent_class(): @staticmethod def deserialize(content): - encoding = [ - deserialize_encoding(enc) for enc in content["encoding"] - ] - return BinaryCIFData( - decode_stepwise(content["data"], encoding), encoding - ) + encoding = [deserialize_encoding(enc) for enc in content["encoding"]] + return BinaryCIFData(decode_stepwise(content["data"], encoding), encoding) def serialize(self): serialized_data = encode_stepwise(self._array, self._encoding) if not isinstance(serialized_data, bytes): - raise SerializationError( - "Final encoding must return 'bytes'" - ) + raise SerializationError("Final encoding must return 'bytes'") serialized_encoding = [enc.serialize() for enc in self._encoding] return {"data": serialized_data, "encoding": serialized_encoding} @@ -190,8 +190,7 @@ def __init__(self, data, mask=None): mask = BinaryCIFData(mask) if len(data) != len(mask): raise IndexError( - f"Data has length {len(data)}, " - f"but mask has length {len(mask)}" + f"Data has length {len(data)}, " f"but mask has length {len(mask)}" ) self._data = data self._mask = mask @@ -290,9 +289,7 @@ def as_array(self, dtype=None, masked_value=None): array = np.full(len(self._data), masked_value, dtype=dtype) present_mask = self._mask.array == MaskValue.PRESENT - array[present_mask] = ( - self._data.array[present_mask].astype(dtype) - ) + array[present_mask] = self._data.array[present_mask].astype(dtype) return array @staticmethod @@ -300,13 +297,14 @@ def deserialize(content): return BinaryCIFColumn( BinaryCIFData.deserialize(content["data"]), BinaryCIFData.deserialize(content["mask"]) - if content["mask"] is not None else None + if content["mask"] is not None + else None, ) def serialize(self): return { "data": self._data.serialize(), - "mask": self._mask.serialize() if self._mask is not None else None + "mask": self._mask.serialize() if self._mask is not None else None, } def __len__(self): @@ -392,10 +390,8 @@ def supercomponent_class(): @staticmethod def deserialize(content): return BinaryCIFCategory( - BinaryCIFCategory._deserialize_elements( - content["columns"], "name" - ), - content["rowCount"] + BinaryCIFCategory._deserialize_elements(content["columns"], "name"), + content["rowCount"], ) def serialize(self): @@ -470,9 +466,7 @@ def supercomponent_class(): @staticmethod def deserialize(content): return BinaryCIFBlock( - BinaryCIFBlock._deserialize_elements( - content["categories"], "name" - ) + BinaryCIFBlock._deserialize_elements(content["categories"], "name") ) def serialize(self): @@ -559,9 +553,7 @@ def supercomponent_class(): @staticmethod def deserialize(content): return BinaryCIFFile( - BinaryCIFFile._deserialize_elements( - content["dataBlocks"], "header" - ) + BinaryCIFFile._deserialize_elements(content["dataBlocks"], "header") ) def serialize(self): @@ -587,18 +579,14 @@ def read(self, file): if is_open_compatible(file): with open(file, "rb") as f: return BinaryCIFFile.deserialize( - msgpack.unpackb( - f.read(), use_list=True, raw=False - ) + msgpack.unpackb(f.read(), use_list=True, raw=False) ) # File object else: if not is_binary(file): raise TypeError("A file opened in 'binary' mode is required") return BinaryCIFFile.deserialize( - msgpack.unpackb( - file.read(), use_list=True, raw=False - ) + msgpack.unpackb(file.read(), use_list=True, raw=False) ) def write(self, file): diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index 6ca9bfe66..c39704fa7 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -10,10 +10,14 @@ import shlex from collections.abc import MutableMapping, Sequence import numpy as np -from .component import _Component, MaskValue -from ....file import File, is_open_compatible, is_text, DeserializationError, \ - SerializationError - +from ....file import ( + DeserializationError, + File, + SerializationError, + is_open_compatible, + is_text, +) +from .component import MaskValue, _Component UNICODE_CHAR_SIZE = 4 @@ -133,9 +137,7 @@ def __init__(self, data, mask=None): if not isinstance(data, CIFData): data = CIFData(data, str) if mask is None: - mask = np.full( - len(data), MaskValue.PRESENT, dtype=np.uint8 - ) + mask = np.full(len(data), MaskValue.PRESENT, dtype=np.uint8) mask[data.array == "."] = MaskValue.INAPPLICABLE mask[data.array == "?"] = MaskValue.MISSING if np.all(mask == MaskValue.PRESENT): @@ -148,8 +150,7 @@ def __init__(self, data, mask=None): mask = CIFData(mask, np.uint8) if len(mask) != len(data): raise IndexError( - f"Data has length {len(data)}, " - f"but mask has length {len(mask)}" + f"Data has length {len(data)}, " f"but mask has length {len(mask)}" ) self._data = data self._mask = mask @@ -222,9 +223,7 @@ def as_array(self, dtype=str, masked_value=None): elif np.issubdtype(dtype, np.str_): # Limit float precision to 3 decimals if np.issubdtype(self._data.array.dtype, np.floating): - array = np.array( - [f"{e:.3f}" for e in self._data.array], type=dtype - ) + array = np.array([f"{e:.3f}" for e in self._data.array], type=dtype) else: # Copy, as otherwise original data would be overwritten # with mask values @@ -247,9 +246,7 @@ def as_array(self, dtype=str, masked_value=None): array = np.full(len(self._data), masked_value, dtype=dtype) present_mask = self._mask.array == MaskValue.PRESENT - array[present_mask] = ( - self._data.array[present_mask].astype(dtype) - ) + array[present_mask] = self._data.array[present_mask].astype(dtype) return array def __len__(self): @@ -361,9 +358,7 @@ def supercomponent_class(): @staticmethod def deserialize(text, expect_whitespace=True): - lines = [ - line.strip() for line in text.splitlines() if not _is_empty(line) - ] + lines = [line.strip() for line in text.splitlines() if not _is_empty(line)] if _is_loop_start(lines[0]): is_looped = True @@ -373,15 +368,11 @@ def deserialize(text, expect_whitespace=True): category_name = _parse_category_name(lines[0]) if category_name is None: - raise DeserializationError( - "Failed to parse category name" - ) + raise DeserializationError("Failed to parse category name") lines = _to_single(lines, is_looped) if is_looped: - category_dict = CIFCategory._deserialize_looped( - lines, expect_whitespace - ) + category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace) else: category_dict = CIFCategory._deserialize_single(lines) return CIFCategory(category_dict, category_name) @@ -511,27 +502,21 @@ def _serialize_single(self): ] def _serialize_looped(self): - key_lines = [ - "_" + self._name + "." + key + " " - for key in self.keys() - ] + key_lines = ["_" + self._name + "." + key + " " for key in self.keys()] column_arrays = [] for column in self.values(): array = column.as_array(str) # Quote before measuring the number of chars, # as the quote characters modify the length - array = np.array( - [_multiline(_quote(element)) for element in array] - ) + array = np.array([_multiline(_quote(element)) for element in array]) column_arrays.append(array) # Number of characters the longest string in the column needs # This can be deduced from the dtype # The "+1" is for the small whitespace column column_n_chars = [ - array.dtype.itemsize // UNICODE_CHAR_SIZE + 1 - for array in column_arrays + array.dtype.itemsize // UNICODE_CHAR_SIZE + 1 for array in column_arrays ] value_lines = [""] * self._row_count for i in range(self._row_count): @@ -615,15 +600,11 @@ def deserialize(text): if is_loop_in_line: # In case of lines with "loop_" the category is # in the next line - category_name_in_line = _parse_category_name( - lines[i + 1] - ) + category_name_in_line = _parse_category_name(lines[i + 1]) current_category_name = category_name_in_line category_starts.append(i) category_names.append(current_category_name) - return CIFBlock(_create_element_dict( - lines, category_names, category_starts - )) + return CIFBlock(_create_element_dict(lines, category_names, category_starts)) def serialize(self): text_blocks = [] @@ -659,9 +640,7 @@ def __getitem__(self, key): expect_whitespace = True category = CIFCategory.deserialize(category, expect_whitespace) except: - raise DeserializationError( - f"Failed to deserialize category '{key}'" - ) + raise DeserializationError(f"Failed to deserialize category '{key}'") # Update with deserialized object self._categories[key] = category return category @@ -870,18 +849,14 @@ def __getitem__(self, key): try: block = CIFBlock.deserialize(block) except: - raise DeserializationError( - f"Failed to deserialize block '{key}'" - ) + raise DeserializationError(f"Failed to deserialize block '{key}'") # Update with deserialized object self._blocks[key] = block return block def __setitem__(self, key, block): if not isinstance(block, CIFBlock): - raise TypeError( - f"Expected 'CIFBlock', but got '{type(block).__name__}'" - ) + raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'") self._blocks[key] = block def __delitem__(self, key): @@ -919,7 +894,7 @@ def _create_element_dict(lines, element_names, element_starts): # Lazy deserialization # -> keep as text for now and deserialize later if needed return { - element_name: "\n".join(lines[element_starts[i] : element_starts[i+1]]) + element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) for i, element_name in enumerate(element_names) } diff --git a/src/biotite/structure/io/pdbx/component.py b/src/biotite/structure/io/pdbx/component.py index 76eb0c8da..28e233c64 100644 --- a/src/biotite/structure/io/pdbx/component.py +++ b/src/biotite/structure/io/pdbx/component.py @@ -11,10 +11,10 @@ __author__ = "Patrick Kunzmann" __all__ = ["MaskValue"] -from enum import IntEnum from abc import ABCMeta, abstractmethod from collections.abc import MutableMapping -from ....file import SerializationError, DeserializationError +from enum import IntEnum +from ....file import DeserializationError, SerializationError class MaskValue(IntEnum): @@ -29,6 +29,7 @@ class MaskValue(IntEnum): - `MISSING` : For this row the value is missing or unknown (``?`` in *CIF*). """ + PRESENT = 0 INAPPLICABLE = 1 MISSING = 2 @@ -109,8 +110,7 @@ def __str__(self): return str(self.serialize()) -class _HierarchicalContainer(_Component, MutableMapping, - metaclass=ABCMeta): +class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta): """ A container for hierarchical data in BinaryCIF files. For example, the file contains multiple blocks, each block contains @@ -182,9 +182,7 @@ def _serialize_elements(self, store_key_in=None): try: serialized_element = element.serialize() except: - raise SerializationError( - f"Failed to serialize element '{key}'" - ) + raise SerializationError(f"Failed to serialize element '{key}'") else: # Element is already stored in serialized form serialized_element = element @@ -201,9 +199,7 @@ def __getitem__(self, key): try: element = self.subcomponent_class().deserialize(element) except: - raise DeserializationError( - f"Failed to deserialize element '{key}'" - ) + raise DeserializationError(f"Failed to deserialize element '{key}'") # Update container with deserialized object self._elements[key] = element return element @@ -221,9 +217,7 @@ def __setitem__(self, key, element): try: element = self.subcomponent_class().deserialize(element) except: - raise DeserializationError( - f"Failed to deserialize given value" - ) + raise DeserializationError("Failed to deserialize given value") self._elements[key] = element def __delitem__(self, key): diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 712dec0b9..4bd37adf4 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -23,24 +23,29 @@ from ...atoms import AtomArray, AtomArrayStack, repeat from ...bonds import BondList, BondType, connect_via_residue_names from ...box import unitcell_from_vectors, vectors_from_unitcell +from ...error import BadStructureError from ...filter import filter_first_altloc, filter_highest_occupancy_altloc from ...residues import get_residue_count, get_residue_starts_for -from ...error import BadStructureError from ...util import matrix_rotate +from .bcif import BinaryCIFBlock, BinaryCIFColumn, BinaryCIFFile +from .cif import CIFBlock, CIFFile from .component import MaskValue -from .cif import CIFFile, CIFBlock -from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn from .encoding import StringArrayEncoding - # Cond types in `struct_conn` category that refer to covalent bonds PDBX_COVALENT_TYPES = [ - "covale", "covale_base", "covale_phosphate", "covale_sugar", - "disulf", "modres", "modres_link", "metalc" + "covale", + "covale_base", + "covale_phosphate", + "covale_sugar", + "disulf", + "modres", + "modres_link", + "metalc", ] # Map 'struct_conn' bond orders to 'BondType'... PDBX_BOND_ORDER_TO_TYPE = { - "": BondType.ANY, + "": BondType.ANY, "sing": BondType.SINGLE, "doub": BondType.DOUBLE, "trip": BondType.TRIPLE, @@ -60,13 +65,13 @@ } # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'... COMP_BOND_ORDER_TO_TYPE = { - ("SING", "N") : BondType.SINGLE, - ("DOUB", "N") : BondType.DOUBLE, - ("TRIP", "N") : BondType.TRIPLE, - ("QUAD", "N") : BondType.QUADRUPLE, - ("SING", "Y") : BondType.AROMATIC_SINGLE, - ("DOUB", "Y") : BondType.AROMATIC_DOUBLE, - ("TRIP", "Y") : BondType.AROMATIC_TRIPLE, + ("SING", "N"): BondType.SINGLE, + ("DOUB", "N"): BondType.DOUBLE, + ("TRIP", "N"): BondType.TRIPLE, + ("QUAD", "N"): BondType.QUADRUPLE, + ("SING", "Y"): BondType.AROMATIC_SINGLE, + ("DOUB", "Y"): BondType.AROMATIC_DOUBLE, + ("TRIP", "Y"): BondType.AROMATIC_TRIPLE, } # ...and vice versa COMP_BOND_TYPE_TO_ORDER = { @@ -97,16 +102,15 @@ def _filter(category, index): Column = Category.subcomponent_class() Data = Column.subcomponent_class() - return Category({ - key: Column( - Data(column.data.array[index]), - ( - Data(column.mask.array[index]) - if column.mask is not None else None + return Category( + { + key: Column( + Data(column.data.array[index]), + (Data(column.mask.array[index]) if column.mask is not None else None), ) - ) - for key, column in category.items() - }) + for key, column in category.items() + } + ) def get_sequence(pdbx_file, data_block=None): @@ -148,7 +152,7 @@ def get_sequence(pdbx_file, data_block=None): """ block = _get_block(pdbx_file, data_block) - poly_category= block["entity_poly"] + poly_category = block["entity_poly"] seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str) seq_type = poly_category["type"].as_array(str) @@ -158,7 +162,7 @@ def get_sequence(pdbx_file, data_block=None): for string, stype in zip(seq_string, seq_type) ] - strand_ids = poly_category['pdbx_strand_id'].as_array(str) + strand_ids = poly_category["pdbx_strand_id"].as_array(str) strand_ids = [strand_id.split(",") for strand_id in strand_ids] sequence_dict = { @@ -192,14 +196,20 @@ def get_model_count(pdbx_file, data_block=None): The number of models. """ block = _get_block(pdbx_file, data_block) - return len(_get_model_starts( - block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32) - )) + return len( + _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)) + ) -def get_structure(pdbx_file, model=None, data_block=None, altloc="first", - extra_fields=None, use_author_fields=True, - include_bonds=False): +def get_structure( + pdbx_file, + model=None, + data_block=None, + altloc="first", + extra_fields=None, + use_author_fields=True, + include_bonds=False, +): """ Create an :class:`AtomArray` or :class:`AtomArrayStack` from the ``atom_site`` category in a file. @@ -310,12 +320,21 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first", "instead" ) - atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \ - .reshape((model_count, model_length)) - atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \ - .reshape((model_count, model_length)) - atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \ - .reshape((model_count, model_length)) + atoms.coord[:, :, 0] = ( + atom_site["Cartn_x"] + .as_array(np.float32) + .reshape((model_count, model_length)) + ) + atoms.coord[:, :, 1] = ( + atom_site["Cartn_y"] + .as_array(np.float32) + .reshape((model_count, model_length)) + ) + atoms.coord[:, :, 2] = ( + atom_site["Cartn_z"] + .as_array(np.float32) + .reshape((model_count, model_length)) + ) box = _get_box(block) if box is not None: @@ -345,31 +364,25 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first", atoms.box = _get_box(block) # The below part is the same for both, AtomArray and AtomArrayStack - _fill_annotations( - atoms, model_atom_site, extra_fields, use_author_fields - ) + _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields) if include_bonds: if "chem_comp_bond" in block: try: - custom_bond_dict = _parse_intra_residue_bonds( - block["chem_comp_bond"] - ) + custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"]) except KeyError: warnings.warn( "The 'chem_comp_bond' category has missing columns, " "falling back to using Chemical Component Dictionary", - UserWarning + UserWarning, ) custom_bond_dict = None - bonds = connect_via_residue_names( - atoms, custom_bond_dict=custom_bond_dict - ) + bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict) else: bonds = connect_via_residue_names(atoms) if "struct_conn" in block: - bonds = bonds.merge(_parse_inter_residue_bonds( - model_atom_site, block["struct_conn"] - )) + bonds = bonds.merge( + _parse_inter_residue_bonds(model_atom_site, block["struct_conn"]) + ) atoms.bonds = bonds atoms = _filter_altloc(atoms, model_atom_site, altloc) @@ -388,24 +401,24 @@ def _get_block(pdbx_component, block_name): def _get_or_fallback(category, key, fallback_key): - """ - Return column related to key in category if it exists, - otherwise try to get the column related to fallback key. - """ - if key not in category: - warnings.warn( - f"Attribute '{key}' not found within 'atom_site' category. " - f"The fallback attribute '{fallback_key}' will be used instead", - UserWarning - ) - try: - return category[fallback_key] - except KeyError as key_exc: - raise InvalidFileError( - f"Fallback attribute '{fallback_key}' not found within " - "'atom_site' category" - ) from key_exc - return category[key] + """ + Return column related to key in category if it exists, + otherwise try to get the column related to fallback key. + """ + if key not in category: + warnings.warn( + f"Attribute '{key}' not found within 'atom_site' category. " + f"The fallback attribute '{fallback_key}' will be used instead", + UserWarning, + ) + try: + return category[fallback_key] + except KeyError as key_exc: + raise InvalidFileError( + f"Fallback attribute '{fallback_key}' not found within " + "'atom_site' category" + ) from key_exc + return category[key] def _fill_annotations(array, atom_site, extra_fields, use_author_fields): @@ -424,78 +437,52 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields): instead of ``label_``. """ - prefix, alt_prefix = ( - ("auth", "label") if use_author_fields else ("label", "auth") - ) + prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth") array.set_annotation( "chain_id", _get_or_fallback( atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id" - ).as_array("U4") + ).as_array("U4"), ) array.set_annotation( "res_id", _get_or_fallback( atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id" - ).as_array(int, -1) - ) - array.set_annotation( - "ins_code", - atom_site["pdbx_PDB_ins_code"].as_array("U1", "") + ).as_array(int, -1), ) + array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array("U1", "")) array.set_annotation( "res_name", _get_or_fallback( atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id" - ).as_array("U5") - ) - array.set_annotation( - "hetero", - atom_site["group_PDB"].as_array(str) == "HETATM" + ).as_array("U5"), ) + array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM") array.set_annotation( "atom_name", _get_or_fallback( atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id" - ).as_array("U6") - ) - array.set_annotation( - "element", - atom_site["type_symbol"].as_array("U2") + ).as_array("U6"), ) + array.set_annotation("element", atom_site["type_symbol"].as_array("U2")) if "atom_id" in extra_fields: - array.set_annotation( - "atom_id", - atom_site["id"].as_array(int) - ) + array.set_annotation("atom_id", atom_site["id"].as_array(int)) extra_fields.remove("atom_id") if "b_factor" in extra_fields: - array.set_annotation( - "b_factor", - atom_site["B_iso_or_equiv"].as_array(float) - ) + array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float)) extra_fields.remove("b_factor") if "occupancy" in extra_fields: - array.set_annotation( - "occupancy", - atom_site["occupancy"].as_array(float) - ) + array.set_annotation("occupancy", atom_site["occupancy"].as_array(float)) extra_fields.remove("occupancy") if "charge" in extra_fields: - array.set_annotation( - "charge", - atom_site["pdbx_formal_charge"].as_array(int, 0) - ) + array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0)) extra_fields.remove("charge") # Handle all remaining custom fields for field in extra_fields: - array.set_annotation( - field, - atom_site[field].as_array(str) - ) + array.set_annotation(field, atom_site[field].as_array(str)) def _parse_intra_residue_bonds(chem_comp_bond): @@ -509,7 +496,7 @@ def _parse_intra_residue_bonds(chem_comp_bond): chem_comp_bond["atom_id_1"].as_array(str), chem_comp_bond["atom_id_2"].as_array(str), chem_comp_bond["value_order"].as_array(str), - chem_comp_bond["pdbx_aromatic_flag"].as_array(str) + chem_comp_bond["pdbx_aromatic_flag"].as_array(str), ): if res_name not in custom_bond_dict: custom_bond_dict[res_name] = {} @@ -530,33 +517,32 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): IDENTITY = "1_555" # Columns in 'atom_site' that should be matched by 'struct_conn' COLUMNS = [ - "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id", - "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id", - "pdbx_PDB_ins_code" + "label_asym_id", + "label_comp_id", + "label_seq_id", + "label_atom_id", + "label_alt_id", + "auth_asym_id", + "auth_comp_id", + "auth_seq_id", + "pdbx_PDB_ins_code", ] covale_mask = np.isin( struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES ) if "ptnr1_symmetry" in struct_conn: - covale_mask &= ( - struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY - ) + covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY if "ptnr2_symmetry" in struct_conn: - covale_mask &= ( - struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY - ) + covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY atom_indices = [None] * 2 for i in range(2): reference_arrays = [] query_arrays = [] for col_name in COLUMNS: - struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1) - if ( - col_name not in atom_site - or struct_conn_col_name not in struct_conn - ): + struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1) + if col_name not in atom_site or struct_conn_col_name not in struct_conn: continue # Ensure both arrays have the same dtype to allow comparison reference = atom_site[col_name].as_array() @@ -593,7 +579,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): return BondList( atom_site.row_count, - np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1) + np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1), ) @@ -603,10 +589,13 @@ def _find_matches(query_arrays, reference_arrays): `reference_arrays` where all query values the reference counterpart. If no match is found for a query, the corresponding index is -1. """ - match_masks_for_all_columns = np.stack([ - query[:, np.newaxis] == reference[np.newaxis, :] - for query, reference in zip(query_arrays, reference_arrays) - ], axis=-1) + match_masks_for_all_columns = np.stack( + [ + query[:, np.newaxis] == reference[np.newaxis, :] + for query, reference in zip(query_arrays, reference_arrays) + ], + axis=-1, + ) match_masks = np.all(match_masks_for_all_columns, axis=-1) query_matches, reference_matches = np.where(match_masks) @@ -685,9 +674,7 @@ def _filter_model(atom_site, model_starts, model): Data = Column.subcomponent_class() # Append exclusive stop - model_starts = np.append( - model_starts, [atom_site.row_count] - ) + model_starts = np.append(model_starts, [atom_site.row_count]) # Indexing starts at 0, but model number starts at 1 model_index = model - 1 index = slice(model_starts[model_index], model_starts[model_index + 1]) @@ -773,9 +760,7 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): # Fill PDBx columns from information # in structures' attribute arrays as good as possible atom_site = Category() - atom_site["group_PDB"] = np.where( - array.hetero, "HETATM", "ATOM" - ) + atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM") atom_site["type_symbol"] = np.copy(array.element) atom_site["label_atom_id"] = np.copy(array.atom_name) atom_site["label_alt_id"] = Column( @@ -789,7 +774,7 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): atom_site["label_seq_id"] = np.copy(array.res_id) atom_site["pdbx_PDB_ins_code"] = Column( np.copy(array.ins_code), - np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT) + np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT), ) atom_site["auth_seq_id"] = atom_site["label_seq_id"] atom_site["auth_comp_id"] = atom_site["label_comp_id"] @@ -806,11 +791,11 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): if "charge" in annot_categories: atom_site["pdbx_formal_charge"] = Column( np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]), - np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT) + np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT), ) if array.bonds is not None: - struct_conn = _set_inter_residue_bonds(array, atom_site) + struct_conn = _set_inter_residue_bonds(array, atom_site) if struct_conn is not None: block["struct_conn"] = struct_conn if include_bonds: @@ -828,16 +813,12 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0])) atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1])) atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2])) - atom_site["pdbx_PDB_model_num"] = np.ones( - array.array_length(), dtype=np.int32 - ) + atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32) # In case of multiple models repeat annotations # and use model specific coordinates else: atom_site = _repeat(atom_site, array.stack_depth()) - coord = np.reshape( - array.coord, (array.stack_depth() * array.array_length(), 3) - ) + coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3)) atom_site["Cartn_x"] = np.copy(coord[:, 0]) atom_site["Cartn_y"] = np.copy(coord[:, 1]) atom_site["Cartn_z"] = np.copy(coord[:, 2]) @@ -845,11 +826,9 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): np.arange(1, array.stack_depth() + 1, dtype=np.int32), repeats=array.array_length(), ) - if not "atom_id" in annot_categories: + if "atom_id" not in annot_categories: # Count from 1 - atom_site["id"] = np.arange( - 1, len(atom_site["group_PDB"]) + 1 - ) + atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1) block["atom_site"] = atom_site # Write box into file @@ -938,8 +917,11 @@ def _repeat(category, repetitions): data = Data(np.tile(column.data.array, repetitions), data_encoding) else: data = Data(np.tile(column.data.array, repetitions)) - mask = Data(np.tile(column.mask.array, repetitions)) \ - if column.mask is not None else None + mask = ( + Data(np.tile(column.mask.array, repetitions)) + if column.mask is not None + else None + ) category_dict[key] = Column(data, mask) return Category(category_dict) @@ -986,22 +968,18 @@ def _set_intra_residue_bonds(array, atom_site): chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]] chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]] chem_comp_bond["value_order"] = Column( - value_order, - np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) + value_order, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) ) chem_comp_bond["pdbx_aromatic_flag"] = Column( - aromatic_flag, - np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) + aromatic_flag, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) ) # BondList does not contain stereo information # -> all values are missing chem_comp_bond["pdbx_stereo_config"] = Column( np.zeros(len(bond_array), dtype="U1"), - np.full(len(bond_array), MaskValue.MISSING) - ) - chem_comp_bond["pdbx_ordinal"] = np.arange( - 1, len(bond_array) + 1, dtype=np.int32 + np.full(len(bond_array), MaskValue.MISSING), ) + chem_comp_bond["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1, dtype=np.int32) return chem_comp_bond @@ -1013,8 +991,11 @@ def _set_inter_residue_bonds(array, atom_site): ``atom_site`` category. """ COLUMNS = [ - "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id", - "pdbx_PDB_ins_code" + "label_asym_id", + "label_comp_id", + "label_seq_id", + "label_atom_id", + "pdbx_PDB_ins_code", ] Category = type(atom_site) @@ -1027,13 +1008,12 @@ def _set_inter_residue_bonds(array, atom_site): struct_conn["id"] = np.arange(1, len(bond_array) + 1) struct_conn["conn_type_id"] = np.full(len(bond_array), "covale") struct_conn["pdbx_value_order"] = Column( - np.array( - [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]] - ), + np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]), np.where( bond_array[:, 2] == BondType.ANY, - MaskValue.MISSING, MaskValue.PRESENT, - ) + MaskValue.MISSING, + MaskValue.PRESENT, + ), ) # Write the identifying annotation... for col_name in COLUMNS: @@ -1041,8 +1021,9 @@ def _set_inter_residue_bonds(array, atom_site): # ...for each bond partner for i in range(2): atom_indices = bond_array[:, i] - struct_conn[_get_struct_conn_col_name(col_name, i+1)] \ - = annot[atom_indices] + struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[ + atom_indices + ] return struct_conn @@ -1054,9 +1035,9 @@ def _filter_bonds(array, connection): bond_array = array.bonds.as_array() # To save computation time call 'get_residue_starts_for()' only once # with indices of the first and second atom of each bond - residue_starts_1, residue_starts_2 = get_residue_starts_for( - array, bond_array[:, :2].flatten() - ).reshape(-1, 2).T + residue_starts_1, residue_starts_2 = ( + get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T + ) if connection == "intra": return bond_array[residue_starts_1 == residue_starts_2] elif connection == "inter": @@ -1065,8 +1046,7 @@ def _filter_bonds(array, connection): raise ValueError("Invalid 'connection' option") -def get_component(pdbx_file, data_block=None, use_ideal_coord=True, - res_name=None): +def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None): """ Create an :class:`AtomArray` for a chemical component from the ``chem_comp_atom`` and, if available, the ``chem_comp_bond`` @@ -1166,16 +1146,16 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, coord_fields, alt_coord_fields = alt_coord_fields, coord_fields try: for i, field in enumerate(coord_fields): - array.coord[:,i] = atom_category[field].as_array(np.float32) + array.coord[:, i] = atom_category[field].as_array(np.float32) except KeyError as err: key = err.args[0] warnings.warn( f"Attribute '{key}' not found within 'chem_comp_atom' category. " f"The fallback coordinates will be used instead", - UserWarning + UserWarning, ) for i, field in enumerate(alt_coord_fields): - array.coord[:,i] = atom_category[field].as_array(np.float32) + array.coord[:, i] = atom_category[field].as_array(np.float32) try: bond_category = block["chem_comp_bond"] @@ -1185,9 +1165,8 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, ) except KeyError: warnings.warn( - f"Category 'chem_comp_bond' not found. " - f"No bonds will be parsed", - UserWarning + "Category 'chem_comp_bond' not found. " "No bonds will be parsed", + UserWarning, ) else: bonds = BondList(array.array_length()) @@ -1195,7 +1174,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, bond_category["atom_id_1"].as_array(str), bond_category["atom_id_2"].as_array(str), bond_category["value_order"].as_array(str), - bond_category["pdbx_aromatic_flag"].as_array(str) + bond_category["pdbx_aromatic_flag"].as_array(str), ): atom_i = np.where(array.atom_name == atom1)[0][0] atom_j = np.where(array.atom_name == atom2)[0][0] @@ -1237,9 +1216,7 @@ def set_component(pdbx_file, array, data_block=None): Category = block.subcomponent_class() if get_residue_count(array) > 1: - raise BadStructureError( - "The input atom array must comprise only one residue" - ) + raise BadStructureError("The input atom array must comprise only one residue") res_name = array.res_name[0] annot_categories = array.get_annotation_categories() @@ -1262,31 +1239,28 @@ def set_component(pdbx_file, array, data_block=None): atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"] atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"] atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"] - atom_cat["pdbx_ordinal"] = np.arange( - 1, array.array_length() + 1 - ).astype(str) + atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str) block["chem_comp_atom"] = atom_cat if array.bonds is not None and array.bonds.get_bond_count() > 0: bond_array = array.bonds.as_array() order_flags = [] aromatic_flags = [] - for bond_type in bond_array[:,2]: + for bond_type in bond_array[:, 2]: order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type] order_flags.append(order_flag) aromatic_flags.append(aromatic_flag) bond_cat = Category() bond_cat["comp_id"] = np.full(len(bond_array), res_name) - bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]] - bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]] + bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]] + bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]] bond_cat["value_order"] = np.array(order_flags) bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags) - bond_cat["pdbx_ordinal"] = np.arange( - 1, len(bond_array) + 1 - ).astype(str) + bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str) block["chem_comp_bond"] = bond_cat + def list_assemblies(pdbx_file, data_block=None): """ List the biological assemblies that are available for the structure @@ -1337,14 +1311,21 @@ def list_assemblies(pdbx_file, data_block=None): id: details for id, details in zip( assembly_category["id"].as_array(str), - assembly_category["details"].as_array(str) + assembly_category["details"].as_array(str), ) } -def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, - altloc="first", extra_fields=None, use_author_fields=True, - include_bonds=False): +def get_assembly( + pdbx_file, + assembly_id=None, + model=None, + data_block=None, + altloc="first", + extra_fields=None, + use_author_fields=True, + include_bonds=False, +): """ Build the given biological assembly. @@ -1434,9 +1415,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, try: assembly_gen_category = block["pdbx_struct_assembly_gen"] except KeyError: - raise InvalidFileError( - "File has no 'pdbx_struct_assembly_gen' category" - ) + raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category") try: struct_oper_category = block["pdbx_struct_oper_list"] @@ -1469,7 +1448,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, altloc, extra_fields_and_asym, use_author_fields, - include_bonds + include_bonds, ) ### Get transformations and apply them to the affected asym IDs @@ -1485,9 +1464,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, operations = _parse_operation_expression(op_expr) asym_ids = asym_id_expr.split(",") # Filter affected asym IDs - sub_structure = structure[ - ..., np.isin(structure.label_asym_id, asym_ids) - ] + sub_structure = structure[..., np.isin(structure.label_asym_id, asym_ids)] sub_assembly = _apply_transformations( sub_structure, transformations, operations ) @@ -1546,10 +1523,9 @@ def _get_transformations(struct_oper): for i in (1, 2, 3) ] ) - translation_vector = np.array([ - struct_oper[f"vector[{i}]"].as_array(float)[index] - for i in (1, 2, 3) - ]) + translation_vector = np.array( + [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)] + ) transformation_dict[id] = (rotation_matrix, translation_vector) return transformation_dict @@ -1604,6 +1580,4 @@ def _convert_string_to_sequence(string, stype): elif stype in _other_type_list: return None else: - raise InvalidFileError( - "mmCIF _entity_poly.type unsupported" " type: " + stype - ) + raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype) diff --git a/src/biotite/structure/io/tng/__init__.py b/src/biotite/structure/io/tng/__init__.py index b344635fd..250b5b3c3 100644 --- a/src/biotite/structure/io/tng/__init__.py +++ b/src/biotite/structure/io/tng/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.tng" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/tng/file.py b/src/biotite/structure/io/tng/file.py index 8666ecc39..abf5ed864 100644 --- a/src/biotite/structure/io/tng/file.py +++ b/src/biotite/structure/io/tng/file.py @@ -14,12 +14,13 @@ class TNGFile(TrajectoryFile): """ This file class represents a TNG trajectory file. """ - + @classmethod def traj_type(cls): import mdtraj.formats as traj + return traj.TNGTrajectoryFile - + @classmethod def process_read_values(cls, read_values): # nm to Angstrom @@ -29,18 +30,15 @@ def process_read_values(cls, read_values): box *= 10 time = read_values[1] return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): # Angstrom to nm - xyz = np.divide(coord, 10, dtype=np.float32) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None - box = np.divide(box, 10, dtype=np.float32) \ - if box is not None else None + xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None + box = np.divide(box, 10, dtype=np.float32) if box is not None else None return { - "xyz" : xyz, - "box" : box, - "time" : time, + "xyz": xyz, + "box": box, + "time": time, } diff --git a/src/biotite/structure/io/trajfile.py b/src/biotite/structure/io/trajfile.py index 23842ea4e..f0ca86404 100644 --- a/src/biotite/structure/io/trajfile.py +++ b/src/biotite/structure/io/trajfile.py @@ -6,18 +6,18 @@ __author__ = "Patrick Kunzmann" __all__ = ["TrajectoryFile"] -import itertools import abc +import itertools import numpy as np -from ..atoms import AtomArray, AtomArrayStack, stack, from_template from ...file import File +from ..atoms import AtomArray, AtomArrayStack, from_template class TrajectoryFile(File, metaclass=abc.ABCMeta): """ This file class represents a trajectory file interfacing a trajectory file class from `MDtraj`. - + A trajectory file stores atom coordinates over multiple (time) frames. The file formats are usually binary and involve sometimes heavy compression, so that a large number of frames can be stored @@ -34,27 +34,27 @@ class TrajectoryFile(File, metaclass=abc.ABCMeta): Therefore, it is strongly recommended to make a copy of the respective array, if the array is modified. """ - + def __init__(self): super().__init__() self._coord = None self._time = None self._box = None self._model_count = None - @classmethod - def read(cls, file_name, start=None, stop=None, step=None, - atom_i=None, chunk_size=None): + def read( + cls, file_name, start=None, stop=None, step=None, atom_i=None, chunk_size=None + ): """ Read a trajectory file. - + A trajectory file can be seen as a file representation of an :class:`AtomArrayStack`. Therefore, `start`, `stop` and `step` represent slice parameters of the index of the first dimension and `atom_i` represents an index array for the second dimension. - + Parameters ---------- file_name : str @@ -85,7 +85,7 @@ def read(cls, file_name, start=None, stop=None, step=None, Although lower values can decrease the memory consumption of reading trajectories, they also increase the computation time. - + Returns ------- file_object : TrajectoryFile @@ -105,7 +105,6 @@ def read(cls, file_name, start=None, stop=None, step=None, traj_type = cls.traj_type() with traj_type(file_name, "r") as f: - if start is None: start = 0 # Discard atoms before start @@ -116,13 +115,13 @@ def read(cls, file_name, start=None, stop=None, step=None, TrajectoryFile._read_chunk_wise( f, start, None, atom_i, chunk_size, discard=True ) - + # The upcoming frames are saved # Calculate the amount of frames to be read if stop is None: n_frames = None else: - n_frames = stop-start + n_frames = stop - start if step is not None and n_frames is not None: # Divide number of frames by 'step' in order to convert # 'step' into 'stride' @@ -130,7 +129,7 @@ def read(cls, file_name, start=None, stop=None, step=None, # the number of frames is decremented before division # and incremented afterwards again n_frames = ((n_frames - 1) // step) + 1 - + # Read frames if chunk_size is None: result = f.read(n_frames, stride=step, atom_indices=atom_i) @@ -138,7 +137,7 @@ def read(cls, file_name, start=None, stop=None, step=None, result = TrajectoryFile._read_chunk_wise( f, n_frames, step, atom_i, chunk_size, discard=False ) - + # nm to Angstrom coord, box, time = cls.process_read_values(result) file.set_coord(coord) @@ -146,15 +145,15 @@ def read(cls, file_name, start=None, stop=None, step=None, file.set_time(time) return file - @classmethod - def read_iter(cls, file_name, start=None, stop=None, step=None, - atom_i=None, stack_size=None): + def read_iter( + cls, file_name, start=None, stop=None, step=None, atom_i=None, stack_size=None + ): """ Create an iterator over each frame of the given trajectory file in the selected range. - + Parameters ---------- file_name : str @@ -181,7 +180,7 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, values. If the number of frames is not a multiple of `stack_size`, the final stack is smaller than `stack_size`. - + Yields ------ coord : ndarray, dtype=float32, shape=(n,3) or shape=(m,n,3) @@ -190,30 +189,29 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, The box vectors of the current frame or stack. time : float or ndarray, dtype=float32, shape=(n,) or None The simulation time of the current frame or stack in *ps*. - + See also -------- read_iter_structure - + Notes ----- The `step` parameter does currently not work for *DCD* files. """ traj_type = cls.traj_type() with traj_type(file_name, "r") as f: - if start is None: start = 0 # Discard atoms before start if start != 0: f.read(n_frames=start, stride=None, atom_indices=atom_i) - + # The upcoming frames are read # Calculate the amount of frames to be read if stop is None: n_frames = None else: - n_frames = stop-start + n_frames = stop - start if step is not None and n_frames is not None: # Divide number of frames by 'step' in order to convert # 'step' into 'stride' @@ -221,7 +219,6 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, # the number of frames is decremented before division # and incremented afterwards again n_frames = ((n_frames - 1) // step) + 1 - # Read frames if stack_size is None: @@ -242,7 +239,7 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, yield coord, box, time if remaining_frames is not None: remaining_frames -= 1 - + else: remaining_frames = n_frames while remaining_frames is None or remaining_frames > 0: @@ -260,11 +257,18 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, yield coord, box, time if remaining_frames is not None: remaining_frames -= stack_size - @classmethod - def read_iter_structure(cls, file_name, template, start=None, stop=None, - step=None, atom_i=None, stack_size=None): + def read_iter_structure( + cls, + file_name, + template, + start=None, + stop=None, + step=None, + atom_i=None, + stack_size=None, + ): """ Create an iterator over each frame of the given trajectory file in the selected range. @@ -275,8 +279,8 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, information and no topology information, this method requires a template atom array or stack. This template can be acquired for example from a PDB file, which is associated with the - trajectory file. - + trajectory file. + Parameters ---------- file_name : str @@ -306,18 +310,18 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, determined by this parameter. If the number of frames is not a multiple of `stack_size`, the final stack is smaller than `stack_size`. - + Yields ------ structure : AtomArray or AtomArrayStack The structure of the current frame as :class:`AtomArray`. If `stack_size` is set, multiple frames are returned as :class:`AtomArrayStack`. - + See also -------- read_iter - + Notes ----- This iterator creates a new copy of the given template for every @@ -335,7 +339,7 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, f"An 'AtomArray' or 'AtomArrayStack' is expected as template, " f"not '{type(template).__name__}'" ) - + for coord, box, _ in cls.read_iter( file_name, start, stop, step, atom_i, stack_size ): @@ -347,7 +351,6 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, else: yield from_template(template, coord, box) - def write(self, file_name): """ Write the content into a trajectory file. @@ -360,9 +363,8 @@ def write(self, file_name): """ traj_type = self.traj_type() param = self.prepare_write_values(self._coord, self._box, self._time) - with traj_type(file_name, 'w') as f: + with traj_type(file_name, "w") as f: f.write(**param) - @classmethod def write_iter(cls, file_name, coord, box=None, time=None): @@ -376,7 +378,7 @@ def write_iter(cls, file_name, coord, box=None, time=None): Hence, this class method may save a large amount of memory if a large file should be written, if `coord` are provided as generator. - + Parameters ---------- file_name : str @@ -399,7 +401,7 @@ def write_iter(cls, file_name, coord, box=None, time=None): time = itertools.repeat(None) traj_type = cls.traj_type() - with traj_type(file_name, 'w') as f: + with traj_type(file_name, "w") as f: for c, b, t in zip(coord, box, time): if c.ndim != 2: raise IndexError( @@ -414,24 +416,22 @@ def write_iter(cls, file_name, coord, box=None, time=None): t = np.expand_dims(t, axis=0) param = cls.prepare_write_values(c, b, t) f.write(**param) - def get_coord(self): """ Extract only the atom coordinates from the trajectory file. - + Returns ------- coord : ndarray, dtype=float, shape=(m,n,3) The coordinates stored in the trajectory file. """ return self._coord - def get_time(self): """ Get the simlation time in *ps* values for each frame. - + Returns ------- time : ndarray, dtype=float, shape=(m,) @@ -439,12 +439,11 @@ def get_time(self): frames, that were read from the file. """ return self._time - def get_box(self): """ Get the box vectors for each frame. - + Returns ------- box : ndarray, dtype=float, shape=(m,3,3) @@ -452,12 +451,11 @@ def get_box(self): frames, that were read from the file. """ return self._box - def set_coord(self, coord): """ Set the atom coordinates in the trajectory file. - + Parameters ---------- coord : ndarray, dtype=float, shape=(m,n,3) @@ -465,12 +463,11 @@ def set_coord(self, coord): """ self._check_model_count(coord) self._coord = coord - def set_time(self, time): """ Set the simulation time of each frame in the trajectory file. - + Parameters ---------- time : ndarray, dtype=float, shape=(m,) @@ -478,13 +475,12 @@ def set_time(self, time): """ self._check_model_count(time) self._time = time - def set_box(self, box): """ Set the periodic box vectors of each frame in the trajectory file. - + Parameters ---------- time : ndarray, dtype=float, shape=(m,3,3) @@ -492,25 +488,24 @@ def set_box(self, box): """ self._check_model_count(box) self._box = box - def get_structure(self, template): """ Convert the trajectory file content into an :class:`AtomArrayStack`. - + Since trajectory files usually only contain atom coordinate information and no topology information, this method requires a template atom array or stack. This template can be acquired for example from a PDB file, which is associated with the - trajectory file. - + trajectory file. + Parameters ---------- template : AtomArray or AtomArrayStack The template array or stack, where the atom annotation data is taken from. - + Returns ------- array_stack : AtomArrayStack @@ -519,15 +514,14 @@ def get_structure(self, template): trajectory file. """ return from_template(template, self.get_coord(), self.get_box()) - def set_structure(self, structure, time=None): """ Write an atom array (stack) into the trajectory file object. - + The topology information (chain, residue, etc.) is not saved in the file. - + Parameters ---------- structure : AtomArray or AtomArrayStack @@ -547,34 +541,30 @@ def set_structure(self, structure, time=None): if time is not None: self.set_time(time) - def copy(self): """ This operation is not implemented for trajectory files. - + Raises ------ NotImplementedError """ - raise NotImplementedError("Copying is not implemented " - "for trajectory files") - + raise NotImplementedError("Copying is not implemented " "for trajectory files") @classmethod @abc.abstractmethod def traj_type(cls): """ The `MDtraj` files class to be used. - + PROTECTED: Override when inheriting. - + Returns ------- class An `MDtraj` subclass of :class:`TrajectoryFile`. """ pass - @classmethod @abc.abstractmethod @@ -583,15 +573,15 @@ def process_read_values(cls, read_values): Convert the return value of the `read()` method of the respective :class:`mdtraj.TrajectoryFile` into coordinates, simulation box and simulation time. - + PROTECTED: Override when inheriting. - + Parameters ---------- read_values : tuple The return value of the respective :func:`mdtraj.TrajectoryFile.read()` method. - + Returns ------- coord : ndarray, dtype=float, shape=(m,n,3) @@ -602,7 +592,6 @@ def process_read_values(cls, read_values): The simulation time in ps for each frame. """ pass - @classmethod @abc.abstractmethod @@ -622,7 +611,7 @@ def prepare_write_values(cls, coord, box, time): The box vectors in Å for each frame. time : ndarray, dtype=float, shape=(m,) The simulation time in ps for each frame. - + Returns ------- parameters : dict @@ -631,7 +620,6 @@ def prepare_write_values(cls, coord, box, time): """ pass - def _check_model_count(self, array): """ Check if the amount of models in the given array is equal to @@ -650,11 +638,9 @@ def _check_model_count(self, array): f"{len(array)} models were given, " f"but the file contains {self._model_count} models" ) - @staticmethod - def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, - discard=False): + def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, discard=False): """ Similar to :func:`read()`, just for chunk-wise reading of the trajectory. @@ -691,7 +677,7 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, chunks.append(chunk) if remaining_frames is not None: remaining_frames -= n - + if not discard: # Assemble the chunks into contiguous arrays # for each value (coord, box, time) @@ -707,4 +693,4 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, result[i] = None return tuple(result) else: - return None \ No newline at end of file + return None diff --git a/src/biotite/structure/io/trr/__init__.py b/src/biotite/structure/io/trr/__init__.py index cf2f0510d..c7ed3f8d9 100644 --- a/src/biotite/structure/io/trr/__init__.py +++ b/src/biotite/structure/io/trr/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.trr" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/trr/file.py b/src/biotite/structure/io/trr/file.py index 435fd6f7a..a9f25f63d 100644 --- a/src/biotite/structure/io/trr/file.py +++ b/src/biotite/structure/io/trr/file.py @@ -14,12 +14,13 @@ class TRRFile(TrajectoryFile): """ This file class represents a TRR trajectory file. """ - + @classmethod def traj_type(cls): import mdtraj.formats as traj + return traj.TRRTrajectoryFile - + @classmethod def process_read_values(cls, read_values): # nm to Angstrom @@ -29,18 +30,15 @@ def process_read_values(cls, read_values): box *= 10 time = read_values[1] return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): # Angstrom to nm - xyz = np.divide(coord, 10, dtype=np.float32) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None - box = np.divide(box, 10, dtype=np.float32) \ - if box is not None else None + xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None + box = np.divide(box, 10, dtype=np.float32) if box is not None else None return { - "xyz" : xyz, - "box" : box, - "time" : time, + "xyz": xyz, + "box": box, + "time": time, } diff --git a/src/biotite/structure/io/xtc/__init__.py b/src/biotite/structure/io/xtc/__init__.py index 5803ef784..5fe71216e 100644 --- a/src/biotite/structure/io/xtc/__init__.py +++ b/src/biotite/structure/io/xtc/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.xtc" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/xtc/file.py b/src/biotite/structure/io/xtc/file.py index 62d9a977f..c7f41e3d4 100644 --- a/src/biotite/structure/io/xtc/file.py +++ b/src/biotite/structure/io/xtc/file.py @@ -14,10 +14,11 @@ class XTCFile(TrajectoryFile): """ This file class represents a XTC trajectory file. """ - + @classmethod def traj_type(cls): import mdtraj.formats as traj + return traj.XTCTrajectoryFile @classmethod @@ -29,18 +30,15 @@ def process_read_values(cls, read_values): box *= 10 time = read_values[1] return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): # Angstrom to nm - xyz = np.divide(coord, 10, dtype=np.float32) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None - box = np.divide(box, 10, dtype=np.float32) \ - if box is not None else None + xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None + box = np.divide(box, 10, dtype=np.float32) if box is not None else None return { - "xyz" : xyz, - "box" : box, - "time" : time, + "xyz": xyz, + "box": box, + "time": time, } diff --git a/src/biotite/structure/mechanics.py b/src/biotite/structure/mechanics.py index d79e23908..cb967d34e 100644 --- a/src/biotite/structure/mechanics.py +++ b/src/biotite/structure/mechanics.py @@ -12,9 +12,6 @@ __all__ = ["mass_center", "gyration_radius"] import numpy as np -from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .util import vector_dot, norm_vector -from .error import BadStructureError from .geometry import distance from .info.masses import mass @@ -22,7 +19,7 @@ def gyration_radius(array, masses=None): """ Compute the radius/radii of gyration of an atom array or stack. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -33,7 +30,7 @@ def gyration_radius(array, masses=None): Must have the same length as `array`. By default, the standard atomic mass for each element is taken. - + Returns ------- masses : float or ndarray, dtype=float @@ -46,13 +43,14 @@ def gyration_radius(array, masses=None): masses = np.array([mass(element) for element in array.element]) center = mass_center(array, masses) radii = distance(array, center[..., np.newaxis, :]) - inertia_moment = np.sum(masses * radii*radii, axis=-1) + inertia_moment = np.sum(masses * radii * radii, axis=-1) return np.sqrt(inertia_moment / np.sum(masses)) + def mass_center(array, masses=None): """ Calculate the center(s) of mass of an atom array or stack. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -61,7 +59,7 @@ def mass_center(array, masses=None): The masses to use for each atom in the input `array`. Must have the same length as `array`. By default, the standard atomic mass for each element is taken. - + Returns ------- radius : ndarray, ndarray, dtype=float @@ -72,4 +70,4 @@ def mass_center(array, masses=None): """ if masses is None: masses = np.array([mass(element) for element in array.element]) - return np.sum(masses[:,np.newaxis] * array.coord, axis=-2) / np.sum(masses) \ No newline at end of file + return np.sum(masses[:, np.newaxis] * array.coord, axis=-2) / np.sum(masses) diff --git a/src/biotite/structure/molecules.py b/src/biotite/structure/molecules.py index d40920b18..ea94cbd7a 100644 --- a/src/biotite/structure/molecules.py +++ b/src/biotite/structure/molecules.py @@ -244,8 +244,7 @@ def get_molecule_masks(array): molecule_indices = get_molecule_indices(bonds) molecule_masks = np.zeros( - (len(molecule_indices), bonds.get_atom_count()), - dtype=bool + (len(molecule_indices), bonds.get_atom_count()), dtype=bool ) for i in range(len(molecule_indices)): molecule_masks[i, molecule_indices[i]] = True diff --git a/src/biotite/structure/pseudoknots.py b/src/biotite/structure/pseudoknots.py index 36a877a84..2a065f16b 100644 --- a/src/biotite/structure/pseudoknots.py +++ b/src/biotite/structure/pseudoknots.py @@ -10,9 +10,10 @@ __author__ = "Tom David Müller" __all__ = ["pseudoknots"] -import numpy as np -import networkx as nx from itertools import chain, product +import networkx as nx +import numpy as np + def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): """ @@ -118,7 +119,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): return np.array([[]], dtype=np.int32) # List containing the results - results = [np.full(len(base_pairs), -1, dtype='int32')] + results = [np.full(len(base_pairs), -1, dtype="int32")] # if no score array is given, each base pairs' score is one if scores is None: @@ -126,9 +127,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): # Make sure `base_pairs` has the same length as the score array if len(base_pairs) != len(scores): - raise ValueError( - "'base_pair' and 'scores' must have the same shape" - ) + raise ValueError("'base_pair' and 'scores' must have the same shape") # Split the base pairs in regions regions = _find_regions(base_pairs, scores) @@ -139,7 +138,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): return np.vstack(results) -class _Region(): +class _Region: """ This class represents a paired region. @@ -159,7 +158,7 @@ class _Region(): The score for each base pair. """ - def __init__ (self, base_pairs, region_pairs, scores): + def __init__(self, base_pairs, region_pairs, scores): # The Start and Stop indices for each Region self.start = np.min(base_pairs[region_pairs]) self.stop = np.max(base_pairs[region_pairs]) @@ -245,19 +244,18 @@ def _find_regions(base_pairs, scores): # Check if the current base pair belongs to the region that is # currently being defined - previous_upstream_rank = rank[i-1, 0] + previous_upstream_rank = rank[i - 1, 0] this_upstream_rank = rank[i, 0] - previous_downstream_rank = rank[i-1, 1] + previous_downstream_rank = rank[i - 1, 1] this_downstream_rank = rank[i, 1] # if the current base pair belongs to a new region, save the # current region and start a new region - if ((previous_downstream_rank - this_downstream_rank) != 1 or - (this_upstream_rank - previous_upstream_rank) != 1): - regions.add( - _Region(base_pairs, np.array(region_pairs), scores) - ) - region_pairs = [] + if (previous_downstream_rank - this_downstream_rank) != 1 or ( + this_upstream_rank - previous_upstream_rank + ) != 1: + regions.add(_Region(base_pairs, np.array(region_pairs), scores)) + region_pairs = [] # Append the current base pair to the region region_pairs.append(original_indices[i]) @@ -296,7 +294,7 @@ def _generate_graphical_representation(regions): # Get the region array and a boolean array, where the start of each # region is ``True``. region_array, (start_stops,) = _get_region_array_for( - regions, content=[lambda a : [True, False]], dtype=['bool'] + regions, content=[lambda a: [True, False]], dtype=["bool"] ) # Check each region for conflicts with other regions @@ -307,15 +305,15 @@ def _generate_graphical_representation(regions): # Find the index of the stopping of the region in the region # array - stop = _get_first_occurrence_for(region_array[start+1:], region) - stop += (start + 1) + stop = _get_first_occurrence_for(region_array[start + 1 :], region) + stop += start + 1 # Store regions the current region conflicts with conflicts = set() # Iterate over the regions between the starting and stopping # point of the current region - for other_region in region_array[start+1:stop]: + for other_region in region_array[start + 1 : stop]: # If the other region is not already a conflict, add it to # the conflict set if other_region not in conflicts: @@ -389,17 +387,17 @@ def _get_region_array_for(regions, content=[], dtype=[]): The custom output. """ # region_array and index array - region_array = np.empty(len(regions)*2, dtype=_Region) - index_array = np.empty(len(regions)*2, dtype='int32') + region_array = np.empty(len(regions) * 2, dtype=_Region) + index_array = np.empty(len(regions) * 2, dtype="int32") # Content array for custom return arrays - content_list = [None]*len(content) + content_list = [None] * len(content) for i in range(len(content)): - content_list[i] = np.empty(len(regions)*2, dtype=dtype[i]) + content_list[i] = np.empty(len(regions) * 2, dtype=dtype[i]) # Fill the arrays for i, reg in enumerate(regions): - indices = [2*i, 2*i+1] + indices = [2 * i, 2 * i + 1] region_array[indices] = reg for c in range(len(content_list)): content_list[c][indices] = content[c](reg) @@ -443,8 +441,8 @@ def _remove_pseudoknots(regions): represented as ``set`` of unknotted regions. """ # Create dynamic programming matrix - dp_matrix_shape = len(regions)*2, len(regions)*2 - dp_matrix = np.empty(dp_matrix_shape, dtype='object') + dp_matrix_shape = len(regions) * 2, len(regions) * 2 + dp_matrix = np.empty(dp_matrix_shape, dtype="object") dp_matrix_solutions_starts = np.zeros_like(dp_matrix) dp_matrix_solutions_stops = np.zeros_like(dp_matrix) @@ -452,9 +450,7 @@ def _remove_pseudoknots(regions): # ``region_array`` contains the region objects and ``start_stops`` # contains the lowest and highest positions of the regions region_array, (start_stops,) = _get_region_array_for( - regions, - [lambda a : (a.start, a.stop)], - ['int32'] + regions, [lambda a: (a.start, a.stop)], ["int32"] ) # Initialise the matrix diagonal with ndarrays of empty frozensets for i in range(len(dp_matrix)): @@ -462,11 +458,11 @@ def _remove_pseudoknots(regions): # Iterate through the top right half of the dynamic programming # matrix - for j in range(len(regions)*2): - for i in range(j-1, -1, -1): + for j in range(len(regions) * 2): + for i in range(j - 1, -1, -1): solution_candidates = set() - left = dp_matrix[i, j-1] - bottom = dp_matrix[i+1, j] + left = dp_matrix[i, j - 1] + bottom = dp_matrix[i + 1, j] # Add all solutions of the cell to the left for solution in left: @@ -474,24 +470,21 @@ def _remove_pseudoknots(regions): # Add all solutions of the cell to the bottom for solution in bottom: - solution_candidates.add(solution) + solution_candidates.add(solution) # Check if i and j are start/end-points of the same region if region_array[i] is region_array[j]: - # Add all solutions from the cell to the bottom left # plus this region - bottom_left = dp_matrix[i+1, j-1] + bottom_left = dp_matrix[i + 1, j - 1] for solution in bottom_left: solution_candidates.add(solution | set([region_array[i]])) # Perform additional tests if solution in the left cell and # bottom cell both differ from an empty solution - if (np.any(left != [frozenset()]) and - np.any(bottom != [frozenset()])): - - left_highest = dp_matrix_solutions_stops[i, j-1] - bottom_lowest = dp_matrix_solutions_starts[i+1, j] + if np.any(left != [frozenset()]) and np.any(bottom != [frozenset()]): + left_highest = dp_matrix_solutions_stops[i, j - 1] + bottom_lowest = dp_matrix_solutions_starts[i + 1, j] # For each pair of solutions check if solutions are # disjoint @@ -504,11 +497,11 @@ def _remove_pseudoknots(regions): # Both solutions are not disjoint # Add subsolutions for k in range( - np.where(start_stops==lowest)[0][0]-1, - np.where(start_stops==highest)[0][0]+1 + np.where(start_stops == lowest)[0][0] - 1, + np.where(start_stops == highest)[0][0] + 1, ): cell1 = dp_matrix[i, k] - cell2 = dp_matrix[k+1, j] + cell2 = dp_matrix[k + 1, j] for subsolution1 in cell1: for subsolution2 in cell2: solution_candidates.add( @@ -536,16 +529,12 @@ def _remove_pseudoknots(regions): # Add the solutions to the dynamic programming matrix dp_matrix[i, j] = solution_candidates - solution_starts = np.zeros_like(solution_candidates, dtype='int32') - solution_stops = np.zeros_like(solution_candidates, dtype='int32') + solution_starts = np.zeros_like(solution_candidates, dtype="int32") + solution_stops = np.zeros_like(solution_candidates, dtype="int32") for s, solution in enumerate(solution_candidates): - solution_starts[s] = min( - [reg.start for reg in solution], default=-1 - ) - solution_stops[s] = max( - [reg.stop for reg in solution], default=-1 - ) + solution_starts[s] = min([reg.start for reg in solution], default=-1) + solution_stops[s] = max([reg.stop for reg in solution], default=-1) dp_matrix_solutions_starts[i, j] = solution_starts dp_matrix_solutions_stops[i, j] = solution_stops @@ -586,14 +575,11 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Non-conflicting regions are of the current order: index_list_non_conflicting = list( - chain( - *[region.get_index_array() for region in non_conflicting] - ) - ) + chain(*[region.get_index_array() for region in non_conflicting]) + ) for result in results: result[index_list_non_conflicting] = order - # If no conflicts remain, the results are complete if len(regions) == 0: return results @@ -601,9 +587,10 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Get the optimal solutions for given regions. Evaluate each clique # of mutually conflicting regions seperately cliques = [component for component in nx.connected_components(regions)] - solutions = [set(chain(*e)) for e in product( - *[_remove_pseudoknots(clique) for clique in cliques] - )] + solutions = [ + set(chain(*e)) + for e in product(*[_remove_pseudoknots(clique) for clique in cliques]) + ] # Get a copy of the current results for each optimal solution results_list = [ @@ -612,16 +599,13 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Evaluate each optimal solution for i, solution in enumerate(solutions): - # Get the pseudoknotted regions pseudoknotted_regions = regions.copy() pseudoknotted_regions.remove_nodes_from(solution) # Get an index list of the unknotted base pairs index_list_unknotted = list( - chain( - *[region.get_index_array() for region in solution] - ) + chain(*[region.get_index_array() for region in solution]) ) # Write results for current solution @@ -634,8 +618,10 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Evaluate the pseudoknotted region results_list[i] = _get_results( - pseudoknotted_regions, results_list[i], - max_pseudoknot_order, order=order+1 + pseudoknotted_regions, + results_list[i], + max_pseudoknot_order, + order=order + 1, ) # Flatten the results diff --git a/src/biotite/structure/rdf.py b/src/biotite/structure/rdf.py index 563cd0ae3..4ba3a4d77 100644 --- a/src/biotite/structure/rdf.py +++ b/src/biotite/structure/rdf.py @@ -12,15 +12,16 @@ from numbers import Integral import numpy as np -from .atoms import Atom, AtomArray, stack, array, coord, AtomArrayStack +from .atoms import AtomArray, coord, stack from .box import box_volume +from .celllist import CellList from .geometry import displacement from .util import vector_dot -from .celllist import CellList -def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, - periodic=False): +def rdf( + center, atoms, selection=None, interval=(0, 10), bins=100, box=None, periodic=False +): r""" Compute the radial distribution function *g(r)* (RDF) for one or multiple given central positions based on a given system of @@ -155,7 +156,7 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, Find the radius for the first solvation shell. In this simple case, the density peak is identified by finding the maximum of the function. - + >>> peak_position = np.argmax(g_r) >>> print(f"{bins[peak_position]/10:.2f} nm") 0.29 nm @@ -165,9 +166,9 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, atoms = stack([atoms]) if selection is not None: atoms = atoms[..., selection] - + atom_coord = atoms.coord - + if box is None: if atoms.box is None: raise ValueError("A box must be supplied") @@ -175,17 +176,15 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, box = atoms.box elif box.ndim == 2 and atoms.stack_depth() == 1: box = box[np.newaxis, :, :] - + center = coord(center) if center.ndim == 1: center = center.reshape((1, 1) + center.shape) elif center.ndim == 2: center = center.reshape((1,) + center.shape) - + if box.shape[0] != center.shape[0] or box.shape[0] != atom_coord.shape[0]: - raise ValueError( - "Center, box, and atoms must have the same model count" - ) + raise ValueError("Center, box, and atoms must have the same model count") # Calculate distance histogram edges = _calculate_edges(interval, bins) @@ -209,17 +208,20 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, for j in range(center.shape[1]): dist_box = box[i] if periodic else None # Calculate squared distances - disp.append(displacement( - center[i,j], atom_coord[i, near_atom_mask[j]], box=dist_box - )) + disp.append( + displacement( + center[i, j], atom_coord[i, near_atom_mask[j]], box=dist_box + ) + ) # Make one array from multiple arrays with different length disp = np.concatenate(disp) sq_distances = vector_dot(disp, disp) hist, _ = np.histogram(sq_distances, bins=sq_edges) # Normalize with average particle density (N/V) in each bin - bin_volume = (4 / 3 * np.pi * np.power(edges[1: ], 3)) \ - - (4 / 3 * np.pi * np.power(edges[:-1], 3)) + bin_volume = (4 / 3 * np.pi * np.power(edges[1:], 3)) - ( + 4 / 3 * np.pi * np.power(edges[:-1], 3) + ) n_frames = len(atoms) volume = box_volume(box).mean() density = atoms.array_length() / volume @@ -237,7 +239,7 @@ def _calculate_edges(interval, bins): if isinstance(bins, Integral): if bins < 1: raise ValueError("At least one bin is required") - return np.linspace(*interval, bins+1) + return np.linspace(*interval, bins + 1) else: # 'bins' contains edges return np.array(bins, dtype=float) diff --git a/src/biotite/structure/repair.py b/src/biotite/structure/repair.py index abc7a96e5..df2719f55 100644 --- a/src/biotite/structure/repair.py +++ b/src/biotite/structure/repair.py @@ -10,12 +10,12 @@ __author__ = "Patrick Kunzmann, Daniel Bauer" __all__ = ["create_continuous_res_ids", "infer_elements", "create_atom_names"] -from collections import Counter import warnings +from collections import Counter import numpy as np from .atoms import AtomArray, AtomArrayStack -from .residues import get_residue_starts from .chains import get_chain_starts +from .residues import get_residue_starts def create_continuous_res_ids(atoms, restart_each_chain=True): @@ -151,18 +151,131 @@ def create_atom_names(atoms): return atom_names -_elements = [elem.upper() for elem in -["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", -"Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", -"Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", -"Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", -"I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", -"Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", -"Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", -"U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", -"Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", -"Og"] +_elements = [ + elem.upper() + for elem in [ + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + "Rf", + "Db", + "Sg", + "Bh", + "Hs", + "Mt", + "Ds", + "Rg", + "Cn", + "Nh", + "Fl", + "Mc", + "Lv", + "Ts", + "Og", + ] ] + + def _guess_element(atom_name): # remove digits (1H -> H) elem = "".join([i for i in atom_name if not i.isdigit()]) @@ -171,9 +284,13 @@ def _guess_element(atom_name): return "" # Some often used elements for biomolecules - if elem.startswith("C") or elem.startswith("N") or \ - elem.startswith("O") or elem.startswith("S") or \ - elem.startswith("H"): + if ( + elem.startswith("C") + or elem.startswith("N") + or elem.startswith("O") + or elem.startswith("S") + or elem.startswith("H") + ): return elem[0] # Exactly match element abbreviations @@ -184,4 +301,4 @@ def _guess_element(atom_name): return _elements[_elements.index(elem[0])] except ValueError: warnings.warn(f"Could not infer element for '{atom_name}'") - return "" \ No newline at end of file + return "" diff --git a/src/biotite/structure/residues.py b/src/biotite/structure/residues.py index a32a79f18..e4faf04d1 100644 --- a/src/biotite/structure/residues.py +++ b/src/biotite/structure/residues.py @@ -9,13 +9,19 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["get_residue_starts", "apply_residue_wise", "spread_residue_wise", - "get_residue_masks", "get_residue_starts_for", - "get_residue_positions", "get_residues", "get_residue_count", - "residue_iter"] +__all__ = [ + "get_residue_starts", + "apply_residue_wise", + "spread_residue_wise", + "get_residue_masks", + "get_residue_starts_for", + "get_residue_positions", + "get_residues", + "get_residue_count", + "residue_iter", +] import numpy as np -from .atoms import AtomArray, AtomArrayStack from .resutil import * @@ -57,23 +63,20 @@ def get_residue_starts(array, add_exclusive_stop=False): 278 292 304] """ # These mask are 'true' at indices where the value changes - chain_id_changes = (array.chain_id[1:] != array.chain_id[:-1]) - res_id_changes = (array.res_id[1:] != array.res_id[:-1] ) - ins_code_changes = (array.ins_code[1:] != array.ins_code[:-1]) - res_name_changes = (array.res_name[1:] != array.res_name[:-1]) + chain_id_changes = array.chain_id[1:] != array.chain_id[:-1] + res_id_changes = array.res_id[1:] != array.res_id[:-1] + ins_code_changes = array.ins_code[1:] != array.ins_code[:-1] + res_name_changes = array.res_name[1:] != array.res_name[:-1] # If any of these annotation arrays change, a new residue starts residue_change_mask = ( - chain_id_changes | - res_id_changes | - ins_code_changes | - res_name_changes + chain_id_changes | res_id_changes | ins_code_changes | res_name_changes ) # Convert mask to indices # Add 1, to shift the indices from the end of a residue # to the start of a new residue - residue_starts = np.where(residue_change_mask)[0] +1 + residue_starts = np.where(residue_change_mask)[0] + 1 # The first residue is not included yet -> Insert '[0]' if add_exclusive_stop: diff --git a/src/biotite/structure/resutil.py b/src/biotite/structure/resutil.py index 64c5339e1..d35e6fb6f 100644 --- a/src/biotite/structure/resutil.py +++ b/src/biotite/structure/resutil.py @@ -4,8 +4,14 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["apply_segment_wise", "spread_segment_wise", "get_segment_masks", - "get_segment_starts_for", "get_segment_positions", "segment_iter"] +__all__ = [ + "apply_segment_wise", + "spread_segment_wise", + "get_segment_masks", + "get_segment_starts_for", + "get_segment_positions", + "segment_iter", +] import numpy as np @@ -24,8 +30,8 @@ def apply_segment_wise(starts, data, function, axis): """ # The result array processed_data = None - for i in range(len(starts)-1): - segment = data[starts[i]:starts[i+1]] + for i in range(len(starts) - 1): + segment = data[starts[i] : starts[i + 1]] if axis == None: value = function(segment) else: @@ -39,13 +45,11 @@ def apply_segment_wise(starts, data, function, axis): # is length of segment of size 1 -> length of all IDs # (equal to atom array length) processed_data = np.zeros( - (len(starts)-1,) + value.shape, dtype=value.dtype + (len(starts) - 1,) + value.shape, dtype=value.dtype ) else: # Scalar value -> one dimensional result array - processed_data = np.zeros( - len(starts)-1, dtype=type(value) - ) + processed_data = np.zeros(len(starts) - 1, dtype=type(value)) # Write values into result arrays processed_data[i] = value return processed_data @@ -64,7 +68,7 @@ def spread_segment_wise(starts, input_data): atom array. """ output_data = np.zeros(starts[-1], dtype=input_data.dtype) - for i in range(len(starts)-1): + for i in range(len(starts) - 1): start = starts[i] stop = starts[i + 1] output_data[start:stop] = input_data[i] @@ -92,14 +96,13 @@ def get_segment_masks(starts, indices): if (indices >= length).any(): index = np.min(np.where(indices >= length)[0]) raise ValueError( - f"Index {index} is out of range for " - f"an atom array with length {length}" + f"Index {index} is out of range for " f"an atom array with length {length}" ) - + insertion_points = np.searchsorted(starts, indices, side="right") - 1 for i, point in enumerate(insertion_points): - masks[i, starts[point] : starts[point+1]] = True - + masks[i, starts[point] : starts[point + 1]] = True + return masks @@ -125,10 +128,9 @@ def get_segment_starts_for(starts, indices): if (indices >= length).any(): index = np.min(np.where(indices >= length)[0]) raise ValueError( - f"Index {index} is out of range for " - f"an atom array with length {length}" + f"Index {index} is out of range for " f"an atom array with length {length}" ) - + insertion_points = np.searchsorted(starts, indices, side="right") - 1 return starts[insertion_points] @@ -155,10 +157,9 @@ def get_segment_positions(starts, indices): if (indices >= length).any(): index = np.min(np.where(indices >= length)[0]) raise ValueError( - f"Index {index} is out of range for " - f"an atom array with length {length}" + f"Index {index} is out of range for " f"an atom array with length {length}" ) - + return np.searchsorted(starts, indices, side="right") - 1 @@ -174,5 +175,5 @@ def segment_iter(array, starts): Includes exclusive stop, i.e. the length of the corresponding atom array. """ - for i in range(len(starts)-1): - yield array[..., starts[i] : starts[i+1]] + for i in range(len(starts) - 1): + yield array[..., starts[i] : starts[i + 1]] diff --git a/src/biotite/structure/sequence.py b/src/biotite/structure/sequence.py index 0cad79b73..8c9e85d26 100644 --- a/src/biotite/structure/sequence.py +++ b/src/biotite/structure/sequence.py @@ -11,13 +11,12 @@ __all__ = ["to_sequence"] import numpy as np -from .info.misc import one_letter_code -from .info.groups import amino_acid_names, nucleotide_names -from .residues import get_residues +from ..sequence.seqtypes import NucleotideSequence, ProteinSequence from .chains import get_chain_starts from .error import BadStructureError -from ..sequence.seqtypes import ProteinSequence, NucleotideSequence - +from .info.groups import amino_acid_names, nucleotide_names +from .info.misc import one_letter_code +from .residues import get_residues HETERO_PLACEHOLDER = "." @@ -63,9 +62,9 @@ def to_sequence(atoms, allow_hetero=False): """ sequences = [] chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) - for i in range(len(chain_start_indices)-1): + for i in range(len(chain_start_indices) - 1): start = chain_start_indices[i] - stop = chain_start_indices[i+1] + stop = chain_start_indices[i + 1] chain = atoms[start:stop] _, residues = get_residues(chain) one_letter_symbols = np.array( @@ -73,7 +72,7 @@ def to_sequence(atoms, allow_hetero=False): ) hetero_mask = one_letter_symbols == HETERO_PLACEHOLDER - aa_count = np.count_nonzero(np.isin(residues, amino_acid_names())) + aa_count = np.count_nonzero(np.isin(residues, amino_acid_names())) nuc_count = np.count_nonzero(np.isin(residues, nucleotide_names())) if aa_count == 0 and nuc_count == 0: raise BadStructureError( @@ -109,4 +108,4 @@ def to_sequence(atoms, allow_hetero=False): sequences.append(NucleotideSequence("".join(one_letter_symbols))) # Remove exclusive stop - return sequences, chain_start_indices[:-1] \ No newline at end of file + return sequences, chain_start_indices[:-1] diff --git a/src/biotite/structure/sse.py b/src/biotite/structure/sse.py index 0d870071c..32d4f5e47 100644 --- a/src/biotite/structure/sse.py +++ b/src/biotite/structure/sse.py @@ -13,24 +13,22 @@ import numpy as np from .celllist import CellList -from .geometry import distance, angle, dihedral from .filter import filter_amino_acids -from .residues import get_residue_starts +from .geometry import angle, dihedral, distance from .integrity import check_res_id_continuity +from .residues import get_residue_starts +_r_helix = (np.deg2rad(89 - 12), np.deg2rad(89 + 12)) +_a_helix = (np.deg2rad(50 - 20), np.deg2rad(50 + 20)) +_d2_helix = ((5.5 - 0.5), (5.5 + 0.5)) # Not used in the algorithm description +_d3_helix = ((5.3 - 0.5), (5.3 + 0.5)) +_d4_helix = ((6.4 - 0.6), (6.4 + 0.6)) -_r_helix = (np.deg2rad(89-12), np.deg2rad(89+12)) -_a_helix = (np.deg2rad(50-20), np.deg2rad(50+20)) -_d2_helix = ((5.5-0.5), (5.5+0.5)) # Not used in the algorithm description -_d3_helix = ((5.3-0.5), (5.3+0.5)) -_d4_helix = ((6.4-0.6), (6.4+0.6)) - -_r_strand = (np.deg2rad(124-14), np.deg2rad(124+14)) -_a_strand = (np.deg2rad(-180), np.deg2rad(-125), - np.deg2rad(145), np.deg2rad(180)) -_d2_strand = ((6.7-0.6), (6.7+0.6)) -_d3_strand = ((9.9-0.9), (9.9+0.9)) -_d4_strand = ((12.4-1.1), (12.4+1.1)) +_r_strand = (np.deg2rad(124 - 14), np.deg2rad(124 + 14)) +_a_strand = (np.deg2rad(-180), np.deg2rad(-125), np.deg2rad(145), np.deg2rad(180)) +_d2_strand = ((6.7 - 0.6), (6.7 + 0.6)) +_d3_strand = ((9.9 - 0.9), (9.9 + 0.9)) +_d4_strand = ((12.4 - 1.1), (12.4 + 1.1)) def annotate_sse(atom_array): @@ -93,9 +91,9 @@ def annotate_sse(atom_array): ca_indices = np.where( filter_amino_acids(atom_array) & (atom_array.atom_name == "CA") )[0] - ca_coord[ - np.searchsorted(residue_starts, ca_indices, "right") - 1 - ] = atom_array.coord[ca_indices] + ca_coord[np.searchsorted(residue_starts, ca_indices, "right") - 1] = ( + atom_array.coord[ca_indices] + ) if len(ca_coord) <= 5: # The number of atoms is too small # @@ -112,12 +110,12 @@ def annotate_sse(atom_array): # purpose of geometric measurements # -> the distances/angles spanning discontinuities are NaN discont_indices = check_res_id_continuity(atom_array) - discont_res_indices = np.searchsorted( - residue_starts, discont_indices, "right" - ) - 1 + discont_res_indices = np.searchsorted(residue_starts, discont_indices, "right") - 1 ca_coord = np.insert( - ca_coord, discont_res_indices, - np.full((len(discont_res_indices),3), np.nan), axis=0 + ca_coord, + discont_res_indices, + np.full((len(discont_res_indices), 3), np.nan), + axis=0, ) # Later the SSE for virtual residues are removed again # via this mask @@ -126,60 +124,62 @@ def annotate_sse(atom_array): length = len(ca_coord) - # The distances and angles are not defined for the entire interval, # therefore the indices do not have the full range # Values that are not defined are NaN d2i = np.full(length, np.nan) d3i = np.full(length, np.nan) d4i = np.full(length, np.nan) - ri = np.full(length, np.nan) - ai = np.full(length, np.nan) - - d2i[1 : length-1] = distance(ca_coord[0 : length-2], ca_coord[2 : length]) - d3i[1 : length-2] = distance(ca_coord[0 : length-3], ca_coord[3 : length]) - d4i[1 : length-3] = distance(ca_coord[0 : length-4], ca_coord[4 : length]) - ri[1 : length-1] = angle( - ca_coord[0 : length-2], - ca_coord[1 : length-1], - ca_coord[2 : length] + ri = np.full(length, np.nan) + ai = np.full(length, np.nan) + + d2i[1 : length - 1] = distance(ca_coord[0 : length - 2], ca_coord[2:length]) + d3i[1 : length - 2] = distance(ca_coord[0 : length - 3], ca_coord[3:length]) + d4i[1 : length - 3] = distance(ca_coord[0 : length - 4], ca_coord[4:length]) + ri[1 : length - 1] = angle( + ca_coord[0 : length - 2], ca_coord[1 : length - 1], ca_coord[2:length] ) - ai[1 : length-2] = dihedral( - ca_coord[0 : length-3], - ca_coord[1 : length-2], - ca_coord[2 : length-1], - ca_coord[3 : length-0] + ai[1 : length - 2] = dihedral( + ca_coord[0 : length - 3], + ca_coord[1 : length - 2], + ca_coord[2 : length - 1], + ca_coord[3 : length - 0], ) # Find CA that meet criteria for potential helices and strands - relaxed_helix = ( - (d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1]) - ) | ( - (ri >= _r_helix[0] ) & ( ri <= _r_helix[1]) + relaxed_helix = ((d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1])) | ( + (ri >= _r_helix[0]) & (ri <= _r_helix[1]) ) strict_helix = ( - (d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1]) & - (d4i >= _d4_helix[0]) & (d4i <= _d4_helix[1]) + (d3i >= _d3_helix[0]) + & (d3i <= _d3_helix[1]) + & (d4i >= _d4_helix[0]) + & (d4i <= _d4_helix[1]) ) | ( - (ri >= _r_helix[0] ) & ( ri <= _r_helix[1]) & - (ai >= _a_helix[0] ) & ( ai <= _a_helix[1]) + (ri >= _r_helix[0]) + & (ri <= _r_helix[1]) + & (ai >= _a_helix[0]) + & (ai <= _a_helix[1]) ) relaxed_strand = (d3i >= _d3_strand[0]) & (d3i <= _d3_strand[1]) strict_strand = ( - (d2i >= _d2_strand[0]) & (d2i <= _d2_strand[1]) & - (d3i >= _d3_strand[0]) & (d3i <= _d3_strand[1]) & - (d4i >= _d4_strand[0]) & (d4i <= _d4_strand[1]) + (d2i >= _d2_strand[0]) + & (d2i <= _d2_strand[1]) + & (d3i >= _d3_strand[0]) + & (d3i <= _d3_strand[1]) + & (d4i >= _d4_strand[0]) + & (d4i <= _d4_strand[1]) ) | ( - (ri >= _r_strand[0] ) & ( ri <= _r_strand[1]) & - ( + (ri >= _r_strand[0]) + & (ri <= _r_strand[1]) + & ( # Account for periodic boundary of dihedral angle - ((ai >= _a_strand[0] ) & ( ai <= _a_strand[1])) | - ((ai >= _a_strand[2] ) & ( ai <= _a_strand[3])) + ((ai >= _a_strand[0]) & (ai <= _a_strand[1])) + | ((ai >= _a_strand[2]) & (ai <= _a_strand[3])) ) ) - helix_mask = _mask_consecutive(strict_helix, 5) helix_mask = _extend_region(helix_mask, relaxed_helix) @@ -187,12 +187,11 @@ def annotate_sse(atom_array): short_strand_mask = _mask_regions_with_contacts( ca_coord, _mask_consecutive(strict_strand, 3), - min_contacts=5, min_distance=4.2, max_distance=5.2 - ) - strand_mask = _extend_region( - strand_mask | short_strand_mask, relaxed_strand + min_contacts=5, + min_distance=4.2, + max_distance=5.2, ) - + strand_mask = _extend_region(strand_mask | short_strand_mask, relaxed_strand) sse = np.full(length, "c", dtype="U1") sse[helix_mask] = "a" @@ -215,10 +214,10 @@ def _mask_consecutive(mask, number): # if it and the following `number-1` elements are True # The elements `mask[-(number-1):]` cannot have the sufficient count # by this definition, as they are at the end of the array - counts = np.zeros(len(mask) - (number-1), dtype=int) + counts = np.zeros(len(mask) - (number - 1), dtype=int) for i in range(number): counts[mask[i : i + len(counts)]] += 1 - consecutive_seed = (counts == number) + consecutive_seed = counts == number # Not only that element, but also the # following `number-1` elements are in a consecutive region @@ -257,8 +256,9 @@ def _extend_region(base_condition_mask, extension_condition_mask): ) -def _mask_regions_with_contacts(coord, candidate_mask, - min_contacts, min_distance, max_distance): +def _mask_regions_with_contacts( + coord, candidate_mask, min_contacts, min_distance, max_distance +): """ Mask regions of `candidate_mask` that have at least `min_contacts` contacts with `coord` in the range `min_distance` to `max_distance`. @@ -269,9 +269,7 @@ def _mask_regions_with_contacts(coord, candidate_mask, # -> no residue can satisfy 'min_contacts' return np.zeros(len(candidate_mask), dtype=bool) - cell_list = CellList( - potential_contact_coord, max_distance - ) + cell_list = CellList(potential_contact_coord, max_distance) # For each candidate position, # get all contacts within maximum distance all_within_max_dist_indices = cell_list.get_atoms( @@ -282,33 +280,29 @@ def _mask_regions_with_contacts(coord, candidate_mask, for i, atom_index in enumerate(np.where(candidate_mask)[0]): within_max_dist_indices = all_within_max_dist_indices[i] # Remove padding values - within_max_dist_indices = within_max_dist_indices[ - within_max_dist_indices != -1 - ] + within_max_dist_indices = within_max_dist_indices[within_max_dist_indices != -1] # Now count all contacts within maximum distance # that also satisfy the minimum distance contacts[atom_index] = np.count_nonzero( distance( - coord[atom_index], - potential_contact_coord[within_max_dist_indices] - ) > min_distance + coord[atom_index], potential_contact_coord[within_max_dist_indices] + ) + > min_distance ) # Count the number of contacts per region # These indices mark the start of either a 'True' or 'False' region # Prepend absent region to the start to capture the event, # that the first element is already the start of a region - region_change_indices = np.where( - np.diff(np.append([False], candidate_mask)) - )[0] + region_change_indices = np.where(np.diff(np.append([False], candidate_mask)))[0] # Add exclusive stop region_change_indices = np.append(region_change_indices, [len(coord)]) output_mask = np.zeros(len(candidate_mask), dtype=bool) for i in range(len(region_change_indices) - 1): start = region_change_indices[i] - stop = region_change_indices[i+1] - total_contacts = np.sum(contacts[start : stop]) + stop = region_change_indices[i + 1] + total_contacts = np.sum(contacts[start:stop]) if total_contacts >= min_contacts: - output_mask[start : stop] = True + output_mask[start:stop] = True - return output_mask \ No newline at end of file + return output_mask diff --git a/src/biotite/structure/superimpose.py b/src/biotite/structure/superimpose.py index d522471fa..eb5f229a7 100755 --- a/src/biotite/structure/superimpose.py +++ b/src/biotite/structure/superimpose.py @@ -8,19 +8,22 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Claude J. Rogers" -__all__ = ["superimpose", "superimpose_homologs", - "superimpose_without_outliers", - "AffineTransformation"] +__all__ = [ + "superimpose", + "superimpose_homologs", + "superimpose_without_outliers", + "AffineTransformation", +] import numpy as np +from ..sequence.align import SubstitutionMatrix, align_optimal, get_codes +from ..sequence.alphabet import common_alphabet +from ..sequence.seqtypes import ProteinSequence from .atoms import coord -from .geometry import centroid, distance from .filter import filter_amino_acids, filter_nucleotides +from .geometry import centroid, distance from .sequence import to_sequence -from ..sequence.alphabet import common_alphabet -from ..sequence.seqtypes import ProteinSequence -from ..sequence.align import SubstitutionMatrix, align_optimal, get_codes class AffineTransformation: @@ -45,12 +48,12 @@ class AffineTransformation: The dimensions are always expanded to *(m,3)* or *(m,3,3)*, respectively. """ + def __init__(self, center_translation, rotation, target_translation): self.center_translation = _expand_dims(center_translation, 2) self.rotation = _expand_dims(rotation, 3) self.target_translation = _expand_dims(target_translation, 2) - def apply(self, atoms): """ Apply this transformation on the given structure. @@ -118,7 +121,6 @@ def apply(self, atoms): superimposed.coord = superimposed_coord return superimposed - def as_matrix(self): """ Get the translations and rotation as a combined 4x4 @@ -316,16 +318,19 @@ def superimpose(fixed, mobile, atom_mask=None): mob_centered_filtered = mob_filtered - mob_centroid[:, np.newaxis, :] fix_centered_filtered = fix_filtered - fix_centroid[:, np.newaxis, :] - rotation = _get_rotation_matrices( - fix_centered_filtered, mob_centered_filtered - ) + rotation = _get_rotation_matrices(fix_centered_filtered, mob_centered_filtered) transform = AffineTransformation(-mob_centroid, rotation, fix_centroid) return transform.apply(mobile), transform -def superimpose_without_outliers(fixed, mobile, min_anchors=3, - max_iterations=10, quantiles=(0.25, 0.75), - outlier_threshold=1.5): +def superimpose_without_outliers( + fixed, + mobile, + min_anchors=3, + max_iterations=10, + quantiles=(0.25, 0.75), + outlier_threshold=1.5, +): r""" Superimpose structures onto a fixed structure, ignoring conformational outliers. @@ -458,8 +463,9 @@ def superimpose_without_outliers(fixed, mobile, min_anchors=3, return transform.apply(mobile), transform, anchor_indices -def superimpose_homologs(fixed, mobile, substitution_matrix=None, - gap_penalty=-10, min_anchors=3, **kwargs): +def superimpose_homologs( + fixed, mobile, substitution_matrix=None, gap_penalty=-10, min_anchors=3, **kwargs +): r""" Superimpose one protein or nucleotide chain onto another one, considering sequence differences and conformational outliers. @@ -530,8 +536,8 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None, fixed_anchor_indices = _get_backbone_anchor_indices(fixed) mobile_anchor_indices = _get_backbone_anchor_indices(mobile) if ( - len(fixed_anchor_indices) < min_anchors or - len(mobile_anchor_indices) < min_anchors + len(fixed_anchor_indices) < min_anchors + or len(mobile_anchor_indices) < min_anchors ): raise ValueError( "Structures have too few CA atoms for required number of anchors" @@ -562,7 +568,7 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None, fixed[..., fixed_anchor_indices], mobile[..., mobile_anchor_indices], min_anchors, - **kwargs + **kwargs, ) fixed_anchor_indices = fixed_anchor_indices[selected_anchor_indices] mobile_anchor_indices = mobile_anchor_indices[selected_anchor_indices] @@ -580,17 +586,13 @@ def _reshape_to_3d(coord): Reshape the coordinate array to 3D, if it is 2D. """ if coord.ndim < 2: - raise ValueError( - "Coordinates must be at least two-dimensional" - ) + raise ValueError("Coordinates must be at least two-dimensional") if coord.ndim == 2: return coord[np.newaxis, ...] elif coord.ndim == 3: return coord else: - raise ValueError( - "Coordinates must be at most three-dimensional" - ) + raise ValueError("Coordinates must be at most three-dimensional") def _get_rotation_matrices(fixed, mobile): @@ -602,10 +604,10 @@ def _get_rotation_matrices(fixed, mobile): Both sets of coordinates must already be centered at origin. """ # Calculate cross-covariance matrices - cov = np.sum(fixed[:,:,:,np.newaxis] * mobile[:,:,np.newaxis,:], axis=1) + cov = np.sum(fixed[:, :, :, np.newaxis] * mobile[:, :, np.newaxis, :], axis=1) v, s, w = np.linalg.svd(cov) # Remove possibility of reflected atom coordinates - reflected_mask = (np.linalg.det(v) * np.linalg.det(w) < 0) + reflected_mask = np.linalg.det(v) * np.linalg.det(w) < 0 v[reflected_mask, :, -1] *= -1 matrices = np.matmul(v, w) return matrices @@ -617,11 +619,7 @@ def _multi_matmul(matrices, vectors): with m x n vectors. """ return np.transpose( - np.matmul( - matrices, - np.transpose(vectors, axes=(0, 2, 1)) - ), - axes=(0, 2, 1) + np.matmul(matrices, np.transpose(vectors, axes=(0, 2, 1))), axes=(0, 2, 1) ) @@ -631,8 +629,8 @@ def _get_backbone_anchor_indices(atoms): nucleotide and return their indices. """ return np.where( - ((filter_amino_acids(atoms)) & (atoms.atom_name == "CA")) | - ((filter_nucleotides(atoms)) & (atoms.atom_name == "P")) + ((filter_amino_acids(atoms)) & (atoms.atom_name == "CA")) + | ((filter_nucleotides(atoms)) & (atoms.atom_name == "P")) )[0] @@ -685,11 +683,7 @@ def _find_matching_anchors( def _to_sequence(atoms): sequences, _ = to_sequence(atoms, allow_hetero=True) if len(sequences) == 0: - raise ValueError( - "Structure does not contain any amino acids or nucleotides" - ) + raise ValueError("Structure does not contain any amino acids or nucleotides") if len(sequences) > 1: - raise ValueError( - "Structure contains multiple chains, but only one is allowed" - ) - return sequences[0] \ No newline at end of file + raise ValueError("Structure contains multiple chains, but only one is allowed") + return sequences[0] diff --git a/src/biotite/structure/transform.py b/src/biotite/structure/transform.py index 0ab281c8d..2ca994bf6 100644 --- a/src/biotite/structure/transform.py +++ b/src/biotite/structure/transform.py @@ -9,20 +9,25 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann", "Claude J. Rogers" -__all__ = ["translate", "rotate", "rotate_centered", "rotate_about_axis", - "orient_principal_components", "align_vectors"] +__all__ = [ + "translate", + "rotate", + "rotate_centered", + "rotate_about_axis", + "orient_principal_components", + "align_vectors", +] import numpy as np -from .geometry import centroid -from .error import BadStructureError from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .util import norm_vector, vector_dot, matrix_rotate +from .geometry import centroid +from .util import matrix_rotate, norm_vector, vector_dot def translate(atoms, vector): """ Translate the given atoms or coordinates by a given vector. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -30,7 +35,7 @@ def translate(atoms, vector): The coordinates can be directly provided as :class:`ndarray`. vector: array-like, shape=(3,) or shape=(n,3) or shape=(m,n,3) The translation vector :math:`(x, y, z)`. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -39,7 +44,7 @@ def translate(atoms, vector): """ positions = coord(atoms).copy() vector = np.asarray(vector) - + if vector.shape[-1] != 3: raise ValueError("Translation vector must contain 3 coordinates") positions += vector @@ -50,10 +55,10 @@ def rotate(atoms, angles): """ Rotate the given atoms or coordinates about the *x*, *y* and *z* axes by given angles. - + The rotations are centered at the origin and are performed sequentially in the order *x*, *y*, *z*. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -61,13 +66,13 @@ def rotate(atoms, angles): The coordinates can be directly provided as :class:`ndarray`. angles: array-like, length=3 The rotation angles in radians around *x*, *y* and *z*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates, rotated by the given angles. - + See Also -------- rotate_centered @@ -82,27 +87,39 @@ def rotate(atoms, angles): >>> print(rotated) [1.225e-16 2.000e+00 0.000e+00] """ - from numpy import sin, cos + from numpy import cos, sin # Check if "angles" contains 3 angles for all dimensions if len(angles) != 3: raise ValueError("Translation vector must be container of length 3") # Create rotation matrices for all 3 dimensions - rot_x = np.array([[ 1, 0, 0 ], - [ 0, cos(angles[0]), -sin(angles[0]) ], - [ 0, sin(angles[0]), cos(angles[0]) ]]) - - rot_y = np.array([[ cos(angles[1]), 0, sin(angles[1]) ], - [ 0, 1, 0 ], - [ -sin(angles[1]), 0, cos(angles[1]) ]]) - - rot_z = np.array([[ cos(angles[2]), -sin(angles[2]), 0 ], - [ sin(angles[2]), cos(angles[2]), 0 ], - [ 0, 0, 1 ]]) - + rot_x = np.array( + [ + [1, 0, 0], + [0, cos(angles[0]), -sin(angles[0])], + [0, sin(angles[0]), cos(angles[0])], + ] + ) + + rot_y = np.array( + [ + [cos(angles[1]), 0, sin(angles[1])], + [0, 1, 0], + [-sin(angles[1]), 0, cos(angles[1])], + ] + ) + + rot_z = np.array( + [ + [cos(angles[2]), -sin(angles[2]), 0], + [sin(angles[2]), cos(angles[2]), 0], + [0, 0, 1], + ] + ) + positions = coord(atoms).copy() positions = matrix_rotate(positions, rot_z @ rot_y @ rot_x) - + return _put_back(atoms, positions) @@ -110,10 +127,10 @@ def rotate_centered(atoms, angles): """ Rotate the given atoms or coordinates about the *x*, *y* and *z* axes by given angles. - + The rotations are centered at the centroid of the corresponding structure and are performed sequentially in the order *x*, *y*, *z*. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -121,13 +138,13 @@ def rotate_centered(atoms, angles): The coordinates can be directly provided as :class:`ndarray`. angles: array-like, length=3 The rotation angles in radians around axes *x*, *y* and *z*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates, rotated by the given angles. - + See Also -------- rotate @@ -136,7 +153,7 @@ def rotate_centered(atoms, angles): if len(coord(atoms).shape) == 1: # Single value -> centered rotation does not change coordinates return atoms.copy() - + # Rotation around centroid requires moving centroid to origin center = coord(centroid(atoms)) # 'centroid()' removes the second last dimesion @@ -152,7 +169,7 @@ def rotate_about_axis(atoms, axis, angle, support=None): """ Rotate the given atoms or coordinates about a given axis by a given angle. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -167,13 +184,13 @@ def rotate_about_axis(atoms, axis, angle, support=None): An optional support vector for the rotation axis, i.e. the center of the rotation. By default, the center of the rotation is at *(0,0,0)*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates, rotated about the given axis. - + See Also -------- rotate @@ -194,7 +211,7 @@ def rotate_about_axis(atoms, axis, angle, support=None): # Transform coordinates # so that the axis support vector is at (0,0,0) positions -= np.asarray(support) - + # Normalize axis axis = np.asarray(axis, dtype=np.float32).copy() if np.linalg.norm(axis) == 0: @@ -205,16 +222,30 @@ def rotate_about_axis(atoms, axis, angle, support=None): sin_a = np.sin(angle) cos_a = np.cos(angle) icos_a = 1 - cos_a - x = axis[...,0] - y = axis[...,1] - z = axis[...,2] + x = axis[..., 0] + y = axis[..., 1] + z = axis[..., 2] # Rotation matrix is taken from # https://en.wikipedia.org/wiki/Rotation_matrix#Rotation_matrix_from_axis_and_angle - rot_matrix = np.array([ - [ cos_a + icos_a*x**2, icos_a*x*y - z*sin_a, icos_a*x*z + y*sin_a], - [icos_a*x*y + z*sin_a, cos_a + icos_a*y**2, icos_a*y*z - x*sin_a], - [icos_a*x*z - y*sin_a, icos_a*y*z + x*sin_a, cos_a + icos_a*z**2] - ]) + rot_matrix = np.array( + [ + [ + cos_a + icos_a * x**2, + icos_a * x * y - z * sin_a, + icos_a * x * z + y * sin_a, + ], + [ + icos_a * x * y + z * sin_a, + cos_a + icos_a * y**2, + icos_a * y * z - x * sin_a, + ], + [ + icos_a * x * z - y * sin_a, + icos_a * y * z + x * sin_a, + cos_a + icos_a * z**2, + ], + ] + ) # For proper rotation reshape into a maximum of 2 dimensions orig_ndim = positions.ndim @@ -230,7 +261,7 @@ def rotate_about_axis(atoms, axis, angle, support=None): if support is not None: # Transform coordinates back to original support vector position positions += np.asarray(support) - + return _put_back(atoms, positions) @@ -298,9 +329,7 @@ def orient_principal_components(atoms, order=None): else: order = np.asarray(order, dtype=int) if order.shape != (3,): - raise ValueError( - f"Expected order to have shape (3,), not {order.shape}" - ) + raise ValueError(f"Expected order to have shape (3,), not {order.shape}") if not (np.sort(order) == np.arange(3)).all(): raise ValueError("Expected order to contain [0, 1, 2].") @@ -333,8 +362,13 @@ def orient_principal_components(atoms, order=None): return _put_back(atoms, centered) -def align_vectors(atoms, origin_direction, target_direction, - origin_position=None, target_position=None): +def align_vectors( + atoms, + origin_direction, + target_direction, + origin_position=None, + target_position=None, +): """ Apply a transformation to atoms or coordinates, that would transfer a origin vector to a target vector. @@ -345,8 +379,8 @@ def align_vectors(atoms, origin_direction, target_direction, This means, that the application of the transformation on the origin vector would give the target vector. Then the same transformation is applied to the given - atoms/coordinates. - + atoms/coordinates. + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -359,13 +393,13 @@ def align_vectors(atoms, origin_direction, target_direction, origin_position, target_position : array-like, length=3, optional Optional support vectors for the origin or target, respectively. By default, origin and target start at *(0,0,0)*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates with the applied transformation. - + See Also -------- rotate @@ -428,12 +462,8 @@ def align_vectors(atoms, origin_direction, target_direction, A 2 LEU HD22 H -6.255 7.544 -2.657 A 2 LEU HD23 H -5.592 8.445 -1.281 """ - origin_direction = np.asarray( - origin_direction, dtype=np.float32 - ).squeeze() - target_direction = np.asarray( - target_direction, dtype=np.float32 - ).squeeze() + origin_direction = np.asarray(origin_direction, dtype=np.float32).squeeze() + target_direction = np.asarray(target_direction, dtype=np.float32).squeeze() # check that original and target direction are vectors of shape (3,) if origin_direction.shape != (3,): raise ValueError( @@ -449,9 +479,9 @@ def align_vectors(atoms, origin_direction, target_direction, raise ValueError("Length of the origin vector is 0") if np.linalg.norm(target_direction) == 0: raise ValueError("Length of the target vector is 0") - if origin_position is not None: + if origin_position is not None: origin_position = np.asarray(origin_position, dtype=np.float32) - if target_position is not None: + if target_position is not None: target_position = np.asarray(target_position, dtype=np.float32) positions = coord(atoms).copy() @@ -459,7 +489,7 @@ def align_vectors(atoms, origin_direction, target_direction, # Transform coordinates # so that the position of the origin vector is at (0,0,0) positions -= origin_position - + # Normalize direction vectors origin_direction = origin_direction.copy() norm_vector(origin_direction) @@ -468,11 +498,7 @@ def align_vectors(atoms, origin_direction, target_direction, # Formula is taken from # https://math.stackexchange.com/questions/180418/calculate-rotation-matrix-to-align-vector-a-to-vector-b-in-3d/476311#476311 vx, vy, vz = np.cross(origin_direction, target_direction) - v_c = np.array([ - [ 0, -vz, vy], - [ vz, 0, -vx], - [-vy, vx, 0] - ], dtype=float) + v_c = np.array([[0, -vz, vy], [vz, 0, -vx], [-vy, vx, 0]], dtype=float) cos_a = vector_dot(origin_direction, target_direction) if np.all(cos_a == -1): raise ValueError( @@ -480,9 +506,9 @@ def align_vectors(atoms, origin_direction, target_direction, "cannot calculate rotation matrix" ) rot_matrix = np.identity(3) + v_c + (v_c @ v_c) / (1 + cos_a) - + positions = matrix_rotate(positions, rot_matrix) - + if target_position is not None: # Transform coordinates to position of the target vector positions += target_position @@ -501,4 +527,4 @@ def _put_back(input_atoms, transformed): moved_atoms.coord = transformed return moved_atoms else: - return transformed \ No newline at end of file + return transformed diff --git a/src/biotite/structure/util.py b/src/biotite/structure/util.py index 68f13f20d..cabbdc8f5 100644 --- a/src/biotite/structure/util.py +++ b/src/biotite/structure/util.py @@ -11,31 +11,30 @@ __all__ = ["vector_dot", "norm_vector", "distance", "matrix_rotate"] import numpy as np -from .atoms import Atom, array -def vector_dot(v1,v2): +def vector_dot(v1, v2): """ Calculate vector dot product of two vectors. - + Parameters ---------- v1,v2 : ndarray The arrays to calculate the product from. The vectors are represented by the last axis. - + Returns ------- product : float or ndarray Scalar product over the last dimension of the arrays. """ - return (v1*v2).sum(axis=-1) + return (v1 * v2).sum(axis=-1) def norm_vector(v): """ Normalise a vector. - + Parameters ---------- v : ndarray @@ -47,25 +46,25 @@ def norm_vector(v): v /= factor[..., np.newaxis] else: v /= factor - -def distance(v1,v2): + +def distance(v1, v2): """ Calculate the distance between two position vectors. - + Parameters ---------- v1,v2 : ndarray The arrays to calculate the product from. The vectors are represented by the last axis. - + Returns ------- product : float or ndarray Vector distance over the last dimension of the array. """ dif = v1 - v2 - return np.sqrt((dif*dif).sum(axis=-1)) + return np.sqrt((dif * dif).sum(axis=-1)) def matrix_rotate(v, matrix): @@ -78,7 +77,7 @@ def matrix_rotate(v, matrix): The coordinates to rotate. matrix : ndarray The rotation matrix. - + Returns ------- rotated : ndarray @@ -95,4 +94,3 @@ def matrix_rotate(v, matrix): if orig_ndim > 2: v = v.reshape(*orig_shape) return v - diff --git a/src/biotite/visualize.py b/src/biotite/visualize.py index a2839c6a6..f45b7815f 100644 --- a/src/biotite/visualize.py +++ b/src/biotite/visualize.py @@ -6,25 +6,25 @@ __author__ = "Patrick Kunzmann" __all__ = ["colors", "set_font_size_in_coord", "AdaptiveFancyArrow"] -import abc from collections import OrderedDict import numpy as np from numpy.linalg import norm - # Biotite themed colors -colors = OrderedDict([ - ("brightorange" , "#ffb569ff"), - ("lightorange" , "#ff982dff"), - ("orange" , "#ff8405ff"), - ("dimorange" , "#dc7000ff"), - ("darkorange" , "#b45c00ff"), - ("brightgreen" , "#98e97fff"), - ("lightgreen" , "#6fe04cff"), - ("green" , "#52da2aff"), - ("dimgreen" , "#45bc20ff"), - ("darkgreen" , "#389a1aff"), -]) +colors = OrderedDict( + [ + ("brightorange", "#ffb569ff"), + ("lightorange", "#ff982dff"), + ("orange", "#ff8405ff"), + ("dimorange", "#dc7000ff"), + ("darkorange", "#b45c00ff"), + ("brightgreen", "#98e97fff"), + ("lightgreen", "#6fe04cff"), + ("green", "#52da2aff"), + ("dimgreen", "#45bc20ff"), + ("darkgreen", "#389a1aff"), + ] +) def set_font_size_in_coord(text, width=None, height=None, mode="unlocked"): @@ -75,8 +75,8 @@ def set_font_size_in_coord(text, width=None, height=None, mode="unlocked"): This behavior is not equal for all initial font sizes (in 'pt'), the boundaries for an initial size of 1 'pt' seem to be most exact. """ - from matplotlib.transforms import Bbox, Affine2D from matplotlib.patheffects import AbstractPathEffect + from matplotlib.transforms import Affine2D, Bbox class TextScaler(AbstractPathEffect): def __init__(self, text, width, height, mode): @@ -127,25 +127,21 @@ def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): if mode in ["unlocked", "minimum", "maximum"]: if width is None or height is None: - raise TypeError( - f"Width and height must be set in '{mode}' mode" - ) + raise TypeError(f"Width and height must be set in '{mode}' mode") elif mode == "proportional": - if not (width is None and height is not None) or \ - not (height is None and width is not None): - raise TypeError( - f"Either width or height must be set in '{mode}' mode" - ) + if not (width is None and height is not None) or not ( + height is None and width is not None + ): + raise TypeError(f"Either width or height must be set in '{mode}' mode") else: - raise ValueError( - f"Unknown mode '{mode}'" - ) + raise ValueError(f"Unknown mode '{mode}'") text.set_path_effects([TextScaler(text, width, height, mode)]) + try: # Only create this class when matplotlib is installed - from matplotlib.transforms import Bbox from matplotlib.patches import FancyArrow + from matplotlib.transforms import Bbox class AdaptiveFancyArrow(FancyArrow): """ @@ -177,9 +173,19 @@ class AdaptiveFancyArrow(FancyArrow): `FancyArrow`. """ - def __init__(self, x, y, dx, dy, - tail_width, head_width, head_ratio, draw_head=True, - shape="full", **kwargs): + def __init__( + self, + x, + y, + dx, + dy, + tail_width, + head_width, + head_ratio, + draw_head=True, + shape="full", + **kwargs, + ): self._x = x self._y = y self._dx = dx @@ -193,23 +199,25 @@ def __init__(self, x, y, dx, dy, if not draw_head: head_width = tail_width super().__init__( - x, y, dx, dy, - width=tail_width, head_width=head_width, - overhang=0, shape=shape, - length_includes_head=True, **kwargs + x, + y, + dx, + dy, + width=tail_width, + head_width=head_width, + overhang=0, + shape=shape, + length_includes_head=True, + **kwargs, ) def draw(self, renderer): - arrow_box = Bbox([(0,0), (0,self._head_width)]) + arrow_box = Bbox([(0, 0), (0, self._head_width)]) arrow_box_display = self.axes.transData.transform_bbox(arrow_box) - head_length_display = np.abs( - arrow_box_display.height * self._head_ratio - ) + head_length_display = np.abs(arrow_box_display.height * self._head_ratio) arrow_box_display.x1 = arrow_box_display.x0 + head_length_display # Transfrom back to data coordinates for plotting - arrow_box = self.axes.transData.inverted().transform_bbox( - arrow_box_display - ) + arrow_box = self.axes.transData.inverted().transform_bbox(arrow_box_display) head_length = arrow_box.width arrow_length = norm((self._dx, self._dy)) if head_length > arrow_length: @@ -221,11 +229,19 @@ def draw(self, renderer): # Renew the arrow's properties super().__init__( - self._x, self._y, self._dx, self._dy, - width=self._tail_width, head_width=self._head_width, - overhang=0, shape=self._shape, - head_length=head_length, length_includes_head=True, - axes=self.axes, transform=self.get_transform(), **self._kwargs + self._x, + self._y, + self._dx, + self._dy, + width=self._tail_width, + head_width=self._head_width, + overhang=0, + shape=self._shape, + head_length=head_length, + length_includes_head=True, + axes=self.axes, + transform=self.get_transform(), + **self._kwargs, ) self.set_clip_path(self.axes.patch) super().draw(renderer) @@ -234,18 +250,16 @@ def draw(self, renderer): # Removes warning: # unknown document: /tutorials/intermediate/constrainedlayout_guide def get_in_layout(self): - """ - """ + """ """ return super().get_in_layout() + def set_in_layout(self, in_layout): - """ - """ + """ """ return super().set_in_layout(in_layout) except ImportError: - # Dummy class that propagates a meaningful error, # i.e. that Matplotlib is not installed - class AdaptiveFancyArrow(): + class AdaptiveFancyArrow: def __init__(*args, **kwargs): - raise ModuleNotFoundError(f"No module named 'matplotlib'") \ No newline at end of file + raise ModuleNotFoundError("No module named 'matplotlib'") diff --git a/tests/application/test_autodock.py b/tests/application/test_autodock.py index 126f424d2..969d73425 100644 --- a/tests/application/test_autodock.py +++ b/tests/application/test_autodock.py @@ -12,9 +12,7 @@ from ..util import data_dir, is_not_installed -@pytest.mark.skipif( - is_not_installed("vina"), reason="Autodock Vina is not installed" -) +@pytest.mark.skipif(is_not_installed("vina"), reason="Autodock Vina is not installed") @pytest.mark.parametrize("flexible", [False, True]) def test_docking(flexible): """ @@ -24,9 +22,7 @@ def test_docking(flexible): PDB structure. """ # A structure of a straptavidin-biotin complex - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("application"), "2rtg.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("application"), "2rtg.bcif")) structure = pdbx.get_structure( pdbx_file, model=1, extra_fields=["charge"], include_bonds=True ) @@ -46,8 +42,11 @@ def test_docking(flexible): flexible_mask = None app = VinaApp( - ligand, receptor, struc.centroid(ref_ligand), [20, 20, 20], - flexible=flexible_mask + ligand, + receptor, + struc.centroid(ref_ligand), + [20, 20, 20], + flexible=flexible_mask, ) app.set_seed(0) app.start() @@ -65,7 +64,7 @@ def test_docking(flexible): # Select best binding pose test_ligand_coord = test_ligand_coord[0] not_nan_mask = ~np.isnan(test_ligand_coord).any(axis=-1) - ref_ligand_coord = ref_ligand_coord[not_nan_mask] + ref_ligand_coord = ref_ligand_coord[not_nan_mask] test_ligand_coord = test_ligand_coord[not_nan_mask] # Check if it least one atom is preserved assert test_ligand_coord.shape[1] > 0 @@ -78,7 +77,7 @@ def test_docking(flexible): # Select best binding pose test_receptor_coord = test_receptor_coord[0] not_nan_mask = ~np.isnan(test_receptor_coord).any(axis=-1) - ref_receptor_coord = receptor[not_nan_mask] + ref_receptor_coord = receptor[not_nan_mask] test_receptor_coord = test_receptor_coord[not_nan_mask] # Check if it least one atom is preserved assert test_receptor_coord.shape[1] > 0 @@ -86,9 +85,7 @@ def test_docking(flexible): # from the original conformation # NOTE: Currently 1.0 Å is sufficient in local testing, # but not in the CI (1.6 Å) - assert np.max( - struc.distance(test_receptor_coord, ref_receptor_coord) - ) < 1.7 + assert np.max(struc.distance(test_receptor_coord, ref_receptor_coord)) < 1.7 else: ref_receptor_coord = receptor.coord for model_coord in test_receptor_coord: diff --git a/tests/application/test_blast.py b/tests/application/test_blast.py index 49bfed2b4..d24bdf52a 100644 --- a/tests/application/test_blast.py +++ b/tests/application/test_blast.py @@ -2,15 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +import os.path +import pytest +import biotite.application.blast as blast import biotite.sequence as seq import biotite.sequence.io as seqio -import biotite.application.blast as blast -import numpy as np -from requests.exceptions import ConnectionError -import pytest -import os.path -from ..util import data_dir, cannot_connect_to - +from ..util import cannot_connect_to, data_dir BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" @@ -22,10 +19,7 @@ prot_seq = seq.ProteinSequence("MTMITPSFPGNS") -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastn(): app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False) app.set_max_expect_value(100) @@ -36,10 +30,8 @@ def test_blastn(): assert dna_seq == alignments[0].sequences[0] assert dna_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastx(): app = blast.BlastWebApp("blastx", dna_seq, obey_rules=False) app.set_max_expect_value(100) @@ -50,10 +42,8 @@ def test_blastx(): assert prot_seq == alignments[0].sequences[0] assert prot_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_tblastx(): app = blast.BlastWebApp("tblastx", dna_seq, obey_rules=False) app.set_max_expect_value(100) @@ -61,16 +51,14 @@ def test_tblastx(): app.join(timeout=300) alignments = app.get_alignments() # BLAST should find original sequence as best hit - print (alignments[0].sequences[0]) - print (alignments[0].sequences[1]) + print(alignments[0].sequences[0]) + print(alignments[0].sequences[1]) rev_prot_seq = dna_seq.reverse().complement().translate(complete=True) assert rev_prot_seq == alignments[0].sequences[0] assert rev_prot_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastp(): app = blast.BlastWebApp("blastp", prot_seq, obey_rules=False) app.set_max_expect_value(100) @@ -81,10 +69,8 @@ def test_blastp(): assert prot_seq == alignments[0].sequences[0] assert prot_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_tblastn(): app = blast.BlastWebApp("tblastn", prot_seq, obey_rules=False) app.set_max_expect_value(200) @@ -95,20 +81,20 @@ def test_tblastn(): assert prot_seq == alignments[0].sequences[0] assert prot_seq == alignments[0].sequences[1] + def test_file_input(): path = os.path.join(data_dir("sequence"), "prot.fasta") app = blast.BlastWebApp("blastp", path, obey_rules=False) + def test_invalid_query(): with pytest.raises(ValueError): app = blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False) with pytest.raises(ValueError): app = blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False) - -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_no_hit(): app = blast.BlastWebApp("blastn", "ACTGTACGAAACTCGGCGTA", obey_rules=False) app.set_word_size(20) @@ -118,10 +104,8 @@ def test_no_hit(): # BLAST should find original sequence as best hit assert len(alignments) == 0 -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_invalid_input(): app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False) # Set some invalid parameters @@ -132,18 +116,15 @@ def test_invalid_input(): app.join(timeout=300) -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_hit_with_selenocysteine(): # Sequence is taken from issue #344 query = seqio.load_sequence( os.path.join(data_dir("sequence"), "selenocysteine.fasta") ) - + # Expect hit containing selenocysteine when searching Swiss-Prot blast_app = blast.BlastWebApp("blastp", query, "swissprot") blast_app.start() # No AlphabetError should be raised here - blast_app.join() \ No newline at end of file + blast_app.join() diff --git a/tests/application/test_dssp.py b/tests/application/test_dssp.py index 197790236..fcb7303d0 100644 --- a/tests/application/test_dssp.py +++ b/tests/application/test_dssp.py @@ -16,8 +16,7 @@ @pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed") def test_multiple_chains(): atoms = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")), - model=1 + pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")), model=1 ) atoms = atoms[struc.filter_canonical_amino_acids(atoms)] sse = DsspApp.annotate_sse(atoms) diff --git a/tests/application/test_msa.py b/tests/application/test_msa.py index f4a8c16e6..2971822b5 100644 --- a/tests/application/test_msa.py +++ b/tests/application/test_msa.py @@ -2,36 +2,31 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from distutils.version import Version +import numpy as np +import pytest import biotite.sequence as seq -import biotite.sequence.phylo as phylo import biotite.sequence.align as align +import biotite.sequence.phylo as phylo from biotite.application import VersionError -from biotite.application.muscle import MuscleApp, Muscle5App -from biotite.application.mafft import MafftApp from biotite.application.clustalo import ClustalOmegaApp -import numpy as np -import pytest -import shutil +from biotite.application.mafft import MafftApp +from biotite.application.muscle import Muscle5App, MuscleApp from ..util import is_not_installed - BIN_PATH = { - MuscleApp : "muscle", - Muscle5App : "muscle", - MafftApp : "mafft", - ClustalOmegaApp: "clustalo" + MuscleApp: "muscle", + Muscle5App: "muscle", + MafftApp: "mafft", + ClustalOmegaApp: "clustalo", } @pytest.fixture def sequences(): - return [seq.ProteinSequence(string) for string in [ - "BIQTITE", - "TITANITE", - "BISMITE", - "IQLITE" -]] + return [ + seq.ProteinSequence(string) + for string in ["BIQTITE", "TITANITE", "BISMITE", "IQLITE"] + ] @pytest.mark.parametrize( @@ -70,7 +65,7 @@ def sequences(): [1, 2, 0, 3] ) ] -) # fmt: skip +) # fmt: skip def test_msa(sequences, app_cls, exp_ali, exp_order): """ Test MSA software on short toy sequences with known alignment @@ -83,7 +78,7 @@ def test_msa(sequences, app_cls, exp_ali, exp_order): try: app = app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() @@ -115,14 +110,13 @@ def test_large_sequence_number(app_cls): try: app = app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() # Expect completely matching sequences - assert alignment.trace.tolist() == [ - [i]*SEQ_NUMBER for i in range(SEQ_LENGTH) - ] + assert alignment.trace.tolist() == [[i] * SEQ_NUMBER for i in range(SEQ_LENGTH)] + def test_additional_options(sequences): bin_path = BIN_PATH[ClustalOmegaApp] @@ -139,7 +133,7 @@ def test_additional_options(sequences): app1.join() app2.join() assert "--full" not in app1.get_command() - assert "--full" in app2.get_command() + assert "--full" in app2.get_command() assert app1.get_alignment() == app2.get_alignment() @@ -158,11 +152,11 @@ def test_custom_substitution_matrix(sequences, app_cls): "TITANITE\n" "BI-SMITE\n" "-I-QLITE" - ) # fmt: skip + ) # fmt: skip try: app = app_cls(sequences, matrix=matrix) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() @@ -181,16 +175,16 @@ def test_custom_sequence_type(app_cls): sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], - ]] # fmt: skip + ]] # fmt: skip exp_trace = [ - [ 0, 0], - [ 1, -1], - [ 2, 1], - [ 3, 2], - [-1, 3], - [ 4, 4], - [ 5, 5], - [ 6, 6], + [0, 0], + [1, -1], + [2, 1], + [3, 2], + [-1, 3], + [4, 4], + [5, 5], + [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) @@ -200,7 +194,7 @@ def test_custom_sequence_type(app_cls): try: app = app_cls(sequences, matrix=matrix) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() @@ -222,12 +216,12 @@ def test_invalid_sequence_type_no_matrix(app_cls): sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], - ]] # fmt: skip + ]] # fmt: skip with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") @pytest.mark.parametrize("app_cls", [MuscleApp, MafftApp, ClustalOmegaApp]) @@ -241,15 +235,18 @@ def test_invalid_sequence_type_unsuitable_alphabet(app_cls): pytest.skip(f"'{bin_path}' is not installed") alph = seq.Alphabet(range(50)) - sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ - [1,2,3], - [1,2,3], - ]] + sequences = [ + seq.GeneralSequence(alph, sequence) + for sequence in [ + [1, 2, 3], + [1, 2, 3], + ] + ] with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") def test_invalid_muscle_version(sequences): @@ -262,7 +259,7 @@ def test_invalid_muscle_version(sequences): pytest.skip(f"'{bin_path}' is not installed") if is_not_installed("muscle"): - pytest.skip(f"'muscle' is not installed") + pytest.skip("'muscle' is not installed") with pytest.raises(VersionError): MuscleApp(sequences) @@ -279,7 +276,7 @@ def test_clustalo_matrix(sequences): [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0] - ] # fmt: skip + ] # fmt: skip app = ClustalOmegaApp(sequences) app.full_matrix_calculation() app.set_distance_matrix(np.array(ref_matrix)) @@ -332,7 +329,7 @@ def test_muscle_tree(sequences): try: app = MuscleApp(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() tree1 = app.get_guide_tree(iteration="kmer") @@ -349,7 +346,7 @@ def test_muscle5_options(sequences): try: app = Muscle5App(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.use_super5() app.set_iterations(2, 100) app.set_thread_number(2) @@ -366,4 +363,4 @@ def test_muscle5_options(sequences): "TITANITE\n" \ "BI-SMITE\n" \ "-I-QLITE" - ) # fmt: skip \ No newline at end of file + ) # fmt: skip diff --git a/tests/application/test_rnaalifold.py b/tests/application/test_rnaalifold.py index f55b6bdb1..281aefa81 100644 --- a/tests/application/test_rnaalifold.py +++ b/tests/application/test_rnaalifold.py @@ -29,7 +29,7 @@ def sample_app(): is_not_installed("RNAalifold"), reason="RNAalifold is not installed" ) def test_get_dot_bracket(sample_app): - assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." + assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." @pytest.mark.skipif( @@ -38,19 +38,17 @@ def test_get_dot_bracket(sample_app): def test_get_free_energy(sample_app): assert sample_app.get_free_energy() == -1.3 + @pytest.mark.skipif( is_not_installed("RNAalifold"), reason="RNAalifold is not installed" ) def test_get_base_pairs(sample_app): - expected_basepairs = np.array([[ 0, 22], - [ 1, 21], - [ 2, 20], - [ 4, 19], - [ 5, 18], - [ 6, 16], - [ 7, 15]]) + expected_basepairs = np.array( + [[0, 22], [1, 21], [2, 20], [4, 19], [5, 18], [6, 16], [7, 15]] + ) assert np.all(sample_app.get_base_pairs() == expected_basepairs) + @pytest.mark.skipif( is_not_installed("RNAalifold"), reason="RNAalifold is not installed" ) @@ -63,7 +61,7 @@ def test_constraints(): sequence = seq.NucleotideSequence("A" * 20) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() alignment = align.align_ungapped(sequence, sequence, matrix) - + # An arbitrary secondary structure # The loop in the center must probably comprise at least 5 bases # due to the dynamic programming algorithm @@ -72,15 +70,18 @@ def test_constraints(): app = RNAalifoldApp(alignment) app.set_constraints( - pairs=np.stack([ - np.where(ref_dotbracket_array == "(")[0], - np.where(ref_dotbracket_array == ")")[0][::-1] - ], axis=-1), - unpaired = (ref_dotbracket_array == "x"), - enforce=True + pairs=np.stack( + [ + np.where(ref_dotbracket_array == "(")[0], + np.where(ref_dotbracket_array == ")")[0][::-1], + ], + axis=-1, + ), + unpaired=(ref_dotbracket_array == "x"), + enforce=True, ) app.start() app.join() test_dotbracket = app.get_dot_bracket() - assert test_dotbracket == ref_dotbracket.replace("x", ".") \ No newline at end of file + assert test_dotbracket == ref_dotbracket.replace("x", ".") diff --git a/tests/application/test_rnafold.py b/tests/application/test_rnafold.py index f8b0ccfd7..048ff5667 100644 --- a/tests/application/test_rnafold.py +++ b/tests/application/test_rnafold.py @@ -22,36 +22,25 @@ def sample_app(): return app -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_get_dot_bracket(sample_app): - assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." + assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_get_free_energy(sample_app): assert sample_app.get_free_energy() == -1.3 -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) + +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_get_base_pairs(sample_app): - expected_basepairs = np.array([[ 0, 22], - [ 1, 21], - [ 2, 20], - [ 4, 19], - [ 5, 18], - [ 6, 16], - [ 7, 15]]) + expected_basepairs = np.array( + [[0, 22], [1, 21], [2, 20], [4, 19], [5, 18], [6, 16], [7, 15]] + ) assert np.all(sample_app.get_base_pairs() == expected_basepairs) -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_constraints(): """ Constrain every position of the input sequence and expect that the @@ -59,7 +48,7 @@ def test_constraints(): """ # Sequence should not matter sequence = seq.NucleotideSequence("A" * 20) - + # An arbitrary secondary structure # The loop in the center must probably comprise at least 5 bases # due to the dynamic programming algorithm @@ -68,15 +57,18 @@ def test_constraints(): app = RNAfoldApp(sequence) app.set_constraints( - pairs=np.stack([ - np.where(ref_dotbracket_array == "(")[0], - np.where(ref_dotbracket_array == ")")[0][::-1] - ], axis=-1), - unpaired = (ref_dotbracket_array == "x"), - enforce=True + pairs=np.stack( + [ + np.where(ref_dotbracket_array == "(")[0], + np.where(ref_dotbracket_array == ")")[0][::-1], + ], + axis=-1, + ), + unpaired=(ref_dotbracket_array == "x"), + enforce=True, ) app.start() app.join() test_dotbracket = app.get_dot_bracket() - assert test_dotbracket == ref_dotbracket.replace("x", ".") \ No newline at end of file + assert test_dotbracket == ref_dotbracket.replace("x", ".") diff --git a/tests/application/test_rnaplot.py b/tests/application/test_rnaplot.py index 8810d1131..449d589c1 100644 --- a/tests/application/test_rnaplot.py +++ b/tests/application/test_rnaplot.py @@ -14,23 +14,24 @@ def sample_app(): Provide a `RNAplotApp` object, where *RNAplot* has been executed for a sample structure. """ - app = RNAplotApp('((..))') + app = RNAplotApp("((..))") app.start() app.join() return app -@pytest.mark.skipif( - is_not_installed("RNAplot"), reason="RNAplot is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAplot"), reason="RNAplot is not installed") def test_get_cooordinates(sample_app): - assert ( - np.all( - sample_app.get_coordinates() == np.array([[ -92.5 , 92.5 ], - [ -92.5 , 77.5 ], - [ -90.31, 58.24], - [-109.69, 58.24], - [-107.5 , 77.5 ], - [-107.5 , 92.5 ]]) + assert np.all( + sample_app.get_coordinates() + == np.array( + [ + [-92.5, 92.5], + [-92.5, 77.5], + [-90.31, 58.24], + [-109.69, 58.24], + [-107.5, 77.5], + [-107.5, 92.5], + ] ) ) diff --git a/tests/application/test_sra.py b/tests/application/test_sra.py index 78b471538..7728ae33a 100644 --- a/tests/application/test_sra.py +++ b/tests/application/test_sra.py @@ -6,16 +6,14 @@ from os.path import join from tempfile import gettempdir import pytest -from biotite.application.sra import FastqDumpApp, FastaDumpApp -from biotite.sequence.io.fastq import FastqFile +from biotite.application.sra import FastaDumpApp, FastqDumpApp from biotite.sequence.io.fasta import FastaFile +from biotite.sequence.io.fastq import FastqFile @pytest.mark.parametrize( - "app_class, custom_prefix", itertools.product( - [FastqDumpApp, FastaDumpApp], - [False, True] - ) + "app_class, custom_prefix", + itertools.product([FastqDumpApp, FastaDumpApp], [False, True]), ) def test_objects(app_class, custom_prefix): """ @@ -45,10 +43,8 @@ def test_objects(app_class, custom_prefix): @pytest.mark.parametrize( - "app_class, custom_prefix", itertools.product( - [FastqDumpApp, FastaDumpApp], - [False, True] - ) + "app_class, custom_prefix", + itertools.product([FastqDumpApp, FastaDumpApp], [False, True]), ) def test_classmethod(app_class, custom_prefix): """ diff --git a/tests/application/test_tantan.py b/tests/application/test_tantan.py index dd88abd66..de27f8d3c 100644 --- a/tests/application/test_tantan.py +++ b/tests/application/test_tantan.py @@ -9,22 +9,18 @@ from biotite.application.tantan import TantanApp from ..util import is_not_installed + @pytest.fixture def simple_matrix(): alph = seq.NucleotideSequence.alphabet_unamb return align.SubstitutionMatrix( - alph, alph, np.array( - [[ 1, -1, -1, -1], - [-1, 1, -1, -1], - [-1, -1, 1, -1], - [-1, -1, -1, 1]] - ) + alph, + alph, + np.array([[1, -1, -1, -1], [-1, 1, -1, -1], [-1, -1, 1, -1], [-1, -1, -1, 1]]), ) -@pytest.mark.skipif( - is_not_installed("tantan"), reason="tantan is not installed" -) +@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed") @pytest.mark.parametrize("use_custom_matrix", [False, True]) def test_nucleotide(simple_matrix, use_custom_matrix): """ @@ -45,9 +41,7 @@ def test_nucleotide(simple_matrix, use_custom_matrix): assert test_mask.tolist() == ref_mask -@pytest.mark.skipif( - is_not_installed("tantan"), reason="tantan is not installed" -) +@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed") @pytest.mark.parametrize("use_custom_matrix", [False, True]) def test_protein(use_custom_matrix): """ @@ -68,16 +62,14 @@ def test_protein(use_custom_matrix): assert test_mask.tolist() == ref_mask -@pytest.mark.skipif( - is_not_installed("tantan"), reason="tantan is not installed" -) +@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed") def test_multiple_sequences(): """ Test masking multiple sequences in a single run. """ seq_strings = [ "CANYQVcanacanasacannercancanACAN", - "NEARAnearanearerearanearlyeerieear" + "NEARAnearanearerearanearlyeerieear", ] sequences = [seq.ProteinSequence(seq_string) for seq_string in seq_strings] @@ -91,4 +83,4 @@ def test_multiple_sequences(): assert len(test_masks) == len(ref_masks) for test_mask, ref_mask in zip(test_masks, ref_masks): - assert test_mask.tolist() == ref_mask \ No newline at end of file + assert test_mask.tolist() == ref_mask diff --git a/tests/conftest.py b/tests/conftest.py index af3b9597b..7701902e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,8 +2,6 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest -import numpy as np def pytest_sessionstart(session): """ @@ -13,10 +11,11 @@ def pytest_sessionstart(session): try: import numpy as np import pyximport + pyximport.install( build_in_temp=False, - setup_args={"include_dirs":np.get_include()}, - language_level=3 + setup_args={"include_dirs": np.get_include()}, + language_level=3, ) except ImportError: - pass \ No newline at end of file + pass diff --git a/tests/database/test_entrez.py b/tests/database/test_entrez.py index a0c4dee44..6c1eb3caa 100644 --- a/tests/database/test_entrez.py +++ b/tests/database/test_entrez.py @@ -4,40 +4,29 @@ import itertools import tempfile -import numpy as np -from requests.exceptions import ConnectionError import pytest import biotite.database.entrez as entrez import biotite.sequence.io.fasta as fasta from biotite.database import RequestError from ..util import cannot_connect_to - NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/" -@pytest.mark.skipif( - cannot_connect_to(NCBI_URL), - reason="NCBI Entrez is not available" -) +@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available") @pytest.mark.parametrize( - "common_name, as_file_like", - itertools.product([False, True], [False, True]) + "common_name, as_file_like", itertools.product([False, True], [False, True]) ) def test_fetch(common_name, as_file_like): path = None if as_file_like else tempfile.gettempdir() db_name = "Protein" if common_name else "protein" - file = entrez.fetch( - "1L2Y_A", path, "fa", db_name, "fasta", overwrite=True - ) + file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 20 -@pytest.mark.skipif( - cannot_connect_to(NCBI_URL), - reason="NCBI Entrez is not available" -) + +@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available") @pytest.mark.parametrize("as_file_like", [False, True]) def test_fetch_single_file(as_file_like): if as_file_like: @@ -45,7 +34,7 @@ def test_fetch_single_file(as_file_like): else: file = tempfile.NamedTemporaryFile("r", suffix=".fa") file_name = file.name - + downloaded_file_name = entrez.fetch_single_file( ["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta" ) @@ -56,17 +45,12 @@ def test_fetch_single_file(as_file_like): if not as_file_like: file.close() -@pytest.mark.skipif( - cannot_connect_to(NCBI_URL), - reason="NCBI Entrez is not available" -) + +@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available") def test_fetch_invalid(): with pytest.raises(RequestError): # Empty ID list - file = entrez.fetch_single_file( - [], None, "protein", "fasta", overwrite=True) + file = entrez.fetch_single_file([], None, "protein", "fasta", overwrite=True) with pytest.raises(RequestError): # Nonexisting ID - file = entrez.fetch( - "xxxx", None, "fa", "protein", "fasta", overwrite=True - ) \ No newline at end of file + file = entrez.fetch("xxxx", None, "fa", "protein", "fasta", overwrite=True) diff --git a/tests/database/test_pubchem.py b/tests/database/test_pubchem.py index 8c26a1ddc..49025c596 100644 --- a/tests/database/test_pubchem.py +++ b/tests/database/test_pubchem.py @@ -2,27 +2,22 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import re import itertools +import re import tempfile -import pytest import numpy as np +import pytest import biotite.database.pubchem as pubchem import biotite.structure.io.mol as mol from biotite.database import RequestError from ..util import cannot_connect_to - PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/" -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="Pubchem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="Pubchem is not available") @pytest.mark.parametrize( - "format, as_file_like", - itertools.product(["sdf", "png"], [False, True]) + "format, as_file_like", itertools.product(["sdf", "png"], [False, True]) ) def test_fetch(format, as_file_like): """ @@ -39,10 +34,7 @@ def test_fetch(format, as_file_like): mol_file.get_structure() -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize("as_structural_formula", [False, True]) def test_fetch_structural_formula(as_structural_formula): """ @@ -52,9 +44,9 @@ def test_fetch_structural_formula(as_structural_formula): """ CID = 2244 - mol_file = mol.MOLFile.read(pubchem.fetch( - 2244, as_structural_formula=as_structural_formula - )) + mol_file = mol.MOLFile.read( + pubchem.fetch(2244, as_structural_formula=as_structural_formula) + ) atoms = mol_file.get_structure() if as_structural_formula: @@ -63,10 +55,7 @@ def test_fetch_structural_formula(as_structural_formula): assert np.any(atoms.coord[:, 2] != 0) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") def test_fetch_invalid(): """ An exception is expected when the CID is not available. @@ -77,10 +66,7 @@ def test_fetch_invalid(): pubchem.fetch(1234567890) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize( "query, ref_ids", [ @@ -89,7 +75,7 @@ def test_fetch_invalid(): (pubchem.InchiQuery("InChI=1S/C4H10/c1-3-4-2/h3-4H2,1-2H3"), [7843]), (pubchem.InchiKeyQuery("IJDNQMDRQITEOD-UHFFFAOYSA-N"), [7843]), ], - ids=["NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery"] + ids=["NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery"], ) def test_search_simple(query, ref_ids): """ @@ -102,10 +88,7 @@ def test_search_simple(query, ref_ids): assert set(ref_ids).issubset(pubchem.search(query)) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") def test_search_formula(): """ Download a structure and search for its molecular formula in @@ -115,23 +98,17 @@ def test_search_formula(): CID = 101608985 atoms = mol.MOLFile.read(pubchem.fetch(CID)).get_structure() - test_cids = pubchem.search( - pubchem.FormulaQuery.from_atoms(atoms) - ) + test_cids = pubchem.search(pubchem.FormulaQuery.from_atoms(atoms)) assert CID in (test_cids) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize( - "cid, from_atoms, query_type", itertools.product( - [2244], - [False, True], - [pubchem.SuperstructureQuery, pubchem.SubstructureQuery] - ) + "cid, from_atoms, query_type", + itertools.product( + [2244], [False, True], [pubchem.SuperstructureQuery, pubchem.SubstructureQuery] + ), ) def test_search_super_and_substructure(cid, from_atoms, query_type): """ @@ -170,16 +147,9 @@ def test_search_super_and_substructure(cid, from_atoms, query_type): assert atoms.array_length() >= original_atoms.array_length() -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize( - "conformation_based, from_atoms", - itertools.product( - [False, True], - [False, True] - ) + "conformation_based, from_atoms", itertools.product([False, True], [False, True]) ) def test_search_similarity(conformation_based, from_atoms): """ @@ -192,8 +162,7 @@ def test_search_similarity(conformation_based, from_atoms): if from_atoms: original_atoms = mol.MOLFile.read(pubchem.fetch(CID)).get_structure() query = pubchem.SimilarityQuery.from_atoms( - original_atoms, threshold=1.0, - conformation_based=conformation_based + original_atoms, threshold=1.0, conformation_based=conformation_based ) else: query = pubchem.SimilarityQuery( @@ -204,10 +173,7 @@ def test_search_similarity(conformation_based, from_atoms): assert CID in cids -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize("from_atoms", [False, True]) def test_search_identity(from_atoms): """ @@ -222,4 +188,4 @@ def test_search_identity(from_atoms): query = pubchem.IdentityQuery(cid=CID) cids = pubchem.search(query) - assert cids == [CID] \ No newline at end of file + assert cids == [CID] diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 4d26cb9f6..e0d0194ab 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -2,32 +2,28 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join import itertools import tempfile -import pytest +from os.path import join import numpy as np +import pytest import biotite.database.rcsb as rcsb +import biotite.sequence.align as align +import biotite.sequence.io.fasta as fasta import biotite.structure.io.pdb as pdb import biotite.structure.io.pdbx as pdbx -import biotite.sequence.io.fasta as fasta -import biotite.sequence.align as align from biotite.database import RequestError from ..util import cannot_connect_to, data_dir - RCSB_URL = "https://www.rcsb.org/" # Search term that should only find the entry 1L2Y TC5B_TERM = "Miniprotein Construct TC5b" -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize( "format, as_file_like", - itertools.product(["pdb", "cif", "bcif", "fasta"], [False, True]) + itertools.product(["pdb", "cif", "bcif", "fasta"], [False, True]), ) def test_fetch(format, as_file_like): path = None if as_file_like else tempfile.gettempdir() @@ -47,16 +43,11 @@ def test_fetch(format, as_file_like): assert len(fasta.get_sequences(file)) > 0 -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize("format", ["pdb", "cif", "bcif", "fasta"]) def test_fetch_invalid(format): with pytest.raises(RequestError): - rcsb.fetch( - "xxxx", format, tempfile.gettempdir(), overwrite=True - ) + rcsb.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True) def test_search_basic(): @@ -72,58 +63,78 @@ def test_search_basic(): "pdbx_serial_crystallography_sample_delivery_injection.preparation", False, {}, - ["6IG7", "6IG6", "7JRI", "7JR5", "7QX4", "7QX5", "7QX6", "7QX7", - "8A2O", "8A2P"] + [ + "6IG7", + "6IG6", + "7JRI", + "7JR5", + "7QX4", + "7QX5", + "7QX6", + "7QX7", + "8A2O", + "8A2P", + ], ), ( "audit_author.name", False, {"is_in": ["Neidigh, J.W."]}, - ["1JRJ", "1L2Y", "2O3P", "2O63", "2O64", "2O65"] + ["1JRJ", "1L2Y", "2O3P", "2O63", "2O64", "2O65"], ), ( "rcsb_entity_source_organism.rcsb_gene_name.value", False, {"exact_match": "lacA"}, - ["5JUV", "1KQA", "1KRV", "1KRU", "1KRR", "3U7V", "4IUG", "4LFK", - "4LFL", "4LFM", "4LFN", "5IFP", "5IFT", "5IHR", "4DUW", "5MGD", - "5MGC"] + [ + "5JUV", + "1KQA", + "1KRV", + "1KRU", + "1KRR", + "3U7V", + "4IUG", + "4LFK", + "4LFL", + "4LFM", + "4LFN", + "5IFP", + "5IFT", + "5IHR", + "4DUW", + "5MGD", + "5MGC", + ], ), ( "struct.title", False, {"contains_words": "tc5b"}, - ["1L2Y", "8ANH", "8ANM", "8ANG", "8ANI"] + ["1L2Y", "8ANH", "8ANM", "8ANG", "8ANI"], ), ( "reflns.d_resolution_high", False, {"less_or_equal": 0.6}, - ["1EJG", "1I0T", "3NIR", "3P4J", "5D8V", "5NW3", "4JLJ", "7ATG", - "7R0H"] + ["1EJG", "1I0T", "3NIR", "3P4J", "5D8V", "5NW3", "4JLJ", "7ATG", "7R0H"], ), ( "rcsb_entry_info.deposited_model_count", False, {"range_closed": (60, 61)}, - ["1BBO", "1GB1", "1O5P", "1XU6", "2LUM", "2NO8"] + ["1BBO", "1GB1", "1O5P", "1XU6", "2LUM", "2NO8"], ), ( "rcsb_id", True, {"exact_match": "AIN"}, - ["1OXR", "1TGM", "3IAZ", "3GCL", "6MQF", "2QQT", "4NSB", "8J3W"] + ["1OXR", "1TGM", "3IAZ", "3GCL", "6MQF", "2QQT", "4NSB", "8J3W"], ), - ] -) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" + ], ) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_field(field, molecular_definition, params, ref_ids): - query = rcsb.FieldQuery( - field, molecular_definition, **params - ) + query = rcsb.FieldQuery(field, molecular_definition, **params) test_ids = rcsb.search(query) test_count = rcsb.count(query) @@ -131,17 +142,12 @@ def test_search_field(field, molecular_definition, params, ref_ids): assert test_count == len(ref_ids) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) - ref_sequence = pdbx.get_sequence(pdbx_file)['A'] - query = rcsb.SequenceQuery( - ref_sequence, "protein", min_identity=IDENTIY_CUTOFF - ) + ref_sequence = pdbx.get_sequence(pdbx_file)["A"] + query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) assert len(test_ids) >= 2 @@ -156,20 +162,14 @@ def test_search_sequence(): assert identity >= IDENTIY_CUTOFF -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_structure(): query = rcsb.StructureQuery("1L2Y", chain="A") test_ids = rcsb.search(query) assert "1L2Y" in test_ids -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_motif(): # motif is taken from official RCSB search API tutorial MOTIF = "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H." @@ -178,25 +178,18 @@ def test_search_motif(): assert test_count == pytest.approx(639, rel=0.1) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_composite(): query1 = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" - ) - query2 = rcsb.FieldQuery( - "exptl.method", - exact_match="SOLUTION NMR" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) + query2 = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR") ids_1 = set(rcsb.search(query1)) ids_2 = set(rcsb.search(query2)) ids_or = set(rcsb.search(query1 | query2)) ids_and = set(rcsb.search(query1 & query2)) - assert ids_or == ids_1 | ids_2 + assert ids_or == ids_1 | ids_2 assert ids_and == ids_1 & ids_2 @@ -209,26 +202,19 @@ def test_search_composite(): ("non_polymer_entity", [] ), ("polymer_instance", ["1L2Y.A"]), ] -) # fmt: skip -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +) # fmt: skip +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_return_type(return_type, expected): query = rcsb.BasicQuery(TC5B_TERM) assert rcsb.search(query, return_type) == expected assert rcsb.count(query, return_type) == len(expected) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize("seed", np.arange(5)) def test_search_range(seed): query = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) count = rcsb.count(query) ref_entries = rcsb.search(query) @@ -241,15 +227,11 @@ def test_search_range(seed): assert test_entries == ref_entries[range[0] : range[1]] -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize("as_sorting_object", [False, True]) def test_search_sort(as_sorting_object): query = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) if as_sorting_object: sort_by = rcsb.Sorting("reflns.d_resolution_high", descending=False) @@ -270,20 +252,18 @@ def test_search_sort(as_sorting_object): assert resolutions == list(reversed(sorted(resolutions))) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_content_types(): # Query to limit the number of returned results # for improved performance query = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) - experimental_set = set(rcsb.search(query, content_types=["experimental"])) + experimental_set = set(rcsb.search(query, content_types=["experimental"])) computational_set = set(rcsb.search(query, content_types=["computational"])) - combined_set = set(rcsb.search(query, content_types=["experimental", "computational"])) + combined_set = set( + rcsb.search(query, content_types=["experimental", "computational"]) + ) # If there are no results, the following tests make no sense assert len(combined_set) > 0 @@ -294,7 +274,9 @@ def test_search_content_types(): assert rcsb.count(query, content_types=["experimental"]) == len(experimental_set) assert rcsb.count(query, content_types=["computational"]) == len(computational_set) - assert rcsb.count(query, content_types=["experimental", "computational"]) == len(combined_set) + assert rcsb.count(query, content_types=["experimental", "computational"]) == len( + combined_set + ) # Expect an exception if no content_type with pytest.raises(ValueError): @@ -303,10 +285,7 @@ def test_search_content_types(): rcsb.count(query, content_types=[]) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize( "grouping, resolution_threshold, return_type, ref_groups", [ @@ -316,79 +295,65 @@ def test_search_content_types(): ), 0.7, "polymer_entity", - set([ - ("3X2M_1",), - ("6E6O_1",), - ("1YK4_1",), - ("5NW3_1",), - ("1US0_1",), - ("4HP2_1",), - ("2DSX_1",), - ("2VB1_1",), - ("7VOS_1", "5D8V_1", "3A38_1"), - ("1UCS_1",), - ("3NIR_1", "1EJG_1"), - ]) + set( + [ + ("3X2M_1",), + ("6E6O_1",), + ("1YK4_1",), + ("5NW3_1",), + ("1US0_1",), + ("4HP2_1",), + ("2DSX_1",), + ("2VB1_1",), + ("7VOS_1", "5D8V_1", "3A38_1"), + ("1UCS_1",), + ("3NIR_1", "1EJG_1"), + ] + ), ), - ( - rcsb.UniprotGrouping( - sort_by="rcsb_accession_info.initial_release_date" - ), + rcsb.UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"), 0.7, "polymer_entity", - set([ - ("3X2M_1",), - ("6E6O_1",), - ("1YK4_1",), - ("5NW3_1",), - ("1US0_1",), - ("4HP2_1",), - ("2DSX_1",), - ("2VB1_1",), - ("7VOS_1", "5D8V_1", "3A38_1"), - ("1UCS_1",), - ("3NIR_1", "1EJG_1"), - ]) + set( + [ + ("3X2M_1",), + ("6E6O_1",), + ("1YK4_1",), + ("5NW3_1",), + ("1US0_1",), + ("4HP2_1",), + ("2DSX_1",), + ("2VB1_1",), + ("7VOS_1", "5D8V_1", "3A38_1"), + ("1UCS_1",), + ("3NIR_1", "1EJG_1"), + ] + ), ), - ( - rcsb.DepositGrouping( - sort_by="rcsb_accession_info.initial_release_date" - ), + rcsb.DepositGrouping(sort_by="rcsb_accession_info.initial_release_date"), 0.9, "entry", - set([ - ("5R32",), - ("5RDH", "5RBR"), - ("7G0Z", "7FXV") - ]) - ) - ] + set([("5R32",), ("5RDH", "5RBR"), ("7G0Z", "7FXV")]), + ), + ], ) -def test_search_grouping(grouping, resolution_threshold, return_type, - ref_groups): +def test_search_grouping(grouping, resolution_threshold, return_type, ref_groups): """ Check whether the same result as in a known example is achieved. """ - query = ( - rcsb.FieldQuery( - "exptl.method", - exact_match="X-RAY DIFFRACTION" - ) - & rcsb.FieldQuery( - "rcsb_entry_info.resolution_combined", - range_closed=(0.0, resolution_threshold) - ) + query = rcsb.FieldQuery( + "exptl.method", exact_match="X-RAY DIFFRACTION" + ) & rcsb.FieldQuery( + "rcsb_entry_info.resolution_combined", range_closed=(0.0, resolution_threshold) ) - test_groups = list(rcsb.search( - query, return_type, - group_by=grouping, return_groups=True - ).values()) + test_groups = list( + rcsb.search(query, return_type, group_by=grouping, return_groups=True).values() + ) test_representatives = rcsb.search( - query, return_type, - group_by=grouping, return_groups=False + query, return_type, group_by=grouping, return_groups=False ) test_count = rcsb.count(query, return_type, group_by=grouping) @@ -398,10 +363,7 @@ def test_search_grouping(grouping, resolution_threshold, return_type, assert test_count == len(ref_groups) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_empty(): query = rcsb.BasicQuery("This will not match any ID") assert rcsb.search(query) == [] @@ -410,21 +372,9 @@ def test_search_empty(): @pytest.mark.parametrize( "field, params", - [ - ( - "invalid.field", - {"exact_match": "Some Value"} - ), - ( - "exptl.method", - {"less": 5} - ) - ] -) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" + [("invalid.field", {"exact_match": "Some Value"}), ("exptl.method", {"less": 5})], ) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_invalid(field, params): invalid_query = rcsb.FieldQuery(field, **params) with pytest.raises(RequestError, match="400"): diff --git a/tests/database/test_uniprot.py b/tests/database/test_uniprot.py index 53c12e60b..8f4b07b4d 100644 --- a/tests/database/test_uniprot.py +++ b/tests/database/test_uniprot.py @@ -10,74 +10,49 @@ from biotite.database import RequestError from ..util import cannot_connect_to - UNIPROT_URL = "https://www.uniprot.org/" -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) -@pytest.mark.parametrize( - "as_file_like", - itertools.product([False, True]) -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") +@pytest.mark.parametrize("as_file_like", itertools.product([False, True])) def test_fetch(as_file_like): path = None if as_file_like else tempfile.gettempdir() # UniProtKB - file = uniprot.fetch( - "P12345", "fasta", path, overwrite=True - ) + file = uniprot.fetch("P12345", "fasta", path, overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 430 # UniRef - file = uniprot.fetch( - "UniRef90_P99999", "fasta", path, overwrite=True - ) + file = uniprot.fetch("UniRef90_P99999", "fasta", path, overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 105 # UniParc - file = uniprot.fetch( - "UPI000000001F", "fasta", path, overwrite=True - ) + file = uniprot.fetch("UPI000000001F", "fasta", path, overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 551 -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") @pytest.mark.parametrize("format", ["fasta", "gff", "txt", "xml", "rdf", "tab"]) def test_fetch_invalid(format): with pytest.raises(RequestError): - file = uniprot.fetch( - "xxxx", format, tempfile.gettempdir(), overwrite=True - ) + file = uniprot.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True) -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") def test_search_simple(): query = uniprot.SimpleQuery("accession", "P12345") - assert uniprot.search(query) \ - == ['P12345'] + assert uniprot.search(query) == ["P12345"] -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") def test_search_composite(): - query = uniprot.SimpleQuery("accession", "P12345") & uniprot.SimpleQuery("reviewed", "true") - assert uniprot.search(query) \ - == ['P12345'] - + query = uniprot.SimpleQuery("accession", "P12345") & uniprot.SimpleQuery( + "reviewed", "true" + ) + assert uniprot.search(query) == ["P12345"] diff --git a/tests/sequence/align/conftest.py b/tests/sequence/align/conftest.py index 191fbde6f..10264bd87 100644 --- a/tests/sequence/align/conftest.py +++ b/tests/sequence/align/conftest.py @@ -15,4 +15,4 @@ def sequences(): 10 Cas9 sequences. """ fasta_file = fasta.FastaFile.read(join(data_dir("sequence"), "cas9.fasta")) - return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()] \ No newline at end of file + return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()] diff --git a/tests/sequence/align/test_alignment.py b/tests/sequence/align/test_alignment.py index ca1250d3e..971aecb73 100644 --- a/tests/sequence/align/test_alignment.py +++ b/tests/sequence/align/test_alignment.py @@ -8,7 +8,6 @@ import biotite.sequence.align as align - def test_alignment_str(): """ Test reading alignments from string. @@ -18,11 +17,12 @@ def test_alignment_str(): ali_str = [ "A-CCTGA----", "----T-ATGCT" - ] # fmt: skip + ] # fmt: skip trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) assert str(alignment).split("\n") == ali_str + def test_conversion_to_symbols(): """ Test conversion of alignments to strings. @@ -31,16 +31,20 @@ def test_conversion_to_symbols(): seq_str2 = "HA--PRDDADWKLHH" seq_str3 = "HA----DDADWKLHH" seq_strings = [seq_str1, seq_str2, seq_str3] - sequences = [seq.ProteinSequence(seq_str.replace("-","")) - for seq_str in seq_strings] + sequences = [ + seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings + ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Test the conversion bach to strings of symbols symbols = align.get_symbols(alignment) - symbols = ["".join([sym if sym is not None else "-" for sym in sym_list]) - for sym_list in symbols] + symbols = [ + "".join([sym if sym is not None else "-" for sym in sym_list]) + for sym_list in symbols + ] assert symbols == seq_strings + def test_identity(): """ Test correct calculation of `get_sequence_identity()` via a known @@ -49,16 +53,18 @@ def test_identity(): seq_str1 = "--HAKLPRDD--WL--" seq_str2 = "FRHA--QRTDADWLHH" seq_strings = [seq_str1, seq_str2] - sequences = [seq.ProteinSequence(seq_str.replace("-","")) - for seq_str in seq_strings] + sequences = [ + seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings + ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Assert correct sequence identity calculation modes = ["all", "not_terminal", "shortest"] - values = [6/16, 6/12, 6/10] + values = [6 / 16, 6 / 12, 6 / 10] for mode, value in zip(modes, values): assert align.get_sequence_identity(alignment, mode=mode) == value + @pytest.mark.parametrize("mode", ["all", "not_terminal", "shortest"]) def test_pairwise_identity(sequences, mode): """ @@ -67,15 +73,14 @@ def test_pairwise_identity(sequences, mode): """ sequences = sequences msa, _, _, _ = align.align_multiple( - sequences, - matrix=align.SubstitutionMatrix.std_protein_matrix() + sequences, matrix=align.SubstitutionMatrix.std_protein_matrix() ) ref_identity_matrix = np.zeros((len(sequences), len(sequences))) for i in range(len(sequences)): for j in range(len(sequences)): - ref_identity_matrix[i,j] = align.get_sequence_identity( - msa[:, [i,j]], mode=mode + ref_identity_matrix[i, j] = align.get_sequence_identity( + msa[:, [i, j]], mode=mode ) test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode) @@ -89,4 +94,4 @@ def test_pairwise_identity(sequences, mode): # Identity matrix is symmetric assert (test_identity_matrix == test_identity_matrix.T).all() # Pairwise identity must be equal in the two functions - assert (test_identity_matrix == ref_identity_matrix).all() \ No newline at end of file + assert (test_identity_matrix == ref_identity_matrix).all() diff --git a/tests/sequence/align/test_banded.py b/tests/sequence/align/test_banded.py index f6098f7c7..85e297dcb 100644 --- a/tests/sequence/align/test_banded.py +++ b/tests/sequence/align/test_banded.py @@ -3,18 +3,16 @@ # information. import itertools -import pytest import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align @pytest.mark.parametrize( - "gap_penalty, local, band_width", itertools.product( - [-10, (-10,-1)], - [False, True], - [2, 5, 20, 100] -)) + "gap_penalty, local, band_width", + itertools.product([-10, (-10, -1)], [False, True], [2, 5, 20, 100]), +) def test_simple_alignment(gap_penalty, local, band_width): """ Test `align_banded()` by comparing the output to `align_optimal()`. @@ -27,16 +25,19 @@ def test_simple_alignment(gap_penalty, local, band_width): matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=local, terminal_penalty=False + seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] test_alignments = align.align_banded( - seq1, seq2, matrix, (-band_width, band_width), - gap_penalty=gap_penalty, local=local + seq1, + seq2, + matrix, + (-band_width, band_width), + gap_penalty=gap_penalty, + local=local, ) assert len(test_alignments) == len(ref_alignments) @@ -45,11 +46,13 @@ def test_simple_alignment(gap_penalty, local, band_width): @pytest.mark.parametrize( - "gap_penalty, local, seq_indices", itertools.product( - [-10, (-10,-1)], - [False, True], - [(i,j) for i in range(10) for j in range(i+1)] -)) + "gap_penalty, local, seq_indices", + itertools.product( + [-10, (-10, -1)], + [False, True], + [(i, j) for i in range(10) for j in range(i + 1)], + ), +) def test_complex_alignment(sequences, gap_penalty, local, seq_indices): """ Test `align_banded()` by comparing the output to `align_optimal()`. @@ -65,9 +68,13 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): seq2 = sequences[index2] ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=local, terminal_penalty=False, - max_number=MAX_NUMBER + seq1, + seq2, + matrix, + gap_penalty=gap_penalty, + local=local, + terminal_penalty=False, + max_number=MAX_NUMBER, ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() @@ -78,8 +85,13 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): # otherwise use the entire search space band_width = 100 if identity > 0.5 else len(seq1) + len(seq2) test_alignments = align.align_banded( - seq1, seq2, matrix, (-band_width, band_width), - gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER + seq1, + seq2, + matrix, + (-band_width, band_width), + gap_penalty=gap_penalty, + local=local, + max_number=MAX_NUMBER, ) try: @@ -102,11 +114,9 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): @pytest.mark.parametrize( - "length, excerpt_length, seed", itertools.product( - [1_000, 1_000_000], - [50, 500], - range(10) -)) + "length, excerpt_length, seed", + itertools.product([1_000, 1_000_000], [50, 500], range(10)), +) def test_large_sequence_mapping(length, excerpt_length, seed): """ Test whether an excerpt of a very large sequence is aligned to that @@ -121,51 +131,37 @@ def test_large_sequence_mapping(length, excerpt_length, seed): excerpt_pos = np.random.randint(len(sequence) - excerpt_length) excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length] - diagonal = np.random.randint( - excerpt_pos - BAND_WIDTH, - excerpt_pos + BAND_WIDTH - ) - band = ( - diagonal - BAND_WIDTH, - diagonal + BAND_WIDTH - ) + diagonal = np.random.randint(excerpt_pos - BAND_WIDTH, excerpt_pos + BAND_WIDTH) + band = (diagonal - BAND_WIDTH, diagonal + BAND_WIDTH) print(band) print(len(sequence), len(excerpt)) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() - test_alignments = align.align_banded( - excerpt, sequence, matrix, band=band - ) + test_alignments = align.align_banded(excerpt, sequence, matrix, band=band) # The excerpt should be uniquely mappable to a single location on # the long sequence assert len(test_alignments) == 1 test_alignment = test_alignments[0] test_trace = test_alignment.trace - ref_trace = np.stack([ - np.arange(len(excerpt)), - np.arange(excerpt_pos, len(excerpt) + excerpt_pos) - ], axis=1) + ref_trace = np.stack( + [np.arange(len(excerpt)), np.arange(excerpt_pos, len(excerpt) + excerpt_pos)], + axis=1, + ) assert np.array_equal(test_trace, ref_trace) - @pytest.mark.parametrize( - "gap_penalty, local, seed", itertools.product( - [-10, (-10, -1)], - [False, True], - range(100) -)) + "gap_penalty, local, seed", + itertools.product([-10, (-10, -1)], [False, True], range(100)), +) def test_swapping(gap_penalty, local, seed): """ Check if `align_banded()` returns a 'swapped' alignment, if the order of input sequences is swapped. """ np.random.seed(seed) - band = ( - np.random.randint(-30, -10), - np.random.randint( 10, 30) - ) + band = (np.random.randint(-30, -10), np.random.randint(10, 30)) seq1, seq2 = _create_random_pair(seed) matrix = align.SubstitutionMatrix.std_protein_matrix() @@ -177,7 +173,7 @@ def test_swapping(gap_penalty, local, seed): seq2, seq1, matrix, band=band, local=local, gap_penalty=gap_penalty ) - if len(ref_alignments) != 1 or len(test_alignments) != 1: + if len(ref_alignments) != 1 or len(test_alignments) != 1: # If multiple optimal alignments exist, # it is not easy to assign a swapped one to an original one # therefore, simply return in this case @@ -192,10 +188,14 @@ def test_swapping(gap_penalty, local, seed): assert np.array_equal(test_alignment.trace, ref_alignment.trace[:, ::-1]) - -def _create_random_pair(seed, length=100, max_subsitutions=5, - max_insertions=5, max_deletions=5, - max_truncations=5): +def _create_random_pair( + seed, + length=100, + max_subsitutions=5, + max_insertions=5, + max_deletions=5, + max_truncations=5, +): """ generate a pair of protein sequences. Each pair contains @@ -216,9 +216,7 @@ def _create_random_pair(seed, length=100, max_subsitutions=5, subsitution_indices = np.random.choice( np.arange(len(mutant)), size=n_subsitutions, replace=False ) - subsitution_values = np.random.randint( - len(original.alphabet), size=n_subsitutions - ) + subsitution_values = np.random.randint(len(original.alphabet), size=n_subsitutions) mutant.code[subsitution_indices] = subsitution_values # Random insertions @@ -226,9 +224,7 @@ def _create_random_pair(seed, length=100, max_subsitutions=5, insertion_indices = np.random.choice( np.arange(len(mutant)), size=n_insertions, replace=False ) - insertion_values = np.random.randint( - len(original.alphabet), size=n_insertions - ) + insertion_values = np.random.randint(len(original.alphabet), size=n_insertions) mutant.code = np.insert(mutant.code, insertion_indices, insertion_values) # Random deletions @@ -240,12 +236,10 @@ def _create_random_pair(seed, length=100, max_subsitutions=5, # Truncate at both ends of original and mutant original = original[ - np.random.randint(max_truncations) : - -(1 + np.random.randint(max_truncations)) + np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations)) ] mutant = mutant[ - np.random.randint(max_truncations) : - -(1 + np.random.randint(max_truncations)) + np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations)) ] - return original, mutant \ No newline at end of file + return original, mutant diff --git a/tests/sequence/align/test_cigar.py b/tests/sequence/align/test_cigar.py index 2c4767ddc..e4ffe4b04 100644 --- a/tests/sequence/align/test_cigar.py +++ b/tests/sequence/align/test_cigar.py @@ -18,10 +18,12 @@ def _generate_cigar(seed): # Alternatingly insert matches and insertions/deletions cigar += f"{np.random.randint(1, 100)}M" op = align.CigarOp( - np.random.choice([ - align.CigarOp.INSERTION, - align.CigarOp.DELETION, - ]) + np.random.choice( + [ + align.CigarOp.INSERTION, + align.CigarOp.DELETION, + ] + ) ).to_cigar_symbol() cigar += f"{np.random.randint(1, 100)}{op}" # Alignment must end with a match @@ -34,8 +36,9 @@ def _generate_cigar(seed): return cigar -def _mutate_sequence(original, - max_subsitutions=50, max_insertions=50, max_deletions=50): +def _mutate_sequence( + original, max_subsitutions=50, max_insertions=50, max_deletions=50 +): """ Introduce random deletions, insertions and substitutions into a sequence. @@ -47,9 +50,7 @@ def _mutate_sequence(original, subsitution_indices = np.random.choice( np.arange(len(mutant)), size=n_subsitutions, replace=False ) - subsitution_values = np.random.randint( - len(original.alphabet), size=n_subsitutions - ) + subsitution_values = np.random.randint(len(original.alphabet), size=n_subsitutions) mutant.code[subsitution_indices] = subsitution_values # Random insertions @@ -57,9 +58,7 @@ def _mutate_sequence(original, insertion_indices = np.random.choice( np.arange(len(mutant)), size=n_insertions, replace=False ) - insertion_values = np.random.randint( - len(original.alphabet), size=n_insertions - ) + insertion_values = np.random.randint(len(original.alphabet), size=n_insertions) mutant.code = np.insert(mutant.code, insertion_indices, insertion_values) # Random deletions @@ -83,8 +82,8 @@ def test_cigar_conversion(cigar): # The sequences are arbitrary, only the alignment trace matters # However, they still need to be long enough for the number of CIGAR # operations - ref = seq.NucleotideSequence(["A"]*LENGTH) - seg = seq.NucleotideSequence(["A"]*LENGTH) + ref = seq.NucleotideSequence(["A"] * LENGTH) + seg = seq.NucleotideSequence(["A"] * LENGTH) alignment = align.read_alignment_from_cigar(cigar, 0, ref, seg) print(alignment) @@ -103,10 +102,9 @@ def test_cigar_conversion(cigar): [False, True], [False, True], [False, True], - ) + ), ) -def test_alignment_conversion(seed, local, distinguish_matches, - include_terminal_gaps): +def test_alignment_conversion(seed, local, distinguish_matches, include_terminal_gaps): """ Check whether an :class:`Alignment` converted into a CIGAR string and back again into an :class:`Alignment` gives the same result. @@ -114,20 +112,16 @@ def test_alignment_conversion(seed, local, distinguish_matches, REF_LENGTH = 1000 np.random.seed(seed) ref = seq.NucleotideSequence(ambiguous=False) - ref.code = np.random.randint( - 0, len(ref.alphabet), REF_LENGTH, dtype=np.uint8 - ) + ref.code = np.random.randint(0, len(ref.alphabet), REF_LENGTH, dtype=np.uint8) excerpt_start = np.random.randint(0, 200) - excerpt_stop = np.random.randint(REF_LENGTH-200, REF_LENGTH) - seg = ref[excerpt_start: excerpt_stop] + excerpt_stop = np.random.randint(REF_LENGTH - 200, REF_LENGTH) + seg = ref[excerpt_start:excerpt_stop] seg = _mutate_sequence(seg) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() if local: - ref_ali = align.align_optimal( - ref, seg, matrix, local=True, max_number=1 - )[0] + ref_ali = align.align_optimal(ref, seg, matrix, local=True, max_number=1)[0] else: ref_ali = align.align_optimal( ref, seg, matrix, terminal_penalty=False, max_number=1 @@ -138,17 +132,15 @@ def test_alignment_conversion(seed, local, distinguish_matches, # Remove score as the compared reconstructed alignment does not # contain it either ref_ali.score = None - start_position = ref_ali.trace[0,0] + start_position = ref_ali.trace[0, 0] cigar = align.write_alignment_to_cigar( ref_ali, distinguish_matches=distinguish_matches, - include_terminal_gaps=include_terminal_gaps + include_terminal_gaps=include_terminal_gaps, ) - test_ali = align.read_alignment_from_cigar( - cigar, start_position, ref, seg - ) + test_ali = align.read_alignment_from_cigar(cigar, start_position, ref, seg) print(cigar) print("\n\n") @@ -156,4 +148,4 @@ def test_alignment_conversion(seed, local, distinguish_matches, print("\n\n") print(test_ali) print("\n\n") - assert test_ali == ref_ali \ No newline at end of file + assert test_ali == ref_ali diff --git a/tests/sequence/align/test_kmeralphabet.py b/tests/sequence/align/test_kmeralphabet.py index 1ea31a400..67b3f9b03 100644 --- a/tests/sequence/align/test_kmeralphabet.py +++ b/tests/sequence/align/test_kmeralphabet.py @@ -7,7 +7,6 @@ import biotite.sequence as seq import biotite.sequence.align as align - K = 3 @@ -15,21 +14,24 @@ def kmer_alphabet(): return align.KmerAlphabet(seq.ProteinSequence.alphabet, K) + @pytest.fixture def spaced_kmer_alphabet(): - return align.KmerAlphabet(seq.ProteinSequence.alphabet, K, spacing=[0,1,2]) - + return align.KmerAlphabet(seq.ProteinSequence.alphabet, K, spacing=[0, 1, 2]) np.random.seed(0) N = 10 L = 30 + + @pytest.mark.parametrize( "ref_split_kmer_code", # Test for single instances as input - list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))) + + list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))) + + # Test for multiple instances as input - list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, L, K))) + list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, L, K))), ) def test_fuse_and_split(kmer_alphabet, ref_split_kmer_code): """ @@ -38,15 +40,16 @@ def test_fuse_and_split(kmer_alphabet, ref_split_kmer_code): """ fused = kmer_alphabet.fuse(ref_split_kmer_code) test_split_kmer_code = kmer_alphabet.split(fused) - + assert test_split_kmer_code.tolist() == ref_split_kmer_code.tolist() np.random.seed(0) N = 10 + + @pytest.mark.parametrize( - "split_kmer_code", - np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K)) + "split_kmer_code", np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K)) ) def test_encode_and_decode(kmer_alphabet, split_kmer_code): """ @@ -58,7 +61,7 @@ def test_encode_and_decode(kmer_alphabet, split_kmer_code): ref_kmer_symbol = alph.decode_multiple(split_kmer_code) kmer_code = kmer_alphabet.encode(ref_kmer_symbol) test_kmer_symbol = kmer_alphabet.decode(kmer_code) - + assert test_kmer_symbol.tolist() == ref_kmer_symbol.tolist() @@ -86,6 +89,8 @@ def test_create_continuous_kmers(kmer_alphabet): N = 50 + + @pytest.mark.parametrize("seed", range(N)) def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed): """ @@ -99,8 +104,7 @@ def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed): np.random.seed(seed) sequence = seq.ProteinSequence() sequence.code = np.random.randint( - len(sequence.alphabet), - size=np.random.randint(MIN_LENGTH, MAX_LENGTH) + len(sequence.alphabet), size=np.random.randint(MIN_LENGTH, MAX_LENGTH) ) ref_kmers = kmer_alphabet.create_kmers(sequence.code) diff --git a/tests/sequence/align/test_kmersimilarity.py b/tests/sequence/align/test_kmersimilarity.py index a72aeca72..5f1fbbf44 100644 --- a/tests/sequence/align/test_kmersimilarity.py +++ b/tests/sequence/align/test_kmersimilarity.py @@ -15,22 +15,24 @@ def kmer_alphabet(): np.random.seed(0) N = 10 -@pytest.mark.parametrize("ref_kmer, threshold", zip( - np.random.randint(10000, size=N), - np.random.randint(-5, 15, size=N) -)) + + +@pytest.mark.parametrize( + "ref_kmer, threshold", + zip(np.random.randint(10000, size=N), np.random.randint(-5, 15, size=N)), +) def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold): """ Test if the similar k-mers given by :class:`ScoreThresholdRule` are equal to k-mers generated by a brute-force approach. """ matrix = align.SubstitutionMatrix.std_protein_matrix() - + ref_kmer_sequence = seq.ProteinSequence() ref_kmer_sequence.code = kmer_alphabet.split(ref_kmer) - + ref_sim_kmer_set = set() - # Iterate through all possible k-mers + # Iterate through all possible k-mers for kmer in range(len(kmer_alphabet)): kmer_sequence = seq.ProteinSequence() kmer_sequence.code = kmer_alphabet.split(kmer) @@ -40,7 +42,7 @@ def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold): # Add k-mer to list if the threshold score is reached if score >= threshold: ref_sim_kmer_set.add(kmer) - + test_rule = align.ScoreThresholdRule(matrix, threshold) test_sim_kmer_set = set(test_rule.similar_kmers(kmer_alphabet, ref_kmer)) @@ -68,4 +70,4 @@ def test_invalid_kmer(kmer_alphabet, invalid_kmer): align.SubstitutionMatrix.std_protein_matrix(), 0 ) with pytest.raises(seq.AlphabetError): - test_rule.similar_kmers(kmer_alphabet, invalid_kmer) \ No newline at end of file + test_rule.similar_kmers(kmer_alphabet, invalid_kmer) diff --git a/tests/sequence/align/test_kmertable.py b/tests/sequence/align/test_kmertable.py index 0f471ca34..deb4b1923 100644 --- a/tests/sequence/align/test_kmertable.py +++ b/tests/sequence/align/test_kmertable.py @@ -4,9 +4,8 @@ import functools import itertools -import string import pickle -from typing import Any +import string import numpy as np import pytest import biotite.sequence as seq @@ -27,9 +26,7 @@ def __init__(self, n_buckets): def __getattr__(self, name): attr = getattr(align.BucketKmerTable, name) - if attr.__name__ in [ - "from_sequences", "from_kmers", "from_kmer_selection" - ]: + if attr.__name__ in ["from_sequences", "from_kmers", "from_kmer_selection"]: return functools.partial(attr, n_buckets=self._n_buckets) else: return attr @@ -47,10 +44,12 @@ def idfn(val): def k(): return 8 + @pytest.fixture def alphabet(): return seq.NucleotideSequence.unambiguous_alphabet() + @pytest.fixture def random_sequences(k, alphabet): N_SEQS = 10 @@ -75,10 +74,10 @@ def random_sequences(k, alphabet): # with less buckets than number of possible kmers ... FixedBucketKmerTable(1000), # ... and one test case with more buckets (perfect hashing) - FixedBucketKmerTable(1000000) - ] + FixedBucketKmerTable(1000000), + ], ), - ids = idfn + ids=idfn, ) def test_from_sequences(k, random_sequences, spacing, table_class): """ @@ -86,29 +85,23 @@ def test_from_sequences(k, random_sequences, spacing, table_class): sequence position, if the position is in the C-array of the corresponding k-mer. """ - table = table_class.from_sequences( - k, random_sequences, spacing=spacing - ) + table = table_class.from_sequences(k, random_sequences, spacing=spacing) kmer_alph = align.KmerAlphabet(random_sequences[0].alphabet, k, spacing) assert kmer_alph == table.kmer_alphabet for i, sequence in enumerate(random_sequences): for j in range(kmer_alph.kmer_array_length(len(sequence))): if spacing is None: - kmer = kmer_alph.fuse(sequence.code[j : j+k]) + kmer = kmer_alph.fuse(sequence.code[j : j + k]) else: kmer = kmer_alph.fuse(sequence.code[kmer_alph.spacing + j]) - assert np.array([i,j]) in table[kmer] + assert np.array([i, j]) in table[kmer] @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_from_kmers(k, random_sequences, table_class): """ @@ -128,12 +121,8 @@ def test_from_kmers(k, random_sequences, table_class): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_from_kmer_selection(k, alphabet, random_sequences, table_class): """ @@ -149,8 +138,7 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class): ] np.random.seed(0) filtered_pos_arrays = [ - np.random.randint(len(kmers), size=N_POSITIONS) - for kmers in kmer_arrays + np.random.randint(len(kmers), size=N_POSITIONS) for kmers in kmer_arrays ] filtered_kmer_arrays = [ kmers[filtered_pos] @@ -162,8 +150,9 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class): # The total number of k-mers in the table # should be the total number of input k-mers - assert np.sum(kmer_table.count(np.arange(len(kmer_alph)))) \ - == np.sum([len(kmers) for kmers in filtered_kmer_arrays]) + assert np.sum(kmer_table.count(np.arange(len(kmer_alph)))) == np.sum( + [len(kmers) for kmers in filtered_kmer_arrays] + ) # Each k-mer in the table should be found # in the original k-mer sequences for kmer in range(len(kmer_alph)): @@ -173,12 +162,8 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_from_tables(k, random_sequences, table_class): """ @@ -205,10 +190,8 @@ def test_from_positions(k, random_sequences): """ ref_table = align.KmerTable.from_sequences(k, random_sequences) - kmer_dict = {kmer : ref_table[kmer] for kmer in range(len(ref_table))} - test_table = align.KmerTable.from_positions( - ref_table.kmer_alphabet, kmer_dict - ) + kmer_dict = {kmer: ref_table[kmer] for kmer in range(len(ref_table))} + test_table = align.KmerTable.from_positions(ref_table.kmer_alphabet, kmer_dict) assert test_table == ref_table @@ -216,14 +199,10 @@ def test_from_positions(k, random_sequences): @pytest.mark.parametrize( "table_class, use_similarity_rule", itertools.product( - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(10000000) - ], - [False, True] + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(10000000)], + [False, True], ), - ids = idfn + ids=idfn, ) def test_match_table(table_class, use_similarity_rule): """ @@ -233,8 +212,7 @@ def test_match_table(table_class, use_similarity_rule): chosen to yield only the same k-mer as similar k-mer. """ alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_") - phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \ - "chuck_wood" + phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" "chuck_wood" phrase2 = "woodchuck" sequence1 = seq.GeneralSequence(alphabet, phrase1) sequence2 = seq.GeneralSequence(alphabet, phrase2) @@ -244,30 +222,32 @@ def test_match_table(table_class, use_similarity_rule): table1 = table_class.from_sequences(4, [sequence1]) table2 = table_class.from_sequences(4, [sequence2]) - ref_matches = set([ - (0, 9), - (0, 22), - (1, 23), - (2, 24), - (3, 25), - (4, 26), - (5, 27), - (4, 32), - (5, 33), - (0, 43), - (1, 44), - (2, 45), - (3, 46), - (4, 47), - (5, 48), - (4, 59), - (5, 60), - (0, 65), - ]) + ref_matches = set( + [ + (0, 9), + (0, 22), + (1, 23), + (2, 24), + (3, 25), + (4, 26), + (5, 27), + (4, 32), + (5, 33), + (0, 43), + (1, 44), + (2, 45), + (3, 46), + (4, 47), + (5, 48), + (4, 59), + (5, 60), + (0, 65), + ] + ) test_matches = table1.match_table(table2, similarity_rule=rule) # the reference indices are irrelevant for this test - test_matches = test_matches[:, [1,3]] + test_matches = test_matches[:, [1, 3]] test_matches = set([tuple(match) for match in test_matches]) assert test_matches == ref_matches @@ -275,14 +255,10 @@ def test_match_table(table_class, use_similarity_rule): @pytest.mark.parametrize( "table_class, use_similarity_rule", itertools.product( - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - [False, True] + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + [False, True], ), - ids = idfn + ids=idfn, ) def test_match(k, random_sequences, table_class, use_similarity_rule): """ @@ -301,12 +277,8 @@ def test_match(k, random_sequences, table_class, use_similarity_rule): for i, kmer in enumerate(kmers): matches = table[kmer] matches = np.stack( - [ - np.full(len(matches), i, dtype=np.uint32), - matches[:,0], - matches[:,1] - ], - axis=1 + [np.full(len(matches), i, dtype=np.uint32), matches[:, 0], matches[:, 1]], + axis=1, ) ref_matches.append(matches) ref_matches = np.concatenate(ref_matches) @@ -319,12 +291,8 @@ def test_match(k, random_sequences, table_class, use_similarity_rule): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_match_kmer_selection(k, random_sequences, table_class): """ @@ -344,12 +312,8 @@ def test_match_kmer_selection(k, random_sequences, table_class): kmer = kmers[pos] matches = table[kmer] matches = np.stack( - [ - np.full(len(matches), pos, dtype=np.uint32), - matches[:,0], - matches[:,1] - ], - axis=1 + [np.full(len(matches), pos, dtype=np.uint32), matches[:, 0], matches[:, 1]], + axis=1, ) ref_matches.append(matches) ref_matches = np.concatenate(ref_matches) @@ -362,14 +326,10 @@ def test_match_kmer_selection(k, random_sequences, table_class): @pytest.mark.parametrize( "table_class, use_mask", itertools.product( - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - [False, True] + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + [False, True], ), - ids = idfn + ids=idfn, ) def test_match_equivalence(k, random_sequences, table_class, use_mask): """ @@ -391,27 +351,22 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask): query_mask = removal_masks[0] table_masks = removal_masks[1:] - table = table_class.from_sequences( - k, table_sequences, ignore_masks=table_masks - ) + table = table_class.from_sequences(k, table_sequences, ignore_masks=table_masks) # 42 -> Dummy value that is distinct from all reference indices ref_table = table_class.from_sequences( k, [query_sequence], [42], ignore_masks=[query_mask] ) ref_matches = table.match_table(ref_table) - assert np.all(ref_matches[:,0] == 42) + assert np.all(ref_matches[:, 0] == 42) # Store matches in set to remove the order dependency # The first column is not present in the matches # returned by 'match_sequence()' -> [:, 1:] ref_matches = set([tuple(match) for match in ref_matches[:, 1:]]) - test_matches = table.match( - query_sequence, ignore_mask=query_mask - ) + test_matches = table.match(query_sequence, ignore_mask=query_mask) test_matches = set([tuple(match) for match in test_matches]) - # Check if any match is found at all assert len(ref_matches) > 0 # The first column is not present in 'test_matches' @@ -433,7 +388,7 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask): ), ], ids = idfn -)# fmt: skip +) # fmt: skip def test_masking(k, input_mask, ref_output_mask): """ Explicitly test the conversion of removal masks to k-mer masks @@ -446,9 +401,7 @@ def test_masking(k, input_mask, ref_output_mask): sequence = seq.NucleotideSequence() sequence.code = np.zeros(len(input_mask)) - table = align.KmerTable.from_sequences( - k, [sequence], ignore_masks=[input_mask] - ) + table = align.KmerTable.from_sequences(k, [sequence], ignore_masks=[input_mask]) # Get the k-mer positions that were masked test_output_mask = np.zeros(len(ref_output_mask), dtype=bool) @@ -467,7 +420,7 @@ def test_masking(k, input_mask, ref_output_mask): (FixedBucketKmerTable(1000), True), (FixedBucketKmerTable(1000000), True), ], - ids = idfn + ids=idfn, ) def test_count(k, random_sequences, table_class, selected_kmers): """ @@ -476,9 +429,7 @@ def test_count(k, random_sequences, table_class, selected_kmers): """ N_KMERS = 100 - table = table_class.from_sequences( - k, random_sequences - ) + table = table_class.from_sequences(k, random_sequences) if selected_kmers: np.random.seed(0) @@ -486,9 +437,7 @@ def test_count(k, random_sequences, table_class, selected_kmers): ref_counts = [len(table[kmer]) for kmer in kmers] test_counts = table.count(kmers) else: - ref_counts = [ - len(table[kmer]) for kmer in range(len(table.kmer_alphabet)) - ] + ref_counts = [len(table[kmer]) for kmer in range(len(table.kmer_alphabet))] test_counts = table.count() assert test_counts.tolist() == ref_counts @@ -496,12 +445,8 @@ def test_count(k, random_sequences, table_class, selected_kmers): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_get_kmers(table_class): """ @@ -511,10 +456,7 @@ def test_get_kmers(table_class): """ np.random.seed(0) - kmer_alphabet = align.KmerAlphabet( - seq.NucleotideSequence.unambiguous_alphabet(), - 8 - ) + kmer_alphabet = align.KmerAlphabet(seq.NucleotideSequence.unambiguous_alphabet(), 8) ref_mask = np.random.choice([False, True], size=len(kmer_alphabet)) ref_kmers = np.where(ref_mask)[0] table = table_class.from_kmers(kmer_alphabet, [ref_kmers]) @@ -526,12 +468,8 @@ def test_get_kmers(table_class): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_pickle(k, random_sequences, table_class): """ @@ -548,10 +486,7 @@ def test_pickle(k, random_sequences, table_class): @pytest.mark.parametrize( "n_kmers, load_factor", - itertools.product( - [1_000, 100_000, 10_000_000, 1_000_000_000], - [0.2, 1.0, 2.0] - ) + itertools.product([1_000, 100_000, 10_000_000, 1_000_000_000], [0.2, 1.0, 2.0]), ) def test_bucket_number(n_kmers, load_factor): """ @@ -563,7 +498,6 @@ def test_bucket_number(n_kmers, load_factor): min_n_buckets = int(n_kmers / load_factor) test_n_buckets = align.bucket_number(n_kmers, load_factor) - assert test_n_buckets >= min_n_buckets assert test_n_buckets <= min_n_buckets * 1.05 @@ -573,4 +507,4 @@ def _identity_rule(alphabet): np.fill_diagonal(score_matrix, 0) matrix = align.SubstitutionMatrix(alphabet, alphabet, score_matrix) rule = align.ScoreThresholdRule(matrix, 0) - return rule \ No newline at end of file + return rule diff --git a/tests/sequence/align/test_localgapped.py b/tests/sequence/align/test_localgapped.py index 3d7d0854b..714004118 100644 --- a/tests/sequence/align/test_localgapped.py +++ b/tests/sequence/align/test_localgapped.py @@ -3,8 +3,8 @@ # information. import itertools -import pytest import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align @@ -12,15 +12,14 @@ @pytest.mark.parametrize( "gap_penalty, seed, threshold, direction, score_only", itertools.product( - [-10, (-10,-1)], + [-10, (-10, -1)], [(0, 0), (11, 11), (20, 19), (30, 29)], [20, 100, 500], - ["both", "upstream","downstream"], - [False, True] - ) + ["both", "upstream", "downstream"], + [False, True], + ), ) -def test_simple_alignment(gap_penalty, seed, threshold, - direction, score_only): +def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. @@ -33,22 +32,20 @@ def test_simple_alignment(gap_penalty, seed, threshold, matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=True + seq1, seq2, matrix, gap_penalty=gap_penalty, local=True ) # Limit reference alignment range to seed # if the alignment does not extend in both directions for alignment in ref_alignments: - seed_index = np.where(alignment.trace[:,0] == seed[0])[0][0] + seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0] if direction == "upstream": - alignment.trace = alignment.trace[:seed_index + 1] + alignment.trace = alignment.trace[: seed_index + 1] elif direction == "downstream": alignment.trace = alignment.trace[seed_index:] alignment.score = align.score(alignment, matrix, gap_penalty) test_result = align.align_local_gapped( - seq1, seq2, matrix, seed, threshold, gap_penalty, - 1000, direction, score_only + seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only ) if score_only: @@ -65,13 +62,12 @@ def test_simple_alignment(gap_penalty, seed, threshold, @pytest.mark.parametrize( "gap_penalty, score_only, seq_indices", itertools.product( - [-10, (-10,-1)], + [-10, (-10, -1)], [False, True], - [(i,j) for i in range(10) for j in range(i+1)] - ) + [(i, j) for i in range(10) for j in range(i + 1)], + ), ) -def test_complex_alignment(sequences, gap_penalty, score_only, - seq_indices): +def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. @@ -90,8 +86,7 @@ def test_complex_alignment(sequences, gap_penalty, score_only, seq2 = sequences[index2] ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER + seq1, seq2, matrix, gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER ) # Select the center of the alignment as seed trace = ref_alignments[0].trace @@ -99,8 +94,7 @@ def test_complex_alignment(sequences, gap_penalty, score_only, seed = trace[len(trace) // 2] test_result = align.align_local_gapped( - seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, - MAX_NUMBER, "both", score_only + seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, MAX_NUMBER, "both", score_only ) if score_only: @@ -112,30 +106,29 @@ def test_complex_alignment(sequences, gap_penalty, score_only, test_alignments = test_result assert test_alignments[0].score == ref_alignments[0].score # Test if the score is also correctly calculated - assert align.score(test_alignments[0], matrix, gap_penalty) \ + assert ( + align.score(test_alignments[0], matrix, gap_penalty) == ref_alignments[0].score - if len(ref_alignments) < MAX_NUMBER \ - and len(test_alignments) < MAX_NUMBER: - # Only test if the exact same alignments were created, - # if the number of traces was not limited by MAX_NUMBER - for i, alignment in enumerate(test_alignments): - try: - assert alignment in ref_alignments - except AssertionError: - # Edge case: - # In rare case the local alignment may be - # slightly longer on the upstream side for - # 'align_local_ungapped()', since the - # upstream side is handled in an inverted - # manner - # However this does not effect the score - # Consequently, the exception is ignored - # if the alignment is longer than all - # reference alignments - if len(alignment) <= max( - [len(ali) for ali in ref_alignments] - ): - raise + ) + if len(ref_alignments) < MAX_NUMBER and len(test_alignments) < MAX_NUMBER: + # Only test if the exact same alignments were created, + # if the number of traces was not limited by MAX_NUMBER + for i, alignment in enumerate(test_alignments): + try: + assert alignment in ref_alignments + except AssertionError: + # Edge case: + # In rare case the local alignment may be + # slightly longer on the upstream side for + # 'align_local_ungapped()', since the + # upstream side is handled in an inverted + # manner + # However this does not effect the score + # Consequently, the exception is ignored + # if the alignment is longer than all + # reference alignments + if len(alignment) <= max([len(ali) for ali in ref_alignments]): + raise except AssertionError: print(f"Missing test alignment at index {i}:") print() @@ -150,11 +143,11 @@ def test_complex_alignment(sequences, gap_penalty, score_only, @pytest.mark.parametrize( "gap_penalty, direction, score_only, should_raise", itertools.product( - [-10, (-10,-1)], - ["both", "upstream","downstream"], + [-10, (-10, -1)], + ["both", "upstream", "downstream"], [False, True], - [False, True] - ) + [False, True], + ), ) def test_max_table_size(gap_penalty, direction, score_only, should_raise): """ @@ -183,15 +176,31 @@ def test_max_table_size(gap_penalty, direction, score_only, should_raise): if should_raise: with pytest.raises(MemoryError): align.align_local_gapped( - seq1, seq1, matrix, seed, threshold, gap_penalty, 1, - direction, score_only, max_table_size + seq1, + seq1, + matrix, + seed, + threshold, + gap_penalty, + 1, + direction, + score_only, + max_table_size, ) else: result = align.align_local_gapped( - seq1, seq1, matrix, seed, threshold, gap_penalty, 1, - direction, score_only, max_table_size + seq1, + seq1, + matrix, + seed, + threshold, + gap_penalty, + 1, + direction, + score_only, + max_table_size, ) if not score_only and direction == "both": alignment = result[0] # Expect that no gaps are introduced - assert len(alignment) == len(seq1) \ No newline at end of file + assert len(alignment) == len(seq1) diff --git a/tests/sequence/align/test_localungapped.py b/tests/sequence/align/test_localungapped.py index f2b0ff74e..11105a11a 100644 --- a/tests/sequence/align/test_localungapped.py +++ b/tests/sequence/align/test_localungapped.py @@ -71,10 +71,19 @@ [[False], [True]], # uint8_code )] -) # fmt: skip -def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, - ref_range1, ref_range2, - direction, score_only, uint8_code): +) # fmt: skip +def test_simple_alignments( + seq_type, + seq1, + seq2, + seed, + threshold, + ref_range1, + ref_range2, + direction, + score_only, + uint8_code, +): """ Check if `algin_local_ungapped()` produces correct alignments based on simple known examples. @@ -99,19 +108,16 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) - ref_alignment = align.Alignment( [seq1, seq2], - np.stack([ - np.arange(*ref_range1), - np.arange(*ref_range2) - ], axis=-1) + np.stack([np.arange(*ref_range1), np.arange(*ref_range2)], axis=-1), ) ref_score = align.score(ref_alignment, matrix) ref_alignment.score = ref_score test_result = align.align_local_ungapped( - seq1, seq2, matrix, seed, threshold, direction, score_only) + seq1, seq2, matrix, seed, threshold, direction, score_only + ) if score_only: assert test_result == ref_score @@ -120,10 +126,7 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, @pytest.mark.parametrize( - "seed, uint8_code", itertools.product( - range(100), - [False, True] - ) + "seed, uint8_code", itertools.product(range(100), [False, True]) ) def test_random_alignment(seed, uint8_code): """ @@ -146,24 +149,21 @@ def test_random_alignment(seed, uint8_code): # Create conserved regions conserved1 = ProteinSequence() - conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1) + conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE + 1) conserved1.code = np.random.randint( # Do not include stop symbol for aesthetic reasons -> -1 - len(conserved1.alphabet)-1, - size=conserved_len + len(conserved1.alphabet) - 1, + size=conserved_len, ) conserved2 = ProteinSequence() # The second conserved regions is equal to the first one, # except a few point mutations conserved2.code = conserved1.code.copy() mutation_mask = np.random.choice( - [False, True], - size=conserved_len, - p = [1 - MUTATION_PROB, MUTATION_PROB] + [False, True], size=conserved_len, p=[1 - MUTATION_PROB, MUTATION_PROB] ) conserved2.code[mutation_mask] = np.random.randint( - len(conserved2.alphabet)-1, - size=np.count_nonzero(mutation_mask) + len(conserved2.alphabet) - 1, size=np.count_nonzero(mutation_mask) ) # Flank the conserved regions with equal termini to ensure # that the alignment extends from start to end of the region @@ -174,36 +174,33 @@ def test_random_alignment(seed, uint8_code): seq1 = ProteinSequence() seq2 = ProteinSequence() offset = [] - for sequence, conserved in zip( - (seq1, seq2), (conserved1, conserved2) - ): + for sequence, conserved in zip((seq1, seq2), (conserved1, conserved2)): sequence.code = np.random.randint( - len(sequence.alphabet)-1, - size=np.random.randint(MIN_SIZE, MAX_SIZE+1) + len(sequence.alphabet) - 1, size=np.random.randint(MIN_SIZE, MAX_SIZE + 1) ) # Place conserved region randomly within the sequence conserved_pos = np.random.randint(0, len(sequence) - len(conserved)) - sequence.code[conserved_pos : conserved_pos + len(conserved)] \ - = conserved.code + sequence.code[conserved_pos : conserved_pos + len(conserved)] = conserved.code offset.append(conserved_pos) # The seed is placed somewhere in the conserved region seed = np.array(offset) + np.random.randint(len(conserved)) - matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) ref_score = align.align_optimal( - seq1, seq2, matrix, local=True, max_number=1, + seq1, + seq2, + matrix, + local=True, + max_number=1, # High gap penalty to prevent introduction of gaps, # since 'align_local_ungapped()' is also no able to place gaps - gap_penalty=-1000 + gap_penalty=-1000, )[0].score - test_alignment = align.align_local_ungapped( - seq1, seq2, matrix, seed, THRESHOLD - ) + test_alignment = align.align_local_ungapped(seq1, seq2, matrix, seed, THRESHOLD) assert test_alignment.score == ref_score # Test if the score is also correctly calculated @@ -211,23 +208,23 @@ def test_random_alignment(seed, uint8_code): def _convert_to_uint16_code(seq1, seq2, matrix): - """ - Adjust sequences, so that they use 'uint16' as dtype for the - code. - This is a necessary test, since 'uint8' uses a separate - implementation. - """ - new_alph = seq.Alphabet(np.arange(500)) - code = seq1.code - seq1 = seq.GeneralSequence(new_alph) - seq1.code = code - code = seq2.code - seq2 = seq.GeneralSequence(new_alph) - seq2.code = code - # Adjust the substitution matrix as well, - # so that it is compatible with the new alphabet - score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) - orig_len = len(matrix.score_matrix()) - score_matrix[:orig_len, :orig_len] = matrix.score_matrix() - matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) - return seq1, seq2, matrix \ No newline at end of file + """ + Adjust sequences, so that they use 'uint16' as dtype for the + code. + This is a necessary test, since 'uint8' uses a separate + implementation. + """ + new_alph = seq.Alphabet(np.arange(500)) + code = seq1.code + seq1 = seq.GeneralSequence(new_alph) + seq1.code = code + code = seq2.code + seq2 = seq.GeneralSequence(new_alph) + seq2.code = code + # Adjust the substitution matrix as well, + # so that it is compatible with the new alphabet + score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) + orig_len = len(matrix.score_matrix()) + score_matrix[:orig_len, :orig_len] = matrix.score_matrix() + matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) + return seq1, seq2, matrix diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 582eec2d3..5916b23d9 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -8,9 +8,14 @@ import biotite.sequence.align as align -@pytest.mark.parametrize("db_entry", [entry for entry - in align.SubstitutionMatrix.list_db() - if entry not in ["NUC","GONNET"]]) +@pytest.mark.parametrize( + "db_entry", + [ + entry + for entry in align.SubstitutionMatrix.list_db() + if entry not in ["NUC", "GONNET"] + ], +) def test_matrices(db_entry): """ Test for exceptions when reading matrix files. @@ -19,6 +24,7 @@ def test_matrices(db_entry): alph2 = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix(alph1, alph2, db_entry) + def test_matrix_str(): """ Test conversion of substitution matrix to string via a small @@ -26,11 +32,11 @@ def test_matrix_str(): """ alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("def") - score_matrix = np.arange(9).reshape((3,3)) + score_matrix = np.arange(9).reshape((3, 3)) matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix) assert str(matrix) == "\n".join( [" d e f", "a 0 1 2", "b 3 4 5", "c 6 7 8"] - ) # fmt: skip \ No newline at end of file + ) # fmt: skip diff --git a/tests/sequence/align/test_multiple.py b/tests/sequence/align/test_multiple.py index 2dd4f757a..32a95321f 100644 --- a/tests/sequence/align/test_multiple.py +++ b/tests/sequence/align/test_multiple.py @@ -3,18 +3,14 @@ # information. import pytest -import biotite.sequence.align as align import biotite.application.muscle as muscle +import biotite.sequence.align as align from biotite.application import VersionError from ...util import is_not_installed - -@pytest.mark.skipif( - is_not_installed("muscle"), - reason="MUSCLE is not installed" -) -@pytest.mark.parametrize("gap_penalty", [-10, (-10,-1)]) +@pytest.mark.skipif(is_not_installed("muscle"), reason="MUSCLE is not installed") +@pytest.mark.parametrize("gap_penalty", [-10, (-10, -1)]) def test_align_multiple(sequences, gap_penalty): r""" Test `align_multiple()` function using actual long sequences, @@ -29,18 +25,14 @@ def test_align_multiple(sequences, gap_penalty): test_alignment, order, tree, distances = align.align_multiple( sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True ) - test_score = align.score( - test_alignment, matrix, gap_penalty, terminal_penalty=True - ) + test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True) try: ref_alignment = muscle.MuscleApp.align( sequences, matrix=matrix, gap_penalty=gap_penalty ) except VersionError: - pytest.skip(f"Invalid Muscle software version") - ref_score = align.score( - ref_alignment, matrix, gap_penalty, terminal_penalty=True - ) + pytest.skip("Invalid Muscle software version") + ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True) - assert test_score >= ref_score * 0.5 \ No newline at end of file + assert test_score >= ref_score * 0.5 diff --git a/tests/sequence/align/test_pairwise.py b/tests/sequence/align/test_pairwise.py index 38c93ffb9..53b3db0c3 100644 --- a/tests/sequence/align/test_pairwise.py +++ b/tests/sequence/align/test_pairwise.py @@ -5,9 +5,9 @@ import itertools import numpy as np import pytest +import biotite.application.muscle as muscle import biotite.sequence as seq import biotite.sequence.align as align -import biotite.application.muscle as muscle from biotite.application import VersionError from ...util import is_not_installed @@ -47,11 +47,13 @@ def test_align_ungapped(): ("TAAAGCGAAAT\nT---GCG---T")), (False,False,-7, "TAAAGCGAAAT","TGCGT", ("TAAAGCGAAAT\n---TGCGT---")) -] # fmt: skip -@pytest.mark.parametrize("local, term, gap_penalty, input1, input2, expect", - align_cases) -def test_align_optimal_simple(local, term, gap_penalty, - input1, input2, expect): +] # fmt: skip + + +@pytest.mark.parametrize( + "local, term, gap_penalty, input1, input2, expect", align_cases +) +def test_align_optimal_simple(local, term, gap_penalty, input1, input2, expect): """ Test `align_optimal()` function using constructed test cases. """ @@ -59,29 +61,27 @@ def test_align_optimal_simple(local, term, gap_penalty, seq2 = seq.NucleotideSequence(input2) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Test alignment function - alignments = align.align_optimal(seq1, seq2, - matrix, - gap_penalty=gap_penalty, terminal_penalty=term, - local=local) + alignments = align.align_optimal( + seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local + ) for ali in alignments: assert str(ali) in expect # Test if separate score function calculates the same score for ali in alignments: - score = align.score(ali, matrix, - gap_penalty=gap_penalty, terminal_penalty=term) + score = align.score(ali, matrix, gap_penalty=gap_penalty, terminal_penalty=term) assert score == ali.score -@pytest.mark.skipif( - is_not_installed("muscle"), - reason="MUSCLE is not installed" -) +@pytest.mark.skipif(is_not_installed("muscle"), reason="MUSCLE is not installed") # Ignore warning about MUSCLE writing no second guide tree @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize("gap_penalty, seq_indices", itertools.product( - [-10, (-10,-1)], [(i,j) for i in range(10) for j in range(i+1)] -)) +@pytest.mark.parametrize( + "gap_penalty, seq_indices", + itertools.product( + [-10, (-10, -1)], [(i, j) for i in range(10) for j in range(i + 1)] + ), +) def test_align_optimal_complex(sequences, gap_penalty, seq_indices): """ Test `align_optimal()` function using real world sequences, @@ -92,8 +92,7 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices): seq1 = sequences[index1] seq2 = sequences[index2] test_alignment = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, terminal_penalty=True, max_number=1 + seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=True, max_number=1 )[0] try: @@ -101,18 +100,14 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices): [seq1, seq2], matrix=matrix, gap_penalty=gap_penalty ) except VersionError: - pytest.skip(f"Invalid Muscle software version") + pytest.skip("Invalid Muscle software version") # Check whether the score of the optimal alignments is the same # or higher as the MUSCLE alignment # Direct alignment comparison is not feasible, # since the treatment of terminal gaps is different in MUSCLE - test_score = align.score( - test_alignment, matrix, gap_penalty, terminal_penalty=True - ) - ref_score = align.score( - ref_alignment, matrix, gap_penalty, terminal_penalty=True - ) + test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True) + ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True) try: assert test_score >= ref_score except AssertionError: @@ -127,9 +122,8 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices): @pytest.mark.parametrize( - "local, term, gap_penalty, seed", itertools.product( - [True, False], [True, False], [-5, -8, -10, -15], range(10) - ) + "local, term, gap_penalty, seed", + itertools.product([True, False], [True, False], [-5, -8, -10, -15], range(10)), ) def test_affine_gap_penalty(local, term, gap_penalty, seed): """ @@ -144,9 +138,7 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed): for _ in range(2): sequence = seq.NucleotideSequence() length = np.random.randint(*LENGTH_RANGE) - sequence.code = np.random.randint( - len(sequence.alphabet), size=length - ) + sequence.code = np.random.randint(len(sequence.alphabet), size=length) sequences.append(sequence) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() @@ -177,13 +169,15 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed): @pytest.mark.parametrize( - "local, term, gap_penalty, seq_indices", itertools.product( - [True, False], [True, False], [-10, (-10,-1)], - [(i,j) for i in range(10) for j in range(i+1)] - ) + "local, term, gap_penalty, seq_indices", + itertools.product( + [True, False], + [True, False], + [-10, (-10, -1)], + [(i, j) for i in range(10) for j in range(i + 1)], + ), ) -def test_align_optimal_symmetry(sequences, local, term, gap_penalty, - seq_indices): +def test_align_optimal_symmetry(sequences, local, term, gap_penalty, seq_indices): """ Alignments should be indifferent about which sequence comes first. """ @@ -192,15 +186,23 @@ def test_align_optimal_symmetry(sequences, local, term, gap_penalty, seq1 = sequences[index1] seq2 = sequences[index2] alignment1 = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, terminal_penalty=term, local=local, - max_number=1 + seq1, + seq2, + matrix, + gap_penalty=gap_penalty, + terminal_penalty=term, + local=local, + max_number=1, )[0] # Swap the sequences alignment2 = align.align_optimal( - seq2, seq1, matrix, - gap_penalty=gap_penalty, terminal_penalty=term, local=local, - max_number=1 + seq2, + seq1, + matrix, + gap_penalty=gap_penalty, + terminal_penalty=term, + local=local, + max_number=1, )[0] # Comparing all traces of both alignments to each other # would be unfeasible @@ -209,10 +211,12 @@ def test_align_optimal_symmetry(sequences, local, term, gap_penalty, @pytest.mark.parametrize( - "gap_penalty, term, seq_indices", itertools.product( - [-10, (-10,-1)], [False, True], - [(i,j) for i in range(10) for j in range(i+1)] - ) + "gap_penalty, term, seq_indices", + itertools.product( + [-10, (-10, -1)], + [False, True], + [(i, j) for i in range(10) for j in range(i + 1)], + ), ) def test_scoring(sequences, gap_penalty, term, seq_indices): """ @@ -224,12 +228,10 @@ def test_scoring(sequences, gap_penalty, term, seq_indices): seq1 = sequences[index1] seq2 = sequences[index2] alignment = align.align_optimal( - seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, - max_number=1 + seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, max_number=1 )[0] try: - assert align.score(alignment, matrix, gap_penalty, term) \ - == alignment.score + assert align.score(alignment, matrix, gap_penalty, term) == alignment.score except AssertionError: print(alignment) - raise \ No newline at end of file + raise diff --git a/tests/sequence/align/test_permutation.py b/tests/sequence/align/test_permutation.py index 9f9085f52..1b22b5579 100644 --- a/tests/sequence/align/test_permutation.py +++ b/tests/sequence/align/test_permutation.py @@ -3,9 +3,9 @@ # information. import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align -import pytest def _create_frequency_permutation(k): @@ -34,10 +34,7 @@ def test_random_permutation_modulo(): np.iinfo(np.int64).max + 1, size=SEQ_LENGTH, dtype=np.int64 ) - ref_order = [ - (LCG_A * kmer.item() + LCG_C) % LCG_M - for kmer in kmers - ] + ref_order = [(LCG_A * kmer.item() + LCG_C) % LCG_M for kmer in kmers] permutation = align.RandomPermutation() test_order = permutation.permute(kmers) @@ -60,11 +57,9 @@ def test_random_permutation_randomness(): kmers = np.arange(0, SEQ_LENGTH, dtype=np.int64) permutation = align.RandomPermutation() order = permutation.permute(kmers) - positive = (np.sign(order) == 1) - n_positive = np.convolve(positive, np.ones(FRAME_SIZE), mode='valid') - distribution, _ = np.histogram( - n_positive, bins=np.arange(0, 10 * FRAME_SIZE) - ) + positive = np.sign(order) == 1 + n_positive = np.convolve(positive, np.ones(FRAME_SIZE), mode="valid") + distribution, _ = np.histogram(n_positive, bins=np.arange(0, 10 * FRAME_SIZE)) # Since each value in the k-mer array is unique, # all mapped values should be unique as well @@ -76,9 +71,7 @@ def test_random_permutation_randomness(): def test_frequency_permutation(): K = 5 - kmer_alphabet = align.KmerAlphabet( - seq.NucleotideSequence.alphabet_unamb, K - ) + kmer_alphabet = align.KmerAlphabet(seq.NucleotideSequence.alphabet_unamb, K) np.random.seed(0) # Generate a random count order for each k-mer # Use 'np.arange()' to generate a unique order, @@ -89,21 +82,24 @@ def test_frequency_permutation(): kmer_alphabet, # The actual k-mer positions are dummy values, # only the number of each k-mer is important for this test - {i: np.zeros((count, 2)) for i, count in enumerate(counts)} + {i: np.zeros((count, 2)) for i, count in enumerate(counts)}, ) permutation = align.FrequencyPermutation.from_table(kmer_table) kmers_sorted_by_frequency = np.argsort(counts) - assert permutation.permute(kmers_sorted_by_frequency).tolist() \ + assert ( + permutation.permute(kmers_sorted_by_frequency).tolist() == np.arange(len(kmer_alphabet), dtype=np.int64).tolist() + ) @pytest.mark.parametrize( - "kmer_range, permutation", [ + "kmer_range, permutation", + [ (np.iinfo(np.int64).max, align.RandomPermutation()), (int(4**5), _create_frequency_permutation(5)), (int(4**8), _create_frequency_permutation(8)), - ] + ], ) def test_min_max(kmer_range, permutation): """ diff --git a/tests/sequence/align/test_selector.py b/tests/sequence/align/test_selector.py index cd2bcc4bb..a062df7eb 100644 --- a/tests/sequence/align/test_selector.py +++ b/tests/sequence/align/test_selector.py @@ -11,12 +11,7 @@ @pytest.mark.parametrize( "seed, window, from_sequence, use_permutation", - itertools.product( - range(20), - [2, 5, 10, 25], - [False, True], - [False, True] - ) + itertools.product(range(20), [2, 5, 10, 25], [False, True], [False, True]), ) def test_minimizer(seed, window, from_sequence, use_permutation): """ @@ -40,23 +35,20 @@ def test_minimizer(seed, window, from_sequence, use_permutation): order = kmers # Use an inefficient but simple algorithm for comparison - ref_minimizer_pos = np.array([ - np.argmin(order[i : i + window]) + i - for i in range(len(order) - (window - 1)) - ]) + ref_minimizer_pos = np.array( + [np.argmin(order[i : i + window]) + i for i in range(len(order) - (window - 1))] + ) # Remove duplicates ref_minimizer_pos = np.unique(ref_minimizer_pos) ref_minimizers = kmers[ref_minimizer_pos] - minimizer_selector = align.MinimizerSelector( - kmer_alph, window, permutation - ) + minimizer_selector = align.MinimizerSelector(kmer_alph, window, permutation) if from_sequence: - test_minimizer_pos, test_minimizers \ - = minimizer_selector.select(sequence) + test_minimizer_pos, test_minimizers = minimizer_selector.select(sequence) else: - test_minimizer_pos, test_minimizers \ - = minimizer_selector.select_from_kmers(kmers) + test_minimizer_pos, test_minimizers = minimizer_selector.select_from_kmers( + kmers + ) assert test_minimizer_pos.tolist() == ref_minimizer_pos.tolist() assert test_minimizers.tolist() == ref_minimizers.tolist() @@ -69,10 +61,10 @@ def test_minimizer(seed, window, from_sequence, use_permutation): [2, 3, 5, 7], [(0,), (0, 1, 2), (0, -1), (-2, -1)], [False, True], - [False, True] + [False, True], ), # Print tuples in name of test - ids=lambda x: str(x).replace(" ", "") if isinstance(x, tuple) else None + ids=lambda x: str(x).replace(" ", "") if isinstance(x, tuple) else None, ) def test_syncmer(seed, s, offset, from_sequence, use_permutation): """ @@ -113,11 +105,9 @@ def test_syncmer(seed, s, offset, from_sequence, use_permutation): sequence.alphabet, K, s, permutation, offset ) if from_sequence: - test_syncmer_pos, test_syncmers \ - = syncmer_selector.select(sequence) + test_syncmer_pos, test_syncmers = syncmer_selector.select(sequence) else: - test_syncmer_pos, test_syncmers \ - = syncmer_selector.select_from_kmers(kmers) + test_syncmer_pos, test_syncmers = syncmer_selector.select_from_kmers(kmers) assert test_syncmer_pos.tolist() == ref_syncmer_pos.tolist() assert test_syncmers.tolist() == ref_syncmers.tolist() @@ -141,14 +131,10 @@ def test_cached_syncmer(): np.random.seed(0) sequence.code = np.random.randint(len(sequence.alphabet), size=LENGTH) - syncmer_selector = align.SyncmerSelector( - sequence.alphabet, K, S - ) + syncmer_selector = align.SyncmerSelector(sequence.alphabet, K, S) ref_syncmer_pos, ref_syncmers = syncmer_selector.select(sequence) - cached_syncmer_selector = align.CachedSyncmerSelector( - sequence.alphabet, K, S - ) + cached_syncmer_selector = align.CachedSyncmerSelector(sequence.alphabet, K, S) test_syncmer_pos, test_syncmers = cached_syncmer_selector.select(sequence) assert test_syncmer_pos.tolist() == ref_syncmer_pos.tolist() @@ -159,13 +145,13 @@ def test_cached_syncmer(): "offset, exception_type", [ # Duplicate values - ((1, 1), ValueError), + ((1, 1), ValueError), ((0, 2, 0), ValueError), - ((0, -10), ValueError), + ((0, -10), ValueError), # Offset out of window range - ((-11,), IndexError), - ((10,), IndexError), - ] + ((-11,), IndexError), + ((10,), IndexError), + ], ) def test_syncmer_invalid_offset(offset, exception_type): """ @@ -176,7 +162,10 @@ def test_syncmer_invalid_offset(offset, exception_type): with pytest.raises(exception_type): align.SyncmerSelector( # Any alphabet would work here - seq.NucleotideSequence.alphabet_unamb, K, S, offset=offset + seq.NucleotideSequence.alphabet_unamb, + K, + S, + offset=offset, ) @@ -205,12 +194,9 @@ def test_mincode(use_permutation): permutation_range = len(kmer_alph) order = kmers - mincode_selector = align.MincodeSelector( - kmer_alph, COMPRESSION, permutation - ) + mincode_selector = align.MincodeSelector(kmer_alph, COMPRESSION, permutation) _, mincode_pos = mincode_selector.select_from_kmers(kmers) threshold = permutation_offset + permutation_range / COMPRESSION assert mincode_pos.tolist() == np.where(order < threshold)[0].tolist() - assert len(mincode_pos) * COMPRESSION \ - == pytest.approx(len(kmers), rel=0.02) \ No newline at end of file + assert len(mincode_pos) * COMPRESSION == pytest.approx(len(kmers), rel=0.02) diff --git a/tests/sequence/align/test_statistics.py b/tests/sequence/align/test_statistics.py index c63b5513c..5d9f5a13a 100644 --- a/tests/sequence/align/test_statistics.py +++ b/tests/sequence/align/test_statistics.py @@ -2,49 +2,55 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align from biotite.sequence.align.statistics import EValueEstimator - -BACKGROUND = np.array(list({ - "A": 35155, - "C": 8669, - "D": 24161, - "E": 28354, - "F": 17367, - "G": 33229, - "H": 9906, - "I": 23161, - "K": 25872, - "L": 40625, - "M": 10101, - "N": 20212, - "P": 23435, - "Q": 19208, - "R": 23105, - "S": 32070, - "T": 26311, - "V": 29012, - "W": 5990, - "Y": 14488, - "B": 0, - "Z": 0, - "X": 0, - "*": 0, -}.values())) / 450431 +BACKGROUND = ( + np.array( + list( + { + "A": 35155, + "C": 8669, + "D": 24161, + "E": 28354, + "F": 17367, + "G": 33229, + "H": 9906, + "I": 23161, + "K": 25872, + "L": 40625, + "M": 10101, + "N": 20212, + "P": 23435, + "Q": 19208, + "R": 23105, + "S": 32070, + "T": 26311, + "V": 29012, + "W": 5990, + "Y": 14488, + "B": 0, + "Z": 0, + "X": 0, + "*": 0, + }.values() + ) + ) + / 450431 +) @pytest.mark.parametrize( "matrix_name, gap_penalty, ref_lam, ref_k", [ ("BLOSUM62", (-10000, -10000), 0.318, 0.130), - ("BLOSUM62", ( -12, -2), 0.300, 0.090), - ("BLOSUM62", ( -5, -5), 0.131, 0.009), - ( "PAM250", ( -16, -1), 0.172, 0.018), - ] + ("BLOSUM62", (-12, -2), 0.300, 0.090), + ("BLOSUM62", (-5, -5), 0.131, 0.009), + ("PAM250", (-16, -1), 0.172, 0.018), + ], ) def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k): """ @@ -60,8 +66,7 @@ def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k): np.random.seed(0) estimator = align.EValueEstimator.from_samples( - alphabet, matrix, gap_penalty, BACKGROUND, - SAMPLE_LENGTH, SAMPLE_SIZE + alphabet, matrix, gap_penalty, BACKGROUND, SAMPLE_LENGTH, SAMPLE_SIZE ) # Due to relatively low sample size, expect rather large deviation @@ -84,35 +89,29 @@ def test_evalue(): matrix = align.SubstitutionMatrix.std_protein_matrix() estimator = align.EValueEstimator.from_samples( - seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, - BACKGROUND + seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND ) # Generate large number of alignments of random sequences np.random.seed(0) random_sequence_code = np.random.choice( - len(seq.ProteinSequence.alphabet), - size=(N_SAMPLES, 2, SEQ_LENGTH), - p=BACKGROUND + len(seq.ProteinSequence.alphabet), size=(N_SAMPLES, 2, SEQ_LENGTH), p=BACKGROUND ) sample_scores = np.zeros(N_SAMPLES, dtype=int) for i in range(N_SAMPLES): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() - seq1.code = random_sequence_code[i,0] - seq2.code = random_sequence_code[i,1] + seq1.code = random_sequence_code[i, 0] + seq2.code = random_sequence_code[i, 1] sample_scores[i] = align.align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0].score e_values = [ 10 ** estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES) for score in TEST_SCORES ] - counts = [ - np.count_nonzero(sample_scores >= score) for score in TEST_SCORES - ] + counts = [np.count_nonzero(sample_scores >= score) for score in TEST_SCORES] assert e_values == pytest.approx(counts, rel=0.5) @@ -132,45 +131,50 @@ def test_score_scaling(sequences): np.random.seed(0) std_estimator = align.EValueEstimator.from_samples( - seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, - BACKGROUND + seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND ) scores = [ align.align_optimal( - sequences[i], sequences[i+1], matrix, GAP_PENALTY, local=True, - max_number=1 - )[0].score for i in range(9) + sequences[i], + sequences[i + 1], + matrix, + GAP_PENALTY, + local=True, + max_number=1, + )[0].score + for i in range(9) ] - std_log_evalues = std_estimator.log_evalue( - scores, SEQ_LENGTH, SEQ_LENGTH - ) + std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) scaled_matrix = align.SubstitutionMatrix( seq.ProteinSequence.alphabet, seq.ProteinSequence.alphabet, - matrix.score_matrix() * SCALING_FACTOR + matrix.score_matrix() * SCALING_FACTOR, ) scaled_gap_penalty = ( GAP_PENALTY[0] * SCALING_FACTOR, - GAP_PENALTY[1] * SCALING_FACTOR + GAP_PENALTY[1] * SCALING_FACTOR, ) scaled_estimator = align.EValueEstimator.from_samples( - seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, - BACKGROUND + seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, BACKGROUND ) scores = [ align.align_optimal( - sequences[i], sequences[i+1], scaled_matrix, scaled_gap_penalty, - local=True, max_number=1 - )[0].score for i in range(9) + sequences[i], + sequences[i + 1], + scaled_matrix, + scaled_gap_penalty, + local=True, + max_number=1, + )[0].score + for i in range(9) ] - scaled_log_evalues = scaled_estimator.log_evalue( - scores, SEQ_LENGTH, SEQ_LENGTH - ) + scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) # Due to relatively low sample size, expect rather large deviation - assert std_log_evalues.tolist() \ - == pytest.approx(scaled_log_evalues.tolist(), rel=0.2) + assert std_log_evalues.tolist() == pytest.approx( + scaled_log_evalues.tolist(), rel=0.2 + ) def test_invalid_scoring_scheme(): @@ -186,4 +190,4 @@ def test_invalid_scoring_scheme(): freq = np.ones(len(alph)) with pytest.raises(ValueError): - estimator = EValueEstimator.from_samples(alph, matrix, -10, freq) \ No newline at end of file + estimator = EValueEstimator.from_samples(alph, matrix, -10, freq) diff --git a/tests/sequence/test_alphabet.py b/tests/sequence/test_alphabet.py index b99756e79..ba6ef023f 100644 --- a/tests/sequence/test_alphabet.py +++ b/tests/sequence/test_alphabet.py @@ -3,16 +3,19 @@ # information. import itertools -import pytest import numpy as np +import pytest import biotite.sequence as seq - test_cases = { - "A" : [0], - "D" : [3], - "ABC" : [0,1,2,], - "ABAFF" : [0,1,0,5,5] + "A": [0], + "D": [3], + "ABC": [ + 0, + 1, + 2, + ], + "ABAFF": [0, 1, 0, 5, 5], } @@ -24,17 +27,17 @@ def alphabet_symbols(): @pytest.mark.parametrize( "symbols, exp_code, use_letter_alphabet", zip( - list(test_cases.keys() ) * 2, + list(test_cases.keys()) * 2, list(test_cases.values()) * 2, - [False] * len(test_cases) + [True] * len(test_cases) - ) + [False] * len(test_cases) + [True] * len(test_cases), + ), ) def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) - + if len(symbols) == 1: assert alph.encode(symbols[0]) == exp_code[0] else: @@ -44,17 +47,17 @@ def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet): @pytest.mark.parametrize( "exp_symbols, code, use_letter_alphabet", zip( - list(test_cases.keys() ) * 2, + list(test_cases.keys()) * 2, list(test_cases.values()) * 2, - [False] * len(test_cases) + [True] * len(test_cases) - ) + [False] * len(test_cases) + [True] * len(test_cases), + ), ) def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) - + code = np.array(code, dtype=np.uint8) if len(code) == 1: assert alph.decode(code[0]) == exp_symbols[0] @@ -64,9 +67,7 @@ def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet): @pytest.mark.parametrize( "use_letter_alphabet, is_single_val", - itertools.product( - [False, True], [False, True] - ) + itertools.product([False, True], [False, True]), ) def test_error(alphabet_symbols, use_letter_alphabet, is_single_val): if use_letter_alphabet: @@ -96,8 +97,13 @@ def test_error(alphabet_symbols, use_letter_alphabet, is_single_val): @pytest.mark.parametrize( "symbols", - ["ABC", b"ABC", ["A","B","C"], - np.array(["A","B","C"]), np.array([b"A",b"B",b"C"])] + [ + "ABC", + b"ABC", + ["A", "B", "C"], + np.array(["A", "B", "C"]), + np.array([b"A", b"B", b"C"]), + ], ) def test_input_types(alphabet_symbols, symbols): """ @@ -108,13 +114,14 @@ def test_input_types(alphabet_symbols, symbols): alph = seq.LetterAlphabet(alphabet_symbols) code = alph.encode_multiple(symbols) conv_symbols = alph.decode_multiple(code) - - + if isinstance(symbols, bytes): symbols = symbols.decode("ASCII") assert list(conv_symbols) == list( - [symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol - for symbol in symbols] + [ + symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol + for symbol in symbols + ] ) @@ -137,26 +144,24 @@ def test_contains(alphabet_symbols, use_letter_alphabet): @pytest.mark.parametrize( - "source_alph_symbols, target_alph_symbols", + "source_alph_symbols, target_alph_symbols", [ ("A", "AB"), (["foo", "bar"], ["bar", "foo", 42]), ("ACGT", "AGTC"), ("ACGT", "ACGNT"), (np.arange(0, 1000), np.arange(999, -1, -1)), - ] + ], ) def test_alphabet_mapper(source_alph_symbols, target_alph_symbols): CODE_LENGTH = 10000 source_alph = seq.Alphabet(source_alph_symbols) target_alph = seq.Alphabet(target_alph_symbols) mapper = seq.AlphabetMapper(source_alph, target_alph) - + ref_sequence = seq.GeneralSequence(source_alph) np.random.seed(0) - ref_sequence.code = np.random.randint( - len(source_alph), size=CODE_LENGTH, dtype=int - ) + ref_sequence.code = np.random.randint(len(source_alph), size=CODE_LENGTH, dtype=int) test_sequence = seq.GeneralSequence(target_alph) test_sequence.code = mapper[ref_sequence.code] @@ -164,22 +169,25 @@ def test_alphabet_mapper(source_alph_symbols, target_alph_symbols): assert test_sequence.symbols == ref_sequence.symbols -@pytest.mark.parametrize("alphabets, common_alph", [ - ( - [ +@pytest.mark.parametrize( + "alphabets, common_alph", + [ + ( + [ + seq.NucleotideSequence.alphabet_amb, + seq.NucleotideSequence.alphabet_unamb, + ], seq.NucleotideSequence.alphabet_amb, - seq.NucleotideSequence.alphabet_unamb, - ], - seq.NucleotideSequence.alphabet_amb - ), - ( - [ - seq.NucleotideSequence.alphabet_unamb, + ), + ( + [ + seq.NucleotideSequence.alphabet_unamb, + seq.NucleotideSequence.alphabet_amb, + ], seq.NucleotideSequence.alphabet_amb, - ], - seq.NucleotideSequence.alphabet_amb - ), -]) + ), + ], +) def test_common_alphabet(alphabets, common_alph): """ Check if :func:`common_alphabet()` correctly identifies the common @@ -188,13 +196,14 @@ def test_common_alphabet(alphabets, common_alph): seq.common_alphabet(alphabets) == common_alph - def test_common_alphabet_no_common(): """ Check if :func:`common_alphabet()` correctly identifies that no common alphabet exists in a simple known test case. """ - assert seq.common_alphabet([ - seq.NucleotideSequence.alphabet_unamb, - seq.ProteinSequence.alphabet - ]) is None \ No newline at end of file + assert ( + seq.common_alphabet( + [seq.NucleotideSequence.alphabet_unamb, seq.ProteinSequence.alphabet] + ) + is None + ) diff --git a/tests/sequence/test_annotation.py b/tests/sequence/test_annotation.py index 4ce771692..c0f58e2b7 100644 --- a/tests/sequence/test_annotation.py +++ b/tests/sequence/test_annotation.py @@ -2,58 +2,62 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join import biotite.sequence as seq -from biotite.sequence import Location, Feature, Annotation, AnnotatedSequence import biotite.sequence.io.genbank as gb -import numpy as np -from os.path import join +from biotite.sequence import AnnotatedSequence, Annotation, Feature, Location from ..util import data_dir -import pytest def test_annotation_creation(): - feature1 = Feature("CDS", [seq.Location(1,2)], qual={"gene" : "test1"}) - feature2 = Feature("CDS", [seq.Location(3,4)], qual={"gene" : "test2"}) + feature1 = Feature("CDS", [seq.Location(1, 2)], qual={"gene": "test1"}) + feature2 = Feature("CDS", [seq.Location(3, 4)], qual={"gene": "test2"}) feature_list = [feature1, feature2] annotation = Annotation(feature_list) for feature in annotation: assert feature.key in [f.key for f in feature_list] - assert feature.qual["gene"] in [ - f.qual["gene"] for f in feature_list - ] + assert feature.qual["gene"] in [f.qual["gene"] for f in feature_list] + def test_annotation_concatenation(): - feature1 = Feature("CDS", [seq.Location(1,1)], qual={"gene" : "test1"}) - feature2 = Feature("CDS", [seq.Location(2,2)], qual={"gene" : "test2"}) + feature1 = Feature("CDS", [seq.Location(1, 1)], qual={"gene": "test1"}) + feature2 = Feature("CDS", [seq.Location(2, 2)], qual={"gene": "test2"}) annot1 = Annotation([feature1, feature2]) - feature3 = Feature("CDS", [seq.Location(3,3)], qual={"gene" : "test3"}) - feature4 = Feature("CDS", [seq.Location(4,4)], qual={"gene" : "test4"}) + feature3 = Feature("CDS", [seq.Location(3, 3)], qual={"gene": "test3"}) + feature4 = Feature("CDS", [seq.Location(4, 4)], qual={"gene": "test4"}) annot2 = Annotation([feature3, feature4]) - feature5 = Feature("CDS", [seq.Location(5,5)], qual={"gene" : "test5"}) + feature5 = Feature("CDS", [seq.Location(5, 5)], qual={"gene": "test5"}) concat = annot1 + annot2 + feature5 - assert set([f.qual["gene"] for f in concat]) \ - == set(["test1", "test2", "test3", "test4", "test5"]) + assert set([f.qual["gene"] for f in concat]) == set( + ["test1", "test2", "test3", "test4", "test5"] + ) + def test_annotation_indexing(): - feature1 = Feature("CDS", [Location(-10,30 )], qual={"gene" : "test1"}) - feature2 = Feature("CDS", [Location(20, 50 )], qual={"gene" : "test2"}) - feature3 = Feature("CDS", [Location(100,130)], qual={"gene" : "test3"}) - feature4 = Feature("CDS", [Location(150,250)], qual={"gene" : "test4"}) - feature5 = Feature("CDS", [Location(-50,200)], qual={"gene" : "test5"}) - annotation = Annotation([feature1,feature2,feature3,feature4,feature5]) + feature1 = Feature("CDS", [Location(-10, 30)], qual={"gene": "test1"}) + feature2 = Feature("CDS", [Location(20, 50)], qual={"gene": "test2"}) + feature3 = Feature("CDS", [Location(100, 130)], qual={"gene": "test3"}) + feature4 = Feature("CDS", [Location(150, 250)], qual={"gene": "test4"}) + feature5 = Feature("CDS", [Location(-50, 200)], qual={"gene": "test5"}) + annotation = Annotation([feature1, feature2, feature3, feature4, feature5]) sub_annot = annotation[40:150] # Only one location per feature - assert set([list(f.locs)[0].defect for f in sub_annot]) \ - == set([Location.Defect.MISS_LEFT, Location.Defect.NONE, - (Location.Defect.MISS_LEFT | Location.Defect.MISS_RIGHT)]) - assert set([f.qual["gene"] for f in sub_annot]) \ - == set(["test2", "test3", "test5"]) + assert set([list(f.locs)[0].defect for f in sub_annot]) == set( + [ + Location.Defect.MISS_LEFT, + Location.Defect.NONE, + (Location.Defect.MISS_LEFT | Location.Defect.MISS_RIGHT), + ] + ) + assert set([f.qual["gene"] for f in sub_annot]) == set(["test2", "test3", "test5"]) + def test_annotated_sequence(): sequence = seq.NucleotideSequence("ATGGCGTACGATTAGAAAAAAA") - feature1 = Feature("misc_feature", [Location(1,2), Location(11,12)], - {"note" : "walker"}) - feature2 = Feature("misc_feature", [Location(16,22)], {"note" : "poly-A"}) + feature1 = Feature( + "misc_feature", [Location(1, 2), Location(11, 12)], {"note": "walker"} + ) + feature2 = Feature("misc_feature", [Location(16, 22)], {"note": "poly-A"}) annotation = Annotation([feature1, feature2]) annot_seq = AnnotatedSequence(annotation, sequence) assert annot_seq[2] == "T" @@ -62,17 +66,19 @@ def test_annotated_sequence(): # test slicing with only stop annot_seq2 = annot_seq[:16] assert annot_seq2.sequence == seq.NucleotideSequence("ATGGCGTACGATTAG") - assert set([f.qual['note'] for f in annot_seq2.annotation]) == {'walker'} + assert set([f.qual["note"] for f in annot_seq2.annotation]) == {"walker"} # test slicing with only start annot_seq3 = annot_seq[16:] assert annot_seq3.sequence == seq.NucleotideSequence("AAAAAAA") - assert set([f.qual['note'] for f in annot_seq3.annotation]) == {'poly-A'} + assert set([f.qual["note"] for f in annot_seq3.annotation]) == {"poly-A"} # test slicing with start and stop annot_seq4 = annot_seq[1:17] - assert annot_seq4.sequence == seq.NucleotideSequence("ATGGCGTACGATTAGA") # sequences are 1-indexed - assert set([f.qual['note'] for f in annot_seq4.annotation]) == {'walker', 'poly-A'} + assert annot_seq4.sequence == seq.NucleotideSequence( + "ATGGCGTACGATTAGA" + ) # sequences are 1-indexed + assert set([f.qual["note"] for f in annot_seq4.annotation]) == {"walker", "poly-A"} assert annot_seq[feature1] == seq.NucleotideSequence("ATAT") assert annot_seq[feature2] == seq.NucleotideSequence("AAAAAAA") @@ -80,12 +86,17 @@ def test_annotated_sequence(): assert annot_seq.sequence == seq.NucleotideSequence("CCGGCGTACGCCTAGAAAAAAA") # test slicing with feature on minus strand - feature3 = Feature("misc_feature", [Location(1,4), Location(8,12)]) - feature4 = Feature("misc_feature_minus", [ - Location(1,4,strand=Location.Strand.REVERSE), - Location(8,12,strand=Location.Strand.REVERSE)]) + feature3 = Feature("misc_feature", [Location(1, 4), Location(8, 12)]) + feature4 = Feature( + "misc_feature_minus", + [ + Location(1, 4, strand=Location.Strand.REVERSE), + Location(8, 12, strand=Location.Strand.REVERSE), + ], + ) assert annot_seq[feature4] == annot_seq[feature3].reverse().complement() + def test_reverse_complement(): gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) annot_seq = gb.get_annotated_sequence(gb_file) diff --git a/tests/sequence/test_codon.py b/tests/sequence/test_codon.py index 315f2b0fc..8f0a799e8 100644 --- a/tests/sequence/test_codon.py +++ b/tests/sequence/test_codon.py @@ -2,12 +2,39 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.sequence as seq import pytest +import biotite.sequence as seq -@pytest.mark.parametrize("table_id", - [1,2,3,4,5,6,9,10,11,12,13,14,16,21,22,23,24,25,26,27,28,29,30,31]) +@pytest.mark.parametrize( + "table_id", + [ + 1, + 2, + 3, + 4, + 5, + 6, + 9, + 10, + 11, + 12, + 13, + 14, + 16, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + ], +) def test_table_load(table_id): table = seq.CodonTable.load(table_id) diff --git a/tests/sequence/test_fasta.py b/tests/sequence/test_fasta.py index 1b7103e30..bb7d9ab42 100644 --- a/tests/sequence/test_fasta.py +++ b/tests/sequence/test_fasta.py @@ -2,18 +2,18 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import itertools import glob import io -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import numpy as np +import itertools import os import os.path -from ..util import data_dir +import numpy as np import pytest +import biotite.sequence as seq +import biotite.sequence.io.fasta as fasta +from ..util import data_dir + - def test_access_low_level(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) @@ -21,21 +21,21 @@ def test_access_low_level(): assert file["another dna sequence"] == "A" assert file["third dna sequence"] == "ACGT" assert dict(file.items()) == { - "dna sequence" : "ACGCTACGT", - "another dna sequence" : "A", - "third dna sequence" : "ACGT", - "rna sequence" : "ACGU", - "ambiguous rna sequence" : "ACGUNN", + "dna sequence": "ACGCTACGT", + "another dna sequence": "A", + "third dna sequence": "ACGT", + "rna sequence": "ACGU", + "ambiguous rna sequence": "ACGUNN", } file["another dna sequence"] = "AA" del file["dna sequence"] file["yet another sequence"] = "ACGT" assert dict(file.items()) == { - "another dna sequence" : "AA", - "third dna sequence" : "ACGT", - "rna sequence" : "ACGU", - "ambiguous rna sequence" : "ACGUNN", - "yet another sequence" : "ACGT", + "another dna sequence": "AA", + "third dna sequence": "ACGT", + "rna sequence": "ACGU", + "ambiguous rna sequence": "ACGUNN", + "yet another sequence": "ACGT", } @@ -45,16 +45,16 @@ def test_access_high_level(seq_type): file = fasta.FastaFile.read(path) sequences = fasta.get_sequences(file, seq_type=seq_type) assert sequences == { - "dna sequence" : seq.NucleotideSequence("ACGCTACGT", False), - "another dna sequence" : seq.NucleotideSequence("A", False), - "third dna sequence" : seq.NucleotideSequence("ACGT", False), - "rna sequence" : seq.NucleotideSequence("ACGT", False), - "ambiguous rna sequence" : seq.NucleotideSequence("ACGTNN", True), + "dna sequence": seq.NucleotideSequence("ACGCTACGT", False), + "another dna sequence": seq.NucleotideSequence("A", False), + "third dna sequence": seq.NucleotideSequence("ACGT", False), + "rna sequence": seq.NucleotideSequence("ACGT", False), + "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True), } @pytest.mark.parametrize( - "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) + "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) ) def test_sequence_conversion_ambiguous(seq_type): path = os.path.join(data_dir("sequence"), "nuc.fasta") @@ -67,10 +67,8 @@ def test_sequence_conversion_ambiguous(seq_type): file, seq_type=None ) else: - assert seq_type(sequence) == fasta.get_sequence( - file, seq_type=seq_type - ) - + assert seq_type(sequence) == fasta.get_sequence(file, seq_type=seq_type) + seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) @@ -84,7 +82,7 @@ def test_sequence_conversion_ambiguous(seq_type): assert str(seq1) == str(seq2) else: assert seq_dict == seq_dict2 - + if seq_type is not None: sequence = "AACCTTGG" file3 = fasta.FastaFile() @@ -93,7 +91,7 @@ def test_sequence_conversion_ambiguous(seq_type): @pytest.mark.parametrize( - "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) + "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) ) def test_sequence_conversion_protein(seq_type): path = os.path.join(data_dir("sequence"), "prot.fasta") @@ -112,7 +110,7 @@ def test_sequence_conversion_protein(seq_type): @pytest.mark.parametrize( - "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) + "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) ) def test_sequence_conversion_invalid(seq_type): path = os.path.join(data_dir("sequence"), "invalid.fasta") @@ -134,32 +132,31 @@ def test_alignment_conversion(): path = os.path.join(data_dir("sequence"), "alignment.fasta") file = fasta.FastaFile.read(path) alignment = fasta.get_alignment(file) - assert str(alignment) == ("ADTRCGTARDCGTR-DRTCGRAGD\n" - "ADTRCGT---CGTRADRTCGRAGD\n" - "ADTRCGTARDCGTRADR--GRAGD") - + assert str(alignment) == ( + "ADTRCGTARDCGTR-DRTCGRAGD\n" + "ADTRCGT---CGTRADRTCGRAGD\n" + "ADTRCGTARDCGTRADR--GRAGD" + ) + file2 = fasta.FastaFile() - fasta.set_alignment(file2, alignment, seq_names=["seq1","seq2","seq3"]) + fasta.set_alignment(file2, alignment, seq_names=["seq1", "seq2", "seq3"]) alignment2 = fasta.get_alignment(file2) assert str(alignment) == str(alignment2) + @pytest.mark.parametrize( - "file_name", - glob.glob(os.path.join(data_dir("sequence"), "*.fasta")) + "file_name", glob.glob(os.path.join(data_dir("sequence"), "*.fasta")) ) def test_read_iter(file_name): ref_dict = dict(fasta.FastaFile.read(file_name).items()) - + test_dict = dict(fasta.FastaFile.read_iter(file_name)) assert test_dict == ref_dict @pytest.mark.parametrize( - "chars_per_line, n_sequences", itertools.product( - [80, 200], - [1, 10] - ) + "chars_per_line, n_sequences", itertools.product([80, 200], [1, 10]) ) def test_write_iter(chars_per_line, n_sequences): """ @@ -176,28 +173,24 @@ def test_write_iter(chars_per_line, n_sequences): for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint( - len(seq.NucleotideSequence.alphabet_unamb), - size=seq_length + len(seq.NucleotideSequence.alphabet_unamb), size=seq_length ) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) - + fasta_file = fasta.FastaFile(chars_per_line) for i, sequence in enumerate(sequences): header = f"seq_{i}" fasta_file[header] = str(sequence) ref_file = io.StringIO() fasta_file.write(ref_file) - + test_file = io.StringIO() fasta.FastaFile.write_iter( test_file, - ( - (f"seq_{i}", str(sequence)) - for i, sequence in enumerate(sequences) - ), - chars_per_line + ((f"seq_{i}", str(sequence)) for i, sequence in enumerate(sequences)), + chars_per_line, ) - assert test_file.getvalue() == ref_file.getvalue() \ No newline at end of file + assert test_file.getvalue() == ref_file.getvalue() diff --git a/tests/sequence/test_fastq.py b/tests/sequence/test_fastq.py index d497787a8..644738b06 100644 --- a/tests/sequence/test_fastq.py +++ b/tests/sequence/test_fastq.py @@ -5,43 +5,40 @@ import glob import io import itertools +import os +import os.path from tempfile import TemporaryFile +import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.io.fastq as fastq -import numpy as np -import os -import os.path from ..util import data_dir -import pytest + @pytest.mark.parametrize("chars_per_line", [None, 80]) def test_access(chars_per_line): path = os.path.join(data_dir("sequence"), "random.fastq") - file = fastq.FastqFile.read( - path, offset=33, chars_per_line=chars_per_line - ) + file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line) assert len(file) == 20 assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)] - del(file["Read:05"]) + del file["Read:05"] assert len(file) == 19 - assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20) - if i+1 != 5] + assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5] for seq_str, scores in file.values(): assert len(seq_str) == len(scores) assert (scores >= 0).all() seq_str = "ACTCGGT" - scores = np.array([10,12,20,11,0,80,42]) + scores = np.array([10, 12, 20, 11, 0, 80, 42]) file["test"] = seq_str, scores seq_str2, scores2 = file["test"] assert seq_str == seq_str2 assert np.array_equal(scores, scores2) + @pytest.mark.parametrize("chars_per_line", [None, 80]) def test_conversion(chars_per_line): path = os.path.join(data_dir("sequence"), "random.fastq") - fasta_file = fastq.FastqFile.read( - path, offset=33, chars_per_line=chars_per_line - ) + fasta_file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line) ref_content = dict(fasta_file.items()) fasta_file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) @@ -51,48 +48,46 @@ def test_conversion(chars_per_line): fasta_file.write(temp) temp.seek(0) - fasta_file = fastq.FastqFile.read( - temp, offset=33, chars_per_line=chars_per_line - ) + fasta_file = fastq.FastqFile.read(temp, offset=33, chars_per_line=chars_per_line) content = dict(fasta_file.items()) temp.close() - + for identifier in ref_content: ref_sequence, ref_scores = ref_content[identifier] test_sequence, test_scores = content[identifier] assert test_sequence == ref_sequence assert np.array_equal(test_scores, ref_scores) + def test_rna_conversion(): sequence = seq.NucleotideSequence("ACGT") scores = np.array([0, 0, 0, 0]) fastq_file = fastq.FastqFile(offset="Sanger") fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False) fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True) - assert fastq_file["seq1"][0] == "ACGT" + assert fastq_file["seq1"][0] == "ACGT" assert fastq_file["seq2"][0] == "ACGU" + @pytest.mark.parametrize( - "file_name", - glob.glob(os.path.join(data_dir("sequence"), "*.fastq")) + "file_name", glob.glob(os.path.join(data_dir("sequence"), "*.fastq")) ) def test_read_iter(file_name): ref_dict = dict(fastq.FastqFile.read(file_name, offset="Sanger").items()) - + test_dict = dict(fastq.FastqFile.read_iter(file_name, offset="Sanger")) - for (test_id, (test_seq, test_sc)), (ref_id, (ref_seq, ref_sc)) \ - in zip(test_dict.items(), ref_dict.items()): - assert test_id == ref_id - assert test_seq == ref_seq - assert (test_sc == ref_sc).all() + for (test_id, (test_seq, test_sc)), (ref_id, (ref_seq, ref_sc)) in zip( + test_dict.items(), ref_dict.items() + ): + assert test_id == ref_id + assert test_seq == ref_seq + assert (test_sc == ref_sc).all() + @pytest.mark.parametrize( - "offset, chars_per_line, n_sequences", itertools.product( - [33, 42, "Solexa"], - [None, 80], - [1, 10] - ) + "offset, chars_per_line, n_sequences", + itertools.product([33, 42, "Solexa"], [None, 80], [1, 10]), ) def test_write_iter(offset, chars_per_line, n_sequences): """ @@ -110,22 +105,21 @@ def test_write_iter(offset, chars_per_line, n_sequences): for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint( - len(seq.NucleotideSequence.alphabet_unamb), - size=seq_length + len(seq.NucleotideSequence.alphabet_unamb), size=seq_length ) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) score = np.random.randint(*SCORE_RANGE, size=seq_length) scores.append(score) - + fastq_file = fastq.FastqFile(offset, chars_per_line) for i, (sequence, score) in enumerate(zip(sequences, scores)): identifier = f"seq_{i}" fastq_file[identifier] = (str(sequence), score) ref_file = io.StringIO() fastq_file.write(ref_file) - + test_file = io.StringIO() fastq.FastqFile.write_iter( test_file, @@ -133,7 +127,8 @@ def test_write_iter(offset, chars_per_line, n_sequences): (f"seq_{i}", (str(sequence), score)) for i, (sequence, score) in enumerate(zip(sequences, scores)) ), - offset, chars_per_line + offset, + chars_per_line, ) - assert test_file.getvalue() == ref_file.getvalue() \ No newline at end of file + assert test_file.getvalue() == ref_file.getvalue() diff --git a/tests/sequence/test_genbank.py b/tests/sequence/test_genbank.py index 6ecefd061..1f695e860 100644 --- a/tests/sequence/test_genbank.py +++ b/tests/sequence/test_genbank.py @@ -2,20 +2,19 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile import glob from os.path import join +from tempfile import TemporaryFile +import pytest import biotite.sequence as seq import biotite.sequence.io.genbank as gb -import numpy as np -import pytest from ..util import data_dir @pytest.mark.parametrize( "path", - glob.glob(join(data_dir("sequence"), "*.gb")) + \ - glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")) + glob.glob(join(data_dir("sequence"), "*.gb")) + + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")), ) def test_contiguous_field_pos(path): """ @@ -25,7 +24,7 @@ def test_contiguous_field_pos(path): assert gb_file._field_pos[0][0] == 0 for i in range(1, len(gb_file._field_pos)): start, _, _ = gb_file._field_pos[i] - _, stop, _ = gb_file._field_pos[i-1] + _, stop, _ = gb_file._field_pos[i - 1] assert start == stop @@ -37,27 +36,23 @@ def test_file_access(): gb_file = gb.GenBankFile() gb_file.append("SOMEFIELD", ["Some content", "some other content"]) gb_file.insert(0, "OTHERFIELD", ["Additional content"]) - assert gb_file[1] \ - == ("SOMEFIELD", ["Some content", "some other content"], {}) - gb_file[1] \ - = "NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]} + assert gb_file[1] == ("SOMEFIELD", ["Some content", "some other content"], {}) + gb_file[1] = "NEWFIELD", ["Extra content"], {"SUBFIELD": ["L 1", "L 2"]} gb_file.append("THIRDFIELD", ["Supplementary content"]) assert len(gb_file) == 3 assert gb_file[0] == ("OTHERFIELD", ["Additional content"], {}) del gb_file[0] - assert gb_file[0] \ - == ("NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]}) + assert gb_file[0] == ("NEWFIELD", ["Extra content"], {"SUBFIELD": ["L 1", "L 2"]}) del gb_file[0] assert gb_file[0] == ("THIRDFIELD", ["Supplementary content"], {}) del gb_file[0] assert len(gb_file) == 0 - @pytest.mark.parametrize( "path", - glob.glob(join(data_dir("sequence"), "*.gb")) + \ - glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")) + glob.glob(join(data_dir("sequence"), "*.gb")) + + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")), ) def test_conversion_lowlevel(path): """ @@ -72,7 +67,7 @@ def test_conversion_lowlevel(path): gb_file.append(name, content, subfields) temp = TemporaryFile("w+") gb_file.write(temp) - + temp.seek(0) gb_file = gb.GenBankFile.read(temp) temp.close() @@ -82,8 +77,8 @@ def test_conversion_lowlevel(path): @pytest.mark.parametrize( "path", - glob.glob(join(data_dir("sequence"), "*.gb")) + \ - glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")) + glob.glob(join(data_dir("sequence"), "*.gb")) + + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")), ) def test_conversion_highlevel(path): """ @@ -101,44 +96,55 @@ def test_conversion_highlevel(path): gb.set_annotated_sequence(gb_file, ref_annot_seq) temp = TemporaryFile("w+") gb_file.write(temp) - + temp.seek(0) gb_file = gb.GenBankFile.read(temp) temp.close() test_locus = gb.get_locus(gb_file) test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) assert test_locus == ref_locus - assert test_annot_seq.sequence == ref_annot_seq.sequence - assert test_annot_seq.annotation == ref_annot_seq.annotation + assert test_annot_seq.sequence == ref_annot_seq.sequence + assert test_annot_seq.annotation == ref_annot_seq.annotation assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start def test_genbank_utility_gb(): """ Check whether the high-level utility functions return the expected - content of a known GenBank file. + content of a known GenBank file. """ gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) - assert gb.get_locus(gb_file) \ - == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017") - assert gb.get_definition(gb_file) \ - == ("Escherichia coli BL21(DE3), complete genome.") + assert gb.get_locus(gb_file) == ( + "CP001509", + 4558953, + "DNA", + True, + "BCT", + "16-FEB-2017", + ) + assert gb.get_definition(gb_file) == ( + "Escherichia coli BL21(DE3), complete genome." + ) assert gb.get_version(gb_file) == "CP001509.3" assert gb.get_gi(gb_file) == 296142109 - assert gb.get_db_link(gb_file) \ - == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"} + assert gb.get_db_link(gb_file) == { + "BioProject": "PRJNA20713", + "BioSample": "SAMN02603478", + } annotation = gb.get_annotation(gb_file, include_only=["CDS"]) feature = seq.Feature( "CDS", [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)], - {"gene": "yaaA", "transl_table": "11"} + {"gene": "yaaA", "transl_table": "11"}, ) in_annotation = False for f in annotation: - if f.key == feature.key and f.locs == feature.locs and \ - all([(key, val in f.qual.items()) - for key, val in feature.qual.items()]): - in_annotation = True + if ( + f.key == feature.key + and f.locs == feature.locs + and all([(key, val in f.qual.items()) for key, val in feature.qual.items()]) + ): + in_annotation = True assert in_annotation assert len(gb.get_sequence(gb_file, format="gb")) == 4558953 @@ -146,30 +152,34 @@ def test_genbank_utility_gb(): def test_genbank_utility_gp(): """ Check whether the high-level utility functions return the expected - content of a known GenPept file. + content of a known GenPept file. """ gp_file = gb.GenBankFile.read(join(data_dir("sequence"), "bt_lysozyme.gp")) - #[print(e) for e in gp_file._field_pos] - assert gb.get_locus(gp_file) \ - == ("AAC37312", 147, None, False, "MAM", "27-APR-1993") + # [print(e) for e in gp_file._field_pos] + assert gb.get_locus(gp_file) == ("AAC37312", 147, None, False, "MAM", "27-APR-1993") assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]." assert gb.get_version(gp_file) == "AAC37312.1" assert gb.get_gi(gp_file) == 163334 annotation = gb.get_annotation(gp_file) feature = seq.Feature( "Site", - [seq.Location(start, stop) for start, stop in zip( - [52,55,62,76,78,81,117,120,125], - [53,55,62,76,78,81,117,120,126] - )], - {"note": "lysozyme catalytic cleft [active]", "site_type": "active"} + [ + seq.Location(start, stop) + for start, stop in zip( + [52, 55, 62, 76, 78, 81, 117, 120, 125], + [53, 55, 62, 76, 78, 81, 117, 120, 126], + ) + ], + {"note": "lysozyme catalytic cleft [active]", "site_type": "active"}, ) in_annotation = False for f in annotation: - if f.key == feature.key and f.locs == feature.locs and \ - all([(key, val in f.qual.items()) - for key, val in feature.qual.items()]): - in_annotation = True + if ( + f.key == feature.key + and f.locs == feature.locs + and all([(key, val in f.qual.items()) for key, val in feature.qual.items()]) + ): + in_annotation = True assert in_annotation assert len(gb.get_sequence(gp_file, format="gp")) == 147 @@ -184,21 +194,27 @@ def test_multi_file(): "locus_content, expected_result", [ ( - "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID 1224 bp DNA linear VRT 14-NOV-2006", - ("AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID", 1224, "DNA", False, "VRT", "14-NOV-2006") + "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID 1224 bp DNA linear VRT 14-NOV-2006", + ( + "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID", + 1224, + "DNA", + False, + "VRT", + "14-NOV-2006", + ), ), ( - "SCU49845 5028 bp DNA PLN 21-JUN-1999", - ("SCU49845", 5028, "DNA", False, "PLN", "21-JUN-1999") + "SCU49845 5028 bp DNA PLN 21-JUN-1999", + ("SCU49845", 5028, "DNA", False, "PLN", "21-JUN-1999"), ), ( - "123MissingMolTypeAndCircular 5028 bp PLN 21-JUN-1999", - ("123MissingMolTypeAndCircular", 5028, None, False, "PLN", "21-JUN-1999") - ) - ] + "123MissingMolTypeAndCircular 5028 bp PLN 21-JUN-1999", + ("123MissingMolTypeAndCircular", 5028, None, False, "PLN", "21-JUN-1999"), + ), + ], ) def test_parse_locus(locus_content, expected_result): gb_file = gb.GenBankFile() gb_file.append("LOCUS", [locus_content]) assert gb.get_locus(gb_file) == expected_result - \ No newline at end of file diff --git a/tests/sequence/test_generalio.py b/tests/sequence/test_generalio.py index a5b21315b..62c150698 100644 --- a/tests/sequence/test_generalio.py +++ b/tests/sequence/test_generalio.py @@ -2,33 +2,24 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile -import biotite -import biotite.sequence as seq -import biotite.sequence.io as seqio -import numpy as np import glob from os.path import join -from ..util import data_dir +from tempfile import NamedTemporaryFile import pytest +import biotite.sequence.io as seqio +from ..util import data_dir -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("sequence"), "random.*")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("sequence"), "random.*"))) def test_loading_single(path): - ref_sequence = seqio.load_sequence( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequence = seqio.load_sequence(join(data_dir("sequence"), "random.fasta")) sequence = seqio.load_sequence(path) assert ref_sequence == sequence @pytest.mark.parametrize("suffix", ["fasta", "fastq"]) def test_saving_single(suffix): - ref_sequence = seqio.load_sequence( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequence = seqio.load_sequence(join(data_dir("sequence"), "random.fasta")) temp = NamedTemporaryFile("w+", suffix=f".{suffix}") try: seqio.save_sequence(temp.name, ref_sequence) @@ -37,22 +28,16 @@ def test_saving_single(suffix): pytest.skip("Permission is denied") -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("sequence"), "random.*")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("sequence"), "random.*"))) def test_loading_multiple(path): - ref_sequences = seqio.load_sequences( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequences = seqio.load_sequences(join(data_dir("sequence"), "random.fasta")) sequences = seqio.load_sequences(path) assert ref_sequences == sequences @pytest.mark.parametrize("suffix", ["fasta", "fastq"]) def test_saving_multiple(suffix): - ref_sequences = seqio.load_sequences( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequences = seqio.load_sequences(join(data_dir("sequence"), "random.fasta")) temp = NamedTemporaryFile("w+", suffix=f".{suffix}") try: seqio.save_sequences(temp.name, ref_sequences) @@ -60,6 +45,7 @@ def test_saving_multiple(suffix): # This error might occur on AppVeyor pytest.skip("Permission is denied") + @pytest.mark.parametrize("file_name", ["gg_avidin.gb", "bt_lysozyme.gp"]) def test_genbank(file_name): """ @@ -73,4 +59,4 @@ def test_genbank(file_name): seqio.save_sequence(temp.name, sequence) except PermissionError: # This error might occur on AppVeyor - pytest.skip("Permission is denied") \ No newline at end of file + pytest.skip("Permission is denied") diff --git a/tests/sequence/test_gff.py b/tests/sequence/test_gff.py index 5c6ee77b4..02dc3b615 100644 --- a/tests/sequence/test_gff.py +++ b/tests/sequence/test_gff.py @@ -2,19 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile from os.path import join +from tempfile import TemporaryFile +import pytest import biotite.sequence as seq -import biotite.sequence.io.gff as gff import biotite.sequence.io.genbank as gb -import numpy as np -import pytest +import biotite.sequence.io.gff as gff from ..util import data_dir @pytest.mark.parametrize( - "path", - ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] + "path", ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] ) def test_conversion_lowlevel(path): """ @@ -38,8 +36,7 @@ def test_conversion_lowlevel(path): @pytest.mark.parametrize( - "path", - ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] + "path", ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] ) def test_conversion_highlevel(path): """ @@ -69,7 +66,7 @@ def test_conversion_highlevel(path): for _, _, type, _, _, _, _, phase, _ in gff_file: if type == "CDS": test_phases.append(phase) - + assert ref_annot == test_annot assert test_phases == ref_phases @@ -87,7 +84,7 @@ def test_genbank_consistency(path): gff_file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3")) test_annot = gff.get_annotation(gff_file) - + # Remove qualifiers, since they will be different # in GFF3 and GenBank ref_annot = seq.Annotation( @@ -115,7 +112,7 @@ def test_file_access(): file. """ file = gff.GFFFile() - entry_scaffold = ("ab", "cd", 1, 2, None, None, None, {"Id":"foo"}) + entry_scaffold = ("ab", "cd", 1, 2, None, None, None, {"Id": "foo"}) entry = ("a",) + entry_scaffold file.append(*entry) assert file[0] == entry @@ -124,8 +121,11 @@ def test_file_access(): file[1] = ("d",) + entry_scaffold file.insert(3, *(("e",) + entry_scaffold)) del file[2] - assert [seqid for seqid, _, _, _, _, _, _, _, _ in file] \ - == ["a", "d", "e", ] + assert [seqid for seqid, _, _, _, _, _, _, _, _ in file] == [ + "a", + "d", + "e", + ] def test_entry_indexing(): @@ -134,17 +134,14 @@ def test_entry_indexing(): test file with multiple directives, including '##FASTA'. """ with pytest.warns(UserWarning): - file = gff.GFFFile.read( - join(data_dir("sequence"), "indexing_test.gff3") - ) + file = gff.GFFFile.read(join(data_dir("sequence"), "indexing_test.gff3")) assert file._directives == [ ("directive 1", 1), ("directive 2", 2), ("directive 3", 7), ("FASTA", 8), ] - assert file._entries == [3,4,6] - + assert file._entries == [3, 4, 6] def test_percent_encoding(): @@ -153,21 +150,19 @@ def test_percent_encoding(): artificial test file. """ file = gff.GFFFile.read(join(data_dir("sequence"), "percent_test.gff3")) - seqid, source, type, start, end, score, strand, phase, attrib \ - = file[0] + seqid, source, type, start, end, score, strand, phase, attrib = file[0] assert seqid == "123,456" assert source == "ääh" assert type == "regi&n" assert attrib == { - "ID" : "AnID;AnotherID", - "Name" : "Ångström", - "c$l$r": "red\tgreen\tblue" + "ID": "AnID;AnotherID", + "Name": "Ångström", + "c$l$r": "red\tgreen\tblue", } file2 = gff.GFFFile() file.append(seqid, source, type, start, end, score, strand, phase, attrib) - assert (seqid, source, type, start, end, score, strand, phase, attrib) \ - == file[0] + assert (seqid, source, type, start, end, score, strand, phase, attrib) == file[0] def test_error(): @@ -177,16 +172,17 @@ def test_error(): file = gff.GFFFile() with pytest.raises(ValueError): # 'seqid' beginning with '>' is not legal - file.append(">xyz", "ab", "cd", 1, 2, None, None, None, {"Id":"foo"}) + file.append(">xyz", "ab", "cd", 1, 2, None, None, None, {"Id": "foo"}) with pytest.raises(ValueError): # String fields must not be empty - file.append("", "ab", "cd", 1, 2, None, None, None, {"Id":"foo"}) + file.append("", "ab", "cd", 1, 2, None, None, None, {"Id": "foo"}) with pytest.raises(ValueError): # String fields must not be empty - file.append("xyz", "", "cd", 1, 2, None, None, None, {"Id":"foo"}) + file.append("xyz", "", "cd", 1, 2, None, None, None, {"Id": "foo"}) with pytest.raises(ValueError): # String fields must not be empty - file.append("xyz", "ab", "", 1, 2, None, None, None, {"Id":"foo"}) + file.append("xyz", "ab", "", 1, 2, None, None, None, {"Id": "foo"}) + def test_feature_without_id(): """ @@ -194,12 +190,14 @@ def test_feature_without_id(): locations and consequently multiple entries in the GFF3 file. """ annot = seq.Annotation( - [seq.Feature( - key = "CDS", - locs = [seq.Location(1,2), seq.Location(4,5)], - qual = {"some" : "qualifiers"} - )] + [ + seq.Feature( + key="CDS", + locs=[seq.Location(1, 2), seq.Location(4, 5)], + qual={"some": "qualifiers"}, + ) + ] ) file = gff.GFFFile() with pytest.raises(ValueError): - gff.set_annotation(file, annot) \ No newline at end of file + gff.set_annotation(file, annot) diff --git a/tests/sequence/test_graphics.py b/tests/sequence/test_graphics.py index bfad27840..b59b6df50 100644 --- a/tests/sequence/test_graphics.py +++ b/tests/sequence/test_graphics.py @@ -2,23 +2,19 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import dirname, abspath, join import glob +from os.path import abspath, dirname, join import pytest import biotite.sequence as seq from ..util import cannot_import -@pytest.mark.skipif( - cannot_import("matplotlib"), reason="Matplotlib is not installed" -) +@pytest.mark.skipif(cannot_import("matplotlib"), reason="Matplotlib is not installed") @pytest.mark.parametrize( - "scheme_path", glob.glob( - join( - dirname(abspath(seq.__file__)), - "graphics", "color_schemes", "*.json" - ) - ) + "scheme_path", + glob.glob( + join(dirname(abspath(seq.__file__)), "graphics", "color_schemes", "*.json") + ), ) def test_load_color_scheme(scheme_path): from matplotlib.colors import to_rgb @@ -27,9 +23,9 @@ def test_load_color_scheme(scheme_path): supported_alphabets = [ seq.NucleotideSequence.alphabet_amb, seq.ProteinSequence.alphabet, - seq.LetterAlphabet("abcdefghijklmnop") # Protein block alphabet + seq.LetterAlphabet("abcdefghijklmnop"), # Protein block alphabet ] - + test_scheme = graphics.load_color_scheme(scheme_path) assert test_scheme["alphabet"] in supported_alphabets @@ -37,4 +33,4 @@ def test_load_color_scheme(scheme_path): for color in test_scheme["colors"]: if color is not None: # Should not raise error - to_rgb(color) \ No newline at end of file + to_rgb(color) diff --git a/tests/sequence/test_phylo.py b/tests/sequence/test_phylo.py index d483d2534..8fde15f53 100644 --- a/tests/sequence/test_phylo.py +++ b/tests/sequence/test_phylo.py @@ -43,10 +43,12 @@ def test_upgma(tree, upgma_newick): for i in range(len(tree)): for j in range(len(tree)): # Check for equal distances and equal topologies - assert tree.get_distance(i,j) \ - == pytest.approx(ref_tree.get_distance(i,j), abs=1e-3) - assert tree.get_distance(i,j, topological=True) \ - == ref_tree.get_distance(i,j, topological=True) + assert tree.get_distance(i, j) == pytest.approx( + ref_tree.get_distance(i, j), abs=1e-3 + ) + assert tree.get_distance(i, j, topological=True) == ref_tree.get_distance( + i, j, topological=True + ) def test_neighbor_joining(): @@ -60,34 +62,36 @@ def test_neighbor_joining(): [ 7, 10, 7, 0, 5, 9], [ 6, 9, 6, 5, 0, 8], [ 8, 11, 8, 9, 8, 0], - ]) # fmt: skip - - ref_tree = phylo.Tree(phylo.TreeNode( - [ - phylo.TreeNode( - [ - phylo.TreeNode( - [ - phylo.TreeNode(index=0), - phylo.TreeNode(index=1), - ], - [1,4] - ), - phylo.TreeNode(index=2), - ], - [1, 2] - ), - phylo.TreeNode( - [ - phylo.TreeNode(index=3), - phylo.TreeNode(index=4), - ], - [3,2] - ), - phylo.TreeNode(index=5), - ], - [1,1,5] - )) + ]) # fmt: skip + + ref_tree = phylo.Tree( + phylo.TreeNode( + [ + phylo.TreeNode( + [ + phylo.TreeNode( + [ + phylo.TreeNode(index=0), + phylo.TreeNode(index=1), + ], + [1, 4], + ), + phylo.TreeNode(index=2), + ], + [1, 2], + ), + phylo.TreeNode( + [ + phylo.TreeNode(index=3), + phylo.TreeNode(index=4), + ], + [3, 2], + ), + phylo.TreeNode(index=5), + ], + [1, 1, 5], + ) + ) test_tree = phylo.neighbor_joining(dist) @@ -106,20 +110,20 @@ def test_node_distance(tree): assert leaf.distance_to(tree.root) == dist # Example topological distances assert tree.get_distance(0, 19, True) == 9 - assert tree.get_distance(4, 2, True) == 10 + assert tree.get_distance(4, 2, True) == 10 # All pairwise leaf node distances should be sufficient # to reconstruct the same tree via UPGMA ref_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - ref_dist_mat[i,j] = tree.get_distance(i,j) + ref_dist_mat[i, j] = tree.get_distance(i, j) assert np.allclose(ref_dist_mat, ref_dist_mat.T) new_tree = phylo.upgma(ref_dist_mat) test_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - test_dist_mat[i,j] = new_tree.get_distance(i,j) + test_dist_mat[i, j] = new_tree.get_distance(i, j) assert np.allclose(test_dist_mat, ref_dist_mat) @@ -136,14 +140,14 @@ def test_distances(tree): assert leaf.distance_to(tree.root) == dist # Example topological distances assert tree.get_distance(0, 19, True) == 9 - assert tree.get_distance(4, 2, True) == 10 + assert tree.get_distance(4, 2, True) == 10 def test_get_leaves(tree): # Manual example cases node = tree.leaves[6] assert set(tree.leaves[6].parent.get_indices()) == set( - [6,11,2,3,13,8,14,5,0,15,16] + [6, 11, 2, 3, 13, 8, 14, 5, 0, 15, 16] ) assert set(tree.leaves[10].get_indices()) == set([10]) assert tree.root.get_leaf_count() == 20 @@ -190,30 +194,33 @@ def test_immutability(): phylo.Tree(node1) -@pytest.mark.parametrize("newick, labels, error", [ - # Reference index out of range - ("((1,0),4),2);", None, biotite.InvalidFileError), - # Empty string - ("", None, biotite.InvalidFileError), - # Empty node - ("();", None, biotite.InvalidFileError), - # Missing brackets - ("((0,1,(2,3));", None, biotite.InvalidFileError), - # A node with three leaves - ("((0,1),(2,3),(4,5));", None, None), - # A node with one leaf - ("((0,1),(2,3),(4));", None, None), - # Named intermediate nodes - ("((0,1,3)A,2)B;", None, None), - # Named intermediate nodes and distances - ("((0:1.0,1:3.0,3:5.0)A:2.0,2:5.0)B;", None, None), - # Nodes with labels - ("((((A,B),(C,D)),E),F);", ["A","B","C","D","E","F"], None), - # Nodes with labels and distances - ("((((A:1,B:2),(C:3,D:4)),E:5),F:6);", ["A","B","C","D","E","F"], None), - # Newick with spaces - (" ( 0 : 1.0 , 1 : 3.0 ) A ; ", None, None), -]) +@pytest.mark.parametrize( + "newick, labels, error", + [ + # Reference index out of range + ("((1,0),4),2);", None, biotite.InvalidFileError), + # Empty string + ("", None, biotite.InvalidFileError), + # Empty node + ("();", None, biotite.InvalidFileError), + # Missing brackets + ("((0,1,(2,3));", None, biotite.InvalidFileError), + # A node with three leaves + ("((0,1),(2,3),(4,5));", None, None), + # A node with one leaf + ("((0,1),(2,3),(4));", None, None), + # Named intermediate nodes + ("((0,1,3)A,2)B;", None, None), + # Named intermediate nodes and distances + ("((0:1.0,1:3.0,3:5.0)A:2.0,2:5.0)B;", None, None), + # Nodes with labels + ("((((A,B),(C,D)),E),F);", ["A", "B", "C", "D", "E", "F"], None), + # Nodes with labels and distances + ("((((A:1,B:2),(C:3,D:4)),E:5),F:6);", ["A", "B", "C", "D", "E", "F"], None), + # Newick with spaces + (" ( 0 : 1.0 , 1 : 3.0 ) A ; ", None, None), + ], +) def test_newick_simple(newick, labels, error): # Read, write and read again a Newick notation and expect # the same reult from both reads @@ -223,8 +230,8 @@ def test_newick_simple(newick, labels, error): tree2 = phylo.Tree.from_newick(newick, labels) assert tree1 == tree2 else: - with pytest.raises(error): - tree1 = phylo.Tree.from_newick(newick, labels) + with pytest.raises(error): + tree1 = phylo.Tree.from_newick(newick, labels) @pytest.mark.parametrize("use_labels", [False, True]) @@ -243,14 +250,16 @@ def test_newick_complex(upgma_newick, use_labels): def test_newick_rounding(): # Create the distance matrix distances = np.array( - [[0. , 0.53, 0.93, 0.78, 0.38, 0.99, 1.02, 0.76], - [0.53, 0. , 0.59, 0.41, 0.35, 0.87, 1.03, 0.83], - [0.93, 0.59, 0. , 0.16, 0.58, 0.55, 1.59, 1.19], - [0.78, 0.41, 0.16, 0. , 0.42, 0.69, 1.4 , 1.18], - [0.38, 0.35, 0.58, 0.42, 0. , 1.02, 1.11, 0.89], - [0.99, 0.87, 0.55, 0.69, 1.02, 0. , 1.47, 1.26], - [1.02, 1.03, 1.59, 1.4 , 1.11, 1.47, 0. , 1.39], - [0.76, 0.83, 1.19, 1.18, 0.89, 1.26, 1.39, 0. ]] + [ + [0.0, 0.53, 0.93, 0.78, 0.38, 0.99, 1.02, 0.76], + [0.53, 0.0, 0.59, 0.41, 0.35, 0.87, 1.03, 0.83], + [0.93, 0.59, 0.0, 0.16, 0.58, 0.55, 1.59, 1.19], + [0.78, 0.41, 0.16, 0.0, 0.42, 0.69, 1.4, 1.18], + [0.38, 0.35, 0.58, 0.42, 0.0, 1.02, 1.11, 0.89], + [0.99, 0.87, 0.55, 0.69, 1.02, 0.0, 1.47, 1.26], + [1.02, 1.03, 1.59, 1.4, 1.11, 1.47, 0.0, 1.39], + [0.76, 0.83, 1.19, 1.18, 0.89, 1.26, 1.39, 0.0], + ] ) # Create the tree tree = phylo.neighbor_joining(distances) @@ -270,12 +279,15 @@ def test_newick_rounding(): ) -@pytest.mark.parametrize("newick_in, exp_newick_out", [ - ("(0:1.0, 1:2.0);", "(0:1.0,1:2.0):0.0;" ), - ("(0:1.0, 1:2.0, 2:3.0);", "((0:1.0,1:2.0):0.0,2:3.0):0.0;" ), - ("(((0:1.0, 1:2.0):10.0):5.0, 2:8.0);", "((0:1.0,1:2.0):15.0,2:8.0):0.0;"), - ("((0:1.0, 1:2.0):10.0):5.0;", "(0:1.0,1:2.0):0.0;" ), -]) +@pytest.mark.parametrize( + "newick_in, exp_newick_out", + [ + ("(0:1.0, 1:2.0);", "(0:1.0,1:2.0):0.0;"), + ("(0:1.0, 1:2.0, 2:3.0);", "((0:1.0,1:2.0):0.0,2:3.0):0.0;"), + ("(((0:1.0, 1:2.0):10.0):5.0, 2:8.0);", "((0:1.0,1:2.0):15.0,2:8.0):0.0;"), + ("((0:1.0, 1:2.0):10.0):5.0;", "(0:1.0,1:2.0):0.0;"), + ], +) def test_as_binary_cases(newick_in, exp_newick_out): """ Test the `as_binary()` function based on known cases. @@ -296,13 +308,13 @@ def test_as_binary_distances(): ref_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - ref_dist_mat[i,j] = tree.get_distance(i,j) + ref_dist_mat[i, j] = tree.get_distance(i, j) bin_tree = phylo.as_binary(tree) test_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - test_dist_mat[i,j] = bin_tree.get_distance(i,j) + test_dist_mat[i, j] = bin_tree.get_distance(i, j) assert np.allclose(test_dist_mat, ref_dist_mat) @@ -313,26 +325,27 @@ def test_equality(tree): """ assert tree == tree.copy() # Order of children is not important - assert tree == phylo.Tree(phylo.TreeNode( - [tree.root.children[1].copy(), tree.root.children[0].copy()], - [tree.root.children[1].distance, tree.root.children[0].distance] - )) + assert tree == phylo.Tree( + phylo.TreeNode( + [tree.root.children[1].copy(), tree.root.children[0].copy()], + [tree.root.children[1].distance, tree.root.children[0].distance], + ) + ) # Different distance -> Unequal tree - assert tree != phylo.Tree(phylo.TreeNode( - [tree.root.children[0].copy(), tree.root.children[1].copy()], - [tree.root.children[0].distance, 42] - )) + assert tree != phylo.Tree( + phylo.TreeNode( + [tree.root.children[0].copy(), tree.root.children[1].copy()], + [tree.root.children[0].distance, 42], + ) + ) # Additional node -> Unequal tree - assert tree != phylo.Tree(phylo.TreeNode( - [ - tree.root.children[0].copy(), - tree.root.children[1].copy(), - phylo.TreeNode(index=len(tree)) - ], - [ - tree.root.children[0].distance, - tree.root.children[1].distance, - 42 - ] - )) - + assert tree != phylo.Tree( + phylo.TreeNode( + [ + tree.root.children[0].copy(), + tree.root.children[1].copy(), + phylo.TreeNode(index=len(tree)), + ], + [tree.root.children[0].distance, tree.root.children[1].distance, 42], + ) + ) diff --git a/tests/sequence/test_profile.py b/tests/sequence/test_profile.py index 658779bd2..3f7669bbd 100644 --- a/tests/sequence/test_profile.py +++ b/tests/sequence/test_profile.py @@ -11,24 +11,43 @@ def test_from_alignment(): seq1 = seq.NucleotideSequence("CGTCAT") seq2 = seq.NucleotideSequence("TCATGC") - ali_str = ["CGTCAT--", - "--TCATGC"] + ali_str = ["CGTCAT--", "--TCATGC"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) profile = seq.SequenceProfile.from_alignment(alignment) - symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) + symbols = np.array( + [ + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) assert np.array_equal(symbols, profile.symbols) assert np.array_equal(gaps, profile.gaps) - assert (alphabet == profile.alphabet) + assert alphabet == profile.alphabet def test_to_consensus_nuc(): - symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) + symbols = np.array( + [ + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) profile = seq.SequenceProfile(symbols, gaps, alphabet) @@ -37,8 +56,18 @@ def test_to_consensus_nuc(): def test_to_consensus_nuc_ambiguous(): - symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) + symbols = np.array( + [ + [1, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) profile = seq.SequenceProfile(symbols, gaps, alphabet) @@ -48,45 +77,65 @@ def test_to_consensus_nuc_ambiguous(): def test_to_consensus_prot(): # Avidin protein sequence - seq1 = seq.ProteinSequence("MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP" - "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE") + seq1 = seq.ProteinSequence( + "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP" + "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE" + ) # Streptavidin protein sequence - seq2 = seq.ProteinSequence("MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA" - "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN" - "GNPLDAVQQ") + seq2 = seq.ProteinSequence( + "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA" + "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN" + "GNPLDAVQQ" + ) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(seq1, seq2, matrix)[0] profile = seq.SequenceProfile.from_alignment(alignment) - assert seq.ProteinSequence("MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD" - "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG" - "INIFNPLDAQKE") == profile.to_consensus() + assert ( + seq.ProteinSequence( + "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD" + "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG" + "INIFNPLDAQKE" + ) + == profile.to_consensus() + ) def test_new_position_matrices(): - seqs = [seq.NucleotideSequence("AAGAAT"), - seq.NucleotideSequence("ATCATA"), - seq.NucleotideSequence("AAGTAA"), - seq.NucleotideSequence("AACAAA"), - seq.NucleotideSequence("ATTAAA"), - seq.NucleotideSequence("AAGAAT")] + seqs = [ + seq.NucleotideSequence("AAGAAT"), + seq.NucleotideSequence("ATCATA"), + seq.NucleotideSequence("AAGTAA"), + seq.NucleotideSequence("AACAAA"), + seq.NucleotideSequence("ATTAAA"), + seq.NucleotideSequence("AAGAAT"), + ] alignment = align.Alignment( sequences=seqs, - trace=np.tile(np.arange(len(seqs[0])), len(seqs)) \ - .reshape(len(seqs), len(seqs[0])) \ - .transpose(), - score=0 + trace=np.tile(np.arange(len(seqs[0])), len(seqs)) + .reshape(len(seqs), len(seqs[0])) + .transpose(), + score=0, ) profile = seq.SequenceProfile.from_alignment(alignment) - probability_matrix = np.array([[1., 0., 0., 0., ], - [0.66666667, 0., 0., 0.33333333], - [0., 0.33333333, 0.5, 0.16666667], - [0.83333333, 0., 0., 0.16666667], - [0.83333333, 0., 0., 0.16666667], - [0.66666667, 0., 0., 0.33333333]]) + probability_matrix = np.array( + [ + [ + 1.0, + 0.0, + 0.0, + 0.0, + ], + [0.66666667, 0.0, 0.0, 0.33333333], + [0.0, 0.33333333, 0.5, 0.16666667], + [0.83333333, 0.0, 0.0, 0.16666667], + [0.83333333, 0.0, 0.0, 0.16666667], + [0.66666667, 0.0, 0.0, 0.33333333], + ] + ) ppm = profile.probability_matrix() @@ -98,25 +147,35 @@ def test_new_position_matrices(): ppm = profile.probability_matrix(pseudocount=1) - probability_matrix = np.array([[0.89285714, 0.03571429, 0.03571429, 0.03571429], - [0.60714286, 0.03571429, 0.03571429, 0.32142857], - [0.03571429, 0.32142857, 0.46428571, 0.17857143], - [0.75, 0.03571429, 0.03571429, 0.17857143], - [0.75, 0.03571429, 0.03571429, 0.17857143], - [0.60714286, 0.03571429, 0.03571429, 0.32142857]]) + probability_matrix = np.array( + [ + [0.89285714, 0.03571429, 0.03571429, 0.03571429], + [0.60714286, 0.03571429, 0.03571429, 0.32142857], + [0.03571429, 0.32142857, 0.46428571, 0.17857143], + [0.75, 0.03571429, 0.03571429, 0.17857143], + [0.75, 0.03571429, 0.03571429, 0.17857143], + [0.60714286, 0.03571429, 0.03571429, 0.32142857], + ] + ) assert np.allclose(probability_matrix, ppm, atol=1e-3) - probability = profile.sequence_probability(seq.NucleotideSequence("AAAAAA"), pseudocount=1) + probability = profile.sequence_probability( + seq.NucleotideSequence("AAAAAA"), pseudocount=1 + ) assert probability == pytest.approx(0.0066, abs=1e-3) - log_odds_matrix = np.array([[1.83650127, -2.80735492, -2.80735492, -2.80735492], - [1.28010792, -2.80735492, -2.80735492, 0.36257008], - [-2.80735492, 0.36257008, 0.8930848, -0.48542683], - [1.5849625, -2.80735492, -2.80735492, -0.48542683], - [1.5849625, -2.80735492, -2.80735492, -0.48542683], - [1.28010792, -2.80735492, -2.80735492, 0.36257008]]) + log_odds_matrix = np.array( + [ + [1.83650127, -2.80735492, -2.80735492, -2.80735492], + [1.28010792, -2.80735492, -2.80735492, 0.36257008], + [-2.80735492, 0.36257008, 0.8930848, -0.48542683], + [1.5849625, -2.80735492, -2.80735492, -0.48542683], + [1.5849625, -2.80735492, -2.80735492, -0.48542683], + [1.28010792, -2.80735492, -2.80735492, 0.36257008], + ] + ) pwm = profile.log_odds_matrix(pseudocount=1) diff --git a/tests/sequence/test_search.py b/tests/sequence/test_search.py index 7ef2b4618..c2150afac 100644 --- a/tests/sequence/test_search.py +++ b/tests/sequence/test_search.py @@ -3,8 +3,6 @@ # information. import biotite.sequence as seq -import numpy as np -import pytest def test_find_subsequence(): @@ -13,12 +11,13 @@ def test_find_subsequence(): main_seq = seq.NucleotideSequence(string) sub_seq = seq.NucleotideSequence(substring) matches = seq.find_subsequence(main_seq, sub_seq) - assert list(matches) == [4,8] - + assert list(matches) == [4, 8] + + def test_find_symbol(): string = "ATACGCTTGCT" symbol = "T" dna = seq.NucleotideSequence(string) - assert list(seq.find_symbol(dna, symbol)) == [1,6,7,10] + assert list(seq.find_symbol(dna, symbol)) == [1, 6, 7, 10] assert seq.find_symbol_first(dna, symbol) == 1 - assert seq.find_symbol_last(dna, symbol) == 10 \ No newline at end of file + assert seq.find_symbol_last(dna, symbol) == 10 diff --git a/tests/sequence/test_seqtypes.py b/tests/sequence/test_seqtypes.py index 157d8d9ff..086f972a9 100644 --- a/tests/sequence/test_seqtypes.py +++ b/tests/sequence/test_seqtypes.py @@ -2,9 +2,8 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.sequence as seq -import numpy as np import pytest +import biotite.sequence as seq def test_nucleotide_construction(): @@ -23,26 +22,31 @@ def test_reverse_complement(): dna = seq.NucleotideSequence(string) assert str(dna.reverse().complement()) == "RNTAACGCATT" + def test_stop_removal(): string = "LYG*GR*" protein = seq.ProteinSequence(string) assert str(protein.remove_stops()) == string.replace("*", "") -@pytest.mark.parametrize("dna_str, protein_str", - [("CACATAGCATGA", "HIA*"), - ("ATGTAGCTA", "M*L")]) +@pytest.mark.parametrize( + "dna_str, protein_str", [("CACATAGCATGA", "HIA*"), ("ATGTAGCTA", "M*L")] +) def test_full_translation(dna_str, protein_str): dna = seq.NucleotideSequence(dna_str) protein = dna.translate(complete=True) assert protein_str == str(protein) -@pytest.mark.parametrize("dna_str, protein_str_list", - [("CA", []), - ("GAATGCACTGAGATGCAATAG", ["MH*","MQ*"]), - ("ATGCACATGTAGGG", ["MHM*","M*"]), - ("GATGCATGTGAAAA", ["MHVK","M*"])]) +@pytest.mark.parametrize( + "dna_str, protein_str_list", + [ + ("CA", []), + ("GAATGCACTGAGATGCAATAG", ["MH*", "MQ*"]), + ("ATGCACATGTAGGG", ["MHM*", "M*"]), + ("GATGCATGTGAAAA", ["MHVK", "M*"]), + ], +) def test_frame_translation(dna_str, protein_str_list): dna = seq.NucleotideSequence(dna_str) proteins, pos = dna.translate(complete=False) @@ -50,8 +54,8 @@ def test_frame_translation(dna_str, protein_str_list): assert set([str(protein) for protein in proteins]) == set(protein_str_list) # Test if the positions are also right # -> Get sequence slice and translate completely - assert set([str(dna[start : stop].translate(complete=True)) - for start, stop in pos] + assert set( + [str(dna[start:stop].translate(complete=True)) for start, stop in pos] ) == set(protein_str_list) @@ -76,7 +80,7 @@ def test_letter_conversion(): @pytest.mark.parametrize( "monoisotopic, expected_mol_weight_protein", # Reference values taken from https://web.expasy.org/compute_pi/ - [(True, 2231.06), (False, 2232.56)] + [(True, 2231.06), (False, 2232.56)], ) def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein): """ @@ -84,8 +88,5 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein): correctly. """ protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW") - mol_weight_protein = protein.get_molecular_weight( - monoisotopic=monoisotopic) - assert mol_weight_protein == \ - pytest.approx(expected_mol_weight_protein, abs=1e-2) - + mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic) + assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2) diff --git a/tests/sequence/test_sequence.py b/tests/sequence/test_sequence.py index 78a815b5f..bfffaedb5 100644 --- a/tests/sequence/test_sequence.py +++ b/tests/sequence/test_sequence.py @@ -2,8 +2,8 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import numpy as np +import pytest import biotite.sequence as seq @@ -13,13 +13,15 @@ def test_encoding(): string2 = str(dna) assert string1 == string2 + def test_validity_check(): dna = seq.NucleotideSequence() - dna.code = np.array([0,1,0,3,3]) + dna.code = np.array([0, 1, 0, 3, 3]) assert dna.is_valid() - dna.code = np.array([0,1,4,3,3]) + dna.code = np.array([0, 1, 4, 3, 3]) assert not dna.is_valid() - + + def test_access(): string = "AATGCGTTA" dna = seq.NucleotideSequence(string) @@ -28,6 +30,7 @@ def test_access(): dna = dna[3:-2] assert "GCGT" == str(dna) + def test_manipulation(): dna_seq = seq.NucleotideSequence("ACGTA") dna_copy = dna_seq.copy() @@ -37,12 +40,13 @@ def test_manipulation(): dna_copy[0:2] = dna_copy[3:5] assert "TAGTA" == str(dna_copy) dna_copy = dna_seq.copy() - dna_copy[np.array([True,False,False,False,True])] = "T" + dna_copy[np.array([True, False, False, False, True])] = "T" assert "TCGTT" == str(dna_copy) dna_copy = dna_seq.copy() - dna_copy[1:4] = np.array([0,1,2]) + dna_copy[1:4] = np.array([0, 1, 2]) assert "AACGA" == str(dna_copy) + def test_concatenation(): str1 = "AAGTTA" str2 = "CGA" @@ -54,16 +58,19 @@ def test_concatenation(): concat_seq = seq.NucleotideSequence(str3) + seq.NucleotideSequence(str1) assert str3 + str1 == str(concat_seq) + def test_frequency(): string = "ACGCGAGAAAGCGGG" dna = seq.NucleotideSequence(string) assert dna.get_symbol_frequency() == {"A": 5, "C": 3, "G": 7, "T": 0} - + + def test_alph_error(): string = "AATGCGTUTA" with pytest.raises(seq.AlphabetError): seq.NucleotideSequence(string) + def test_alphabet_extension(): alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("abc") @@ -73,4 +80,4 @@ def test_alphabet_extension(): assert alph2.extends(alph1) assert not alph3.extends(alph1) assert alph4.extends(alph1) - assert not alph1.extends(alph4) \ No newline at end of file + assert not alph1.extends(alph4) diff --git a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py index c81f9e050..e0c2fa1f4 100644 --- a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py +++ b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py @@ -1,35 +1,35 @@ -import pandas as pd import argparse -import numpy as np import json +import numpy as np +import pandas as pd + def process(input, output, chain): data = pd.read_csv(input) # Only retain rows with basepair annotation - data = data[data['Leontis-Westhof'].notna()] + data = data[data["Leontis-Westhof"].notna()] output_list = [] for _, row in data.iterrows(): - - nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] + nucleotides = [row["Nucleotide 1"], row["Nucleotide 2"]] # Extract the Leontis-Westhof annotation - lw_string = row['Leontis-Westhof'] + lw_string = row["Leontis-Westhof"] # Some interactions are labelled with `n` for near. These are # ignored - if lw_string[0] == 'n': + if lw_string[0] == "n": continue # Get sugar orientation from string (`c` = cis, `t` = trans) sugar_orientation = lw_string[0] # The residue ids of the nucleotides - res_ids = [None]*2 + res_ids = [None] * 2 for i, nucleotide in enumerate(nucleotides): - nucleotide_list = nucleotide.split('.') + nucleotide_list = nucleotide.split(".") # if the nucleotide is not part of the specified chain, skip # base pair @@ -41,37 +41,28 @@ def process(input, output, chain): if None in res_ids: continue - if sugar_orientation == 'c': + if sugar_orientation == "c": sugar_orientation = 1 - elif sugar_orientation == 't': + elif sugar_orientation == "t": sugar_orientation = 2 this_output = sorted((int(res_ids[0]), int(res_ids[1]))) this_output.append(int(sugar_orientation)) output_list.append(this_output) output_list = np.unique(output_list, axis=0).tolist() - with open(output, 'w') as f: + with open(output, "w") as f: json.dump(output_list, f, indent=1) + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Parse the glycosidic bond orientation annotations in the " "NAKB-database for a specific chain. The annotations can be " "downloaded in the section 'Base Pairs'." ) - parser.add_argument( - "infile", - help="The path to the input file." - ) - parser.add_argument( - "outfile", - help="The path to the output JSON file." - ) - parser.add_argument( - "chain", - help="The chain ID to be extracted." - ) + parser.add_argument("infile", help="The path to the input file.") + parser.add_argument("outfile", help="The path to the output JSON file.") + parser.add_argument("chain", help="The chain ID to be extracted.") args = parser.parse_args() process(args.infile, args.outfile, args.chain) - diff --git a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py index 1a46eb4d3..bdcd1f586 100644 --- a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py +++ b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py @@ -1,36 +1,37 @@ -import pandas as pd import argparse import json import numpy as np +import pandas as pd + def process(input, output, chain): data = pd.read_csv(input) # Only retain rows with basepair annotation - data = data[data['Leontis-Westhof'].notna()] + data = data[data["Leontis-Westhof"].notna()] output_list = [] for _, row in data.iterrows(): - nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] + nucleotides = [row["Nucleotide 1"], row["Nucleotide 2"]] # Extract the Leontis-Westhof annotation - lw_string = row['Leontis-Westhof'] + lw_string = row["Leontis-Westhof"] # Some interactions are labelled with `n` for near. These are # ignored - if lw_string[0] == 'n': + if lw_string[0] == "n": continue # Get edge annotations from string edges = [lw_string[-2], lw_string[-1]] - + # Dont allow unspecified edges in test data - if '.' in edges: + if "." in edges: continue - res_ids = [None]*2 + res_ids = [None] * 2 for i, nucleotide in enumerate(nucleotides): - nucleotide_list = nucleotide.split('.') + nucleotide_list = nucleotide.split(".") # if the nucleotide is not part of the specified chain, skip # base pair @@ -43,11 +44,11 @@ def process(input, output, chain): continue for i, edge in enumerate(edges): - if edge == 'W': + if edge == "W": edges[i] = 1 - if edge == 'H': + if edge == "H": edges[i] = 2 - if edge == 'S': + if edge == "S": edges[i] = 3 # Lower residue id on the left, higher residue id on the right @@ -62,28 +63,19 @@ def process(input, output, chain): ) output_list = np.unique(output_list, axis=0).tolist() - with open(output, 'w') as f: + with open(output, "w") as f: json.dump(output_list, f, indent=1) + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Parse the edge type annotations in the NAKB-database for " "a specific chain. The annotations can be downloaded in the section " "'Base Pairs'." ) - parser.add_argument( - "infile", - help="The path to the input file." - ) - parser.add_argument( - "outfile", - help="The path to the output JSON file." - ) - parser.add_argument( - "chain", - help="The chain ID to be extracted." - ) + parser.add_argument("infile", help="The path to the input file.") + parser.add_argument("outfile", help="The path to the output JSON file.") + parser.add_argument("chain", help="The chain ID to be extracted.") args = parser.parse_args() process(args.infile, args.outfile, args.chain) - diff --git a/tests/structure/data/create_test_structures.py b/tests/structure/data/create_test_structures.py index 4bf0ae175..da0f0ff48 100644 --- a/tests/structure/data/create_test_structures.py +++ b/tests/structure/data/create_test_structures.py @@ -1,12 +1,12 @@ import argparse -import subprocess -from os.path import join import logging +import subprocess import sys +from os.path import join import biotite -from biotite.database import RequestError import biotite.database.rcsb as rcsb import biotite.structure.io as strucio +from biotite.database import RequestError def create(pdb_id, directory, include_gro): @@ -18,7 +18,7 @@ def create(pdb_id, directory, include_gro): # PDB entry is not provided in this format pass try: - array = strucio.load_structure(join(directory, pdb_id+".pdb")) + array = strucio.load_structure(join(directory, pdb_id + ".pdb")) except biotite.InvalidFileError: # Structure probably contains multiple models with different # number of atoms @@ -31,41 +31,55 @@ def create(pdb_id, directory, include_gro): cleaned_file_name = biotite.temp_file("pdb") strucio.save_structure(cleaned_file_name, array) # Run GROMACS for file conversion - subprocess.run([ - "editconf", - "-f", cleaned_file_name, - "-o", join(directory, pdb_id+".gro") - ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run( + [ + "editconf", + "-f", + cleaned_file_name, + "-o", + join(directory, pdb_id + ".gro"), + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Create structure files for unit tests " - "in all supported formats from PDB ID " - "(excluding GROMACS trajectory files)" - ) - parser.add_argument( - "--dir", "-d", dest="directory", default=".", - help="the Biotite project directory to put the test files into" + "in all supported formats from PDB ID " + "(excluding GROMACS trajectory files)" ) parser.add_argument( - "--id", "-i", dest="id", - help="the PDB ID" + "--dir", + "-d", + dest="directory", + default=".", + help="the Biotite project directory to put the test files into", ) + parser.add_argument("--id", "-i", dest="id", help="the PDB ID") parser.add_argument( - "--file", "-f", dest="file", - help="read mutliple PDB IDs from text file (line break separated IDs)" + "--file", + "-f", + dest="file", + help="read mutliple PDB IDs from text file (line break separated IDs)", ) parser.add_argument( - "--gromacs", "-g", action="store_true", dest="include_gro", - help="Create '*.gro' files using the Gromacs software" + "--gromacs", + "-g", + action="store_true", + dest="include_gro", + help="Create '*.gro' files using the Gromacs software", ) args = parser.parse_args() if args.file is not None: with open(args.file, "r") as file: - pdb_ids = [pdb_id.strip().lower() for pdb_id - in file.read().split("\n") if len(pdb_id.strip()) != 0] + pdb_ids = [ + pdb_id.strip().lower() + for pdb_id in file.read().split("\n") + if len(pdb_id.strip()) != 0 + ] elif args.id is not None: pdb_ids = [args.id.lower()] else: @@ -78,4 +92,4 @@ def create(pdb_id, directory, include_gro): create(pdb_id, args.directory, args.include_gro) except: print() - raise \ No newline at end of file + raise diff --git a/tests/structure/data/molecules/create_v3000_sdf.py b/tests/structure/data/molecules/create_v3000_sdf.py index dc313722f..9630e71a1 100644 --- a/tests/structure/data/molecules/create_v3000_sdf.py +++ b/tests/structure/data/molecules/create_v3000_sdf.py @@ -11,4 +11,4 @@ writer.SetForceV3000(True) for molecule in supplier: writer.write(molecule) - writer.close() \ No newline at end of file + writer.close() diff --git a/tests/structure/test_atoms.py b/tests/structure/test_atoms.py index 93a94ac65..22b75c919 100644 --- a/tests/structure/test_atoms.py +++ b/tests/structure/test_atoms.py @@ -10,81 +10,88 @@ @pytest.fixture def atom_list(): - chain_id = ["A","A","B","B","B"] - res_id = [1,1,1,1,2] + chain_id = ["A", "A", "B", "B", "B"] + res_id = [1, 1, 1, 1, 2] ins_code = [""] * 5 - res_name = ["ALA","ALA","PRO","PRO","MSE"] + res_name = ["ALA", "ALA", "PRO", "PRO", "MSE"] hetero = [False, False, False, False, True] atom_name = ["N", "CA", "O", "CA", "SE"] - element = ["N","C","O","C","SE"] + element = ["N", "C", "O", "C", "SE"] atom_list = [] for i in range(5): - atom_list.append(struc.Atom([i,i,i], - chain_id = chain_id[i], - res_id = res_id[i], - ins_code = ins_code[i], - res_name = res_name[i], - hetero = hetero[i], - atom_name = atom_name[i], - element = element[i])) + atom_list.append( + struc.Atom( + [i, i, i], + chain_id=chain_id[i], + res_id=res_id[i], + ins_code=ins_code[i], + res_name=res_name[i], + hetero=hetero[i], + atom_name=atom_name[i], + element=element[i], + ) + ) return atom_list + @pytest.fixture def atom(atom_list): return atom_list[2] + @pytest.fixture def array(atom_list): return struc.array(atom_list) + @pytest.fixture def stack(array): return struc.stack([array, array.copy(), array.copy()]) + @pytest.fixture def array_box(): - return np.array([ - [1,0,0], - [0,2,0], - [0,0,3] - ]) + return np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]]) + @pytest.fixture def stack_box(stack, array_box): return np.array([array_box] * stack.stack_depth()) + def test_shape(array, stack): assert array.shape == (5,) assert stack.shape == (3, 5) + def test_access(array): - chain_id = ["A","A","B","B","B"] - assert array.coord.shape == (5,3) + chain_id = ["A", "A", "B", "B", "B"] + assert array.coord.shape == (5, 3) assert array.chain_id.tolist() == chain_id assert array.get_annotation("chain_id").tolist() == chain_id array.add_annotation("test1", dtype=int) - assert array.test1.tolist() == [0,0,0,0,0] + assert array.test1.tolist() == [0, 0, 0, 0, 0] with pytest.raises(IndexError): - array.set_annotation("test2", np.array([0,1,2,3])) + array.set_annotation("test2", np.array([0, 1, 2, 3])) def test_modification(atom, array, stack): new_atom = atom new_atom.chain_id = "C" del array[2] - assert array.chain_id.tolist() == ["A","A","B","B"] + assert array.chain_id.tolist() == ["A", "A", "B", "B"] array[-1] = new_atom - assert array.chain_id.tolist() == ["A","A","B","C"] + assert array.chain_id.tolist() == ["A", "A", "B", "C"] del stack[1] assert stack.stack_depth() == 2 def test_array_indexing(atom, array): filtered_array = array[array.chain_id == "B"] - assert filtered_array.res_name.tolist() == ["PRO","PRO","MSE"] + assert filtered_array.res_name.tolist() == ["PRO", "PRO", "MSE"] assert atom == filtered_array[0] - filtered_array = array[[0,2,4]] - assert filtered_array.element.tolist() == ["N","O","SE"] + filtered_array = array[[0, 2, 4]] + assert filtered_array.element.tolist() == ["N", "O", "SE"] def test_stack_indexing(stack): @@ -93,22 +100,22 @@ def test_stack_indexing(stack): filtered_stack = stack[0] assert type(filtered_stack) == struc.AtomArray filtered_stack = stack[0:2, stack.res_name == "PRO"] - assert filtered_stack.atom_name.tolist() == ["O","CA"] - filtered_stack = stack[np.array([True,False,True])] + assert filtered_stack.atom_name.tolist() == ["O", "CA"] + filtered_stack = stack[np.array([True, False, True])] assert filtered_stack.stack_depth() == 2 assert filtered_stack.array_length() == 5 - filtered_stack = stack[:,0] + filtered_stack = stack[:, 0] assert filtered_stack.stack_depth() == 3 assert filtered_stack.array_length() == 1 - + def test_concatenation(array, stack): concat_array = array[2:] + array[:2] - assert concat_array.chain_id.tolist() == ["B","B","B","A","A"] - assert concat_array.coord.shape == (5,3) - concat_stack = stack[:,2:] + stack[:,:2] - assert concat_array.chain_id.tolist() == ["B","B","B","A","A"] - assert concat_stack.coord.shape == (3,5,3) + assert concat_array.chain_id.tolist() == ["B", "B", "B", "A", "A"] + assert concat_array.coord.shape == (5, 3) + concat_stack = stack[:, 2:] + stack[:, :2] + assert concat_array.chain_id.tolist() == ["B", "B", "B", "A", "A"] + assert concat_stack.coord.shape == (3, 5, 3) def test_comparison(array): @@ -129,23 +136,26 @@ def test_bonds(array): with pytest.raises(ValueError): # Expect a BondList with array length as atom count array.bonds = struc.BondList(13) - array.bonds = struc.BondList(5, np.array([(0,1),(0,2),(2,3),(2,4)])) - assert array.bonds.as_array().tolist() == [[0, 1, 0], - [0, 2, 0], - [2, 3, 0], - [2, 4, 0],] + array.bonds = struc.BondList(5, np.array([(0, 1), (0, 2), (2, 3), (2, 4)])) + assert array.bonds.as_array().tolist() == [ + [0, 1, 0], + [0, 2, 0], + [2, 3, 0], + [2, 4, 0], + ] filtered_array = array[array.chain_id == "B"] - assert filtered_array.bonds.as_array().tolist() == [[0, 1, 0], - [0, 2, 0]] + assert filtered_array.bonds.as_array().tolist() == [[0, 1, 0], [0, 2, 0]] concat_array = array + array - assert concat_array.bonds.as_array().tolist() == [[0, 1, 0], - [0, 2, 0], - [2, 3, 0], - [2, 4, 0], - [5, 6, 0], - [5, 7, 0], - [7, 8, 0], - [7, 9, 0]] + assert concat_array.bonds.as_array().tolist() == [ + [0, 1, 0], + [0, 2, 0], + [2, 3, 0], + [2, 4, 0], + [5, 6, 0], + [5, 7, 0], + [7, 8, 0], + [7, 9, 0], + ] def test_box(array, stack, array_box, stack_box): @@ -193,4 +203,4 @@ def test_pickle(atom, array, stack): assert test_array == array test_stack = pickle.loads(pickle.dumps(stack)) - assert test_stack == stack \ No newline at end of file + assert test_stack == stack diff --git a/tests/structure/test_basepairs.py b/tests/structure/test_basepairs.py index d0b554f27..16c2bdd32 100644 --- a/tests/structure/test_basepairs.py +++ b/tests/structure/test_basepairs.py @@ -2,23 +2,22 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import json -import warnings +from os.path import join import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -from biotite.structure.info import residue -from biotite.structure.residues import get_residue_masks -from biotite.structure.hbond import hbond -from os.path import join -from ..util import data_dir + # For ``base_pairs_edge()`` differences to a reference can be ambiguous # as the number hydrogen bonds between two different edges can be equal. # In order to distinguish ambiguously identified edges from wrongfully # identified edges the full edge matrix, listing the number of hydrogen # bonds for each edge has to be considered. from biotite.structure.basepairs import _get_edge_matrix +from biotite.structure.info import residue +from biotite.structure.residues import get_residue_masks +from ..util import data_dir def reversed_iterator(iter): @@ -30,9 +29,7 @@ def reversed_iterator(iter): @pytest.fixture def nuc_sample_array(): - return strucio.load_structure( - join(data_dir("structure"), "base_pairs", "1qxb.cif") - ) + return strucio.load_structure(join(data_dir("structure"), "base_pairs", "1qxb.cif")) @pytest.fixture @@ -40,11 +37,10 @@ def basepairs(nuc_sample_array): """ Generate a test output for the base_pairs function. """ - residue_indices, residue_names = struc.residues.get_residues( - nuc_sample_array - )[0:24] + residue_indices, residue_names = struc.residues.get_residues(nuc_sample_array)[0:24] return np.vstack((residue_indices[:12], np.flip(residue_indices)[:12])).T + def check_residue_starts(computed_starts, nuc_sample_array): """ Assert that computed starts are residue starts. @@ -53,6 +49,7 @@ def check_residue_starts(computed_starts, nuc_sample_array): for start in computed_starts.flatten(): assert start in residue_starts + def check_output(computed_basepairs, basepairs): """ Check the output of base_pairs. @@ -60,16 +57,17 @@ def check_output(computed_basepairs, basepairs): # Check if base pairs are unique in computed_basepairs seen = set() - assert (not any( - (base1, base2) in seen) or (base2, base1 in seen) - or seen.add((base1, base2)) for base1, base2 in computed_basepairs - ) + assert ( + not any((base1, base2) in seen) + or (base2, base1 in seen) + or seen.add((base1, base2)) + for base1, base2 in computed_basepairs + ) # Check if the right number of base pairs is in computed_base pairs - assert(len(computed_basepairs) == len(basepairs)) + assert len(computed_basepairs) == len(basepairs) # Check if the right base pairs are in computed_basepairs for comp_basepair in computed_basepairs: - assert ((comp_basepair in basepairs) \ - or (comp_basepair in np.flip(basepairs))) + assert (comp_basepair in basepairs) or (comp_basepair in np.flip(basepairs)) @pytest.mark.parametrize("unique_bool", [False, True]) @@ -105,13 +103,9 @@ def test_base_pairs_reverse(nuc_sample_array, basepairs, unique_bool): for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)): reversed_nuc_sample_array = reversed_nuc_sample_array + residue - computed_basepairs = struc.base_pairs( - reversed_nuc_sample_array, unique=unique_bool - ) + computed_basepairs = struc.base_pairs(reversed_nuc_sample_array, unique=unique_bool) check_residue_starts(computed_basepairs, reversed_nuc_sample_array) - check_output( - reversed_nuc_sample_array[computed_basepairs].res_id, basepairs - ) + check_output(reversed_nuc_sample_array[computed_basepairs].res_id, basepairs) def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs): @@ -128,9 +122,8 @@ def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs): computed_basepairs = struc.base_pairs(reversed_nuc_sample_array) check_residue_starts(computed_basepairs, reversed_nuc_sample_array) - check_output( - reversed_nuc_sample_array[computed_basepairs].res_id, basepairs - ) + check_output(reversed_nuc_sample_array[computed_basepairs].res_id, basepairs) + def test_base_pairs_incomplete_structure(nuc_sample_array): """ @@ -142,14 +135,15 @@ def test_base_pairs_incomplete_structure(nuc_sample_array): """ nuc_sample_array = nuc_sample_array[ - ~ np.isin( + ~np.isin( nuc_sample_array.atom_name, - ['N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9', 'O2'] + ["N1", "C2", "N3", "C4", "C5", "C6", "N7", "C8", "N9", "O2"], ) ] with pytest.warns(struc.IncompleteStructureWarning): assert len(struc.base_pairs(nuc_sample_array)) == 0 + @pytest.mark.parametrize("seed", range(10)) def test_base_pairs_reordered(nuc_sample_array, seed): """ @@ -162,50 +156,47 @@ def test_base_pairs_reordered(nuc_sample_array, seed): for residue in struc.residue_iter(nuc_sample_array): bound = residue.array_length() - indices = np.random.choice( - np.arange(bound), bound,replace=False - ) + indices = np.random.choice(np.arange(bound), bound, replace=False) nuc_sample_array_reordered += residue[..., indices] - assert(np.all( + assert np.all( struc.base_pairs(nuc_sample_array) == struc.base_pairs(nuc_sample_array_reordered) - )) + ) def test_map_nucleotide(): - """Test the function map_nucleotide with some examples. - """ - pyrimidines = ['C', 'T', 'U'] - purines = ['A', 'G'] + """Test the function map_nucleotide with some examples.""" + pyrimidines = ["C", "T", "U"] + purines = ["A", "G"] # Test that the standard bases are correctly identified - assert struc.map_nucleotide(residue('U')) == ('U', True) - assert struc.map_nucleotide(residue('A')) == ('A', True) - assert struc.map_nucleotide(residue('T')) == ('T', True) - assert struc.map_nucleotide(residue('G')) == ('G', True) - assert struc.map_nucleotide(residue('C')) == ('C', True) + assert struc.map_nucleotide(residue("U")) == ("U", True) + assert struc.map_nucleotide(residue("A")) == ("A", True) + assert struc.map_nucleotide(residue("T")) == ("T", True) + assert struc.map_nucleotide(residue("G")) == ("G", True) + assert struc.map_nucleotide(residue("C")) == ("C", True) # Test that some non_standard nucleotides are mapped correctly to # pyrimidine/purine references - psu_tuple = struc.map_nucleotide(residue('PSU')) + psu_tuple = struc.map_nucleotide(residue("PSU")) assert psu_tuple[0] in pyrimidines assert psu_tuple[1] == False - psu_tuple = struc.map_nucleotide(residue('3MC')) + psu_tuple = struc.map_nucleotide(residue("3MC")) assert psu_tuple[0] in pyrimidines assert psu_tuple[1] == False - i_tuple = struc.map_nucleotide(residue('I')) + i_tuple = struc.map_nucleotide(residue("I")) assert i_tuple[0] in purines assert i_tuple[1] == False - m7g_tuple = struc.map_nucleotide(residue('M7G')) + m7g_tuple = struc.map_nucleotide(residue("M7G")) assert m7g_tuple[0] in purines assert m7g_tuple[1] == False with pytest.warns(struc.IncompleteStructureWarning): - assert struc.map_nucleotide(residue('ALA')) == (None, False) + assert struc.map_nucleotide(residue("ALA")) == (None, False) def get_reference(pdb_id, suffix): @@ -218,12 +209,13 @@ def get_reference(pdb_id, suffix): ) with open( - join(data_dir("structure"), "base_pairs", f"{pdb_id}_{suffix}.json" - ), "r") as file: + join(data_dir("structure"), "base_pairs", f"{pdb_id}_{suffix}.json"), "r" + ) as file: reference = np.array(json.load(file)) return structure, reference + def get_reference_index(pair, array): """ Get the index of the row in a reference array, where the first two @@ -236,10 +228,7 @@ def get_reference_index(pair, array): return None - -def check_edge_plausibility( - reference_structure, pair, reference_edges, output_edges -): +def check_edge_plausibility(reference_structure, pair, reference_edges, output_edges): """ Checks if the difference to a reference edge is at least ambiguous. A difference is defined as ambiguous, if the number of hydrogen @@ -280,8 +269,9 @@ def test_base_pairs_edge(pdb_id): pair_res_ids = reference_structure[pair].res_id index = get_reference_index(pair_res_ids, reference_edges) if index is not None: - pair_reference_edges = [ - reference_edges[index, 2], reference_edges[index, 3] + pair_reference_edges = [ + reference_edges[index, 2], + reference_edges[index, 3], ] check_edge_plausibility( reference_structure, pair, pair_reference_edges, pair_edges @@ -309,9 +299,7 @@ def test_base_pairs_glycosidic_bond(pdb_id): pair_res_ids = reference_structure[pair].res_id index = get_reference_index(pair_res_ids, reference_gly_bonds) if index is not None: - reference_orientation = struc.GlycosidicBond( - reference_gly_bonds[index, 2] - ) + reference_orientation = struc.GlycosidicBond(reference_gly_bonds[index, 2]) assert reference_orientation == pair_orientation @@ -333,7 +321,7 @@ def test_base_stacking(): # stacked. expected_stackings = [] for i in range(1, 24): - expected_stackings.append([i, i+1]) + expected_stackings.append([i, i + 1]) # Due to distortions in the helix not all adjacent bases have a # geometry that meets the criteria of `base_stacking`. @@ -353,5 +341,3 @@ def test_base_stacking(): # Assert the stacking interactions are correct for interaction in helix[stacking].res_id: assert list(interaction) in expected_stackings - - diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py index a2c940f8f..17293ca60 100644 --- a/tests/structure/test_bonds.py +++ b/tests/structure/test_bonds.py @@ -23,20 +23,22 @@ def generate_random_bond_list(atom_count, bond_count, seed=0): # Clip bond types to allowed BondType values bonds[:, 2] %= len(struc.BondType) # Remove bonds of atoms to itself - bonds = bonds[bonds[:,0] != bonds[:,1]] + bonds = bonds[bonds[:, 0] != bonds[:, 1]] assert len(bonds) > 0 return struc.BondList(atom_count, bonds) @pytest.fixture( - params=[False, True] # as_negative + params=[False, True] # as_negative ) def bond_list(request): """ A toy :class:`BondList`. """ as_negative = request.param - bond_array = np.array([(0,1),(2,1),(3,1),(3,4),(3,1),(1,2),(4,0),(6,4)]) + bond_array = np.array( + [(0, 1), (2, 1), (3, 1), (3, 4), (3, 1), (1, 2), (4, 0), (6, 4)] + ) if as_negative: return struc.BondList(7, -7 + bond_array) else: @@ -48,12 +50,14 @@ def test_creation(bond_list): Test creating a :class:`BondList` on a known example. """ # Test includes redundancy removal and max bonds calculation - assert bond_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0], - [3, 4, 0], - [0, 4, 0], - [4, 6, 0]] + assert bond_list.as_array().tolist() == [ + [0, 1, 0], + [1, 2, 0], + [1, 3, 0], + [3, 4, 0], + [0, 4, 0], + [4, 6, 0], + ] assert bond_list._max_bonds_per_atom == 3 assert bond_list._atom_count == 7 @@ -65,46 +69,44 @@ def test_invalid_creation(): """ # Test invalid input shapes with pytest.raises(ValueError): - struc.BondList( - 5, - np.array([ - [1,2,3,4] - ]) - ) + struc.BondList(5, np.array([[1, 2, 3, 4]])) with pytest.raises(ValueError): - struc.BondList( - 5, - np.array([1,2]) - ) + struc.BondList(5, np.array([1, 2])) # Test invalid atom indices with pytest.raises(IndexError): struc.BondList( 5, - np.array([ - [1,2], - # 5 is an invalid index for an atom count of 5 - [5,2] - ]) + np.array( + [ + [1, 2], + # 5 is an invalid index for an atom count of 5 + [5, 2], + ] + ), ) with pytest.raises(IndexError): struc.BondList( 5, - np.array([ - # Index -6 is invalid for an atom count of 5 - [-6,3], - [3,4] - ]) + np.array( + [ + # Index -6 is invalid for an atom count of 5 + [-6, 3], + [3, 4], + ] + ), ) # Test invalid BondType with pytest.raises(ValueError): struc.BondList( 5, - np.array([ - # BondType '8' does not exist - [1,2,8] - ]) + np.array( + [ + # BondType '8' does not exist + [1, 2, 8] + ] + ), ) @@ -126,25 +128,21 @@ def test_modification(bond_list): # Not in list -> Do nothing bond_list.remove_bond(0, 3) # Remove mutliple bonds, one of them is not in list - bond_list.remove_bonds(struc.BondList(10, np.array([(1,0),(1,2),(8,9)]))) - assert bond_list.as_array().tolist() == [[1, 3, 1], - [3, 4, 0], - [4, 6, 0], - [1, 4, 0]] + bond_list.remove_bonds(struc.BondList(10, np.array([(1, 0), (1, 2), (8, 9)]))) + assert bond_list.as_array().tolist() == [[1, 3, 1], [3, 4, 0], [4, 6, 0], [1, 4, 0]] def test_add_two_bond_list(): """ Test adding two `BondList` objects. """ - bond_list1 = struc.BondList(2, np.array([(0,1)])) # max_bond_per_atom=1 - bond_list2 = struc.BondList(3, np.array([(0,1),(0,2)])) # max_bond_per_atom=2 + bond_list1 = struc.BondList(2, np.array([(0, 1)])) # max_bond_per_atom=1 + bond_list2 = struc.BondList(3, np.array([(0, 1), (0, 2)])) # max_bond_per_atom=2 added_list = bond_list1 + bond_list2 assert added_list._max_bonds_per_atom == 2 assert added_list.get_bonds(2)[0].tolist() == [3, 4] - assert added_list.as_array().tolist() == [[0, 1, 0], - [2, 3, 0], - [2, 4, 0]] + assert added_list.as_array().tolist() == [[0, 1, 0], [2, 3, 0], [2, 4, 0]] + def test_contains(bond_list): """ @@ -185,29 +183,33 @@ def test_merge(bond_list): """ Test merging two `BondList` objects on a known example. """ - merged_list = struc.BondList(8, np.array([(4,6),(6,7)])).merge(bond_list) - assert merged_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0], - [3, 4, 0], - [0, 4, 0], - [4, 6, 0], - [6, 7, 0]] + merged_list = struc.BondList(8, np.array([(4, 6), (6, 7)])).merge(bond_list) + assert merged_list.as_array().tolist() == [ + [0, 1, 0], + [1, 2, 0], + [1, 3, 0], + [3, 4, 0], + [0, 4, 0], + [4, 6, 0], + [6, 7, 0], + ] def test_concatenation(bond_list): """ Test concatenation of two `BondList` objects on a known example. """ - bond_list += struc.BondList(3, np.array([(0,1,2),(1,2,2)])) - assert bond_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0], - [3, 4, 0], - [0, 4, 0], - [4, 6, 0], - [7, 8, 2], - [8, 9, 2]] + bond_list += struc.BondList(3, np.array([(0, 1, 2), (1, 2, 2)])) + assert bond_list.as_array().tolist() == [ + [0, 1, 0], + [1, 2, 0], + [1, 3, 0], + [3, 4, 0], + [0, 4, 0], + [4, 6, 0], + [7, 8, 2], + [8, 9, 2], + ] assert bond_list._max_bonds_per_atom == 3 assert bond_list._atom_count == 10 @@ -219,30 +221,27 @@ def test_indexing(bond_list): sub_list = bond_list[:] assert sub_list.as_array().tolist() == bond_list.as_array().tolist() sub_list = bond_list[::-1] - assert sub_list.as_array().tolist() == [[5, 6, 0], - [4, 5, 0], - [3, 5, 0], - [2, 3, 0], - [2, 6, 0], - [0, 2, 0]] + assert sub_list.as_array().tolist() == [ + [5, 6, 0], + [4, 5, 0], + [3, 5, 0], + [2, 3, 0], + [2, 6, 0], + [0, 2, 0], + ] sub_list = bond_list[1:6:2] assert sub_list.as_array().tolist() == [[0, 1, 0]] sub_list = bond_list[:4] - assert sub_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0]] + assert sub_list.as_array().tolist() == [[0, 1, 0], [1, 2, 0], [1, 3, 0]] sub_list = bond_list[2:] - assert sub_list.as_array().tolist() == [[1, 2, 0], - [2, 4, 0]] + assert sub_list.as_array().tolist() == [[1, 2, 0], [2, 4, 0]] - sub_list = bond_list[[0,3,4]] - assert sub_list.as_array().tolist() == [[1, 2, 0], - [0, 2, 0]] + sub_list = bond_list[[0, 3, 4]] + assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0]] + + sub_list = bond_list[np.array([True, False, False, True, True, False, True])] + assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0], [2, 3, 0]] - sub_list = bond_list[np.array([True,False,False,True,True,False,True])] - assert sub_list.as_array().tolist() == [[1, 2, 0], - [0, 2, 0], - [2, 3, 0]] def test_get_all_bonds(): """ @@ -261,17 +260,13 @@ def test_get_all_bonds(): assert (bond_types != -1).all(axis=1).any(axis=0) test_bonds = [ - ( - bonded_i[bonded_i != -1].tolist(), - bond_type[bond_type != -1].tolist() - ) + (bonded_i[bonded_i != -1].tolist(), bond_type[bond_type != -1].tolist()) for bonded_i, bond_type in zip(bonds, bond_types) ] ref_bonds = [bond_list.get_bonds(i) for i in range(ATOM_COUNT)] ref_bonds = [ - (bonded_i.tolist(), bond_type.tolist()) - for bonded_i, bond_type in ref_bonds + (bonded_i.tolist(), bond_type.tolist()) for bonded_i, bond_type in ref_bonds ] assert test_bonds == ref_bonds @@ -330,9 +325,9 @@ def test_sorted_array_indexing(): # Create a sorted array of random indices for the BondList # Indices may not occur multiple times -> 'replace=False' - index_array = np.sort(np.random.choice( - np.arange(ATOM_COUNT), INDEX_SIZE, replace=False - )) + index_array = np.sort( + np.random.choice(np.arange(ATOM_COUNT), INDEX_SIZE, replace=False) + ) test_bonds = bonds[index_array] # Create a boolean mask that indexes the same elements as the array @@ -363,15 +358,13 @@ def test_unsorted_array_indexing(): # Create random bonds between the reference integers bonds = np.random.randint(ATOM_COUNT, size=(BOND_COUNT, 2)) # Remove bonds of elements to itself - bonds = bonds[bonds[:,0] != bonds[:,1]] + bonds = bonds[bonds[:, 0] != bonds[:, 1]] assert len(bonds) > 0 bonds = struc.BondList(ATOM_COUNT, bonds) # Create an unsorted array of random indices for the BondList # Indices should be unsorted -> 'replace=False' - unsorted_index = np.random.choice( - np.arange(ATOM_COUNT), INDEX_SIZE, replace=False - ) + unsorted_index = np.random.choice(np.arange(ATOM_COUNT), INDEX_SIZE, replace=False) test_bonds = bonds[unsorted_index] # Create a sorted variant of the index array @@ -385,14 +378,18 @@ def test_unsorted_array_indexing(): # Get the 'atoms', in this case integers, that are connected with a bond # Use a set for simpler comparison between the sorted and unsorted variant # Omit the bond type -> 'bonds.as_array()[:, :2]' - test_integer_pairs = set([ - frozenset((unsorted_indexed_integers[i], unsorted_indexed_integers[j])) - for i, j in test_bonds.as_array()[:, :2] - ]) - ref_integer_pairs = set([ - frozenset((sorted_indexed_integers[i], sorted_indexed_integers[j])) - for i, j in ref_bonds.as_array()[:, :2] - ]) + test_integer_pairs = set( + [ + frozenset((unsorted_indexed_integers[i], unsorted_indexed_integers[j])) + for i, j in test_bonds.as_array()[:, :2] + ] + ) + ref_integer_pairs = set( + [ + frozenset((sorted_indexed_integers[i], sorted_indexed_integers[j])) + for i, j in ref_bonds.as_array()[:, :2] + ] + ) # The BondList entries should be different, # since they point to different positions in the reference array @@ -415,18 +412,21 @@ def test_atom_array_consistency(): array = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))[0] ca = array[array.atom_name == "CA"] # Just for testing, does not reflect real bonds - bond_list = struc.BondList(ca.array_length(), - np.array([(0,1),(2,8),(5,15),(1,5),(0,9),(3,18),(2,9)]) + bond_list = struc.BondList( + ca.array_length(), + np.array([(0, 1), (2, 8), (5, 15), (1, 5), (0, 9), (3, 18), (2, 9)]), ) ca.bonds = bond_list - ref_ids = ca.res_id[bond_list.as_array()[:,:2].flatten()] + ref_ids = ca.res_id[bond_list.as_array()[:, :2].flatten()] # Some random boolean mask as index, # but all bonded atoms are included - mask = np.array([1,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,1,1], dtype=bool) + mask = np.array( + [1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1], dtype=bool + ) masked_ca = ca[mask] - test_ids = masked_ca.res_id[masked_ca.bonds.as_array()[:,:2].flatten()] + test_ids = masked_ca.res_id[masked_ca.bonds.as_array()[:, :2].flatten()] # The bonds, should always point to the same atoms (same res_id), # irrespective of indexing @@ -442,9 +442,7 @@ def test_method_consistency(periodic): THRESHOLD_PERCENTAGE = 0.99 # Structure with peptide, nucleotide, small molecules and water - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), "5ugo.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "5ugo.bcif")) atoms = pdbx.get_structure(pdbx_file, model=1) if periodic: # Add large dummy box to test parameter @@ -454,22 +452,22 @@ def test_method_consistency(periodic): bonds_from_names = struc.connect_via_residue_names(atoms) bonds_from_names.remove_bond_order() - bonds_from_distances = struc.connect_via_distances( - atoms, periodic=periodic - ) + bonds_from_distances = struc.connect_via_distances(atoms, periodic=periodic) # The distance based method may not detect all bonds assert bonds_from_distances.as_set().issubset(bonds_from_names.as_set()) - assert len(bonds_from_distances.as_array()) \ + assert ( + len(bonds_from_distances.as_array()) >= len(bonds_from_names.as_array()) * THRESHOLD_PERCENTAGE + ) def test_find_connected(bond_list): """ Find all connected atoms to an atom in a known example. """ - for index in (0,1,2,3,4,6): - assert struc.find_connected(bond_list, index).tolist() == [0,1,2,3,4,6] + for index in (0, 1, 2, 3, 4, 6): + assert struc.find_connected(bond_list, index).tolist() == [0, 1, 2, 3, 4, 6] assert struc.find_connected(bond_list, 5).tolist() == [5] @@ -498,7 +496,7 @@ def test_find_connected(bond_list): ("C17", "C22"), ]), ] -) # fmt: skip +) # fmt: skip def test_find_rotatable_bonds(res_name, expected_bonds): """ Check the :func:`find_rotatable_bonds()` function based on @@ -513,11 +511,9 @@ def test_find_rotatable_bonds(res_name, expected_bonds): rotatable_bonds = struc.find_rotatable_bonds(molecule.bonds) test_bond_set = set() for i, j, _ in rotatable_bonds.as_array(): - test_bond_set.add( - tuple(sorted((molecule.atom_name[i], molecule.atom_name[j]))) - ) + test_bond_set.add(tuple(sorted((molecule.atom_name[i], molecule.atom_name[j])))) # Compare with reference bonded atom names assert test_bond_set == ref_bond_set # All rotatable bonds must be single bonds - assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE) \ No newline at end of file + assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE) diff --git a/tests/structure/test_box.py b/tests/structure/test_box.py index b280b85d0..f9e2e35a2 100644 --- a/tests/structure/test_box.py +++ b/tests/structure/test_box.py @@ -2,16 +2,15 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join import itertools import warnings +from os.path import join import numpy as np import pytest import biotite.structure as struc import biotite.structure.io.pdbx as pdbx from biotite.structure.io import load_structure -from ..util import data_dir, cannot_import - +from ..util import cannot_import, data_dir SAMPLE_BOXES = [ (1, 1, 1, 90, 90, 90), @@ -21,86 +20,82 @@ (2, 4, 6, 100, 110, 120), (9, 9, 9, 90, 90, 170), (9, 8, 7, 50, 80, 50), -] # fmt: skip +] # fmt: skip SAMPLE_COORD = [ ( 1, 1, 1), ( 5, 10, 20), (-1, 5, 8), ( 3, 1, 54) -] # fmt: skip - +] # fmt: skip # Ignore warning about dummy unit cell vector @pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "len_a, len_b, len_c, alpha, beta, gamma", SAMPLE_BOXES -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") +@pytest.mark.parametrize("len_a, len_b, len_c, alpha, beta, gamma", SAMPLE_BOXES) def test_box_vector_calculation(len_a, len_b, len_c, alpha, beta, gamma): box = struc.vectors_from_unitcell( - len_a, len_b, len_c, - np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) + len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) ) from mdtraj.utils import lengths_and_angles_to_box_vectors + ref_box = np.stack( - lengths_and_angles_to_box_vectors( - len_a, len_b, len_c, alpha, beta, gamma - ) + lengths_and_angles_to_box_vectors(len_a, len_b, len_c, alpha, beta, gamma) ) assert np.allclose(box, ref_box) assert struc.unitcell_from_vectors(box) == pytest.approx( - (len_a, len_b, len_c, - alpha * 2*np.pi / 360, beta * 2*np.pi / 360, gamma * 2*np.pi / 360) + ( + len_a, + len_b, + len_c, + alpha * 2 * np.pi / 360, + beta * 2 * np.pi / 360, + gamma * 2 * np.pi / 360, + ) ) def test_volume(): # Very rudimentary test - box = np.array([ - [5,0,0], - [0,8,0], - [0,0,2], - ]) + box = np.array( + [ + [5, 0, 0], + [0, 8, 0], + [0, 0, 2], + ] + ) assert struc.box_volume(box) == pytest.approx(80) boxes = np.stack([box, box]) - assert struc.box_volume(boxes) == pytest.approx(80,80) + assert struc.box_volume(boxes) == pytest.approx(80, 80) @pytest.mark.parametrize( "len_a, len_b, len_c, alpha, beta, gamma, x, y,z", - [box+coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)] + [box + coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)], ) def test_move_into_box(len_a, len_b, len_c, alpha, beta, gamma, x, y, z): box = struc.vectors_from_unitcell( - len_a, len_b, len_c, - np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) + len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) ) - coord = np.array([x,y,z]) + coord = np.array([x, y, z]) moved_coord = struc.move_inside_box(coord, box) fractions = struc.coord_to_fraction(moved_coord, box) - assert ((fractions >= 0) & (fractions <=1)).all() + assert ((fractions >= 0) & (fractions <= 1)).all() @pytest.mark.parametrize( "len_a, len_b, len_c, alpha, beta, gamma, x, y,z", - [box+coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)] + [box + coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)], ) -def test_conversion_to_fraction(len_a, len_b, len_c, - alpha, beta, gamma, - x, y, z): +def test_conversion_to_fraction(len_a, len_b, len_c, alpha, beta, gamma, x, y, z): box = struc.vectors_from_unitcell( - len_a, len_b, len_c, - np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) + len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) ) - coord = np.array([x,y,z]) + coord = np.array([x, y, z]) fractions = struc.coord_to_fraction(coord, box) if struc.is_orthogonal(box): @@ -119,12 +114,11 @@ def test_conversion_to_fraction(len_a, len_b, len_c, def test_repeat_box(multi_model): model = None if multi_model else 1 array = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")), - model=model + pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")), model=model ) repeat_array, _ = struc.repeat_box(array) assert repeat_array.array_length() == array.array_length() * 27 - assert repeat_array[..., :array.array_length()] == array + assert repeat_array[..., : array.array_length()] == array @pytest.mark.parametrize("multi_model", [True, False]) @@ -135,14 +129,12 @@ def test_remove_pbc_unsegmented(multi_model): """ model = None if multi_model else 1 ref_array = load_structure( - join(data_dir("structure"), "3o5r.bcif"), - model=model, - include_bonds=True + join(data_dir("structure"), "3o5r.bcif"), model=model, include_bonds=True ) # Center structure in box centroid = struc.centroid(ref_array) box_center = np.diag(ref_array.box) / 2 - ref_array = struc.translate(ref_array, box_center-centroid) + ref_array = struc.translate(ref_array, box_center - centroid) test_array = struc.remove_pbc(ref_array) assert ref_array.equal_annotation_categories(test_array) @@ -150,11 +142,7 @@ def test_remove_pbc_unsegmented(multi_model): @pytest.mark.parametrize( - "multi_model, seed", - itertools.product( - [False, True], - range(10) - ) + "multi_model, seed", itertools.product([False, True], range(10)) ) def test_remove_pbc_restore(multi_model, seed): BUFFER = 5 @@ -162,14 +150,12 @@ def test_remove_pbc_restore(multi_model, seed): def get_distance_matrices(array): if isinstance(array, struc.AtomArray): matrix = struc.distance( - array.coord[:, np.newaxis, :], - array.coord[np.newaxis, :, :], - box=None + array.coord[:, np.newaxis, :], array.coord[np.newaxis, :, :], box=None ) matrix_pbc = struc.distance( array.coord[:, np.newaxis, :], array.coord[np.newaxis, :, :], - box=array.box + box=array.box, ) elif isinstance(array, struc.AtomArrayStack): matrices = [get_distance_matrices(model) for model in array] @@ -177,9 +163,7 @@ def get_distance_matrices(array): matrix_pbc = np.stack([m[1] for m in matrices]) return matrix, matrix_pbc - stack = load_structure( - join(data_dir("structure"), "1l2y.bcif"), include_bonds=True - ) + stack = load_structure(join(data_dir("structure"), "1l2y.bcif"), include_bonds=True) # Only consider a single molecule # -> remove all other atoms (in this case some unbound hydrogen) @@ -188,10 +172,12 @@ def get_distance_matrices(array): stack = stack[..., largest_mask] # Create a relatively tight box around the protein - stack.box = np.array([ - np.diag(np.max(coord, axis=0) - np.min(coord, axis=0) + BUFFER) - for coord in stack.coord - ]) + stack.box = np.array( + [ + np.diag(np.max(coord, axis=0) - np.min(coord, axis=0) + BUFFER) + for coord in stack.coord + ] + ) stack.coord -= np.min(stack.coord, axis=-2)[:, np.newaxis, :] + BUFFER / 2 if multi_model: array = stack @@ -203,8 +189,7 @@ def get_distance_matrices(array): np.random.seed(seed) size = (array.stack_depth(), 3) if isinstance(array, struc.AtomArrayStack) else 3 translation_vector = np.sum( - np.random.uniform(-5, 5, size)[:, np.newaxis] * array.box, - axis=-2 + np.random.uniform(-5, 5, size)[:, np.newaxis] * array.box, axis=-2 )[..., np.newaxis, :] # Move atoms over periodic boundary... array = struc.translate(array, translation_vector) @@ -226,10 +211,7 @@ def get_distance_matrices(array): # The centroid of the structure should be inside the box dimensions centroid = struc.centroid(array) - assert np.all( - (centroid > np.zeros(3)) & - (centroid < np.sum(array.box, axis=-2)) - ) + assert np.all((centroid > np.zeros(3)) & (centroid < np.sum(array.box, axis=-2))) @pytest.mark.parametrize("multi_model", [True, False]) @@ -249,4 +231,4 @@ def test_remove_pbc_selection(multi_model): # A warning due to a zero-division (centroid of empty list of # atoms) is raised here warnings.simplefilter("ignore") - assert struc.remove_pbc(array, select_none) == array \ No newline at end of file + assert struc.remove_pbc(array, select_none) == array diff --git a/tests/structure/test_celllist.py b/tests/structure/test_celllist.py index 13267ce74..cfbf1c486 100644 --- a/tests/structure/test_celllist.py +++ b/tests/structure/test_celllist.py @@ -2,8 +2,8 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join import itertools +from os.path import join import numpy as np import pytest import biotite.structure as struc @@ -19,28 +19,20 @@ def test_get_atoms(cell_size): with known solutions. """ array = struc.AtomArray(length=5) - array.coord = np.array([[0,0,i] for i in range(5)]) + array.coord = np.array([[0, 0, i] for i in range(5)]) cell_list = struc.CellList(array, cell_size=cell_size) - assert cell_list.get_atoms(np.array([0,0,0.1]), 1).tolist() == [0,1] - assert cell_list.get_atoms(np.array([0,0,1.1]), 1).tolist() == [1,2] - assert cell_list.get_atoms(np.array([0,0,1.1]), 2).tolist() == [0,1,2,3] + assert cell_list.get_atoms(np.array([0, 0, 0.1]), 1).tolist() == [0, 1] + assert cell_list.get_atoms(np.array([0, 0, 1.1]), 1).tolist() == [1, 2] + assert cell_list.get_atoms(np.array([0, 0, 1.1]), 2).tolist() == [0, 1, 2, 3] # Multiple positions - pos = np.array([[0,0,0.1], - [0,0,1.1], - [0,0,4.1]]) - expected_indices = [0, 1, 2, - 0, 1, 2, 3, - 3, 4] + pos = np.array([[0, 0, 0.1], [0, 0, 1.1], [0, 0, 4.1]]) + expected_indices = [0, 1, 2, 0, 1, 2, 3, 3, 4] indices = cell_list.get_atoms(pos, 2) assert indices[indices != -1].tolist() == expected_indices # Multiple positions and multiple radii - pos = np.array([[0,0,0.1], - [0,0,1.1], - [0,0,4.1]]) + pos = np.array([[0, 0, 0.1], [0, 0, 1.1], [0, 0, 4.1]]) rad = np.array([1.0, 2.0, 3.0]) - expected_indices = [0, 1, - 0, 1, 2, 3, - 2, 3, 4] + expected_indices = [0, 1, 0, 1, 2, 3, 2, 3, 4] indices = cell_list.get_atoms(pos, rad) assert indices[indices != -1].tolist() == expected_indices @@ -52,7 +44,7 @@ def test_get_atoms(cell_size): [2, 5, 10], [False, True], [False, True], - ) + ), ) def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): """ @@ -64,9 +56,7 @@ def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): if periodic: # Create an orthorhombic box # with the outer coordinates as bounds - array.box = np.diag( - np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2) - ) + array.box = np.diag(np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2)) if use_selection: np.random.seed(0) @@ -83,17 +73,14 @@ def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): distance = struc.index_distance( array, np.stack( - [ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], - axis=-1 + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=-1, ), - periodic + periodic, ) distance = np.reshape(distance, (length, length)) # Create adjacency matrix from distance matrix - exp_matrix = (distance <= threshold) + exp_matrix = distance <= threshold if use_selection: # Set rows and columns to False for filtered out atoms exp_matrix[~selection, :] = False @@ -145,12 +132,10 @@ def test_empty_coordinates(): array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) cell_list = struc.CellList(array, cell_size=10) - for method in ( - struc.CellList.get_atoms, struc.CellList.get_atoms_in_cells - ): + for method in (struc.CellList.get_atoms, struc.CellList.get_atoms_in_cells): indices = method(cell_list, np.array([]), 1, as_mask=False) mask = method(cell_list, np.array([]), 1, as_mask=True) assert len(indices) == 0 assert len(mask) == 0 assert indices.dtype == np.int32 - assert mask.dtype == bool \ No newline at end of file + assert mask.dtype == bool diff --git a/tests/structure/test_chains.py b/tests/structure/test_chains.py index ffd5f682b..379258db8 100644 --- a/tests/structure/test_chains.py +++ b/tests/structure/test_chains.py @@ -2,18 +2,19 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join +import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -import numpy as np -from os.path import join from ..util import data_dir -import pytest @pytest.fixture def array(): return strucio.load_structure(join(data_dir("structure"), "1igy.bcif")) + def test_get_chain_starts(array): """ Compare :func:`test_get_chain_starts()` with :func:`np.unique` in a @@ -24,6 +25,7 @@ def test_get_chain_starts(array): # All first occurences of a chain id are automatically chain starts assert set(ref_starts).issubset(set(test_starts)) + def test_get_chain_starts_same_id(array): """ Expect correct number of chains in a case where two successive @@ -34,18 +36,20 @@ def test_get_chain_starts_same_id(array): merged = array + array assert struc.get_chain_starts(merged).tolist() == [0, array.array_length()] + def test_apply_chain_wise(array): data = struc.apply_chain_wise(array, np.ones(len(array)), np.sum) assert data.tolist() == [ - len(array[array.chain_id == chain_id]) - for chain_id in np.unique(array.chain_id) + len(array[array.chain_id == chain_id]) for chain_id in np.unique(array.chain_id) ] + def test_spread_chain_wise(array): input_data = np.unique(array.chain_id) output_data = struc.spread_chain_wise(array, input_data) assert output_data.tolist() == array.chain_id.tolist() + def test_get_chain_masks(array): SAMPLE_SIZE = 100 np.random.seed(0) @@ -55,26 +59,29 @@ def test_get_chain_masks(array): ref_mask = array.chain_id == array.chain_id[index] assert test_mask.tolist() == ref_mask.tolist() + def test_get_chain_starts_for(array): SAMPLE_SIZE = 100 np.random.seed(0) indices = np.random.randint(0, array.array_length(), SAMPLE_SIZE) ref_starts = np.array( - [np.where(mask)[0][0] for mask - in struc.get_chain_masks(array, indices)] + [np.where(mask)[0][0] for mask in struc.get_chain_masks(array, indices)] ) test_starts = struc.get_chain_starts_for(array, indices) assert test_starts.tolist() == ref_starts.tolist() + def test_get_chains(array): assert struc.get_chains(array).tolist() == ["A", "B", "C", "D", "E", "F"] + def test_get_chain_count(array): assert struc.get_chain_count(array) == 6 + def test_chain_iter(array): n = 0 for chain in struc.get_chains(array): n += 1 assert isinstance(array, struc.AtomArray) - assert n == 6 \ No newline at end of file + assert n == 6 diff --git a/tests/structure/test_charges.py b/tests/structure/test_charges.py index 4d85d411b..35a99f11a 100644 --- a/tests/structure/test_charges.py +++ b/tests/structure/test_charges.py @@ -3,13 +3,9 @@ # information. import warnings -import pytest import numpy as np -from biotite.structure import Atom -from biotite.structure import array -from biotite.structure import BondList -from biotite.structure import partial_charges - +import pytest +from biotite.structure import Atom, BondList, array, partial_charges # Test the partial charge of carbon in the molecules given in table # 3 of the Gasteiger-Marsili publication @@ -19,236 +15,236 @@ # the relevant information is the BondList # Creating atoms to build molecules with -carbon = Atom([0, 0, 0], element="C") +carbon = Atom([0, 0, 0], element="C") hydrogen = Atom([0, 0, 0], element="H") -oxygen = Atom([0, 0, 0], element="O") +oxygen = Atom([0, 0, 0], element="O") nitrogen = Atom([0, 0, 0], element="N") fluorine = Atom([0, 0, 0], element="F") -sulfur = Atom([0, 0, 0], element="S") +sulfur = Atom([0, 0, 0], element="S") # Building molecules methane = array([carbon, hydrogen, hydrogen, hydrogen, hydrogen]) methane.bonds = BondList( - methane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + methane.array_length(), np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]) ) mol_length = methane.array_length() methane.charge = np.array([0] * mol_length) ethane = array( - [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, - hydrogen] + [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen] ) ethane.bonds = BondList( ethane.array_length(), - np.array([ - [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1] - ]) + np.array( + [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]] + ), ) mol_length = ethane.array_length() ethane.charge = np.array([0] * mol_length) -ethylene = array( - [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen] -) +ethylene = array([carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen]) ethylene.bonds = BondList( ethylene.array_length(), - np.array([[0,1,2], [0,2,1], [0,3,1], [1,4,1], [1,5,1]]) + np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1], [1, 4, 1], [1, 5, 1]]), ) mol_length = ethylene.array_length() ethylene.charge = np.array([0] * mol_length) -acetylene = array( - [carbon, carbon, hydrogen, hydrogen] -) +acetylene = array([carbon, carbon, hydrogen, hydrogen]) acetylene.bonds = BondList( - acetylene.array_length(), - np.array([[0,1,3], [0,2,1], [1,3,1]]) + acetylene.array_length(), np.array([[0, 1, 3], [0, 2, 1], [1, 3, 1]]) ) mol_length = acetylene.array_length() acetylene.charge = np.array([0] * mol_length) -fluoromethane = array( - [carbon, fluorine, hydrogen, hydrogen, hydrogen] -) +fluoromethane = array([carbon, fluorine, hydrogen, hydrogen, hydrogen]) fluoromethane.bonds = BondList( - fluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + fluoromethane.array_length(), np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]) ) mol_length = fluoromethane.array_length() fluoromethane.charge = np.array([0] * mol_length) -difluoromethane = array( - [carbon, fluorine, fluorine, hydrogen, hydrogen] -) +difluoromethane = array([carbon, fluorine, fluorine, hydrogen, hydrogen]) difluoromethane.bonds = BondList( difluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]), ) mol_length = difluoromethane.array_length() difluoromethane.charge = np.array([0] * mol_length) -trifluoromethane = array( - [carbon, fluorine, fluorine, fluorine, hydrogen] -) +trifluoromethane = array([carbon, fluorine, fluorine, fluorine, hydrogen]) trifluoromethane.bonds = BondList( trifluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]), ) mol_length = trifluoromethane.array_length() trifluoromethane.charge = np.array([0] * mol_length) -tetrafluoromethane = array( - [carbon, fluorine, fluorine, fluorine, fluorine] -) +tetrafluoromethane = array([carbon, fluorine, fluorine, fluorine, fluorine]) tetrafluoromethane.bonds = BondList( tetrafluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]), ) mol_length = tetrafluoromethane.array_length() tetrafluoromethane.charge = np.array([0] * mol_length) fluoroethane = array( - [carbon, carbon, fluorine, hydrogen, hydrogen, hydrogen, - hydrogen, hydrogen] + [carbon, carbon, fluorine, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen] ) fluoroethane.bonds = BondList( fluoroethane.array_length(), - np.array([ - [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1] - ]) + np.array( + [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]] + ), ) mol_length = fluoroethane.array_length() fluoroethane.charge = np.array([0] * mol_length) trifluoroethane = array( - [carbon, carbon, fluorine, fluorine, fluorine, hydrogen, - hydrogen, hydrogen] + [carbon, carbon, fluorine, fluorine, fluorine, hydrogen, hydrogen, hydrogen] ) trifluoroethane.bonds = BondList( trifluoroethane.array_length(), - np.array([ - [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1] - ]) + np.array( + [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]] + ), ) mol_length = trifluoroethane.array_length() trifluoroethane.charge = np.array([0] * mol_length) -methanole = array( - [carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen] -) +methanole = array([carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen]) methanole.bonds = BondList( methanole.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1]]), ) mol_length = methanole.array_length() methanole.charge = np.array([0] * mol_length) dimethyl_ether = array( - [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen, - hydrogen, hydrogen] + [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen] ) dimethyl_ether.bonds = BondList( dimethyl_ether.array_length(), - np.array([ - [0,2,1], [1,2,1], [0,3,1], [0,4,1], [0,5,1], [1,6,1], [1,7,1], - [1,8,1] - ]) + np.array( + [ + [0, 2, 1], + [1, 2, 1], + [0, 3, 1], + [0, 4, 1], + [0, 5, 1], + [1, 6, 1], + [1, 7, 1], + [1, 8, 1], + ] + ), ) mol_length = dimethyl_ether.array_length() dimethyl_ether.charge = np.array([0] * mol_length) -formaldehyde = array( - [carbon, oxygen, hydrogen, hydrogen] -) +formaldehyde = array([carbon, oxygen, hydrogen, hydrogen]) formaldehyde.bonds = BondList( - formaldehyde.array_length(), - np.array([[0,1,2], [0,2,1], [0,3,1]]) + formaldehyde.array_length(), np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1]]) ) mol_length = formaldehyde.array_length() formaldehyde.charge = np.array([0] * mol_length) -acetaldehyde = array( - [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen] -) +acetaldehyde = array([carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen]) acetaldehyde.bonds = BondList( acetaldehyde.array_length(), - np.array([[0,1,1], [1,2,2], [0,3,1], [0,4,1], [0,5,1], [1,6,1]]) + np.array([[0, 1, 1], [1, 2, 2], [0, 3, 1], [0, 4, 1], [0, 5, 1], [1, 6, 1]]), ) mol_length = acetaldehyde.array_length() acetaldehyde.charge = np.array([0] * mol_length) acetone = array( - [carbon, carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, - hydrogen, hydrogen, hydrogen] + [ + carbon, + carbon, + carbon, + oxygen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + ] ) acetone.bonds = BondList( acetone.array_length(), - np.array([ - [0,1,1], [1,2,1], [1,3,2], [0,4,1], [0,5,1], [0,6,1], [2,7,1], - [2,8,1], [2,9,1] - ]) + np.array( + [ + [0, 1, 1], + [1, 2, 1], + [1, 3, 2], + [0, 4, 1], + [0, 5, 1], + [0, 6, 1], + [2, 7, 1], + [2, 8, 1], + [2, 9, 1], + ] + ), ) mol_length = acetone.array_length() acetone.charge = np.array([0] * mol_length) -hydrogen_cyanide = array( - [carbon, nitrogen, hydrogen] -) +hydrogen_cyanide = array([carbon, nitrogen, hydrogen]) hydrogen_cyanide.bonds = BondList( - hydrogen_cyanide.array_length(), - np.array([[0,1,3], [0,2,1]]) + hydrogen_cyanide.array_length(), np.array([[0, 1, 3], [0, 2, 1]]) ) mol_length = hydrogen_cyanide.array_length() hydrogen_cyanide.charge = np.array([0] * mol_length) -acetonitrile = array( - [carbon, carbon, nitrogen, hydrogen, hydrogen, hydrogen] -) +acetonitrile = array([carbon, carbon, nitrogen, hydrogen, hydrogen, hydrogen]) acetonitrile.bonds = BondList( acetonitrile.array_length(), - np.array([[0,1,1], [1,2,3], [0,3,1], [0,4,1], [0,5,1]]) + np.array([[0, 1, 1], [1, 2, 3], [0, 3, 1], [0, 4, 1], [0, 5, 1]]), ) mol_length = acetonitrile.array_length() acetonitrile.charge = np.array([0] * mol_length) + # For this purpose, parametrization via pytest is performed -@pytest.mark.parametrize("molecule, expected_results", [ - (methane, (-0.078,)), - (ethane, (-0.068, -0.068)), - (ethylene, (-0.106, -0.106)), - (acetylene, (-0.122, -0.122)), - (fluoromethane, (0.079,)), - (difluoromethane, (0.23,)), - (trifluoromethane, (0.38,)), - (tetrafluoromethane, (0.561,)), - (fluoroethane, (0.087, -0.037)), - (trifluoroethane, (0.387, 0.039)), - (methanole, (0.033,)), - (dimethyl_ether, (0.036, 0.036)), - (formaldehyde, (0.115,)), - (acetaldehyde, (-0.009, 0.123)), - (acetone, (-0.006, 0.131, -0.006)), - (hydrogen_cyanide, (0.051,)), - (acetonitrile, (0.023, 0.06)) -]) +@pytest.mark.parametrize( + "molecule, expected_results", + [ + (methane, (-0.078,)), + (ethane, (-0.068, -0.068)), + (ethylene, (-0.106, -0.106)), + (acetylene, (-0.122, -0.122)), + (fluoromethane, (0.079,)), + (difluoromethane, (0.23,)), + (trifluoromethane, (0.38,)), + (tetrafluoromethane, (0.561,)), + (fluoroethane, (0.087, -0.037)), + (trifluoroethane, (0.387, 0.039)), + (methanole, (0.033,)), + (dimethyl_ether, (0.036, 0.036)), + (formaldehyde, (0.115,)), + (acetaldehyde, (-0.009, 0.123)), + (acetone, (-0.006, 0.131, -0.006)), + (hydrogen_cyanide, (0.051,)), + (acetonitrile, (0.023, 0.06)), + ], +) def test_partial_charges(molecule, expected_results): """ Test whether the partial charges of the carbon atoms comprised in @@ -257,29 +253,33 @@ def test_partial_charges(molecule, expected_results): within a certain tolerance range. """ charges = partial_charges(molecule) - assert charges[molecule.element == "C"].tolist() == \ - pytest.approx(expected_results, abs=1e-2) - - -@pytest.mark.parametrize("molecule", [ - methane, - ethane, - ethylene, - acetylene, - fluoromethane, - difluoromethane, - trifluoromethane, - tetrafluoromethane, - fluoroethane, - trifluoroethane, - methanole, - dimethyl_ether, - formaldehyde, - acetaldehyde, - acetone, - hydrogen_cyanide, - acetonitrile -]) + assert charges[molecule.element == "C"].tolist() == pytest.approx( + expected_results, abs=1e-2 + ) + + +@pytest.mark.parametrize( + "molecule", + [ + methane, + ethane, + ethylene, + acetylene, + fluoromethane, + difluoromethane, + trifluoromethane, + tetrafluoromethane, + fluoroethane, + trifluoroethane, + methanole, + dimethyl_ether, + formaldehyde, + acetaldehyde, + acetone, + hydrogen_cyanide, + acetonitrile, + ], +) def test_total_charge_zero(molecule): """ In the case of the 17 molecules given in table 3, it is verified @@ -302,14 +302,8 @@ def test_pos_formal_charge(): pos_methane = methane.copy() pos_methane.charge = np.array([1, 0, 0, 0, 0]) - ref_carb_part_charge = partial_charges( - methane, - iteration_step_num=6 - )[0] - pos_carb_part_charge = partial_charges( - pos_methane, - iteration_step_num=6 - )[0] + ref_carb_part_charge = partial_charges(methane, iteration_step_num=6)[0] + pos_carb_part_charge = partial_charges(pos_methane, iteration_step_num=6)[0] assert pos_carb_part_charge < 1 assert pos_carb_part_charge > ref_carb_part_charge @@ -331,16 +325,12 @@ def test_valence_state_not_parametrized(): with pytest.warns( UserWarning, match=( - "Parameters for specific valence states of some atoms " - "are not available" - ) + "Parameters for specific valence states of some atoms " "are not available" + ), ): - thioformaldehyde = array( - [carbon, sulfur, hydrogen, hydrogen] - ) + thioformaldehyde = array([carbon, sulfur, hydrogen, hydrogen]) thioformaldehyde.bonds = BondList( - thioformaldehyde.array_length(), - np.array([[0,1,2], [0,2,1], [0,3,1]]) + thioformaldehyde.array_length(), np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1]]) ) mol_length = thioformaldehyde.array_length() thioformaldehyde.charge = np.array([0] * mol_length) @@ -368,9 +358,7 @@ def test_correct_output_ions(): sodium_array.bonds = BondList(sodium_array.array_length()) with warnings.catch_warnings(): warnings.simplefilter("error") - sodium_charge = partial_charges( - sodium_array, iteration_step_num=1 - )[0] + sodium_charge = partial_charges(sodium_array, iteration_step_num=1)[0] assert sodium_charge == 1 @@ -414,51 +402,72 @@ def test_correct_output_charged_aa(): unspecified bond types throughout the whole AtomArray is raised. """ - glycine_charge = np.array( - [+1, 0, 0, 0, -1, 0, 0, 0, 0, 0] - ) + glycine_charge = np.array([+1, 0, 0, 0, -1, 0, 0, 0, 0, 0]) glycine_with_btype = array( - [nitrogen, carbon, carbon, oxygen, oxygen, hydrogen, hydrogen, - hydrogen, hydrogen, hydrogen] + [ + nitrogen, + carbon, + carbon, + oxygen, + oxygen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + ] ) glycine_with_btype.charge = glycine_charge glycine_with_btype.bonds = BondList( glycine_with_btype.array_length(), - np.array([ - [0,1,1], [0,5,1], [0,6,1], [0,7,1], [1,2,1], [1,8,1], - [1,9,1], [2,3,2], [2,4,1] - ]) + np.array( + [ + [0, 1, 1], + [0, 5, 1], + [0, 6, 1], + [0, 7, 1], + [1, 2, 1], + [1, 8, 1], + [1, 9, 1], + [2, 3, 2], + [2, 4, 1], + ] + ), ) glycine_without_btype = glycine_with_btype.copy() glycine_without_btype.charge = glycine_charge glycine_without_btype.bonds = BondList( glycine_without_btype.array_length(), - np.array([ - [0,1,0], [0,5,0], [0,6,0], [0,7,0], [1,2,0], [1,8,0], - [1,9,0], [2,3,0], [2,4,0] - ]) + np.array( + [ + [0, 1, 0], + [0, 5, 0], + [0, 6, 0], + [0, 7, 0], + [1, 2, 0], + [1, 8, 0], + [1, 9, 0], + [2, 3, 0], + [2, 4, 0], + ] + ), ) part_charges_with_btype = partial_charges(glycine_with_btype) with pytest.warns(UserWarning, match="Each atom's bond type is 0"): - part_charges_without_btype = partial_charges( - glycine_without_btype - ) + part_charges_without_btype = partial_charges(glycine_without_btype) # Nitrogen of the amino group has the index 0 nitr_charge_with_btype = part_charges_with_btype[0] nitr_charge_without_btype = part_charges_without_btype[0] - assert nitr_charge_with_btype == pytest.approx( - nitr_charge_without_btype, abs=5e-4 - ) + assert nitr_charge_with_btype == pytest.approx(nitr_charge_without_btype, abs=5e-4) # Oxygen of the hydroxyl group in the carboxyl group has the index 2 oxyg_charge_with_btype = part_charges_with_btype[2] oxyg_charge_without_btype = part_charges_without_btype[2] assert oxyg_charge_with_btype < oxyg_charge_without_btype # Assert that difference between the two values is significant - difference_oxyg_charges = abs(oxyg_charge_with_btype - - oxyg_charge_without_btype) - assert difference_oxyg_charges > 3e-2 \ No newline at end of file + difference_oxyg_charges = abs(oxyg_charge_with_btype - oxyg_charge_without_btype) + assert difference_oxyg_charges > 3e-2 diff --git a/tests/structure/test_compare.py b/tests/structure/test_compare.py index a4cf024a1..6d2652066 100644 --- a/tests/structure/test_compare.py +++ b/tests/structure/test_compare.py @@ -2,17 +2,18 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io as strucio from os.path import join import numpy as np import pytest +import biotite.structure as struc +import biotite.structure.io as strucio from ..util import data_dir + @pytest.fixture def stack(): stack = struc.AtomArrayStack(depth=3, length=5) - stack.coord = np.arange(45).reshape((3,5,3)) + stack.coord = np.arange(45).reshape((3, 5, 3)) return stack @@ -20,92 +21,178 @@ def stack(): def test_rmsd(stack, as_coord): if as_coord: stack = stack.coord - assert struc.rmsd(stack[0], stack).tolist() \ - == pytest.approx([0.0, 25.98076211, 51.96152423]) - assert struc.rmsd(stack[0], stack[1]) \ - == pytest.approx(25.9807621135) + assert struc.rmsd(stack[0], stack).tolist() == pytest.approx( + [0.0, 25.98076211, 51.96152423] + ) + assert struc.rmsd(stack[0], stack[1]) == pytest.approx(25.9807621135) @pytest.mark.parametrize("as_coord", [False, True]) def test_rmsf(stack, as_coord): if as_coord: stack = stack.coord - assert struc.rmsf(struc.average(stack), stack).tolist() \ - == pytest.approx([21.21320344] * 5) + assert struc.rmsf(struc.average(stack), stack).tolist() == pytest.approx( + [21.21320344] * 5 + ) + @pytest.fixture def load_stack_superimpose(): - stack = strucio.load_structure(join( - data_dir("structure"), "1l2y.bcif" - )) + stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) # Superimpose with first frame bb_mask = struc.filter_peptide_backbone(stack[0]) supimp, _ = struc.superimpose(stack[0], stack, atom_mask=bb_mask) return stack, supimp + def test_rmsd_gmx(load_stack_superimpose): """ Comparison of RMSD values computed with Biotite with results obtained from GROMACS 2021.5. """ stack, supimp = load_stack_superimpose - rmsd = struc.rmsd(stack[0], supimp)/10 + rmsd = struc.rmsd(stack[0], supimp) / 10 # Gromacs RMSDs -> Without mass-weighting: # echo "Backbone Protein" | \ # gmx rms -s 1l2y.gro -f 1l2y.xtc -o rmsd.xvg -mw no - rmsd_gmx = np.array([ - 0.0005037, 0.1957698, 0.2119313, 0.2226127, 0.184382, - 0.2210998, 0.2712815, 0.1372861, 0.2348654, 0.1848784, - 0.1893576, 0.2500543, 0.1946374, 0.2101624, 0.2180645, - 0.1836762, 0.1681345, 0.2363865, 0.2287371, 0.2546207, - 0.1604872, 0.2167119, 0.2176063, 0.2069806, 0.2535706, - 0.2682233, 0.2252388, 0.2419151, 0.2343987, 0.1902994, - 0.2334525, 0.2010523, 0.215444, 0.1786632, 0.2652018, - 0.174061, 0.2591569, 0.2602662 - ]) + rmsd_gmx = np.array( + [ + 0.0005037, + 0.1957698, + 0.2119313, + 0.2226127, + 0.184382, + 0.2210998, + 0.2712815, + 0.1372861, + 0.2348654, + 0.1848784, + 0.1893576, + 0.2500543, + 0.1946374, + 0.2101624, + 0.2180645, + 0.1836762, + 0.1681345, + 0.2363865, + 0.2287371, + 0.2546207, + 0.1604872, + 0.2167119, + 0.2176063, + 0.2069806, + 0.2535706, + 0.2682233, + 0.2252388, + 0.2419151, + 0.2343987, + 0.1902994, + 0.2334525, + 0.2010523, + 0.215444, + 0.1786632, + 0.2652018, + 0.174061, + 0.2591569, + 0.2602662, + ] + ) assert np.allclose(rmsd, rmsd_gmx, atol=1e-03) + def test_rmspd_gmx(load_stack_superimpose): """ Comparison of the RMSPD computed with Biotite with results obtained from GROMACS 2021.5. """ stack, _ = load_stack_superimpose - rmspd = struc.rmspd(stack[0], stack)/10 + rmspd = struc.rmspd(stack[0], stack) / 10 # Gromacs RMSDist: # echo "Protein" | \ # gmx rmsdist -f 1l2y.xtc -s 1l2y.gro -o rmsdist.xvg -sumh no -pbc no - rmspd_gmx = np.array([ - 0.000401147, 0.125482, 0.138913, 0.138847, 0.113917, - 0.132915, 0.173084, 0.103089, 0.156309, 0.114694, - 0.12964, 0.15875, 0.12876, 0.128983, 0.137031, - 0.126059, 0.106726, 0.154244, 0.144405, 0.174041, - 0.10417, 0.130936, 0.141216, 0.125559, 0.171342, - 0.165306, 0.137616, 0.154447, 0.146337, 0.116433, - 0.154976, 0.128477, 0.150537, 0.111494, 0.173234, - 0.116638, 0.169524, 0.15953 - ]) + rmspd_gmx = np.array( + [ + 0.000401147, + 0.125482, + 0.138913, + 0.138847, + 0.113917, + 0.132915, + 0.173084, + 0.103089, + 0.156309, + 0.114694, + 0.12964, + 0.15875, + 0.12876, + 0.128983, + 0.137031, + 0.126059, + 0.106726, + 0.154244, + 0.144405, + 0.174041, + 0.10417, + 0.130936, + 0.141216, + 0.125559, + 0.171342, + 0.165306, + 0.137616, + 0.154447, + 0.146337, + 0.116433, + 0.154976, + 0.128477, + 0.150537, + 0.111494, + 0.173234, + 0.116638, + 0.169524, + 0.15953, + ] + ) assert np.allclose(rmspd, rmspd_gmx, atol=1e-03) + def test_rmsf_gmx(load_stack_superimpose): """ Comparison of RMSF values computed with Biotite with results obtained from GROMACS 2021.5. """ stack, supimp = load_stack_superimpose - ca_mask = ((stack[0].atom_name == "CA") & (stack[0].element == "C")) - rmsf = struc.rmsf(struc.average(supimp[:, ca_mask]), supimp[:, ca_mask])/10 + ca_mask = (stack[0].atom_name == "CA") & (stack[0].element == "C") + rmsf = struc.rmsf(struc.average(supimp[:, ca_mask]), supimp[:, ca_mask]) / 10 # Gromacs RMSF: # echo "C-alpha" | gmx rmsf -s 1l2y.gro -f 1l2y.xtc -o rmsf.xvg -res - rmsf_gmx = np.array([ - 0.1379, 0.036, 0.0261, 0.0255, 0.029, 0.0204, 0.0199, - 0.0317, 0.0365, 0.0249, 0.0269, 0.032, 0.0356, 0.0446, - 0.059, 0.037, 0.0331, 0.0392, 0.0403, 0.0954 - ]) - - assert np.allclose(rmsf, rmsf_gmx, atol=1e-02) \ No newline at end of file + rmsf_gmx = np.array( + [ + 0.1379, + 0.036, + 0.0261, + 0.0255, + 0.029, + 0.0204, + 0.0199, + 0.0317, + 0.0365, + 0.0249, + 0.0269, + 0.032, + 0.0356, + 0.0446, + 0.059, + 0.037, + 0.0331, + 0.0392, + 0.0403, + 0.0954, + ] + ) + + assert np.allclose(rmsf, rmsf_gmx, atol=1e-02) diff --git a/tests/structure/test_density.py b/tests/structure/test_density.py index bfbb3e1e4..012b5eb02 100644 --- a/tests/structure/test_density.py +++ b/tests/structure/test_density.py @@ -2,10 +2,11 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -from biotite.structure import Atom import numpy as np import pytest +import biotite.structure as struc +from biotite.structure import Atom + @pytest.fixture def array(): @@ -18,52 +19,56 @@ def array(): atom_list.append(Atom([2.5, 0.5, 1.1])) return struc.array(atom_list) + @pytest.fixture def stack(array): return struc.stack([array, array.copy()]) - def test_density(array, stack): density, (x, y, z) = struc.density(array) assert np.array_equal(x, [0.5, 1.5, 2.5]) assert np.array_equal(y, [0.5, 1.5, 2.5, 3.5]) assert np.array_equal(z, [1.0, 2.0]) assert density.sum() == 6 - assert density[0,2] == 2 - assert density[1,0] == 3 - assert density[1,1] == 1 + assert density[0, 2] == 2 + assert density[1, 0] == 3 + assert density[1, 1] == 1 density, (x, y, z) = struc.density(stack) assert np.array_equal(x, [0.5, 1.5, 2.5]) assert np.array_equal(y, [0.5, 1.5, 2.5, 3.5]) assert np.array_equal(z, [1.0, 2.0]) assert density.sum() == 12 - assert density[0,2] == 4 - assert density[1,0] == 6 - assert density[1,1] == 2 + assert density[0, 2] == 4 + assert density[1, 0] == 6 + assert density[1, 1] == 2 + def test_density_with_bins(array): - bins = np.array([[0, 1, 2, 3],[0, 1, 2, 3],[0, 1, 2, 3]]) + bins = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]) density, (x, y, z) = struc.density(array, bins=bins) - assert np.array_equal(x, [0,1,2,3]) - assert np.array_equal(y, [0,1,2,3]) - assert np.array_equal(z, [0,1,2,3]) + assert np.array_equal(x, [0, 1, 2, 3]) + assert np.array_equal(y, [0, 1, 2, 3]) + assert np.array_equal(z, [0, 1, 2, 3]) assert density.sum() == 6 - assert density[0,2,1] == 2 - assert density[1,1,1] == 1 - assert density[2,0,1] == 3 + assert density[0, 2, 1] == 2 + assert density[1, 1, 1] == 1 + assert density[2, 0, 1] == 3 + def test_density_with_delta(array): density, (x, y, z) = struc.density(array, delta=5.0) assert density.shape == (1, 1, 1) assert density.sum() == 6 - assert density[0,0,0] == 6 + assert density[0, 0, 0] == 6 + def test_density_normalized(array): density, (x, y, z) = struc.density(array, density=True) assert np.abs(density.sum() - 1.0) < 0.0001 - assert np.abs(density[0,2] - 2.0/6.0) < 0.0001 + assert np.abs(density[0, 2] - 2.0 / 6.0) < 0.0001 + def test_density_weights(array, stack): # assign weights to coordinates @@ -74,15 +79,15 @@ def test_density_weights(array, stack): assert density.sum() == atomic_weights.sum() assert density[0, 2] == atomic_weights[0] + atomic_weights[1] assert density[1, 0] == atomic_weights[3:].sum() - assert density[1,1] == atomic_weights[2] + assert density[1, 1] == atomic_weights[2] # weights should be repeated along stack dimensions and lead to the same # result independent of shape density, (x, y, z) = struc.density(stack, weights=atomic_weights) - density2, (x, y, z) = struc.density(stack, - weights=np.array([atomic_weights, atomic_weights])) + density2, (x, y, z) = struc.density( + stack, weights=np.array([atomic_weights, atomic_weights]) + ) assert density.sum() == density2.sum() - assert density[0,2] == density2[0,2] - assert density[1,0] == density2[1,0] - assert density[1,1] == density2[1,1] - + assert density[0, 2] == density2[0, 2] + assert density[1, 0] == density2[1, 0] + assert density[1, 1] == density2[1, 1] diff --git a/tests/structure/test_dotbracket.py b/tests/structure/test_dotbracket.py index 4e6827cd7..46001ce83 100644 --- a/tests/structure/test_dotbracket.py +++ b/tests/structure/test_dotbracket.py @@ -2,11 +2,11 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest +from os.path import join import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -from os.path import join from ..util import data_dir @@ -15,11 +15,10 @@ def nuc_sample_array(): """ Sample structure. """ - nuc_sample_array = strucio.load_structure( - join(data_dir("structure"), "4p5j.cif") - ) + nuc_sample_array = strucio.load_structure(join(data_dir("structure"), "4p5j.cif")) return nuc_sample_array[struc.filter_nucleotides(nuc_sample_array)] + @pytest.fixture def expected_output(): """ @@ -29,47 +28,51 @@ def expected_output(): ".[(((((.[{...)))))(((((((.......)))))))...(((((]}.)..))))[[[...((((((" "]]].].))))))(.)", ".[(((((.{[...)))))(((((((.......)))))))...(((((}].)..))))[[[...((((((" - "]]].].))))))(.)" + "]]].].))))))(.)", ] + @pytest.fixture def basepair_residue_positions(): """ The base pairs in the sample array by their residue postions. """ return np.array( - [[1, 73], - [2, 17], - [3, 16], - [4, 15], - [5, 14], - [6, 13], - [8, 47], - [9, 48], - [18, 38], - [19, 37], - [20, 36], - [21, 35], - [22, 34], - [23, 33], - [24, 32], - [42, 56], - [43, 55], - [44, 54], - [45, 53], - [46, 50], - [57, 71], - [58, 70], - [59, 69], - [63, 80], - [64, 79], - [65, 78], - [66, 77], - [67, 76], - [68, 75], - [81, 83]] + [ + [1, 73], + [2, 17], + [3, 16], + [4, 15], + [5, 14], + [6, 13], + [8, 47], + [9, 48], + [18, 38], + [19, 37], + [20, 36], + [21, 35], + [22, 34], + [23, 33], + [24, 32], + [42, 56], + [43, 55], + [44, 54], + [45, 53], + [46, 50], + [57, 71], + [58, 70], + [59, 69], + [63, 80], + [64, 79], + [65, 78], + [66, 77], + [67, 76], + [68, 75], + [81, 83], + ] ) + def verify_dot_bracket_notation(output, expected_output): """ Ensure that the dot_bracket notation matches a reference. @@ -82,6 +85,7 @@ def verify_dot_bracket_notation(output, expected_output): unique_solutions = set(output) assert len(output) == len(unique_solutions) + def test_dot_bracket_from_structure(nuc_sample_array, expected_output): """ Check the output of ``dot_bracket_from_structure()``. @@ -89,22 +93,20 @@ def test_dot_bracket_from_structure(nuc_sample_array, expected_output): output = struc.dot_bracket_from_structure(nuc_sample_array) verify_dot_bracket_notation(output, expected_output) + def test_dot_bracket(basepair_residue_positions, expected_output): """ Check the output of ``dot_bracket()``. """ - output = struc.dot_bracket( - basepair_residue_positions, len(expected_output[0]) - ) + output = struc.dot_bracket(basepair_residue_positions, len(expected_output[0])) verify_dot_bracket_notation(output, expected_output) -def test_base_pairs_from_dot_bracket( - basepair_residue_positions, expected_output -): + +def test_base_pairs_from_dot_bracket(basepair_residue_positions, expected_output): """ Ensure that the base pairs are correctly extracted from the DBL-notation """ for notation in expected_output: test_residue_positions = struc.base_pairs_from_dot_bracket(notation) - assert np.all(test_residue_positions == basepair_residue_positions) \ No newline at end of file + assert np.all(test_residue_positions == basepair_residue_positions) diff --git a/tests/structure/test_filter.py b/tests/structure/test_filter.py index 996593831..e1dc20c49 100644 --- a/tests/structure/test_filter.py +++ b/tests/structure/test_filter.py @@ -2,110 +2,125 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join +import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -import numpy as np -from os.path import join from ..util import data_dir -import pytest + @pytest.fixture def canonical_sample_protein(): - return strucio.load_structure( - join(data_dir("structure"), "3o5r.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) + @pytest.fixture def sample_protein(): - return strucio.load_structure( - join(data_dir("structure"), "5eil.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "5eil.bcif")) + @pytest.fixture def canonical_sample_nucleotide(): - return strucio.load_structure( - join(data_dir("structure"), "5ugo.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "5ugo.bcif")) + @pytest.fixture def sample_nucleotide(): - return strucio.load_structure( - join(data_dir("structure"), "4p5j.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "4p5j.bcif")) + @pytest.fixture def sample_carbohydrate(): - return strucio.load_structure( - join(data_dir("structure"), "2d0f.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "2d0f.bcif")) + @pytest.fixture def all_atloc_structure(): return strucio.load_structure( join(data_dir("structure"), "1o1z.bcif"), - extra_fields = ["occupancy"], - altloc="all" + extra_fields=["occupancy"], + altloc="all", ) + def test_solvent_filter(canonical_sample_protein): - assert len(canonical_sample_protein[struc.filter_solvent(canonical_sample_protein)]) == 287 + assert ( + len(canonical_sample_protein[struc.filter_solvent(canonical_sample_protein)]) + == 287 + ) + def test_canonical_amino_acid_filter(canonical_sample_protein): assert ( - len(canonical_sample_protein[ - struc.filter_canonical_amino_acids(canonical_sample_protein) - ]) == 982 + len( + canonical_sample_protein[ + struc.filter_canonical_amino_acids(canonical_sample_protein) + ] + ) + == 982 ) + def test_amino_acid_filter(sample_protein): assert ( - struc.get_residue_count((sample_protein[ - struc.filter_amino_acids(sample_protein) - ])) == - struc.get_residue_count((sample_protein[ - struc.filter_canonical_amino_acids(sample_protein) - ])) + 3 + struc.get_residue_count( + (sample_protein[struc.filter_amino_acids(sample_protein)]) + ) + == struc.get_residue_count( + (sample_protein[struc.filter_canonical_amino_acids(sample_protein)]) + ) + + 3 ) + def test_canonical_nucleotide_filter(canonical_sample_nucleotide): assert ( - len(canonical_sample_nucleotide[ - struc.filter_canonical_nucleotides(canonical_sample_nucleotide) - ]) == 651 + len( + canonical_sample_nucleotide[ + struc.filter_canonical_nucleotides(canonical_sample_nucleotide) + ] + ) + == 651 ) + def test_nucleotide_filter(sample_nucleotide): assert ( - struc.get_residue_count((sample_nucleotide[ - struc.filter_nucleotides(sample_nucleotide) - ])) == - struc.get_residue_count((sample_nucleotide[ - struc.filter_canonical_nucleotides(sample_nucleotide) - ])) + 1 + struc.get_residue_count( + (sample_nucleotide[struc.filter_nucleotides(sample_nucleotide)]) + ) + == struc.get_residue_count( + (sample_nucleotide[struc.filter_canonical_nucleotides(sample_nucleotide)]) + ) + + 1 ) + def test_carbohydrate_filter(sample_carbohydrate): assert ( - struc.get_residue_count((sample_carbohydrate[ - struc.filter_carbohydrates(sample_carbohydrate) - ])) == 8 + struc.get_residue_count( + (sample_carbohydrate[struc.filter_carbohydrates(sample_carbohydrate)]) + ) + == 8 ) def test_peptide_backbone_filter(canonical_sample_protein): assert ( - len(canonical_sample_protein[ - struc.filter_peptide_backbone(canonical_sample_protein) - ]) == 384 + len( + canonical_sample_protein[ + struc.filter_peptide_backbone(canonical_sample_protein) + ] + ) + == 384 ) def test_phosphate_backbone_filter(canonical_sample_nucleotide): # take a chain D with five canonical nucleotides # => there should be 5 x 6 = 30 backbone atoms - chain_d = canonical_sample_nucleotide[ - canonical_sample_nucleotide.chain_id == 'D' - ] + chain_d = canonical_sample_nucleotide[canonical_sample_nucleotide.chain_id == "D"] assert len(chain_d[struc.filter_phosphate_backbone(chain_d)]) == 30 @@ -139,39 +154,45 @@ def test_polymer_filter(canonical_sample_nucleotide, sample_carbohydrate): a = canonical_sample_nucleotide # Check for nucleotide filtering - a_nuc = a[struc.filter_polymer(a, pol_type='n')] + a_nuc = a[struc.filter_polymer(a, pol_type="n")] # Take three nucleic acids chains and remove solvent => the result should # encompass all nucleotide polymer atoms, which is exactly the output of the # `filter_polymer()`. In the structure file, the filtered atoms are 1-651. - a_nuc_manual = a[np.isin(a.chain_id, ['D', 'P', 'T']) & ~struc.filter_solvent(a)] + a_nuc_manual = a[np.isin(a.chain_id, ["D", "P", "T"]) & ~struc.filter_solvent(a)] assert len(a_nuc) == len(a_nuc_manual) == 651 - assert set(a_nuc.chain_id) == {'D', 'P', 'T'} + assert set(a_nuc.chain_id) == {"D", "P", "T"} # chain D should be absent - a_nuc = a_nuc[struc.filter_polymer(a_nuc, min_size=6, pol_type='n')] - assert set(a_nuc.chain_id) == {'P', 'T'} + a_nuc = a_nuc[struc.filter_polymer(a_nuc, min_size=6, pol_type="n")] + assert set(a_nuc.chain_id) == {"P", "T"} # Single protein chain A: residues 10-335 - a_pep = a[struc.filter_polymer(a, pol_type='p')] - assert len(a_pep) == len(a[(a.res_id >= 10) & (a.res_id <= 335) & (a.chain_id == 'A')]) + a_pep = a[struc.filter_polymer(a, pol_type="p")] + assert len(a_pep) == len( + a[(a.res_id >= 10) & (a.res_id <= 335) & (a.chain_id == "A")] + ) # Chain B has five carbohydrate residues # Chain C has four # => Only chain B is selected a = sample_carbohydrate - a_carb = a[struc.filter_polymer(a, min_size=4, pol_type='carb')] - assert set(a_carb.chain_id) == {'B'} + a_carb = a[struc.filter_polymer(a, min_size=4, pol_type="carb")] + assert set(a_carb.chain_id) == {"B"} assert struc.get_residue_count(a_carb) == 5 def test_intersection_filter(canonical_sample_protein): assert ( - len(canonical_sample_protein[:200][ - struc.filter_intersection( - canonical_sample_protein[:200],canonical_sample_protein[100:] - ) - ]) == 100 + len( + canonical_sample_protein[:200][ + struc.filter_intersection( + canonical_sample_protein[:200], canonical_sample_protein[100:] + ) + ] + ) + == 100 ) + @pytest.mark.parametrize("filter_func", ["first", "occupancy"]) def test_filter_altloc(all_atloc_structure, filter_func): """ @@ -183,21 +204,22 @@ def test_filter_altloc(all_atloc_structure, filter_func): all_atloc_structure.chain_id, all_atloc_structure.res_id, all_atloc_structure.ins_code, - all_atloc_structure.atom_name + all_atloc_structure.atom_name, ): ref_atom_set.add(atom_tuple) if filter_func == "first": - filtered_structure = all_atloc_structure[struc.filter_first_altloc( - all_atloc_structure, - all_atloc_structure.altloc_id - )] + filtered_structure = all_atloc_structure[ + struc.filter_first_altloc( + all_atloc_structure, all_atloc_structure.altloc_id + ) + ] elif filter_func == "occupancy": filtered_structure = all_atloc_structure[ struc.filter_highest_occupancy_altloc( all_atloc_structure, all_atloc_structure.altloc_id, - all_atloc_structure.occupancy + all_atloc_structure.occupancy, ) ] @@ -206,7 +228,7 @@ def test_filter_altloc(all_atloc_structure, filter_func): filtered_structure.chain_id, filtered_structure.res_id, filtered_structure.ins_code, - filtered_structure.atom_name + filtered_structure.atom_name, ): try: # No atom should be present twice @@ -230,10 +252,9 @@ def test_filter_highest_occupancy_altloc(all_atloc_structure): all_atloc_structure.occupancy[all_atloc_structure.altloc_id == "B"] = 1.0 # filter_first_altloc - filtered_structure = all_atloc_structure[struc.filter_first_altloc( - all_atloc_structure, - all_atloc_structure.altloc_id - )] + filtered_structure = all_atloc_structure[ + struc.filter_first_altloc(all_atloc_structure, all_atloc_structure.altloc_id) + ] ref_occupancy_sum = np.average(filtered_structure.occupancy) # filter_highest_occupancy_altloc @@ -241,9 +262,9 @@ def test_filter_highest_occupancy_altloc(all_atloc_structure): struc.filter_highest_occupancy_altloc( all_atloc_structure, all_atloc_structure.altloc_id, - all_atloc_structure.occupancy + all_atloc_structure.occupancy, ) ] test_occupancy_sum = np.average(filtered_structure.occupancy) - assert test_occupancy_sum > ref_occupancy_sum \ No newline at end of file + assert test_occupancy_sum > ref_occupancy_sum diff --git a/tests/structure/test_generalio.py b/tests/structure/test_generalio.py index 1dbeb73a6..a7b4a6063 100644 --- a/tests/structure/test_generalio.py +++ b/tests/structure/test_generalio.py @@ -2,23 +2,18 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile -import biotite.structure as struc -import biotite.structure.io as strucio import glob import os from os.path import join, splitext -from ..util import data_dir, cannot_import +from tempfile import NamedTemporaryFile import pytest +import biotite.structure as struc +import biotite.structure.io as strucio +from ..util import cannot_import, data_dir -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "1l2y.*")) -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") +@pytest.mark.parametrize("path", glob.glob(join(data_dir("structure"), "1l2y.*"))) def test_loading(path): """ Just check if :func:`load_structure()` does not raise an exception @@ -26,9 +21,7 @@ def test_loading(path): """ suffix = splitext(path)[1] if suffix in [".trr", ".xtc", ".tng", ".dcd", ".netcdf"]: - template = strucio.load_structure( - join(data_dir("structure"), "1l2y.bcif") - ) + template = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) array = strucio.load_structure(path, template) else: array = strucio.load_structure(path) @@ -40,10 +33,7 @@ def test_loading(path): assert isinstance(array, struc.AtomArrayStack) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") def test_loading_template_with_trj(): """ Check if :func:`load_structure()` using a trajectory file does not @@ -57,10 +47,7 @@ def test_loading_template_with_trj(): assert len(stack) > 1 -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") def test_loading_with_extra_args(): """ Check if :func:`load_structure()` witt optional arguments does not @@ -74,9 +61,7 @@ def test_loading_with_extra_args(): assert "b_factor" in structure.get_annotation_categories() # test if arguments are passed to read for trajectories - stack = strucio.load_structure( - trajectory, template=structure[0], start=5, stop=6 - ) + stack = strucio.load_structure(trajectory, template=structure[0], start=5, stop=6) assert len(stack) == 1 # loading should fail with wrong arguments @@ -88,16 +73,10 @@ def test_loading_with_extra_args(): assert stack.shape[1] == 2 -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( "suffix", - [ - "pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng", - "dcd", "netcdf" - ] + ["pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng", "dcd", "netcdf"], ) def test_saving(suffix): """ @@ -124,23 +103,19 @@ def test_saving(suffix): if category == "chain_id" and suffix == "gro": # The chain ID is not written to GRO files continue - assert test_array.get_annotation(category).tolist() \ - == ref_array.get_annotation(category).tolist() + assert ( + test_array.get_annotation(category).tolist() + == ref_array.get_annotation(category).tolist() + ) assert test_array.coord.flatten().tolist() == pytest.approx( - ref_array.coord.flatten().tolist(), abs=1e-2 + ref_array.coord.flatten().tolist(), abs=1e-2 ) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( "suffix", - [ - "pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng", - "dcd", "netcdf" - ] + ["pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "tng", "dcd", "netcdf"], ) def test_saving_with_extra_args(suffix): """ @@ -150,9 +125,7 @@ def test_saving_with_extra_args(suffix): array = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) temp = NamedTemporaryFile("w+", suffix=f".{suffix}") with pytest.raises(TypeError): - strucio.save_structure( - temp.name, array, answer=42 - ) + strucio.save_structure(temp.name, array, answer=42) temp.close() diff --git a/tests/structure/test_geometry.py b/tests/structure/test_geometry.py index 3239d43b4..7924bec8a 100644 --- a/tests/structure/test_geometry.py +++ b/tests/structure/test_geometry.py @@ -2,44 +2,43 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile -import itertools import glob +import itertools from os.path import join +from tempfile import NamedTemporaryFile import numpy as np import numpy.random as random import pytest import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx -from ..util import data_dir, cannot_import +from ..util import cannot_import, data_dir def test_distance(): - coord1 = struc.coord([0,1,1]) - coord2 = struc.coord([0,2,2]) + coord1 = struc.coord([0, 1, 1]) + coord2 = struc.coord([0, 2, 2]) assert struc.distance(coord1, coord2) == pytest.approx(np.sqrt(2)) def test_centroid(): - coord = struc.coord([[1,1,1],[0,-1,-1],[-1,0,0]]) - assert struc.centroid(coord).tolist() == [0,0,0] + coord = struc.coord([[1, 1, 1], [0, -1, -1], [-1, 0, 0]]) + assert struc.centroid(coord).tolist() == [0, 0, 0] def test_angle(): - coord1 = struc.coord([0,0,1]) - coord2 = struc.coord([0,0,0]) - coord3 = struc.coord([0,1,1]) - assert struc.angle(coord1, coord2, coord3) == pytest.approx(0.25*np.pi) + coord1 = struc.coord([0, 0, 1]) + coord2 = struc.coord([0, 0, 0]) + coord3 = struc.coord([0, 1, 1]) + assert struc.angle(coord1, coord2, coord3) == pytest.approx(0.25 * np.pi) def test_dihedral(): - coord1 = struc.coord([-0.5,-1,0]) - coord2 = struc.coord([0,0,0]) - coord3 = struc.coord([1,0,0]) - coord4 = struc.coord([0,0,-1]) - assert struc.dihedral(coord1, coord2, coord3, coord4) \ - == pytest.approx(0.5*np.pi) + coord1 = struc.coord([-0.5, -1, 0]) + coord2 = struc.coord([0, 0, 0]) + coord3 = struc.coord([1, 0, 0]) + coord4 = struc.coord([0, 0, -1]) + assert struc.dihedral(coord1, coord2, coord3, coord4) == pytest.approx(0.5 * np.pi) @pytest.mark.parametrize("multiple_chains", [False, True]) @@ -55,17 +54,18 @@ def test_dihedral_backbone_general(multiple_chains): array = stack[0] # Test array phi, psi, omega = struc.dihedral_backbone(array) - assert phi.shape == (n_res,) - assert psi.shape == (n_res,) + assert phi.shape == (n_res,) + assert psi.shape == (n_res,) assert omega.shape == (n_res,) _assert_plausible_omega(omega) # Test stack phi, psi, omega = struc.dihedral_backbone(stack) - assert phi.shape == (n_models, n_res) - assert psi.shape == (n_models, n_res) + assert phi.shape == (n_models, n_res) + assert psi.shape == (n_models, n_res) assert omega.shape == (n_models, n_res) _assert_plausible_omega(omega) + def _assert_plausible_omega(omega): # Remove nan values omega = omega.flatten() @@ -74,13 +74,8 @@ def _assert_plausible_omega(omega): assert omega.tolist() == pytest.approx([np.pi] * len(omega), rel=0.6) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "file_name", glob.glob(join(data_dir("structure"), "*.bcif")) -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") +@pytest.mark.parametrize("file_name", glob.glob(join(data_dir("structure"), "*.bcif"))) def test_dihedral_backbone_result(file_name): import mdtraj @@ -113,12 +108,11 @@ def test_dihedral_backbone_result(file_name): _, ref_ome = mdtraj.compute_omega(traj) ref_phi, ref_psi, ref_ome = ref_phi[0], ref_psi[0], ref_ome[0] - assert test_phi[1: ] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3) + assert test_phi[1:] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3) assert test_psi[:-1] == pytest.approx(ref_psi, abs=1e-5, rel=5e-3) assert test_ome[:-1] == pytest.approx(ref_ome, abs=1e-5, rel=5e-3) - def test_index_distance_non_periodic(): """ Without PBC the result should be equal to the normal distance @@ -126,26 +120,21 @@ def test_index_distance_non_periodic(): """ array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) ref_dist = struc.distance( - array.coord[np.newaxis, :, :], - array.coord[:, np.newaxis, :] + array.coord[np.newaxis, :, :], array.coord[:, np.newaxis, :] ).flatten() length = array.array_length() dist = struc.index_distance( array, - indices = np.stack([ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], axis=1) + indices=np.stack( + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=1, + ), ) assert np.allclose(dist, ref_dist) @pytest.mark.parametrize( - "shift", [ - np.array([10, 20, 30]), - np.array([-8, 12, 28]), - np.array([ 0, 99, 54]) - ] + "shift", [np.array([10, 20, 30]), np.array([-8, 12, 28]), np.array([0, 99, 54])] ) def test_index_distance_periodic_orthogonal(shift): """ @@ -155,15 +144,13 @@ def test_index_distance_periodic_orthogonal(shift): array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) # Use a box based on the boundaries of the structure # '+1' to add a margin - array.box = np.diag( - np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1 - ) + array.box = np.diag(np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1) length = array.array_length() - dist_indices = np.stack([ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], axis=1) + dist_indices = np.stack( + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=1, + ) ref_dist = struc.index_distance(array, dist_indices, periodic=True) array.coord += shift @@ -173,23 +160,13 @@ def test_index_distance_periodic_orthogonal(shift): @pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( - "shift, angles", itertools.product( - [ - np.array([10, 20, 30]), - np.array([-8, 12, 28]), - np.array([ 0, 99, 54]) - ], - [ - np.array([ 50, 90, 90]), - np.array([ 90, 90, 120]), - np.array([ 60, 60, 60]) - ] - ) + "shift, angles", + itertools.product( + [np.array([10, 20, 30]), np.array([-8, 12, 28]), np.array([0, 99, 54])], + [np.array([50, 90, 90]), np.array([90, 90, 120]), np.array([60, 60, 60])], + ), ) def test_index_distance_periodic_triclinic(shift, angles): """ @@ -202,15 +179,14 @@ def test_index_distance_periodic_triclinic(shift, angles): boundaries = np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1 angles = np.deg2rad(angles) array.box = struc.vectors_from_unitcell( - boundaries[0], boundaries[1], boundaries[2], - angles[0], angles[1], angles[2] + boundaries[0], boundaries[1], boundaries[2], angles[0], angles[1], angles[2] ) length = array.array_length() - dist_indices = np.stack([ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], axis=1) + dist_indices = np.stack( + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=1, + ) # index_distance() creates a large ndarray try: ref_dist = struc.index_distance(array, dist_indices, periodic=True) @@ -219,6 +195,7 @@ def test_index_distance_periodic_triclinic(shift, angles): # Compare with MDTraj import mdtraj + traj = mdtraj.load(join(data_dir("structure"), "3o5r.pdb")) # Angstrom to Nanometers traj.unitcell_vectors = array.box[np.newaxis, :, :] / 10 @@ -249,38 +226,35 @@ def test_index_functions(): samples = (array, stack, struc.coord(array), struc.coord(stack)) # Generate random indices random.seed(42) - indices = random.randint(array.array_length(), size=(100,4), dtype=int) + indices = random.randint(array.array_length(), size=(100, 4), dtype=int) for sample in samples: if isinstance(sample, np.ndarray): - atoms1 = sample[..., indices[:,0], :] - atoms2 = sample[..., indices[:,1], :] - atoms3 = sample[..., indices[:,2], :] - atoms4 = sample[..., indices[:,3], :] + atoms1 = sample[..., indices[:, 0], :] + atoms2 = sample[..., indices[:, 1], :] + atoms3 = sample[..., indices[:, 2], :] + atoms4 = sample[..., indices[:, 3], :] else: - atoms1 = sample[..., indices[:,0]] - atoms2 = sample[..., indices[:,1]] - atoms3 = sample[..., indices[:,2]] - atoms4 = sample[..., indices[:,3]] + atoms1 = sample[..., indices[:, 0]] + atoms2 = sample[..., indices[:, 1]] + atoms3 = sample[..., indices[:, 2]] + atoms4 = sample[..., indices[:, 3]] assert np.allclose( struc.displacement(atoms1, atoms2), - struc.index_displacement(sample, indices[:,:2]), - atol=1e-5 + struc.index_displacement(sample, indices[:, :2]), + atol=1e-5, ) assert np.allclose( struc.distance(atoms1, atoms2), - struc.index_distance(sample, indices[:,:2]), - atol=1e-5 + struc.index_distance(sample, indices[:, :2]), + atol=1e-5, ) assert np.allclose( struc.angle(atoms1, atoms2, atoms3), - struc.index_angle(sample, indices[:,:3]), - atol=1e-5 + struc.index_angle(sample, indices[:, :3]), + atol=1e-5, ) assert np.allclose( struc.dihedral(atoms1, atoms2, atoms3, atoms4), - struc.index_dihedral(sample, indices[:,:4]), - atol=1e-5 + struc.index_dihedral(sample, indices[:, :4]), + atol=1e-5, ) - - - diff --git a/tests/structure/test_gro.py b/tests/structure/test_gro.py index 02faf6f68..4970cb709 100644 --- a/tests/structure/test_gro.py +++ b/tests/structure/test_gro.py @@ -2,13 +2,13 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile import glob import itertools from os.path import join, splitext +from tempfile import TemporaryFile +import numpy as np import pytest from pytest import approx -import numpy as np import biotite import biotite.structure.io.gro as gro import biotite.structure.io.pdb as pdb @@ -25,10 +25,7 @@ def test_get_model_count(): @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.gro")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.gro")), [None, 1, -1]), ) def test_array_conversion(path, model): gro_file = gro.GROFile.read(path) @@ -40,8 +37,10 @@ def test_array_conversion(path, model): assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): - assert array1.get_annotation(category).tolist() == \ - array2.get_annotation(category).tolist() + assert ( + array1.get_annotation(category).tolist() + == array2.get_annotation(category).tolist() + ) assert array1.coord.tolist() == array2.coord.tolist() @@ -58,20 +57,17 @@ def test_pdb_consistency(path): assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: - assert a1.get_annotation(category).tolist() == \ - a2.get_annotation(category).tolist() + assert ( + a1.get_annotation(category).tolist() == a2.get_annotation(category).tolist() + ) # Mind rounding errors when converting pdb to gro (A -> nm) - assert a1.coord.flatten().tolist() \ - == approx(a2.coord.flatten().tolist(), abs=1e-2) + assert a1.coord.flatten().tolist() == approx(a2.coord.flatten().tolist(), abs=1e-2) @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]), ) def test_pdb_to_gro(path, model): """ @@ -105,20 +101,24 @@ def test_pdb_to_gro(path, model): assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: - assert a1.get_annotation(category).tolist() == \ - a2.get_annotation(category).tolist() + assert ( + a1.get_annotation(category).tolist() == a2.get_annotation(category).tolist() + ) # Mind rounding errors when converting pdb to gro (A -> nm) - assert a1.coord.flatten().tolist() \ - == approx(a2.coord.flatten().tolist(), abs=1e-2) + assert a1.coord.flatten().tolist() == approx(a2.coord.flatten().tolist(), abs=1e-2) def test_gro_id_overflow(): # Create an oversized AtomArray where atom_id > 100000 and res_id > 10000 num_atoms = 100005 - atoms = array([Atom([1,2,3], atom_name="CA", element="C", res_name="X", - res_id=i+1) for i in range(num_atoms)]) - atoms.box = np.array([[1,0,0], [0,1,0], [0,0,1]]) + atoms = array( + [ + Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=i + 1) + for i in range(num_atoms) + ] + ) + atoms.box = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # Write .gro file temp = TemporaryFile("w+") @@ -143,7 +143,7 @@ def test_gro_no_box(): """ # Create an AtomArray - atom = Atom([1,2,3], atom_name="CA", element="C", res_name="X", res_id=1) + atom = Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=1) atoms = array([atom]) # Write .gro file @@ -151,7 +151,7 @@ def test_gro_no_box(): gro_file = gro.GROFile() gro_file.set_structure(atoms) gro_file.write(temp) - + # Read in file temp.seek(0) gro_file = gro.GROFile.read(temp) @@ -159,4 +159,4 @@ def test_gro_no_box(): s = gro_file.get_structure() # Assert no box with 0 dimension - assert s.box is None \ No newline at end of file + assert s.box is None diff --git a/tests/structure/test_hbond.py b/tests/structure/test_hbond.py index 64d4068f9..4c0c6ecb7 100644 --- a/tests/structure/test_hbond.py +++ b/tests/structure/test_hbond.py @@ -3,20 +3,18 @@ # information. import itertools -from tempfile import NamedTemporaryFile from os.path import join +from tempfile import NamedTemporaryFile import numpy as np import pytest import biotite.structure as struc from biotite.structure.io import load_structure, save_structure -from ..util import data_dir, cannot_import +from ..util import cannot_import, data_dir @pytest.fixture() def stack(request): - stack = load_structure( - join(data_dir("structure"), "1l2y.bcif") - ) + stack = load_structure(join(data_dir("structure"), "1l2y.bcif")) if request.param: # Use connect_via_distances, since 1l2y has invalidly bonded # N-terminal hydrogen atoms @@ -26,21 +24,15 @@ def stack(request): # Ignore warning about dummy unit cell vector @pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( - "pdb_id, use_bond_list", itertools.product( - ["1l2y", "1gya", "1igy"], - [False, True] - ) + "pdb_id, use_bond_list", itertools.product(["1l2y", "1gya", "1igy"], [False, True]) ) def test_hbond_structure(pdb_id, use_bond_list): """ Compare hydrogen bond detection with MDTraj """ - file_name = join(data_dir("structure"), pdb_id+".bcif") + file_name = join(data_dir("structure"), pdb_id + ".bcif") array = load_structure(file_name) if use_bond_list: @@ -58,9 +50,9 @@ def test_hbond_structure(pdb_id, use_bond_list): if isinstance(array, struc.AtomArrayStack): # For consistency with MDTraj 'S' cannot be acceptor element # https://github.com/mdtraj/mdtraj/blob/master/mdtraj/geometry/hbond.py#L365 - triplets, mask = struc.hbond(array, acceptor_elements=("O","N")) + triplets, mask = struc.hbond(array, acceptor_elements=("O", "N")) else: - triplets = struc.hbond(array, acceptor_elements=("O","N")) + triplets = struc.hbond(array, acceptor_elements=("O", "N")) # Save to new pdb file for consistent treatment of inscode/altloc # im MDTraj @@ -69,11 +61,10 @@ def test_hbond_structure(pdb_id, use_bond_list): # Compare with MDTraj import mdtraj + traj = mdtraj.load(temp.name) temp.close() - triplets_ref = mdtraj.baker_hubbard( - traj, freq=0, periodic=False - ) + triplets_ref = mdtraj.baker_hubbard(traj, freq=0, periodic=False) # Both packages may use different order # -> use set for comparison @@ -122,28 +113,27 @@ def test_hbond_with_selections(stack): of this boundary should be found. Also, hbond should respect the selection type. """ - selection1 = (stack.res_id == 3) & (stack.atom_name == 'O') # 3TYR BB Ox + selection1 = (stack.res_id == 3) & (stack.atom_name == "O") # 3TYR BB Ox selection2 = stack.res_id == 7 # backbone hbond should be found if selection1/2 type is both - triplets, mask = struc.hbond(stack, selection1, selection2, - selection1_type="both") + triplets, mask = struc.hbond(stack, selection1, selection2, selection1_type="both") assert len(triplets) == 1 assert triplets[0][0] == 116 assert triplets[0][2] == 38 # backbone hbond should be found if selection1 is acceptor and # selection2 is donor - triplets, mask = struc.hbond(stack, selection1, selection2, - selection1_type="acceptor") + triplets, mask = struc.hbond( + stack, selection1, selection2, selection1_type="acceptor" + ) assert len(triplets) == 1 assert triplets[0][0] == 116 assert triplets[0][2] == 38 # no hbond should be found, # because the backbone oxygen cannot be a donor - triplets, mask = struc.hbond(stack, selection1, selection2, - selection1_type="donor") + triplets, mask = struc.hbond(stack, selection1, selection2, selection1_type="donor") assert len(triplets) == 0 @@ -164,18 +154,20 @@ def test_hbond_single_selection(stack): def test_hbond_frequency(): - mask = np.array([ - [True, True, True, True, True], # 1.0 - [False, False, False, False, False], # 0.0 - [False, False, False, True, True] # 0.4 - ]).T + mask = np.array( + [ + [True, True, True, True, True], # 1.0 + [False, False, False, False, False], # 0.0 + [False, False, False, True, True], # 0.4 + ] + ).T freq = struc.hbond_frequency(mask) assert not np.isin(False, np.isclose(freq, np.array([1.0, 0.0, 0.4]))) # Ignore warning about missing BondList @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize("translation_vector", [(10,20,30), (-5, 3, 18)]) +@pytest.mark.parametrize("translation_vector", [(10, 20, 30), (-5, 3, 18)]) def test_hbond_periodicity(translation_vector): """ Test whether hydrogen bond identification uses periodic boundary @@ -197,4 +189,4 @@ def test_hbond_periodicity(translation_vector): array.coord = struc.move_inside_box(array.coord, array.box) hbonds = struc.hbond(array, periodic=True) hbonds = set([tuple(triplet) for triplet in hbonds]) - assert ref_hbonds == hbonds \ No newline at end of file + assert ref_hbonds == hbonds diff --git a/tests/structure/test_info.py b/tests/structure/test_info.py index f23c75030..805fcd65f 100644 --- a/tests/structure/test_info.py +++ b/tests/structure/test_info.py @@ -18,7 +18,7 @@ (strucinfo.amino_acid_names, ["ALA", "ARG", "ASN", "ASP"], ["HOH"]), (strucinfo.nucleotide_names, ["A", "C", "G", "U"], ["HOH", "ALA"]), (strucinfo.carbohydrate_names, ["GLC", "RIB"], ["HOH", "ALA"]), - ] + ], ) def test_group_names(function, included, excluded): """ @@ -49,9 +49,9 @@ def test_mass(): ref_masses = [strucinfo.mass(res) for res in struc.residue_iter(array)] # Up to three additional/missing hydrogens are allowed # (protonation state) - mass_diff = np.abs(np.array( - [mass - ref_mass for mass, ref_mass in zip(masses, ref_masses)] - )) + mass_diff = np.abs( + np.array([mass - ref_mass for mass, ref_mass in zip(masses, ref_masses)]) + ) assert (mass_diff // strucinfo.mass("H") <= 3).all() # Check if the mass difference is a multiple of the hydrogen mass multiple_of_h_masses = mass_diff / strucinfo.mass("H") @@ -105,12 +105,16 @@ def test_link_type(): [ (strucinfo.amino_acid_names(), True, 0.4), (strucinfo.nucleotide_names(), True, 0.4), - (sorted( - set(strucinfo.all_residues()) - - set(strucinfo.amino_acid_names()) - - set(strucinfo.nucleotide_names()) - ), False, 0.01), - ] + ( + sorted( + set(strucinfo.all_residues()) + - set(strucinfo.amino_acid_names()) + - set(strucinfo.nucleotide_names()) + ), + False, + 0.01, + ), + ], ) def test_one_letter_code(residues, should_have_one_letter, exception_ratio): """ @@ -145,14 +149,13 @@ def test_standardize_order(multi_model, seed): reordered = struc.AtomArray(0) for residue in struc.residue_iter(original): bound = residue.array_length() - indices = np.random.choice( - np.arange(bound), bound,replace=False - ) + indices = np.random.choice(np.arange(bound), bound, replace=False) reordered += residue[..., indices] # Restore the original PDB standard order restored = reordered[..., strucinfo.standardize_order(reordered)] assert restored.shape == original.shape - assert restored[..., restored.element != "H"] \ - == original[..., original.element != "H"] + assert ( + restored[..., restored.element != "H"] == original[..., original.element != "H"] + ) diff --git a/tests/structure/test_integrity.py b/tests/structure/test_integrity.py index b8fbbb89d..822187cee 100644 --- a/tests/structure/test_integrity.py +++ b/tests/structure/test_integrity.py @@ -2,51 +2,56 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join +import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import numpy as np -from os.path import join from ..util import data_dir -import pytest @pytest.fixture def sample_array(): - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), "1l2y.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) return pdbx.get_structure(pdbx_file, model=1) + @pytest.fixture def gapped_sample_array(sample_array): - atom_ids = np.arange(1, sample_array.shape[0]+1) + atom_ids = np.arange(1, sample_array.shape[0] + 1) sample_array.add_annotation("atom_id", dtype=int) sample_array.atom_id = atom_ids sample_array = sample_array[sample_array.res_id != 5] - sample_array = sample_array[(sample_array.res_id != 9) | - (sample_array.atom_name != "N")] + sample_array = sample_array[ + (sample_array.res_id != 9) | (sample_array.atom_name != "N") + ] return sample_array + @pytest.fixture def duplicate_sample_array(sample_array): sample_array[42] = sample_array[10] sample_array[234] = sample_array[123] return sample_array + def test_atom_id_continuity_check(gapped_sample_array): discon = struc.check_atom_id_continuity(gapped_sample_array) discon_array = gapped_sample_array[discon] assert discon_array.atom_id.tolist() == [93, 159] + def test_res_id_continuity_check(gapped_sample_array): discon = struc.check_res_id_continuity(gapped_sample_array) discon_array = gapped_sample_array[discon] assert discon_array.res_id.tolist() == [6] + def test_linear_continuity_check(gapped_sample_array): # Take the first ASN residue and remove hydrogens asn = gapped_sample_array[ - (gapped_sample_array.res_id == 1) & (gapped_sample_array.element != 'H')] + (gapped_sample_array.res_id == 1) & (gapped_sample_array.element != "H") + ] # The consecutive atom groups are # (1) N, CA, C, O # - break @@ -57,11 +62,13 @@ def test_linear_continuity_check(gapped_sample_array): discon = struc.check_linear_continuity(asn) assert discon.tolist() == [4, 7] + def test_bond_continuity_check(gapped_sample_array): discon = struc.check_backbone_continuity(gapped_sample_array) discon_array = gapped_sample_array[discon] - assert discon_array.res_id.tolist() == [6,9] + assert discon_array.res_id.tolist() == [6, 9] + def test_duplicate_atoms_check(duplicate_sample_array): discon = struc.check_duplicate_atoms(duplicate_sample_array) - assert discon.tolist() == [42,234] \ No newline at end of file + assert discon.tolist() == [42, 234] diff --git a/tests/structure/test_mechanics.py b/tests/structure/test_mechanics.py index 7be195882..30d0a55f9 100644 --- a/tests/structure/test_mechanics.py +++ b/tests/structure/test_mechanics.py @@ -1,25 +1,57 @@ +from os.path import join +import pytest import biotite.structure as struc import biotite.structure.io as strucio -import numpy as np -from os.path import join from ..util import data_dir -import pytest + def test_gyration_radius(): stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) radii = struc.gyration_radius(stack) # Compare with results from MDTraj - exp_radii = \ - [7.30527532, 7.34189463, 7.21863721, 7.29877736, 7.25389752, 7.22292189, - 7.20646252, 7.27215909, 7.30437723, 7.30455437, 7.37979331, 7.14176259, - 7.20674397, 7.27594995, 7.31665835, 7.29850786, 7.34378951, 7.2642137, - 7.20727158, 7.16336879, 7.3479218, 7.19362027, 7.24841519, 7.29229237, - 7.15243826, 7.31285673, 7.22585756, 7.25467109, 7.3493648, 7.34203588, - 7.3310182, 7.29236536, 7.20527373, 7.33138918, 7.2284936, 7.40374312, - 7.24856173, 7.25581809] + exp_radii = [ + 7.30527532, + 7.34189463, + 7.21863721, + 7.29877736, + 7.25389752, + 7.22292189, + 7.20646252, + 7.27215909, + 7.30437723, + 7.30455437, + 7.37979331, + 7.14176259, + 7.20674397, + 7.27594995, + 7.31665835, + 7.29850786, + 7.34378951, + 7.2642137, + 7.20727158, + 7.16336879, + 7.3479218, + 7.19362027, + 7.24841519, + 7.29229237, + 7.15243826, + 7.31285673, + 7.22585756, + 7.25467109, + 7.3493648, + 7.34203588, + 7.3310182, + 7.29236536, + 7.20527373, + 7.33138918, + 7.2284936, + 7.40374312, + 7.24856173, + 7.25581809, + ] assert radii.tolist() == pytest.approx(exp_radii, abs=2e-2) # Same for atom array instead of stack array = stack[0] radius = struc.gyration_radius(array) - assert radius == pytest.approx(exp_radii[0], abs=2e-2) \ No newline at end of file + assert radius == pytest.approx(exp_radii[0], abs=2e-2) diff --git a/tests/structure/test_mol.py b/tests/structure/test_mol.py index 55ce15f04..c4662e2d7 100644 --- a/tests/structure/test_mol.py +++ b/tests/structure/test_mol.py @@ -19,11 +19,12 @@ def list_v2000_sdf_files(): return [ - path for path - in glob.glob(join(data_dir("structure"), "molecules", "*.sdf")) - if not "v3000" in path + path + for path in glob.glob(join(data_dir("structure"), "molecules", "*.sdf")) + if "v3000" not in path ] + def list_v3000_sdf_files(): return glob.glob(join(data_dir("structure"), "molecules", "*v3000.sdf")) @@ -79,11 +80,12 @@ def test_header_conversion(): list_v2000_sdf_files(), ["V2000", "V3000"], [False, True], - [False, True] - ) + [False, True], + ), ) -def test_structure_conversion(FileClass, path, version, omit_charge, - use_charge_property): +def test_structure_conversion( + FileClass, path, version, omit_charge, use_charge_property +): """ After reading a file, writing the structure back to a new file and reading it again should give the same structure. @@ -123,9 +125,10 @@ def test_structure_conversion(FileClass, path, version, omit_charge, @pytest.mark.parametrize( "path", [ - file for file in list_v2000_sdf_files() + list_v3000_sdf_files() + file + for file in list_v2000_sdf_files() + list_v3000_sdf_files() if file.split(".")[0] + ".cif" in list_cif_files() - ] + ], ) def test_pdbx_consistency(path): """ @@ -145,20 +148,17 @@ def test_pdbx_consistency(path): test_atoms = mol.get_structure(sdf_file) assert test_atoms.coord.shape == ref_atoms.coord.shape - assert test_atoms.coord.flatten().tolist() \ - == ref_atoms.coord.flatten().tolist() + assert test_atoms.coord.flatten().tolist() == ref_atoms.coord.flatten().tolist() assert test_atoms.element.tolist() == ref_atoms.element.tolist() assert test_atoms.charge.tolist() == ref_atoms.charge.tolist() - assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) \ - == set(tuple(bond) for bond in ref_atoms.bonds.as_array()) + assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) == set( + tuple(bond) for bond in ref_atoms.bonds.as_array() + ) @pytest.mark.parametrize( "v2000_path, v3000_path", - zip( - sorted(list_v2000_sdf_files()), - sorted(list_v3000_sdf_files()) - ) + zip(sorted(list_v2000_sdf_files()), sorted(list_v3000_sdf_files())), ) def test_version_consistency(v2000_path, v3000_path): """ @@ -198,10 +198,7 @@ def test_multi_record_files(): temp.seek(0) sdf_file = mol.SDFile.read(temp) - test_atom_arrays = [ - sdf_file[res_name].get_structure() - for res_name in RES_NAMES - ] + test_atom_arrays = [sdf_file[res_name].get_structure() for res_name in RES_NAMES] assert test_atom_arrays == ref_atom_arrays @@ -210,9 +207,7 @@ def test_metadata_parsing(): """ Check if metadata is parsed correctly based on a known example. """ - sdf_file = mol.SDFile.read( - join(data_dir("structure"), "molecules", "13136.sdf") - ) + sdf_file = mol.SDFile.read(join(data_dir("structure"), "molecules", "13136.sdf")) metadata = sdf_file.record.metadata assert metadata["PUBCHEM_COMPOUND_CID"] == "13136" @@ -224,10 +219,7 @@ def test_metadata_conversion(): """ Writing metadata and reading it again should give the same data. """ - ref_metadata = { - "test_1": "value 1", - "test_2": "value 2\nvalue 3" - } + ref_metadata = {"test_1": "value 1", "test_2": "value 2\nvalue 3"} record = mol.SDRecord(metadata=ref_metadata) sdf_file = mol.SDFile({"Molecule": record}) @@ -236,9 +228,7 @@ def test_metadata_conversion(): temp.seek(0) sdf_file = mol.SDFile.read(temp) - test_metadata = { - key.name: val for key, val in sdf_file.record.metadata.items() - } + test_metadata = {key.name: val for key, val in sdf_file.record.metadata.items()} temp.close() assert test_metadata == ref_metadata @@ -248,18 +238,10 @@ def test_metadata_conversion(): "key_string, ref_key_attributes", [ # Cases from Dalby1992 - ( - "> ", - (None, "MELTING.POINT", None, None) - ), - ( - "> 55 (MD-08974) DT12", - (12, "BOILING.POINT", 55, "MD-08974") - ), - ( - "> DT12 55", (12, None, 55, None) - ), - ] + ("> ", (None, "MELTING.POINT", None, None)), + ("> 55 (MD-08974) DT12", (12, "BOILING.POINT", 55, "MD-08974")), + ("> DT12 55", (12, None, 55, None)), + ], ) def test_metadata_key_parsing(key_string, ref_key_attributes): """ @@ -270,7 +252,7 @@ def test_metadata_key_parsing(key_string, ref_key_attributes): number=number, name=name, registry_internal=registry_internal, - registry_external=registry_external + registry_external=registry_external, ) test_key = mol.Metadata.Key.deserialize(key_string) @@ -292,7 +274,7 @@ def test_structure_bond_type_fallback(path): # the default bond type ref_atoms.bonds.add_bond(0, 1, BondType.QUADRUPLE) updated_bond = ref_atoms.bonds.as_array()[ - np.all(ref_atoms.bonds.as_array()[:,[0,1]] == [0,1], axis=1) + np.all(ref_atoms.bonds.as_array()[:, [0, 1]] == [0, 1], axis=1) ] assert updated_bond.tolist()[0][2] == BondType.QUADRUPLE test_mol_file = mol.MOLFile() @@ -300,21 +282,16 @@ def test_structure_bond_type_fallback(path): # Test bond type fallback to BondType.ANY value (8) in # MolFile.set_structure during mol_file.lines formatting updated_line = [ - mol_line - for mol_line in test_mol_file.lines if mol_line.startswith(' 1 2 ') + mol_line for mol_line in test_mol_file.lines if mol_line.startswith(" 1 2 ") ].pop() - assert int(updated_line[8]) == \ - BOND_TYPE_MAPPING_REV[BondType.ANY] + assert int(updated_line[8]) == BOND_TYPE_MAPPING_REV[BondType.ANY] # Test bond type fallback to BondType.SINGLE value (1) in # MolFile.set_structure during mol_file.lines formatting - mol.set_structure(test_mol_file, ref_atoms, - default_bond_type=BondType.SINGLE) + mol.set_structure(test_mol_file, ref_atoms, default_bond_type=BondType.SINGLE) updated_line = [ - mol_line - for mol_line in test_mol_file.lines if mol_line.startswith(' 1 2 ') + mol_line for mol_line in test_mol_file.lines if mol_line.startswith(" 1 2 ") ].pop() - assert int(updated_line[8]) == \ - BOND_TYPE_MAPPING_REV[BondType.SINGLE] + assert int(updated_line[8]) == BOND_TYPE_MAPPING_REV[BondType.SINGLE] @pytest.mark.parametrize("atom_type", ["", " ", "A ", " A"]) @@ -396,4 +373,4 @@ def _delete_charge_property(file): lines = [line for line in lines if not line.startswith("M CHG")] file.seek(0) file.truncate() - file.write("\n".join(lines) + "\n") \ No newline at end of file + file.write("\n".join(lines) + "\n") diff --git a/tests/structure/test_molecules.py b/tests/structure/test_molecules.py index d983f9f83..91447febc 100644 --- a/tests/structure/test_molecules.py +++ b/tests/structure/test_molecules.py @@ -18,26 +18,24 @@ def array(): :class:`AtomArray`. """ MOL_NAMES = [ - "ARG", # Molecule with multiple branches - "TRP", # Molecule with a cycle - "GLC", # Molecule with a cycle + "ARG", # Molecule with multiple branches + "TRP", # Molecule with a cycle + "GLC", # Molecule with a cycle "NA", # A single atom - "ATP" # Larger molecule + "ATP", # Larger molecule ] N_MOLECULES = 20 np.random.seed(0) - + atom_array = struc.AtomArray(0) for i, mol_name in enumerate(np.random.choice(MOL_NAMES, N_MOLECULES)): molecule = info.residue(mol_name) - molecule.res_id[:] = i+1 + molecule.res_id[:] = i + 1 atom_array += molecule - + reordered_indices = np.random.choice( - np.arange(atom_array.array_length()), - atom_array.array_length(), - replace=False + np.arange(atom_array.array_length()), atom_array.array_length(), replace=False ) atom_array = atom_array[reordered_indices] @@ -45,12 +43,7 @@ def array(): @pytest.mark.parametrize( - "as_stack, as_bonds", - [ - (False, False), - (True, False), - (False, True ) - ] + "as_stack, as_bonds", [(False, False), (True, False), (False, True)] ) def test_get_molecule_indices(array, as_stack, as_bonds): """ @@ -59,12 +52,12 @@ def test_get_molecule_indices(array, as_stack, as_bonds): """ if as_stack: array = struc.stack([array]) - + if as_bonds: test_indices = struc.get_molecule_indices(array.bonds) else: test_indices = struc.get_molecule_indices(array) - + seen_atoms = 0 for indices in test_indices: molecule = array[..., indices] @@ -72,20 +65,16 @@ def test_get_molecule_indices(array, as_stack, as_bonds): # -> all atoms from the same molecule assert (molecule.res_id == molecule.res_id[0]).all() # Assert that no atom is missing from the molecule - assert molecule.array_length() \ - == info.residue(molecule.res_name[0]).array_length() + assert ( + molecule.array_length() == info.residue(molecule.res_name[0]).array_length() + ) seen_atoms += molecule.array_length() # Assert that all molecules are fond assert seen_atoms == array.array_length() @pytest.mark.parametrize( - "as_stack, as_bonds", - [ - (False, False), - (True, False), - (False, True ) - ] + "as_stack, as_bonds", [(False, False), (True, False), (False, True)] ) def test_get_molecule_masks(array, as_stack, as_bonds): """ @@ -95,14 +84,14 @@ def test_get_molecule_masks(array, as_stack, as_bonds): """ if as_stack: array = struc.stack([array]) - + if as_bonds: ref_indices = struc.get_molecule_indices(array.bonds) test_masks = struc.get_molecule_masks(array.bonds) else: ref_indices = struc.get_molecule_indices(array) test_masks = struc.get_molecule_masks(array) - + for i in range(len(test_masks)): # Assert that the mask is 'True' for all indices # and that these 'True' values are the only ones in the mask @@ -123,4 +112,4 @@ def test_molecule_iter(array, as_stack): test_iterator = struc.molecule_iter(array) for i, molecule in enumerate(test_iterator): - assert molecule == array[..., ref_indices[i]] \ No newline at end of file + assert molecule == array[..., ref_indices[i]] diff --git a/tests/structure/test_pdb.py b/tests/structure/test_pdb.py index d93974f72..5a1d7926c 100644 --- a/tests/structure/test_pdb.py +++ b/tests/structure/test_pdb.py @@ -2,21 +2,20 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile -import warnings -import itertools import glob -from os.path import join, splitext +import itertools import sys +import warnings +from os.path import join, splitext +from tempfile import TemporaryFile +import numpy as np import pytest from pytest import approx -import numpy as np import biotite import biotite.structure as struc import biotite.structure.io.pdb as pdb import biotite.structure.io.pdb.hybrid36 as hybrid36 import biotite.structure.io.pdbx as pdbx -import biotite.structure.io as io from ..util import data_dir @@ -35,17 +34,15 @@ def test_get_model_count(): glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1], [False, True], - [False, True] - ) + [False, True], + ), ) def test_array_conversion(path, model, hybrid36, include_bonds): pdb_file = pdb.PDBFile.read(path) # Test also the thin wrapper around the methods # 'get_structure()' and 'set_structure()' try: - array1 = pdb.get_structure( - pdb_file, model=model, include_bonds=include_bonds - ) + array1 = pdb.get_structure(pdb_file, model=model, include_bonds=include_bonds) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -58,8 +55,7 @@ def test_array_conversion(path, model, hybrid36, include_bonds): if hybrid36 and (array1.res_id < 0).any(): with pytest.raises( ValueError, - match="Only positive integers can be converted " - "into hybrid-36 notation" + match="Only positive integers can be converted " "into hybrid-36 notation", ): pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) @@ -68,33 +64,28 @@ def test_array_conversion(path, model, hybrid36, include_bonds): pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) - array2 = pdb.get_structure( - pdb_file, model=model, include_bonds=include_bonds - ) + array2 = pdb.get_structure(pdb_file, model=model, include_bonds=include_bonds) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): - assert array1.get_annotation(category).tolist() == \ - array2.get_annotation(category).tolist() + assert ( + array1.get_annotation(category).tolist() + == array2.get_annotation(category).tolist() + ) assert array1.coord.tolist() == array2.coord.tolist() @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]), ) def test_pdbx_consistency(path, model): bcif_path = splitext(path)[0] + ".bcif" pdbx_file = pdbx.BinaryCIFFile.read(bcif_path) try: - ref_atoms = pdbx.get_structure( - pdbx_file, model=model, include_bonds=True - ) + ref_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -134,17 +125,16 @@ def test_pdbx_consistency(path, model): print(file=sys.stderr) raise for category in ref_atoms.get_annotation_categories(): - assert test_atoms.get_annotation(category).tolist() == \ - ref_atoms.get_annotation(category).tolist() + assert ( + test_atoms.get_annotation(category).tolist() + == ref_atoms.get_annotation(category).tolist() + ) assert test_atoms.coord.tolist() == ref_atoms.coord.tolist() @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1]), ) def test_pdbx_consistency_assembly(path, model): """ @@ -168,10 +158,13 @@ def test_pdbx_consistency_assembly(path, model): ref_assembly = pdbx.get_assembly(pdbx_file, model=model) for category in ref_assembly.get_annotation_categories(): - assert test_assembly.get_annotation(category).tolist() == \ - ref_assembly.get_annotation(category).tolist() - assert test_assembly.coord.flatten().tolist() == \ - approx(ref_assembly.coord.flatten().tolist(), abs=1e-3) + assert ( + test_assembly.get_annotation(category).tolist() + == ref_assembly.get_annotation(category).tolist() + ) + assert test_assembly.coord.flatten().tolist() == approx( + ref_assembly.coord.flatten().tolist(), abs=1e-3 + ) @pytest.mark.parametrize("hybrid36", [False, True]) @@ -179,9 +172,7 @@ def test_extra_fields(hybrid36): path = join(data_dir("structure"), "1l2y.pdb") pdb_file = pdb.PDBFile.read(path) stack1 = pdb_file.get_structure( - extra_fields=[ - "atom_id", "b_factor", "occupancy", "charge" - ] + extra_fields=["atom_id", "b_factor", "occupancy", "charge"] ) with pytest.raises(ValueError): @@ -196,9 +187,7 @@ def test_extra_fields(hybrid36): pdb_file.set_structure(stack1, hybrid36=hybrid36) stack2 = pdb_file.get_structure( - extra_fields=[ - "atom_id", "b_factor", "occupancy", "charge" - ] + extra_fields=["atom_id", "b_factor", "occupancy", "charge"] ) assert stack1.ins_code.tolist() == stack2.ins_code.tolist() @@ -218,7 +207,7 @@ def test_inferred_elements(): # Remove all elements removed_stack = stack.copy() - removed_stack.element[:] = '' + removed_stack.element[:] = "" # Save stack without elements to tmp file temp = TemporaryFile("w+") @@ -237,10 +226,7 @@ def test_inferred_elements(): @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]), ) def test_box_shape(path, model): pdb_file = pdb.PDBFile.read(path) @@ -266,14 +252,11 @@ def test_box_parsing(): path = join(data_dir("structure"), "1igy.pdb") pdb_file = pdb.PDBFile.read(path) a = pdb_file.get_structure() - expected_box = np.array([[ - [66.65, 0.00, 0.00], - [0.00, 190.66, 0.00], - [-24.59, 0.00, 68.84] - ]]) + expected_box = np.array( + [[[66.65, 0.00, 0.00], [0.00, 190.66, 0.00], [-24.59, 0.00, 68.84]]] + ) - assert expected_box.flatten().tolist() \ - == approx(a.box.flatten().tolist(), abs=1e-2) + assert expected_box.flatten().tolist() == approx(a.box.flatten().tolist(), abs=1e-2) def test_id_overflow(): @@ -283,7 +266,7 @@ def test_id_overflow(): a.coord = np.zeros(a.coord.shape) a.chain_id = np.full(length, "A") # Create residue IDs over 10000 - a.res_id = np.arange(1, length+1) + a.res_id = np.arange(1, length + 1) a.res_name = np.full(length, "GLY") a.hetero = np.full(length, False) a.atom_name = np.full(length, "CA") @@ -299,13 +282,13 @@ def test_id_overflow(): # Assert file can be read properly temp.seek(0) a2 = pdb.get_structure(pdb.PDBFile.read(temp)) - assert(a2.array_length() == a.array_length()) + assert a2.array_length() == a.array_length() # Manually check if the written atom id is correct temp.seek(0) last_line = temp.readlines()[-1] atom_id = int(last_line.split()[1]) - assert(atom_id == 1) + assert atom_id == 1 temp.close() @@ -321,9 +304,9 @@ def test_id_overflow(): temp.seek(0) last_line = temp.readlines()[-1] atom_id = last_line.split()[1] - assert(atom_id == "A0000") + assert atom_id == "A0000" res_id = last_line.split()[4][1:] - assert(res_id == "BXG0") + assert res_id == "BXG0" temp.close() @@ -353,38 +336,41 @@ def test_get_b_factor(model): if model is None: # The B-factor is an annotation category # -> it can only be extracted in a per-model basis - ref_b_factor = np.stack([ - pdb_file.get_structure( - model=m, extra_fields=["b_factor"] - ).b_factor - for m in range(1, pdb_file.get_model_count() + 1) - ]) + ref_b_factor = np.stack( + [ + pdb_file.get_structure(model=m, extra_fields=["b_factor"]).b_factor + for m in range(1, pdb_file.get_model_count() + 1) + ] + ) else: ref_b_factor = pdb_file.get_structure( model=model, extra_fields=["b_factor"] ).b_factor - test_b_factor= pdb_file.get_b_factor(model=model) + test_b_factor = pdb_file.get_b_factor(model=model) assert test_b_factor.shape == ref_b_factor.shape assert (test_b_factor == ref_b_factor).all() - np.random.seed(0) N = 200 LENGTHS = [3, 4, 5] + + @pytest.mark.parametrize( "number, length", zip( - list(itertools.chain(*[ - np.random.randint(0, hybrid36.max_hybrid36_number(length), N) - for length in LENGTHS - ])), - list(itertools.chain(*[ - [length] * N for length in LENGTHS - ])) - ) + list( + itertools.chain( + *[ + np.random.randint(0, hybrid36.max_hybrid36_number(length), N) + for length in LENGTHS + ] + ) + ), + list(itertools.chain(*[[length] * N for length in LENGTHS])), + ), ) def test_hybrid36_codec(number, length): """ @@ -401,7 +387,6 @@ def test_max_hybrid36_number(): assert hybrid36.max_hybrid36_number(5) == 87440031 - @pytest.mark.parametrize("hybrid36", [False, True]) def test_bond_records(hybrid36): """ @@ -420,7 +405,7 @@ def test_bond_records(hybrid36): np.random.seed(0) # Create random bonds four times the number of atoms - bond_array = np.random.randint(n_atoms, size=(4*n_atoms, 2)) + bond_array = np.random.randint(n_atoms, size=(4 * n_atoms, 2)) # Remove bonds of atoms to themselves bond_array = bond_array[bond_array[:, 0] != bond_array[:, 1]] ref_bonds = struc.BondList(n_atoms, bond_array) @@ -459,8 +444,8 @@ def test_get_symmetry_mates(model): Test generated symmetry mates on a known example with a simple space group and a single chain. """ - INVERSION_AXES = [(0,0,0), (0,0,1), (0,1,0), (1,0,0)] - TRANSLATION_AXES = [(0,0,0), (1,0,1), (0,1,1), (1,1,0)] + INVERSION_AXES = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0)] + TRANSLATION_AXES = [(0, 0, 0), (1, 0, 1), (0, 1, 1), (1, 1, 0)] path = join(data_dir("structure"), "1aki.pdb") pdb_file = pdb.PDBFile.read(path) @@ -475,8 +460,7 @@ def test_get_symmetry_mates(model): symmetry_mates = pdb_file.get_symmetry_mates(model=model) # Space group has 4 copies in a unit cell - assert symmetry_mates.array_length() \ - == original_structure.array_length() * 4 + assert symmetry_mates.array_length() == original_structure.array_length() * 4 if model is None: assert symmetry_mates.stack_depth() == original_structure.stack_depth() for chain, inv_axes, trans_axes in zip( @@ -490,10 +474,13 @@ def test_get_symmetry_mates(model): chain = struc.rotate(chain, angles) # Now both mates should be equal for category in original_structure.get_annotation_categories(): - assert chain.get_annotation(category).tolist() == \ - original_structure.get_annotation(category).tolist() - assert chain.coord.flatten().tolist() == \ - approx(original_structure.coord.flatten().tolist(), abs=1e-3) + assert ( + chain.get_annotation(category).tolist() + == original_structure.get_annotation(category).tolist() + ) + assert chain.coord.flatten().tolist() == approx( + original_structure.coord.flatten().tolist(), abs=1e-3 + ) @pytest.mark.parametrize( @@ -512,7 +499,7 @@ def test_get_symmetry_mates(model): ("occupancy", 1000, False), ("charge", -10, False), ("charge", 10, False), - ] + ], ) def test_setting_incompatible_structure(annotation, value, warning_only): """ @@ -535,7 +522,7 @@ def test_setting_incompatible_structure(annotation, value, warning_only): # Set one annotation to a value that exceeds the number of columns if annotation == "coord": - atoms.coord[0,0] = value + atoms.coord[0, 0] = value else: atoms.get_annotation(annotation)[0] = value diff --git a/tests/structure/test_pdbqt.py b/tests/structure/test_pdbqt.py index 1a7c2e049..bcbfeb1f7 100644 --- a/tests/structure/test_pdbqt.py +++ b/tests/structure/test_pdbqt.py @@ -2,12 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import warnings -from tempfile import TemporaryFile import glob +import warnings from os.path import join -import pytest +from tempfile import TemporaryFile import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io.pdbqt as pdbqt import biotite.structure.io.pdbx as pdbx @@ -17,16 +17,15 @@ @pytest.mark.parametrize( "path", [ - path for path in glob.glob(join(data_dir("structure"), "*.bcif")) + path + for path in glob.glob(join(data_dir("structure"), "*.bcif")) # Skip this PDB ID as it contains 5-character residue names if "7gsa" not in path - ] + ], ) def test_array_conversion(path): pdbx_file = pdbx.BinaryCIFFile.read(path) - ref_structure = pdbx.get_structure( - pdbx_file, model=1, extra_fields=["charge"] - ) + ref_structure = pdbx.get_structure(pdbx_file, model=1, extra_fields=["charge"]) ref_structure.bonds = struc.connect_via_residue_names(ref_structure) pdbqt_file = pdbqt.PDBQTFile() @@ -53,7 +52,7 @@ def test_array_conversion(path): try: assert np.array_equal( test_structure.get_annotation(category), - ref_structure.get_annotation(category) + ref_structure.get_annotation(category), ) except AssertionError: print(f"Inequality in '{category}' category") diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py index a3f88d44c..faa32ad1e 100644 --- a/tests/structure/test_pdbx.py +++ b/tests/structure/test_pdbx.py @@ -2,9 +2,9 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import warnings import glob import itertools +import warnings from os.path import join, splitext import numpy as np import pytest @@ -22,7 +22,7 @@ def test_get_model_count(format): Check of :func:`get_model_count()`gives the same number of models as :func:`get_structure()`. """ - base_path = join(data_dir("structure"), f"1l2y") + base_path = join(data_dir("structure"), "1l2y") if format == "cif": pdbx_file = pdbx.CIFFile.read(base_path + ".cif") else: @@ -35,8 +35,17 @@ def test_get_model_count(format): @pytest.mark.parametrize( "string, looped", itertools.product( - ["", " ", " ", "te xt", "'", '"' ,"te\nxt", "\t",], - [False, True] + [ + "", + " ", + " ", + "te xt", + "'", + '"', + "te\nxt", + "\t", + ], + [False, True], ), ) def test_escape(string, looped): @@ -60,9 +69,7 @@ def test_escape(string, looped): @pytest.mark.parametrize( "format, path, model", itertools.product( - ["cif", "bcif"], - glob.glob(join(data_dir("structure"), "*.cif")), - [None, 1, -1] + ["cif", "bcif"], glob.glob(join(data_dir("structure"), "*.cif")), [None, 1, -1] ), ) def test_conversion(tmpdir, format, path, model): @@ -82,9 +89,7 @@ def test_conversion(tmpdir, format, path, model): pdbx_file = File.read(data_path) try: - ref_atoms = pdbx.get_structure( - pdbx_file, model=model, include_bonds=True - ) + ref_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -103,9 +108,7 @@ def test_conversion(tmpdir, format, path, model): # Remove one label section to test fallback to auth fields del pdbx_file.block["atom_site"][DELETED_ANNOTATION] with pytest.warns(UserWarning, match=f"'{DELETED_ANNOTATION}' not found"): - test_atoms = pdbx.get_structure( - pdbx_file, model=model, include_bonds=True - ) + test_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True) assert ref_atoms.array_length() > 0 if ref_atoms.box is not None: @@ -144,9 +147,7 @@ def test_bond_conversion(tmpdir, format, path): File = pdbx.BinaryCIFFile pdbx_file = File.read(data_path) - atoms = pdbx.get_structure( - pdbx_file, model=1, include_bonds=True - ) + atoms = pdbx.get_structure(pdbx_file, model=1, include_bonds=True) ref_bonds = atoms.bonds pdbx_file = File() @@ -160,16 +161,12 @@ def test_bond_conversion(tmpdir, format, path): # i.e. the bonds can be properly read from ``chem_comp_bond`` with warnings.catch_warnings(): warnings.simplefilter("error") - test_bonds = pdbx.get_structure( - pdbx_file, model=1, include_bonds=True - ).bonds + test_bonds = pdbx.get_structure(pdbx_file, model=1, include_bonds=True).bonds assert test_bonds == ref_bonds -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_extra_fields(tmpdir, format): path = join(data_dir("structure"), f"1l2y.{format}") if format == "cif": @@ -208,9 +205,7 @@ def test_intra_bond_residue_parsing(): """ cif_path = join(data_dir("structure"), "1l2y.cif") cif_file = pdbx.CIFFile.read(cif_path) - ref_bonds = pdbx.get_structure( - cif_file, model=1, include_bonds=True - ).bonds + ref_bonds = pdbx.get_structure(cif_file, model=1, include_bonds=True).bonds nextgen_cif_path = join( data_dir("structure"), "nextgen", "pdb_00001l2y_xyz-enrich.cif" @@ -227,9 +222,7 @@ def test_intra_bond_residue_parsing(): assert test_bonds == ref_bonds -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_any_bonds(tmpdir, format): """ Check if ``BondType.ANY`` bonds can be written and read from a PDBx @@ -266,16 +259,12 @@ def test_any_bonds(tmpdir, format): # i.e. the bonds can be properly read from ``chem_comp_bond`` with warnings.catch_warnings(): warnings.simplefilter("error") - test_bonds = pdbx.get_structure( - pdbx_file, model=1, include_bonds=True - ).bonds + test_bonds = pdbx.get_structure(pdbx_file, model=1, include_bonds=True).bonds assert test_bonds == ref_bonds -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_unequal_lengths(format): """ Check if setting columns with unequal lengths in the same category @@ -299,9 +288,7 @@ def test_setting_empty_column(): """ Check if setting an empty column raises an exception. """ - with pytest.raises( - ValueError, match="Array must contain at least one element" - ): + with pytest.raises(ValueError, match="Array must contain at least one element"): pdbx.CIFCategory({"foo": []}) @@ -324,9 +311,7 @@ def test_setting_empty_structure(): pdbx.set_structure(pdbx.CIFFile(), atoms, include_bonds=True) -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_list_assemblies(format): """ Test the :func:`list_assemblies()` function based on a known @@ -351,11 +336,10 @@ def test_list_assemblies(format): } -@pytest.mark.parametrize("format, pdb_id, model", itertools.product( - ["cif", "bcif"], - ["1f2n", "5zng"], - [None, 1, -1] -)) +@pytest.mark.parametrize( + "format, pdb_id, model", + itertools.product(["cif", "bcif"], ["1f2n", "5zng"], [None, 1, -1]), +) def test_get_assembly(format, pdb_id, model): """ Test whether the :func:`get_assembly()` function produces the same @@ -376,13 +360,11 @@ def test_get_assembly(format, pdb_id, model): # Test each available assembly for id, ref_oligomer_count in zip( assembly_category["id"].as_array(str), - assembly_category["oligomeric_count"].as_array(int) + assembly_category["oligomeric_count"].as_array(int), ): print("Assembly ID:", id) try: - assembly = pdbx.get_assembly( - pdbx_file, assembly_id=id, model=model - ) + assembly = pdbx.get_assembly(pdbx_file, assembly_id=id, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -409,8 +391,7 @@ def test_get_assembly(format, pdb_id, model): @pytest.mark.parametrize( "path, use_ideal_coord", itertools.product( - glob.glob(join(data_dir("structure"), "molecules", "*.cif")), - [False, True] + glob.glob(join(data_dir("structure"), "molecules", "*.cif")), [False, True] ), ) def test_component_conversion(tmpdir, path, use_ideal_coord): @@ -420,9 +401,7 @@ def test_component_conversion(tmpdir, path, use_ideal_coord): structure. """ cif_file = pdbx.CIFFile.read(path) - ref_atoms = pdbx.get_component( - cif_file, use_ideal_coord=use_ideal_coord - ) + ref_atoms = pdbx.get_component(cif_file, use_ideal_coord=use_ideal_coord) cif_file = pdbx.CIFFile() pdbx.set_component(cif_file, ref_atoms, data_block="test") @@ -430,9 +409,7 @@ def test_component_conversion(tmpdir, path, use_ideal_coord): cif_file.write(file_path) cif_file = pdbx.CIFFile.read(path) - test_atoms = pdbx.get_component( - cif_file, use_ideal_coord=use_ideal_coord - ) + test_atoms = pdbx.get_component(cif_file, use_ideal_coord=use_ideal_coord) assert test_atoms == ref_atoms @@ -452,14 +429,14 @@ def test_get_sequence(format): sequences_1 = pdbx.get_sequence(pdbx_file) pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}")) sequences_2 = pdbx.get_sequence(pdbx_file) - assert str(sequences_1['T']) == "CCGACGGCGCATCAGC" - assert type(sequences_1['T']) is seq.NucleotideSequence - assert str(sequences_1['P']) == "GCTGATGCGCC" - assert type(sequences_1['P']) is seq.NucleotideSequence - assert str(sequences_1['D']) == "GTCGG" - assert type(sequences_1['D']) is seq.NucleotideSequence + assert str(sequences_1["T"]) == "CCGACGGCGCATCAGC" + assert type(sequences_1["T"]) is seq.NucleotideSequence + assert str(sequences_1["P"]) == "GCTGATGCGCC" + assert type(sequences_1["P"]) is seq.NucleotideSequence + assert str(sequences_1["D"]) == "GTCGG" + assert type(sequences_1["D"]) is seq.NucleotideSequence assert ( - str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" + str(sequences_1["A"]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD" "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD" "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS" @@ -467,14 +444,14 @@ def test_get_sequence(format): "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA" "GEPLPVDSEKDIFDYIQWKYREPKDRSE" ) - assert type(sequences_1['A']) is seq.ProteinSequence + assert type(sequences_1["A"]) is seq.ProteinSequence assert ( - str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" + str(sequences_2["A"]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC" "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT" "CCGGAGTCAGGAAACCTGCCTGCCGTC" ) - assert type(sequences_2['A']) is seq.NucleotideSequence + assert type(sequences_2["A"]) is seq.NucleotideSequence def test_bcif_encoding(): @@ -485,21 +462,20 @@ def test_bcif_encoding(): PDB_ID = "1aki" encodings_used = { - encoding: False for encoding in [ + encoding: False + for encoding in [ pdbx.ByteArrayEncoding, pdbx.FixedPointEncoding, # This encoding is not used in the test file - #pdbx.IntervalQuantizationEncoding, + # pdbx.IntervalQuantizationEncoding, pdbx.RunLengthEncoding, pdbx.DeltaEncoding, pdbx.IntegerPackingEncoding, - pdbx.StringArrayEncoding + pdbx.StringArrayEncoding, ] } - bcif_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), f"{PDB_ID}.bcif") - ) + bcif_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{PDB_ID}.bcif")) for category_name, category in bcif_file[PDB_ID.upper()].items(): for column_name in category.keys(): try: @@ -520,9 +496,7 @@ def test_bcif_encoding(): assert test_msgpack == ref_msgpack except: - raise Exception( - f"Encoding failed for '{category_name}.{column_name}'" - ) + raise Exception(f"Encoding failed for '{category_name}.{column_name}'") # Check if each encoding was used at least once # to ensure that the test was thorough @@ -587,13 +561,16 @@ def test_bcif_cif_consistency(): if cif_column.mask is None: assert bcif_column.mask is None else: - assert cif_column.mask.array.tolist() \ + assert ( + cif_column.mask.array.tolist() == bcif_column.mask.array.tolist() + ) # In CIF format, all vales are strings # -> ensure consistency dtype = bcif_column.data.array.dtype - assert cif_column.as_array(dtype).tolist() \ - == pytest.approx(bcif_column.as_array(dtype).tolist()) + assert cif_column.as_array(dtype).tolist() == pytest.approx( + bcif_column.as_array(dtype).tolist() + ) except: raise Exception( f"Comparison failed for '{category_name}.{column_name}'" @@ -606,7 +583,7 @@ def test_bcif_cif_consistency(): ("cif", None), ("bcif", False), ("bcif", True), - ] + ], ) def test_serialization_consistency(format, create_new_encoding): """ @@ -626,18 +603,14 @@ def test_serialization_consistency(format, create_new_encoding): for category_name, ref_category in file.block.items(): if format == "cif": - test_category = pdbx.CIFCategory.deserialize( - ref_category.serialize() - ) + test_category = pdbx.CIFCategory.deserialize(ref_category.serialize()) elif format == "bcif": # Access each column to force otherwise lazy deserialization for _ in ref_category.values(): pass if create_new_encoding: ref_category = _clear_encoding(ref_category) - test_category = pdbx.BinaryCIFCategory.deserialize( - ref_category.serialize() - ) + test_category = pdbx.BinaryCIFCategory.deserialize(ref_category.serialize()) try: for key in test_category.keys(): assert ref_category[key] == test_category[key] diff --git a/tests/structure/test_pseudoknots.py b/tests/structure/test_pseudoknots.py index d7a594bcf..4d6992551 100644 --- a/tests/structure/test_pseudoknots.py +++ b/tests/structure/test_pseudoknots.py @@ -2,13 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import json +from os.path import join import numpy as np -import pickle as pkl +import pytest import biotite.structure as struc import biotite.structure.io as strucio -from os.path import join from ..util import data_dir @@ -19,6 +18,7 @@ def nuc_sample_array(): """ return strucio.load_structure(join(data_dir("structure"), "4p5j.cif")) + def test_pseudoknots(nuc_sample_array): """ Check the output of :func:`pseudoknots()`. @@ -26,11 +26,9 @@ def test_pseudoknots(nuc_sample_array): # Known base pairs with pseudoknot-order = 1: pseudoknot_order_one = [{2, 74}, {58, 72}, {59, 71}, {60, 70}] # Known base pairs that can either be of order one or two - pseudoknot_order_one_or_two = [{9, 48}, {10, 49}] - order_one_count = ( - len(pseudoknot_order_one) + (len(pseudoknot_order_one_or_two)/2) - ) - order_two_count = len(pseudoknot_order_one_or_two)/2 + pseudoknot_order_one_or_two = [{9, 48}, {10, 49}] + order_one_count = len(pseudoknot_order_one) + (len(pseudoknot_order_one_or_two) / 2) + order_two_count = len(pseudoknot_order_one_or_two) / 2 base_pairs = struc.base_pairs(nuc_sample_array) pseudoknot_order = struc.pseudoknots(base_pairs) @@ -51,15 +49,14 @@ def test_pseudoknots(nuc_sample_array): for base_pair, order in zip( nuc_sample_array[base_pairs].res_id, optimal_solution ): - if(order == 1): + if order == 1: assert ( - set(base_pair) in pseudoknot_order_one or - set(base_pair) in pseudoknot_order_one_or_two - ) - elif (order == 2): - assert ( - set(base_pair) in pseudoknot_order_one_or_two + set(base_pair) in pseudoknot_order_one + or set(base_pair) in pseudoknot_order_one_or_two ) + elif order == 2: + assert set(base_pair) in pseudoknot_order_one_or_two + def load_test(name): """ @@ -67,20 +64,19 @@ def load_test(name): """ # Base pairs as numpy array (input for `pseudoknots()`) with open( - join(data_dir("structure"), "pseudoknots", f"{name}_knotted.json"), - "r" + join(data_dir("structure"), "pseudoknots", f"{name}_knotted.json"), "r" ) as f: basepairs = np.array(json.load(f)) # List of solutions (set of tuples) with open( - join(data_dir("structure"), "pseudoknots", f"{name}_unknotted.json"), - "rb" + join(data_dir("structure"), "pseudoknots", f"{name}_unknotted.json"), "rb" ) as f: solutions = json.load(f) for i, solution in enumerate(solutions): solutions[i] = set([tuple(pair) for pair in solution]) return basepairs, solutions + @pytest.mark.parametrize("name", [f"test{x}" for x in range(21)]) def test_pseudoknot_removal(name): """ @@ -116,6 +112,7 @@ def test_pseudoknot_removal(name): # Verify that the number of solutions matches the reference assert len(reference_solutions) == solutions_count + @pytest.mark.parametrize("seed", range(10)) def test_pseudoknot_orders(seed): """ @@ -136,7 +133,7 @@ def test_pseudoknot_orders(seed): for solution in solutions: # Number of base pairs in the previous order previous_order = -1 - for order in range(np.max(solution)+1): + for order in range(np.max(solution) + 1): # Ensure that the base pairs of the same order are unknotted assert (struc.pseudoknots(basepairs[solution == order]) == 0).all() @@ -148,9 +145,10 @@ def test_pseudoknot_orders(seed): assert this_order <= previous_order previous_order = this_order + def test_empty_base_pairs(): """ Assert than an empty array of base pairs generates an empty array of - pseudoknot orders. + pseudoknot orders. """ - assert struc.pseudoknots([]).shape == (1,0) \ No newline at end of file + assert struc.pseudoknots([]).shape == (1, 0) diff --git a/tests/structure/test_rdf.py b/tests/structure/test_rdf.py index bd072fbbe..5476e5c22 100644 --- a/tests/structure/test_rdf.py +++ b/tests/structure/test_rdf.py @@ -2,147 +2,170 @@ from os.path import join import numpy as np import pytest +from biotite.structure.box import vectors_from_unitcell from biotite.structure.io import load_structure from biotite.structure.rdf import rdf -from biotite.structure.box import vectors_from_unitcell -from ..util import data_dir, cannot_import - +from ..util import cannot_import, data_dir TEST_FILE = join(data_dir("structure"), "waterbox.gro") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") def test_rdf(): - """ General test to reproduce oxygen RDF for a box of water""" + """General test to reproduce oxygen RDF for a box of water""" test_file = TEST_FILE stack = load_structure(test_file) # calculate oxygen RDF for water - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 - bins, g_r = rdf(oxygen[:, 0].coord, oxygen, interval=interval, - bins=n_bins, periodic=False) + bins, g_r = rdf( + oxygen[:, 0].coord, oxygen, interval=interval, bins=n_bins, periodic=False + ) # Compare with MDTraj import mdtraj + traj = mdtraj.load(TEST_FILE) - ow = [a.index for a in traj.topology.atoms if a.name == 'O'] + ow = [a.index for a in traj.topology.atoms if a.name == "O"] pairs = itertools.product([ow[0]], ow) - mdt_bins, mdt_g_r = mdtraj.compute_rdf(traj, list(pairs), - r_range=interval/10, n_bins=n_bins, - periodic=False) + mdt_bins, mdt_g_r = mdtraj.compute_rdf( + traj, list(pairs), r_range=interval / 10, n_bins=n_bins, periodic=False + ) - assert np.allclose(bins, mdt_bins*10) + assert np.allclose(bins, mdt_bins * 10) assert np.allclose(g_r, mdt_g_r, rtol=0.0001) def test_rdf_bins(): - """ Test if RDF produce correct bin ranges """ + """Test if RDF produce correct bin ranges""" stack = load_structure(TEST_FILE) center = stack[:, 0] num_bins = 44 bin_range = (0, 11.7) bins, g_r = rdf(center, stack, bins=num_bins, interval=bin_range) - assert(len(bins) == num_bins) - assert(bins[0] > bin_range[0]) - assert(bins[1] < bin_range[1]) + assert len(bins) == num_bins + assert bins[0] > bin_range[0] + assert bins[1] < bin_range[1] def test_rdf_with_selection(): - """ Test if the selection argument of rdf function works as expected """ + """Test if the selection argument of rdf function works as expected""" stack = load_structure(TEST_FILE) # calculate oxygen RDF for water with and without a selection - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 - sele = (stack.atom_name == 'OW') & (stack.res_id >= 3) - bins, g_r = rdf(oxygen[:, 0].coord, stack, selection=sele, - interval=interval, bins=n_bins, periodic=False) - - nosel_bins, nosel_g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:], - interval=interval, bins=n_bins, periodic=False) + sele = (stack.atom_name == "OW") & (stack.res_id >= 3) + bins, g_r = rdf( + oxygen[:, 0].coord, + stack, + selection=sele, + interval=interval, + bins=n_bins, + periodic=False, + ) + + nosel_bins, nosel_g_r = rdf( + oxygen[:, 0].coord, + oxygen[:, 1:], + interval=interval, + bins=n_bins, + periodic=False, + ) assert np.allclose(bins, nosel_bins) assert np.allclose(g_r, nosel_g_r) def test_rdf_atom_argument(): - """ Test if the first argument allows to use AtomArrayStack """ + """Test if the first argument allows to use AtomArrayStack""" stack = load_structure(TEST_FILE) # calculate oxygen RDF for water with and without a selection - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 - bins, g_r = rdf(oxygen[:, 0], stack, interval=interval, - bins=n_bins, periodic=False) + bins, g_r = rdf(oxygen[:, 0], stack, interval=interval, bins=n_bins, periodic=False) - atom_bins, atoms_g_r = rdf(oxygen[:, 0].coord, stack, interval=interval, - bins=n_bins, periodic=False) + atom_bins, atoms_g_r = rdf( + oxygen[:, 0].coord, stack, interval=interval, bins=n_bins, periodic=False + ) assert np.allclose(g_r, atoms_g_r) def test_rdf_multiple_center(): - """ Test if the first argument allows to use multiple centers""" + """Test if the first argument allows to use multiple centers""" stack = load_structure(TEST_FILE) # calculate oxygen RDF for water with and without a selection - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 # averaging individual calculations - bins1, g_r1 = rdf(oxygen[:, 1].coord, oxygen[:, 2:], interval=interval, - bins=n_bins, periodic=False) - bins2, g_r2 = rdf(oxygen[:, 0].coord, oxygen[:, 2:], interval=interval, - bins=n_bins, periodic=False) + bins1, g_r1 = rdf( + oxygen[:, 1].coord, + oxygen[:, 2:], + interval=interval, + bins=n_bins, + periodic=False, + ) + bins2, g_r2 = rdf( + oxygen[:, 0].coord, + oxygen[:, 2:], + interval=interval, + bins=n_bins, + periodic=False, + ) mean = np.mean([g_r1, g_r2], axis=0) # this should give the same result as averaging for oxygen 0 and 1 - bins, g_r = rdf(oxygen[:, 0:2].coord, oxygen[:, 2:], interval=interval, - bins=n_bins, periodic=False) + bins, g_r = rdf( + oxygen[:, 0:2].coord, + oxygen[:, 2:], + interval=interval, + bins=n_bins, + periodic=False, + ) assert np.allclose(g_r, mean, rtol=0.0001) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") def test_rdf_periodic(): - """ Test if the periodic argument gives the correct results""" + """Test if the periodic argument gives the correct results""" test_file = TEST_FILE stack = load_structure(test_file) # calculate oxygen RDF for water - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 - bins, g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:], interval=interval, - bins=n_bins, periodic=True) + bins, g_r = rdf( + oxygen[:, 0].coord, oxygen[:, 1:], interval=interval, bins=n_bins, periodic=True + ) # Compare with MDTraj import mdtraj + traj = mdtraj.load(TEST_FILE) - ow = [a.index for a in traj.topology.atoms if a.name == 'O'] + ow = [a.index for a in traj.topology.atoms if a.name == "O"] pairs = itertools.product([ow[0]], ow[1:]) - mdt_bins, mdt_g_r = mdtraj.compute_rdf(traj, list(pairs), - r_range=interval/10, n_bins=n_bins, - periodic=True) + mdt_bins, mdt_g_r = mdtraj.compute_rdf( + traj, list(pairs), r_range=interval / 10, n_bins=n_bins, periodic=True + ) - assert np.allclose(bins, mdt_bins*10) + assert np.allclose(bins, mdt_bins * 10) assert np.allclose(g_r, mdt_g_r, rtol=0.0001) def test_rdf_box(): - """ Test correct use of simulation boxes """ + """Test correct use of simulation boxes""" stack = load_structure(TEST_FILE) box = vectors_from_unitcell(1, 1, 1, 90, 90, 90) box_stack = np.repeat(box[np.newaxis, :, :], len(stack), axis=0) @@ -169,16 +192,14 @@ def test_rdf_box(): def test_rdf_normalized(): - """ Assert that the RDF tail is normalized to 1""" + """Assert that the RDF tail is normalized to 1""" test_file = TEST_FILE stack = load_structure(test_file) # calculate oxygen RDF for water - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 5]) n_bins = 100 - bins, g_r = rdf(oxygen.coord, oxygen, interval=interval, - bins=n_bins, periodic=True) + bins, g_r = rdf(oxygen.coord, oxygen, interval=interval, bins=n_bins, periodic=True) assert np.allclose(g_r[-10:], np.ones(10), atol=0.1) - diff --git a/tests/structure/test_repair.py b/tests/structure/test_repair.py index 34ba9f622..a4b2b32e2 100644 --- a/tests/structure/test_repair.py +++ b/tests/structure/test_repair.py @@ -2,19 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join +import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import numpy as np -from os.path import join from ..util import data_dir -import pytest @pytest.fixture def single_chain(): - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), "1l2y.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) return pdbx.get_structure(pdbx_file, model=1) @@ -40,35 +38,37 @@ def test_create_continuous_res_ids(multi_chain, restart_each_chain): test_res_ids, _ = struc.get_residues(multi_chain) if restart_each_chain: - assert test_res_ids.tolist() == np.concatenate( - [np.arange(len(test_res_ids) // 2) + 1] * 2 - ).tolist() + assert ( + test_res_ids.tolist() + == np.concatenate([np.arange(len(test_res_ids) // 2) + 1] * 2).tolist() + ) else: - assert test_res_ids.tolist() \ - == (np.arange(len(test_res_ids)) + 1).tolist() + assert test_res_ids.tolist() == (np.arange(len(test_res_ids)) + 1).tolist() @pytest.mark.parametrize( "name,expected", - [("CA", "C"), - ("C", "C"), - ("CB", "C"), - ("OD1", "O"), - ("HD21", "H"), - ("1H", "H"), - #("CL", "CL"), # This is an edge case where inference is difficult - ("HE", "H"), - ("SD", "S"), - ("NA", "N"), - ("NX", "N"), - ("BE", "BE"), - ("BEA", "BE"), - ("K", "K"), - ("KA", "K"), - ("QWERT", "")] + [ + ("CA", "C"), + ("C", "C"), + ("CB", "C"), + ("OD1", "O"), + ("HD21", "H"), + ("1H", "H"), + # ("CL", "CL"), # This is an edge case where inference is difficult + ("HE", "H"), + ("SD", "S"), + ("NA", "N"), + ("NX", "N"), + ("BE", "BE"), + ("BEA", "BE"), + ("K", "K"), + ("KA", "K"), + ("QWERT", ""), + ], ) def test_infer_elements(name, expected): """ Check if elements are correctly guessed based on known examples. """ - assert struc.infer_elements([name])[0] == expected \ No newline at end of file + assert struc.infer_elements([name])[0] == expected diff --git a/tests/structure/test_residues.py b/tests/structure/test_residues.py index c3597a73f..9d6927118 100644 --- a/tests/structure/test_residues.py +++ b/tests/structure/test_residues.py @@ -2,12 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join +import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -import numpy as np -from os.path import join from ..util import data_dir -import pytest @pytest.fixture @@ -17,11 +17,11 @@ def array(): def test_apply_residue_wise(array): data = struc.apply_residue_wise(array, np.ones(len(array)), np.sum) - assert data.tolist() == [len(array[array.res_id == i]) - for i in range(1, 21)] + assert data.tolist() == [len(array[array.res_id == i]) for i in range(1, 21)] + def test_spread_residue_wise(array): - input_data = np.arange(1,21) + input_data = np.arange(1, 21) output_data = struc.spread_residue_wise(array, input_data) assert output_data.tolist() == array.res_id.tolist() @@ -41,8 +41,7 @@ def test_get_residue_starts_for(array): np.random.seed(0) indices = np.random.randint(0, array.array_length(), SAMPLE_SIZE) ref_starts = np.array( - [np.where(mask)[0][0] for mask - in struc.get_residue_masks(array, indices)] + [np.where(mask)[0][0] for mask in struc.get_residue_masks(array, indices)] ) test_starts = struc.get_residue_starts_for(array, indices) assert test_starts.tolist() == ref_starts.tolist() @@ -51,16 +50,32 @@ def test_get_residue_starts_for(array): def test_get_residues(array): ids, names = struc.get_residues(array) assert ids.tolist() == list(range(1, 21)) - assert names.tolist() == ["ASN","LEU","TYR","ILE","GLN","TRP","LEU","LYS", - "ASP","GLY","GLY","PRO","SER","SER","GLY","ARG", - "PRO","PRO","PRO","SER"] + assert names.tolist() == [ + "ASN", + "LEU", + "TYR", + "ILE", + "GLN", + "TRP", + "LEU", + "LYS", + "ASP", + "GLY", + "GLY", + "PRO", + "SER", + "SER", + "GLY", + "ARG", + "PRO", + "PRO", + "PRO", + "SER", + ] assert len(ids) == struc.get_residue_count(array) def test_residue_iter(array): - centroid = [struc.centroid(res).tolist() - for res in struc.residue_iter(array)] - ref_centroid = struc.apply_residue_wise( - array, array.coord, np.average, axis=0 - ) - assert centroid == ref_centroid.tolist() \ No newline at end of file + centroid = [struc.centroid(res).tolist() for res in struc.residue_iter(array)] + ref_centroid = struc.apply_residue_wise(array, array.coord, np.average, axis=0) + assert centroid == ref_centroid.tolist() diff --git a/tests/structure/test_sasa.py b/tests/structure/test_sasa.py index 12827f533..944f7a3c1 100644 --- a/tests/structure/test_sasa.py +++ b/tests/structure/test_sasa.py @@ -3,58 +3,56 @@ # information. from os.path import join -import pytest import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io.pdb as pdb import biotite.structure.io.pdbx as pdbx -from ..util import data_dir, cannot_import +from ..util import cannot_import, data_dir # Ignore warning about dummy unit cell vector @pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"]) def test_single(pdb_id): - file_name = join(data_dir("structure"), pdb_id+".pdb") + file_name = join(data_dir("structure"), pdb_id + ".pdb") # Single atom SASA, compare with MDTraj file = pdb.PDBFile.read(file_name) array = file.get_structure(model=1) sasa = struc.sasa(array, vdw_radii="Single", point_number=5000) - from biotite.structure.info.radii import _SINGLE_RADII as radii import mdtraj + from biotite.structure.info.radii import _SINGLE_RADII as radii + # Use the same atom radii - radii = {element.capitalize() : radius / 10 - for element, radius in radii.items()} + radii = {element.capitalize(): radius / 10 for element, radius in radii.items()} traj = mdtraj.load(file_name) # Conversion from nm^2 to A^2 - sasa_exp = mdtraj.shrake_rupley( - traj, change_radii=radii, n_sphere_points=5000 - )[0] * 100 - + sasa_exp = ( + mdtraj.shrake_rupley(traj, change_radii=radii, n_sphere_points=5000)[0] * 100 + ) # Assert that more than 90% of atoms # have less than 10% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1e-1) - ) / len(sasa) > 0.9 + assert ( + np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1e-1)) / len(sasa) + > 0.9 + ) # Assert that more than 98% of atoms # have less than 1% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=1e-2, atol=1e-1) - ) / len(sasa) > 0.98 + assert ( + np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-2, atol=1e-1)) / len(sasa) + > 0.98 + ) @pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"]) def test_coarse_grained(pdb_id): # Multi atom SASA (ProtOr), compare with single atom SASA # on residue level - file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id+".bcif")) + file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id + ".bcif")) array = pdbx.get_structure(file, model=1) array = array[struc.filter_amino_acids(array)] sasa = struc.apply_residue_wise( @@ -66,11 +64,13 @@ def test_coarse_grained(pdb_id): # Assert that more than 90% of atoms # have less than 10% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1) - ) / len(sasa) > 0.9 + assert ( + np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1)) / len(sasa) + > 0.9 + ) # Assert that more than 98% of atoms # have less than 40% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1) - ) / len(sasa) > 0.98 \ No newline at end of file + assert ( + np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1)) / len(sasa) + > 0.98 + ) diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py index 098958824..934176045 100644 --- a/tests/structure/test_sequence.py +++ b/tests/structure/test_sequence.py @@ -5,16 +5,14 @@ import glob from os.path import join import pytest -import biotite.structure as struc import biotite.sequence as seq import biotite.sequence.align as align +import biotite.structure as struc import biotite.structure.io.pdbx as pdbx from ..util import data_dir -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "*.bcif")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("structure"), "*.bcif"))) def test_pdbx_sequence_consistency(path): """ Check if sequences created with :func:`to_sequence()` are equal to @@ -61,8 +59,7 @@ def _find_best_match(sequence, ref_sequences): else: matrix = align.SubstitutionMatrix.std_nucleotide_matrix() alignment = align.align_optimal( - sequence, ref_sequence, matrix, - terminal_penalty=False, max_number=1 + sequence, ref_sequence, matrix, terminal_penalty=False, max_number=1 )[0] # The 'shortest' identity is 1.0, if every residue in the # test sequence is aligned to an identical residue @@ -70,4 +67,4 @@ def _find_best_match(sequence, ref_sequences): if identity > best_identity: best_alignment = alignment best_identity = identity - return best_alignment, best_identity \ No newline at end of file + return best_alignment, best_identity diff --git a/tests/structure/test_sse.py b/tests/structure/test_sse.py index 30b6d75cf..160ba26d7 100644 --- a/tests/structure/test_sse.py +++ b/tests/structure/test_sse.py @@ -6,9 +6,9 @@ from os.path import join import numpy as np import pytest +import biotite.sequence.io.fasta as fasta import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.sequence.io.fasta as fasta from ..util import data_dir @@ -23,18 +23,14 @@ def test_sse(): matches = 0 total = 0 - ref_psea_file = fasta.FastaFile.read( - join(data_dir("structure"), "psea.fasta") - ) + ref_psea_file = fasta.FastaFile.read(join(data_dir("structure"), "psea.fasta")) for pdb_id in ref_psea_file: ref_sse = np.array(list(ref_psea_file[pdb_id])) atoms = pdbx.get_structure( - pdbx.BinaryCIFFile.read( - join(data_dir("structure"), f"{pdb_id}.bcif") - ), - model=1 + pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{pdb_id}.bcif")), + model=1, ) atoms = atoms[struc.filter_canonical_amino_acids(atoms)] if atoms.array_length() == 0: @@ -51,9 +47,9 @@ def test_sse(): np.random.seed(0) -@pytest.mark.parametrize( - "discont_pos", np.random.randint(2, 105, size=100) -) + + +@pytest.mark.parametrize("discont_pos", np.random.randint(2, 105, size=100)) def test_sse_discontinuity(discont_pos): """ Check if discontinuities are properly handled by inserting a @@ -61,8 +57,7 @@ def test_sse_discontinuity(discont_pos): proximity becomes 'coil'. """ atoms = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1gya.bcif")), - model=1 + pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1gya.bcif")), model=1 ) atoms = atoms[struc.filter_canonical_amino_acids(atoms)] @@ -72,7 +67,7 @@ def test_sse_discontinuity(discont_pos): assert len(struc.check_res_id_continuity(atoms)) == 0 # Introduce discontinuity res_starts = struc.get_residue_starts(atoms) - atoms.res_id[res_starts[discont_pos]:] += 1 + atoms.res_id[res_starts[discont_pos] :] += 1 test_sse = struc.annotate_sse(atoms) assert len(test_sse) == len(ref_sse) @@ -89,9 +84,7 @@ def test_sse_discontinuity(discont_pos): assert (test_sse[discont_proximity] == "c").all() -@pytest.mark.parametrize( - "file_name", glob.glob(join(data_dir("structure"), "*.bcif")) -) +@pytest.mark.parametrize("file_name", glob.glob(join(data_dir("structure"), "*.bcif"))) def test_sse_non_peptide(file_name): """ Test whether only amino acids get SSE annotated. @@ -101,9 +94,7 @@ def test_sse_non_peptide(file_name): # Special case for PDB 5EIL: # The residue BP5 is an amino acid, but has no CA # -> rename analogous atom - atoms.atom_name[ - (atoms.res_name == "BP5") & (atoms.atom_name == "C13") - ] = "CA" + atoms.atom_name[(atoms.res_name == "BP5") & (atoms.atom_name == "C13")] = "CA" sse = struc.annotate_sse(atoms) peptide_mask = struc.filter_amino_acids(atoms) @@ -111,4 +102,4 @@ def test_sse_non_peptide(file_name): peptide_mask = peptide_mask[struc.get_residue_starts(atoms)] assert np.all(np.isin(sse[peptide_mask], ["a", "b", "c"])) - assert np.all(sse[~peptide_mask] == "") \ No newline at end of file + assert np.all(sse[~peptide_mask] == "") diff --git a/tests/structure/test_superimpose.py b/tests/structure/test_superimpose.py index fd9514734..9c5f5dd7a 100755 --- a/tests/structure/test_superimpose.py +++ b/tests/structure/test_superimpose.py @@ -9,7 +9,6 @@ import pytest import biotite.structure as struc import biotite.structure.io as strucio -import biotite.structure as struc from biotite.structure.superimpose import _multi_matmul as multi_matmul from ..util import data_dir @@ -30,7 +29,7 @@ def test_transform_as_matrix(): # This is not really a rotation matrix, # but the same maths apply rotation=np.random.rand(N_MODELS, 3, 3), - target_translation=np.random.rand(N_MODELS, 3) + target_translation=np.random.rand(N_MODELS, 3), ) ref_coord = transform.apply(orig_coord) @@ -41,15 +40,13 @@ def test_transform_as_matrix(): test_coord_4 = multi_matmul(transform.as_matrix(), orig_coord_4) test_coord = test_coord_4[..., :3] - assert test_coord.flatten().tolist() \ - == pytest.approx(ref_coord.flatten().tolist(), abs=1e-6) + assert test_coord.flatten().tolist() == pytest.approx( + ref_coord.flatten().tolist(), abs=1e-6 + ) @pytest.mark.parametrize( - "seed, multi_model", itertools.product( - range(10), - [False, True] - ) + "seed, multi_model", itertools.product(range(10), [False, True]) ) def test_restoration(seed, multi_model): """ @@ -70,8 +67,9 @@ def test_restoration(seed, multi_model): test_coord = _transform_random_affine(ref_coord) test_coord, _ = struc.superimpose(ref_coord, test_coord) - assert test_coord.flatten().tolist() \ - == pytest.approx(ref_coord.flatten().tolist(), abs=1e-6) + assert test_coord.flatten().tolist() == pytest.approx( + ref_coord.flatten().tolist(), abs=1e-6 + ) def test_rotation_matrix(): @@ -83,28 +81,23 @@ def test_rotation_matrix(): N_COORD = 100 # A rotation matrix that rotates 90 degrees around the z-axis - ref_rotation = np.array([ - [0, -1, 0], - [1, 0, 0], - [0, 0, 1] - ]) + ref_rotation = np.array([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) np.random.seed(0) original_coord = np.random.rand(N_COORD, 3) # Rotate about 90 degrees around z-axis - rotated_coord = struc.rotate(original_coord, angles=(0, 0, np.pi/2)) + rotated_coord = struc.rotate(original_coord, angles=(0, 0, np.pi / 2)) _, transform = struc.superimpose(rotated_coord, original_coord) test_rotation = transform.rotation - assert test_rotation.flatten().tolist() \ - == pytest.approx(ref_rotation.flatten().tolist(), abs=1e-6) + assert test_rotation.flatten().tolist() == pytest.approx( + ref_rotation.flatten().tolist(), abs=1e-6 + ) @pytest.mark.parametrize( - "path, coord_only", itertools.product( - glob.glob(join(data_dir("structure"), "*.bcif")), - [False, True] - ) + "path, coord_only", + itertools.product(glob.glob(join(data_dir("structure"), "*.bcif")), [False, True]), ) def test_superimposition_array(path, coord_only): """ @@ -116,16 +109,14 @@ def test_superimposition_array(path, coord_only): fixed = strucio.load_structure(path, model=1) mobile = fixed.copy() - mobile = struc.rotate(mobile, (1,2,3)) - mobile = struc.translate(mobile, (1,2,3)) + mobile = struc.rotate(mobile, (1, 2, 3)) + mobile = struc.translate(mobile, (1, 2, 3)) if coord_only: fixed = fixed.coord mobile = mobile.coord - fitted, transformation = struc.superimpose( - fixed, mobile - ) + fitted, transformation = struc.superimpose(fixed, mobile) if coord_only: assert isinstance(fitted, np.ndarray) @@ -150,7 +141,7 @@ def test_superimposition_stack(ca_only): fixed = stack[0] mobile = stack[1:] if ca_only: - mask = (mobile.atom_name == "CA") + mask = mobile.atom_name == "CA" else: mask = None @@ -160,15 +151,13 @@ def test_superimposition_stack(ca_only): # The superimpositions are better for most cases than the # superimpositions in the structure file # -> Use average - assert np.mean(struc.rmsd(fixed, fitted)) \ - < np.mean(struc.rmsd(fixed, mobile)) + assert np.mean(struc.rmsd(fixed, fitted)) < np.mean(struc.rmsd(fixed, mobile)) else: # The superimpositions are better than the superimpositions # in the structure file assert (struc.rmsd(fixed, fitted) < struc.rmsd(fixed, mobile)).all() - @pytest.mark.parametrize("seed", range(5)) def test_masked_superimposition(seed): """ @@ -188,25 +177,19 @@ def test_masked_superimposition(seed): # The distance between the atom in both models should not be # already 0 prior to superimposition - assert struc.distance(fixed[mask], mobile[mask])[0] \ - != pytest.approx(0, abs=5e-4) + assert struc.distance(fixed[mask], mobile[mask])[0] != pytest.approx(0, abs=5e-4) - fitted, transformation = struc.superimpose( - fixed, mobile, mask - ) + fitted, transformation = struc.superimpose(fixed, mobile, mask) - assert struc.distance(fixed[mask], fitted[mask])[0] \ - == pytest.approx(0, abs=5e-4) + assert struc.distance(fixed[mask], fitted[mask])[0] == pytest.approx(0, abs=5e-4) fitted = transformation.apply(mobile) - struc.distance(fixed[mask], fitted[mask])[0] \ - == pytest.approx(0, abs=5e-4) + struc.distance(fixed[mask], fitted[mask])[0] == pytest.approx(0, abs=5e-4) @pytest.mark.parametrize( - "single_model, single_atom", - itertools.product([False, True], [False, True]) + "single_model, single_atom", itertools.product([False, True], [False, True]) ) def test_input_shapes(single_model, single_atom): """ @@ -258,24 +241,25 @@ def test_outlier_detection(seed): superimposed_coord, _, anchors = struc.superimpose_without_outliers( # Increase the threshold a bit, # to ensure that no inlier is classified as outlier - fixed_coord, mobile_coord, outlier_threshold=3.0 + fixed_coord, + mobile_coord, + outlier_threshold=3.0, ) test_outlier_mask = np.full(N_COORD, True) test_outlier_mask[anchors] = False assert test_outlier_mask.tolist() == ref_outlier_mask.tolist() # Without the outliers, the RMSD should be in the noise range - assert struc.rmsd( - fixed_coord[~ref_outlier_mask], superimposed_coord[~ref_outlier_mask] - ) < NOISE + assert ( + struc.rmsd( + fixed_coord[~ref_outlier_mask], superimposed_coord[~ref_outlier_mask] + ) + < NOISE + ) @pytest.mark.parametrize( - "multi_model, coord_only", - itertools.product( - [False, True], - [False, True] - ) + "multi_model, coord_only", itertools.product([False, True], [False, True]) ) def test_superimpose_without_outliers_inputs(multi_model, coord_only): """ @@ -289,9 +273,7 @@ def test_superimpose_without_outliers_inputs(multi_model, coord_only): if coord_only: atoms = atoms.coord - superimposed, transform, _ = struc.superimpose_without_outliers( - atoms, atoms - ) + superimposed, transform, _ = struc.superimpose_without_outliers(atoms, atoms) assert type(superimposed) == type(atoms) assert superimposed.shape == atoms.shape @@ -313,7 +295,7 @@ def test_superimpose_without_outliers_inputs(multi_model, coord_only): ("1aki", "A", True), ("4gxy", "A", False), # is a nucleic acid ("4gxy", "A", True), - ] + ], ) def test_superimpose_homologs(pdb_id, chain_id, as_stack): """ @@ -342,8 +324,10 @@ def test_superimpose_homologs(pdb_id, chain_id, as_stack): ) # Check if corresponding residues were superimposed - assert fixed_atoms.res_id[fix_anchors].tolist() \ + assert ( + fixed_atoms.res_id[fix_anchors].tolist() == mobile_atoms.res_id[mob_anchors].tolist() + ) # If a stack, it only contains one model if as_stack: fixed_atoms = fixed_atoms[0] @@ -355,15 +339,14 @@ def test_superimpose_homologs(pdb_id, chain_id, as_stack): def _transform_random_affine(coord): coord = struc.translate(coord, np.random.rand(3)) - coord = struc.rotate(coord, np.random.uniform(low=0, high=2*np.pi, size=3)) + coord = struc.rotate(coord, np.random.uniform(low=0, high=2 * np.pi, size=3)) return coord def _delete_random_residues(atoms, p_conservation): residue_starts = struc.get_residue_starts(atoms) conserved_residue_starts = np.random.choice( - residue_starts, size=int(p_conservation * len(residue_starts)), - replace=False + residue_starts, size=int(p_conservation * len(residue_starts)), replace=False ) conservation_mask = np.any( struc.get_residue_masks(atoms, conserved_residue_starts), axis=0 diff --git a/tests/structure/test_trajectory.py b/tests/structure/test_trajectory.py index e4a9a1ba3..dee46db46 100644 --- a/tests/structure/test_trajectory.py +++ b/tests/structure/test_trajectory.py @@ -2,33 +2,27 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile import itertools -import glob -from os.path import join, basename +from os.path import join +from tempfile import NamedTemporaryFile import numpy as np import pytest import biotite.structure as struc import biotite.structure.io as strucio -import biotite.structure.io.xtc as xtc -import biotite.structure.io.trr as trr -import biotite.structure.io.tng as tng import biotite.structure.io.dcd as dcd import biotite.structure.io.netcdf as netcdf -from ..util import data_dir, cannot_import +import biotite.structure.io.tng as tng +import biotite.structure.io.trr as trr +import biotite.structure.io.xtc as xtc +from ..util import cannot_import, data_dir -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize("format", ["trr", "xtc", "tng", "dcd", "netcdf"]) def test_array_conversion(format): - template = strucio.load_structure( - join(data_dir("structure"), "1l2y.bcif") - )[0] + template = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))[0] # Add fake box - template.box = np.diag([1,2,3]) + template.box = np.diag([1, 2, 3]) if format == "trr": traj_file_cls = trr.TRRFile if format == "xtc": @@ -39,9 +33,7 @@ def test_array_conversion(format): traj_file_cls = dcd.DCDFile if format == "netcdf": traj_file_cls = netcdf.NetCDFFile - traj_file = traj_file_cls.read( - join(data_dir("structure"), f"1l2y.{format}") - ) + traj_file = traj_file_cls.read(join(data_dir("structure"), f"1l2y.{format}")) ref_array = traj_file.get_structure(template) traj_file = traj_file_cls() @@ -58,10 +50,7 @@ def test_array_conversion(format): assert ref_array.coord == pytest.approx(array.coord, abs=1e-2) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( "format, start, stop, step, chunk_size", itertools.product( @@ -69,8 +58,8 @@ def test_array_conversion(format): [None, 2], [None, 17], [None, 2], - [None, 3] - ) + [None, 3], + ), ) def test_bcif_consistency(format, start, stop, step, chunk_size): if format == "netcdf" and stop is not None and step is not None: @@ -97,7 +86,10 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): traj_file_cls = netcdf.NetCDFFile traj_file = traj_file_cls.read( join(data_dir("structure"), f"1l2y.{format}"), - start, stop, step, chunk_size=chunk_size + start, + stop, + step, + chunk_size=chunk_size, ) test_traj = traj_file.get_structure(template) test_traj_time = traj_file.get_time() @@ -108,10 +100,9 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): # Shift to ensure time starts at 0 test_traj_time -= 1 start = start if start is not None else 0 - stop = stop if stop is not None else 38 # 38 models in 1l2y + stop = stop if stop is not None else 38 # 38 models in 1l2y step = step if step is not None else 1 - assert test_traj_time.astype(int).tolist() \ - == list(range(start, stop, step)) + assert test_traj_time.astype(int).tolist() == list(range(start, stop, step)) assert test_traj.stack_depth() == ref_traj.stack_depth() # 1l2y has no box @@ -121,10 +112,7 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): assert test_traj.coord == pytest.approx(ref_traj.coord, abs=1e-2) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( "format, start, stop, step, stack_size", itertools.product( @@ -132,8 +120,8 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): [None, 2], [None, 17], [None, 2], - [None, 2, 3] - ) + [None, 2, 3], + ), ) def test_read_iter(format, start, stop, step, stack_size): """ @@ -176,7 +164,7 @@ def test_read_iter(format, start, stop, step, stack_size): # Convert list to NumPy array combination_func = np.stack if stack_size is None else np.concatenate - test_coord =combination_func(test_coord) + test_coord = combination_func(test_coord) if test_box[0] is not None: test_box = combination_func(test_box) else: @@ -197,10 +185,7 @@ def test_read_iter(format, start, stop, step, stack_size): assert test_time.tolist() == ref_time.tolist() -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( "format, start, stop, step, stack_size", itertools.product( @@ -208,8 +193,8 @@ def test_read_iter(format, start, stop, step, stack_size): [None, 2], [None, 17], [None, 2], - [None, 2, 3] - ) + [None, 2, 3], + ), ) def test_read_iter_structure(format, start, stop, step, stack_size): """ @@ -241,9 +226,12 @@ def test_read_iter_structure(format, start, stop, step, stack_size): traj_file = traj_file_cls.read(file_name, start, stop, step) ref_traj = traj_file.get_structure(template) - frames = [frame for frame in traj_file_cls.read_iter_structure( - file_name, template, start, stop, step, stack_size=stack_size - )] + frames = [ + frame + for frame in traj_file_cls.read_iter_structure( + file_name, template, start, stop, step, stack_size=stack_size + ) + ] if stack_size is None: assert isinstance(frames[0], struc.AtomArray) @@ -255,10 +243,7 @@ def test_read_iter_structure(format, start, stop, step, stack_size): assert test_traj == ref_traj -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) +@pytest.mark.skipif(cannot_import("mdtraj"), reason="MDTraj is not installed") @pytest.mark.parametrize( "format, n_models, n_atoms, include_box, include_time", itertools.product( @@ -267,7 +252,7 @@ def test_read_iter_structure(format, start, stop, step, stack_size): [1, 1000], [False, True], [False, True], - ) + ), ) def test_write_iter(format, n_models, n_atoms, include_box, include_time): """ @@ -297,7 +282,6 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): # time is evenly spaced for TNG compatibility time = np.linspace(0, 10, n_models) if include_time else None - ref_file = NamedTemporaryFile("w+b") traj_file = traj_file_cls() traj_file.set_coord(coord) @@ -311,7 +295,6 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): ref_time = traj_file.get_time() ref_file.close() - test_file = NamedTemporaryFile("w+b") traj_file_cls.write_iter(test_file.name, coord, box, time) @@ -321,9 +304,8 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): test_time = traj_file.get_time() test_file.close() - assert np.allclose(test_coord, ref_coord, atol=1e-2) if include_box: assert np.allclose(test_box, ref_box, atol=1e-2) if include_time: - assert np.allclose(test_time, ref_time, atol=1e-2) \ No newline at end of file + assert np.allclose(test_time, ref_time, atol=1e-2) diff --git a/tests/structure/test_transform.py b/tests/structure/test_transform.py index 12c42aa0b..3824498c8 100644 --- a/tests/structure/test_transform.py +++ b/tests/structure/test_transform.py @@ -13,8 +13,8 @@ @pytest.fixture( params=itertools.product( - [1, 2, 3], # ndim - [False, True] # as_coord + [1, 2, 3], # ndim + [False, True], # as_coord ) ) def input_atoms(request): @@ -28,7 +28,7 @@ def input_atoms(request): atoms = atoms[0] elif ndim == 1: # Only one atom - atoms = atoms[0,0] + atoms = atoms[0, 0] if as_coord: return atoms.coord @@ -62,13 +62,11 @@ def test_translate(input_atoms, ndim, as_list, random_seed): assert type(restored) == type(input_atoms) assert struc.coord(restored).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) @pytest.mark.parametrize("as_list", [False, True]) -@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z +@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z @pytest.mark.parametrize("random_seed", np.arange(5)) @pytest.mark.parametrize("centered", [False, True]) def test_rotate(input_atoms, as_list, axis, random_seed, centered): @@ -78,7 +76,7 @@ def test_rotate(input_atoms, as_list, axis, random_seed, centered): """ np.random.seed(random_seed) angles = np.zeros(3) - angles[axis] = np.random.rand() * 2*np.pi + angles[axis] = np.random.rand() * 2 * np.pi neg_angles = -angles if as_list: angles = angles.tolist() @@ -91,18 +89,16 @@ def test_rotate(input_atoms, as_list, axis, random_seed, centered): assert type(restored) == type(input_atoms) assert struc.coord(restored).shape == struc.coord(input_atoms).shape print(np.max(np.abs(struc.coord(restored) - struc.coord(input_atoms)))) - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) if centered and struc.coord(input_atoms).ndim > 1: assert np.allclose( struc.centroid(restored), struc.centroid(input_atoms), atol=1e-5 ) -@pytest.mark.parametrize("x", [0, 2*np.pi]) -@pytest.mark.parametrize("y", [0, 2*np.pi]) -@pytest.mark.parametrize("z", [0, 2*np.pi]) +@pytest.mark.parametrize("x", [0, 2 * np.pi]) +@pytest.mark.parametrize("y", [0, 2 * np.pi]) +@pytest.mark.parametrize("z", [0, 2 * np.pi]) @pytest.mark.parametrize("centered", [False, True]) def test_rotate_360(input_atoms, x, y, z, centered): """ @@ -114,9 +110,7 @@ def test_rotate_360(input_atoms, x, y, z, centered): assert type(rotated) == type(input_atoms) assert struc.coord(rotated).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(rotated), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(rotated), struc.coord(input_atoms), atol=1e-5) if centered and struc.coord(input_atoms).ndim > 1: assert np.allclose( struc.centroid(rotated), struc.centroid(input_atoms), atol=1e-5 @@ -129,7 +123,7 @@ def test_rotate_known(ndim): Rotate a vector at the Y-axis about the X-axis by 90 degrees and expect a rotated vector at the Z-axis. """ - shape = (1,) * (ndim-1) + (3,) + shape = (1,) * (ndim - 1) + (3,) vector = np.zeros(shape) vector[...] = [0, 1, 0] @@ -143,7 +137,7 @@ def test_rotate_known(ndim): assert np.allclose(test_rotated, exp_rotated, atol=1e-5) -@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z +@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z @pytest.mark.parametrize("random_seed", np.arange(5)) def test_rotate_measure(axis, random_seed): """ @@ -166,8 +160,7 @@ def test_rotate_measure(axis, random_seed): test_angle = struc.angle(rotated, 0, input_coord) # Vector length should be unchanged - assert np.linalg.norm(rotated) \ - == pytest.approx(np.linalg.norm(input_coord)) + assert np.linalg.norm(rotated) == pytest.approx(np.linalg.norm(input_coord)) assert test_angle == pytest.approx(ref_angle) @@ -193,12 +186,10 @@ def test_rotate_about_axis(input_atoms, as_list, use_support, random_seed): assert type(restored) == type(input_atoms) assert struc.coord(restored).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) -@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z +@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z @pytest.mark.parametrize("random_seed", np.arange(5)) def test_rotate_about_axis_consistency(input_atoms, axis, random_seed): """ @@ -215,13 +206,15 @@ def test_rotate_about_axis_consistency(input_atoms, axis, random_seed): rot_axis = np.zeros(3) # Length of axis should be irrelevant rot_axis[axis] = np.random.rand() - test_rotated = struc.rotate_about_axis(input_atoms, rot_axis, angle,) + test_rotated = struc.rotate_about_axis( + input_atoms, + rot_axis, + angle, + ) assert type(test_rotated) == type(ref_rotated) assert struc.coord(test_rotated).shape == struc.coord(ref_rotated).shape - assert np.allclose( - struc.coord(test_rotated), struc.coord(ref_rotated), atol=1e-5 - ) + assert np.allclose(struc.coord(test_rotated), struc.coord(ref_rotated), atol=1e-5) @pytest.mark.parametrize("random_seed", np.arange(5)) @@ -233,26 +226,27 @@ def test_rotate_about_axis_360(input_atoms, random_seed, use_support): """ np.random.seed(random_seed) axis = np.random.rand(3) - support = np.random.rand(3) if use_support else None + support = np.random.rand(3) if use_support else None - rotated = struc.rotate_about_axis(input_atoms, axis, 2*np.pi, support) + rotated = struc.rotate_about_axis(input_atoms, axis, 2 * np.pi, support) assert type(rotated) == type(input_atoms) assert struc.coord(rotated).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(rotated), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(rotated), struc.coord(input_atoms), atol=1e-5) @pytest.mark.parametrize("as_list", [False, True]) -@pytest.mark.parametrize("order", ( - np.array([0, 1, 2]), - np.array([0, 2, 1]), - np.array([1, 0, 2]), - np.array([2, 0, 1]), - np.array([2, 1, 0]), - np.array([1, 2, 0]), -)) +@pytest.mark.parametrize( + "order", + ( + np.array([0, 1, 2]), + np.array([0, 2, 1]), + np.array([1, 0, 2]), + np.array([2, 0, 1]), + np.array([2, 1, 0]), + np.array([1, 2, 0]), + ), +) def test_orient_principal_components(input_atoms, as_list, order): """ Orient atoms such that the variance in each axis is greatest @@ -295,8 +289,8 @@ def test_align_vectors(input_atoms, as_list, use_support, random_seed): source_direction = np.random.rand(3) target_direction = np.random.rand(3) if use_support: - source_position = np.random.rand(3) - target_position = np.random.rand(3) + source_position = np.random.rand(3) + target_position = np.random.rand(3) else: source_position = None target_position = None @@ -310,20 +304,22 @@ def test_align_vectors(input_atoms, as_list, use_support, random_seed): transformed = struc.align_vectors( input_atoms, - source_direction, target_direction, - source_position, target_position + source_direction, + target_direction, + source_position, + target_position, ) restored = struc.align_vectors( transformed, - target_direction, source_direction, - target_position, source_position + target_direction, + source_direction, + target_position, + source_position, ) assert type(restored) == type(input_atoms) assert struc.coord(restored).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) def test_align_vectors_non_vector_inputs(input_atoms): diff --git a/tests/test_doctest.py b/tests/test_doctest.py index 6b6792b69..1d4ddd0c7 100644 --- a/tests/test_doctest.py +++ b/tests/test_doctest.py @@ -5,15 +5,14 @@ __author__ = "Patrick Kunzmann" import doctest -from os.path import join import tempfile from importlib import import_module +from os.path import join import numpy as np import pytest -import biotite.structure.io as strucio import biotite.structure as struc -from .util import is_not_installed, cannot_import, cannot_connect_to - +import biotite.structure.io as strucio +from .util import cannot_connect_to, cannot_import, is_not_installed NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/" RCSB_URL = "https://www.rcsb.org/" @@ -23,129 +22,93 @@ # Keep test parameters in separate variable to generate IDs from them TEST_PARAMETERS = [ - pytest.param( - "biotite", - [] - ), - pytest.param( - "biotite.sequence", - [] - ), - pytest.param( - "biotite.sequence.align", - ["biotite.sequence"] - ), - pytest.param( - "biotite.sequence.phylo", - ["biotite.sequence"] - ), + pytest.param("biotite", []), + pytest.param("biotite.sequence", []), + pytest.param("biotite.sequence.align", ["biotite.sequence"]), + pytest.param("biotite.sequence.phylo", ["biotite.sequence"]), pytest.param( "biotite.sequence.graphics", ["biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_import("matplotlib"), reason="Matplotlib is not installed" - ) - ), - pytest.param( - "biotite.sequence.io", - ["biotite.sequence"] - ), - pytest.param( - "biotite.sequence.io.fasta", - ["biotite.sequence"] - ), - pytest.param( - "biotite.sequence.io.fastq", - ["biotite.sequence"] + ), ), + pytest.param("biotite.sequence.io", ["biotite.sequence"]), + pytest.param("biotite.sequence.io.fasta", ["biotite.sequence"]), + pytest.param("biotite.sequence.io.fastq", ["biotite.sequence"]), pytest.param( "biotite.sequence.io.genbank", ["biotite.sequence", "biotite.database.entrez"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available" - ) + ), ), pytest.param( "biotite.sequence.io.gff", ["biotite.sequence", "biotite.sequence.io.fasta"], - marks = pytest.mark.filterwarnings("ignore:") + marks=pytest.mark.filterwarnings("ignore:"), ), pytest.param( - "biotite.structure", - ["biotite.structure.io", "biotite.structure.info"] + "biotite.structure", ["biotite.structure.io", "biotite.structure.info"] ), pytest.param( "biotite.structure.graphics", ["biotite.structure"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_import("matplotlib"), reason="Matplotlib is not installed" ), ), + pytest.param("biotite.structure.io", ["biotite.structure"]), + pytest.param("biotite.structure.io.pdb", ["biotite.structure", "biotite"]), + pytest.param("biotite.structure.io.pdbx", ["biotite.structure"]), pytest.param( - "biotite.structure.io", - ["biotite.structure"] - ), - pytest.param( - "biotite.structure.io.pdb", - ["biotite.structure", "biotite"] - ), - pytest.param( - "biotite.structure.io.pdbx", - ["biotite.structure"] + "biotite.structure.io.pdbqt", ["biotite.structure", "biotite.structure.info"] ), pytest.param( - "biotite.structure.io.pdbqt", - ["biotite.structure", "biotite.structure.info"] - ), - pytest.param( - "biotite.structure.io.mol", - ["biotite.structure", "biotite.structure.info"] - ), - pytest.param( - "biotite.structure.info", - ["biotite.structure"] + "biotite.structure.io.mol", ["biotite.structure", "biotite.structure.info"] ), + pytest.param("biotite.structure.info", ["biotite.structure"]), pytest.param( "biotite.database.entrez", [], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available" - ) + ), ), pytest.param( "biotite.database.rcsb", [], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" - ) + ), ), pytest.param( "biotite.database.uniprot", [], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(UNIPROT_URL), reason="UniProt is not available" - ) + ), ), pytest.param( "biotite.database.pubchem", ["biotite.structure.info"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available" - ) + ), ), pytest.param( "biotite.application", ["biotite.application.clustalo", "biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("clustalo"), reason="Software is not installed" - ) + ), ), pytest.param( "biotite.application.blast", [], ), # Do not test Muscle due to version clash - #pytest.param( + # pytest.param( # "biotite.application.muscle", # ["biotite.sequence"], # marks = pytest.mark.skipif( @@ -154,50 +117,52 @@ pytest.param( "biotite.application.clustalo", ["biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("clustalo"), reason="Software is not installed" - ) + ), ), pytest.param( "biotite.application.mafft", ["biotite.sequence"], - marks = pytest.mark.skipif( - is_not_installed("mafft"), reason="Software is not installed") + marks=pytest.mark.skipif( + is_not_installed("mafft"), reason="Software is not installed" ), + ), pytest.param( - "biotite.application.sra", ["biotite.sequence"], - marks = pytest.mark.skipif( - is_not_installed("fasterq-dump"), - reason="Software is not installed" - ) + "biotite.application.sra", + ["biotite.sequence"], + marks=pytest.mark.skipif( + is_not_installed("fasterq-dump"), reason="Software is not installed" + ), ), pytest.param( "biotite.application.tantan", ["biotite.sequence"], - marks = pytest.mark.skipif( - is_not_installed("tantan"), reason="Software is not installed") + marks=pytest.mark.skipif( + is_not_installed("tantan"), reason="Software is not installed" ), + ), pytest.param( "biotite.application.viennarna", ["biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("RNAfold") | is_not_installed("RNAplot"), - reason="Software is not installed" - ) + reason="Software is not installed", + ), ), pytest.param( "biotite.application.dssp", ["biotite.structure"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("mkdssp"), reason="Software is not installed" - ) + ), ), pytest.param( "biotite.application.autodock", ["biotite.structure", "biotite.structure.info"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("vina"), reason="Software is not installed" - ) + ), ), ] @@ -205,7 +170,7 @@ @pytest.mark.parametrize( "package_name, context_package_names", TEST_PARAMETERS, - ids=[param.values[0] for param in TEST_PARAMETERS] + ids=[param.values[0] for param in TEST_PARAMETERS], ) def test_doctest(package_name, context_package_names): """ @@ -214,18 +179,17 @@ def test_doctest(package_name, context_package_names): # Collect all attributes of this package and its subpackages # as globals for the doctests globs = {} - #The package itself is also used as context + # The package itself is also used as context for name in context_package_names + [package_name]: context_package = import_module(name) globs.update( - {attr : getattr(context_package, attr) - for attr in dir(context_package)} + {attr: getattr(context_package, attr) for attr in dir(context_package)} ) # Add fixed names for certain paths - globs["path_to_directory"] = tempfile.gettempdir() + globs["path_to_directory"] = tempfile.gettempdir() globs["path_to_structures"] = join(".", "tests", "structure", "data") - globs["path_to_sequences"] = join(".", "tests", "sequence", "data") + globs["path_to_sequences"] = join(".", "tests", "sequence", "data") # Add frequently used modules globs["np"] = np # Add frequently used objects @@ -245,14 +209,14 @@ def test_doctest(package_name, context_package_names): # More information below package = import_module(package_name) runner = doctest.DocTestRunner( - verbose = False, - optionflags = - doctest.ELLIPSIS | - doctest.REPORT_ONLY_FIRST_FAILURE | - doctest.NORMALIZE_WHITESPACE + verbose=False, + optionflags=doctest.ELLIPSIS + | doctest.REPORT_ONLY_FIRST_FAILURE + | doctest.NORMALIZE_WHITESPACE, ) for test in doctest.DocTestFinder(exclude_empty=False).find( - package, package.__name__, + package, + package.__name__, # It is necessary to set 'module' to 'False', as otherwise # Cython functions and classes would be falsely identified # as members of an external module by 'DocTestFinder._find()' @@ -263,7 +227,7 @@ def test_doctest(package_name, context_package_names): # ('__init__.py' modules) should only contain attributes, that # are part of the package itself. module=False, - extraglobs=globs + extraglobs=globs, ): runner.run(test) results = doctest.TestResults(runner.failures, runner.tries) @@ -271,4 +235,4 @@ def test_doctest(package_name, context_package_names): assert results.failed == 0 except AssertionError: print(f"Failing doctest in module {package}") - raise \ No newline at end of file + raise diff --git a/tests/test_init.py b/tests/test_init.py index 644659ce9..39e9617e2 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -5,9 +5,8 @@ __author__ = "Daniel Bauer" import biotite -import pytest def test_version_number(): version = biotite.__version__ - assert hasattr(biotite, "__version__") \ No newline at end of file + assert hasattr(biotite, "__version__") diff --git a/tests/test_modname.py b/tests/test_modname.py index 808625f4b..6145b2441 100644 --- a/tests/test_modname.py +++ b/tests/test_modname.py @@ -2,9 +2,9 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pkgutil -from os.path import dirname, join, isdir, splitext import importlib +import pkgutil +from os.path import dirname, join import pytest from .util import cannot_import @@ -18,10 +18,9 @@ def find_all_modules(package_name, src_dir): for _, module_name, is_package in pkgutil.iter_modules([src_dir]): full_module_name = f"{package_name}.{module_name}" if is_package: - module_names.extend(find_all_modules( - full_module_name, - join(src_dir, module_name) - )) + module_names.extend( + find_all_modules(full_module_name, join(src_dir, module_name)) + ) else: module_names.append(full_module_name) return module_names @@ -29,14 +28,11 @@ def find_all_modules(package_name, src_dir): @pytest.mark.skipif( cannot_import("matplotlib") | cannot_import("mdtraj"), - reason="Optional dependencies are not met" + reason="Optional dependencies are not met", ) @pytest.mark.parametrize( "module_name", - find_all_modules( - "biotite", - join(dirname(dirname(__file__)), "src", "biotite") - ) + find_all_modules("biotite", join(dirname(dirname(__file__)), "src", "biotite")), ) def test_module_name(module_name): """ @@ -55,4 +51,4 @@ def test_module_name(module_name): # Autogenerated module from hatch-vcs # # It contains no '__name__' attribute on purpose return - assert module.__name__ == package_name \ No newline at end of file + assert module.__name__ == package_name diff --git a/tests/test_repr.py b/tests/test_repr.py index 0d986a38f..f8bf319c4 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -2,51 +2,85 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from biotite.sequence import NucleotideSequence -from biotite.sequence import ProteinSequence -from biotite.sequence import Alphabet -from biotite.sequence import GeneralSequence -from biotite.sequence import LetterAlphabet -from biotite.sequence import Location -from biotite.sequence import Feature -from biotite.sequence import Annotation -from biotite.sequence import AnnotatedSequence -from biotite.sequence.align import Alignment -from biotite.structure import Atom import numpy as np -from numpy import float32, int32 # noqa: F401 -from biotite.sequence import CodonTable -from biotite.sequence.align import SubstitutionMatrix -from biotite.sequence import SequenceProfile import pytest +from numpy import float32, int32 # noqa: F401 +from biotite.sequence import ( + Alphabet, + AnnotatedSequence, + Annotation, + CodonTable, + Feature, + GeneralSequence, + LetterAlphabet, + Location, + NucleotideSequence, + ProteinSequence, + SequenceProfile, +) +from biotite.sequence.align import Alignment, SubstitutionMatrix +from biotite.structure import Atom __author__ = "Maximilian Greil" -@pytest.mark.parametrize("repr_object", - [NucleotideSequence("AACTGCTA"), - NucleotideSequence("AACTGCTA", ambiguous=True), - ProteinSequence("BIQTITE"), - Alphabet(["X", "Y", "Z"]), - GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), - LetterAlphabet(["X", "Y", "Z"]), - Location(98, 178), - Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}), - Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), - AnnotatedSequence(Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), - NucleotideSequence("AACTGCTA")), - Alignment([NucleotideSequence("CGTCAT", ambiguous=False), - NucleotideSequence("TCATGC", ambiguous=False)], - np.array([[0, -1], [1, -1], [2, 0], [3, 1], [4, 2], [5, 3], [-1, 4], [-1, 5]]), - score=-20), - Atom([1, 2, 3], chain_id="A"), - CodonTable.default_table(), - SubstitutionMatrix(Alphabet(["foo", "bar"]), Alphabet([1, 2, 3]), - {("foo", 1): 5, ("foo", 2): 10, ("foo", 3): 15, ("bar", 1): 42, - ("bar", 2): 42, ("bar", 3): 42}), - SequenceProfile(np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]), - np.array([1, 1, 0, 0, 0, 0, 1, 1]), - Alphabet(["A", "C", "G", "T"]))]) +@pytest.mark.parametrize( + "repr_object", + [ + NucleotideSequence("AACTGCTA"), + NucleotideSequence("AACTGCTA", ambiguous=True), + ProteinSequence("BIQTITE"), + Alphabet(["X", "Y", "Z"]), + GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), + LetterAlphabet(["X", "Y", "Z"]), + Location(98, 178), + Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}), + Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), + AnnotatedSequence( + Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), + NucleotideSequence("AACTGCTA"), + ), + Alignment( + [ + NucleotideSequence("CGTCAT", ambiguous=False), + NucleotideSequence("TCATGC", ambiguous=False), + ], + np.array( + [[0, -1], [1, -1], [2, 0], [3, 1], [4, 2], [5, 3], [-1, 4], [-1, 5]] + ), + score=-20, + ), + Atom([1, 2, 3], chain_id="A"), + CodonTable.default_table(), + SubstitutionMatrix( + Alphabet(["foo", "bar"]), + Alphabet([1, 2, 3]), + { + ("foo", 1): 5, + ("foo", 2): 10, + ("foo", 3): 15, + ("bar", 1): 42, + ("bar", 2): 42, + ("bar", 3): 42, + }, + ), + SequenceProfile( + np.array( + [ + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ), + np.array([1, 1, 0, 0, 0, 0, 1, 1]), + Alphabet(["A", "C", "G", "T"]), + ), + ], +) def test_repr(repr_object): assert eval(repr(repr_object)) == repr_object diff --git a/tests/test_version.py b/tests/test_version.py index 5f11daa2a..ec7bca6f9 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -6,4 +6,4 @@ def test_version(): """ Check if version imported from version.py is correct. """ - assert biotite.__version__ == version("biotite") \ No newline at end of file + assert biotite.__version__ == version("biotite") diff --git a/tests/util.py b/tests/util.py index e72cc5cb5..99cf24741 100644 --- a/tests/util.py +++ b/tests/util.py @@ -2,11 +2,11 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join, dirname, realpath -import urllib.error -import urllib.request import importlib import shutil +import urllib.error +import urllib.request +from os.path import dirname, join, realpath def data_dir(subdir): @@ -16,6 +16,8 @@ def data_dir(subdir): ### Functions for conditional test skips ### tested_urls = {} + + def cannot_connect_to(url): if url not in tested_urls: try: @@ -25,8 +27,10 @@ def cannot_connect_to(url): tested_urls[url] = True return tested_urls[url] + def cannot_import(module): return importlib.util.find_spec(module) is None + def is_not_installed(program): - return shutil.which(program) is None \ No newline at end of file + return shutil.which(program) is None From 8b93aedd23b3f58b9963df51120884d7f0c290fb Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Wed, 3 Jul 2024 10:21:34 +0200 Subject: [PATCH 7/9] Replace relative imports by absolute imports --- src/biotite/application/autodock/app.py | 12 ++--- src/biotite/application/blast/alignment.py | 2 +- src/biotite/application/blast/webapp.py | 16 +++--- src/biotite/application/clustalo/app.py | 8 +-- src/biotite/application/dssp/app.py | 8 +-- src/biotite/application/localapp.py | 7 ++- src/biotite/application/mafft/app.py | 6 +-- src/biotite/application/msaapp.py | 12 ++--- src/biotite/application/muscle/app3.py | 8 +-- src/biotite/application/muscle/app5.py | 6 +-- src/biotite/application/sra/app.py | 17 +++--- src/biotite/application/tantan/app.py | 10 ++-- src/biotite/application/util.py | 4 +- .../application/viennarna/rnaalifold.py | 12 ++--- src/biotite/application/viennarna/rnafold.py | 10 ++-- src/biotite/application/viennarna/rnaplot.py | 6 +-- src/biotite/application/viennarna/util.py | 2 +- src/biotite/application/webapp.py | 2 +- src/biotite/database/entrez/check.py | 2 +- src/biotite/database/entrez/download.py | 8 +-- src/biotite/database/entrez/query.py | 8 +-- src/biotite/database/pubchem/download.py | 6 +-- src/biotite/database/pubchem/query.py | 8 +-- src/biotite/database/rcsb/download.py | 2 +- src/biotite/database/rcsb/query.py | 4 +- src/biotite/database/uniprot/check.py | 2 +- src/biotite/database/uniprot/download.py | 2 +- src/biotite/database/uniprot/query.py | 2 +- src/biotite/file.py | 2 +- src/biotite/sequence/align/alignment.py | 2 +- src/biotite/sequence/align/cigar.py | 2 +- src/biotite/sequence/align/matrix.py | 2 +- src/biotite/sequence/align/statistics.py | 4 +- src/biotite/sequence/alphabet.py | 2 +- src/biotite/sequence/annotation.py | 2 +- src/biotite/sequence/codon.py | 2 +- src/biotite/sequence/graphics/alignment.py | 4 +- src/biotite/sequence/graphics/colorschemes.py | 2 +- src/biotite/sequence/graphics/features.py | 4 +- src/biotite/sequence/graphics/logo.py | 6 +-- src/biotite/sequence/graphics/plasmid.py | 4 +- src/biotite/sequence/io/fasta/convert.py | 6 +-- src/biotite/sequence/io/fasta/file.py | 2 +- src/biotite/sequence/io/fastq/convert.py | 2 +- src/biotite/sequence/io/fastq/file.py | 2 +- src/biotite/sequence/io/genbank/annotation.py | 4 +- src/biotite/sequence/io/genbank/file.py | 2 +- src/biotite/sequence/io/genbank/metadata.py | 2 +- src/biotite/sequence/io/genbank/sequence.py | 8 +-- src/biotite/sequence/io/general.py | 24 ++++----- src/biotite/sequence/io/gff/convert.py | 2 +- src/biotite/sequence/io/gff/file.py | 4 +- src/biotite/sequence/profile.py | 10 ++-- src/biotite/sequence/seqtypes.py | 6 +-- src/biotite/sequence/sequence.py | 4 +- src/biotite/structure/atoms.py | 4 +- src/biotite/structure/basepairs.py | 20 +++---- src/biotite/structure/box.py | 14 ++--- src/biotite/structure/chains.py | 2 +- src/biotite/structure/compare.py | 6 +-- src/biotite/structure/density.py | 2 +- src/biotite/structure/dotbracket.py | 6 +-- src/biotite/structure/filter.py | 12 +++-- src/biotite/structure/geometry.py | 12 ++--- src/biotite/structure/graphics/rna.py | 4 +- src/biotite/structure/hbond.py | 6 +-- src/biotite/structure/info/atoms.py | 4 +- src/biotite/structure/info/bonds.py | 4 +- src/biotite/structure/info/ccd.py | 2 +- src/biotite/structure/info/masses.py | 4 +- src/biotite/structure/info/misc.py | 2 +- src/biotite/structure/info/radii.py | 2 +- src/biotite/structure/info/standardize.py | 6 +-- src/biotite/structure/integrity.py | 4 +- src/biotite/structure/io/dcd/file.py | 4 +- src/biotite/structure/io/general.py | 52 +++++++++---------- src/biotite/structure/io/gro/file.py | 10 ++-- src/biotite/structure/io/mol/convert.py | 6 +-- src/biotite/structure/io/mol/ctab.py | 8 +-- src/biotite/structure/io/mol/mol.py | 11 ++-- src/biotite/structure/io/mol/sdf.py | 13 +++-- src/biotite/structure/io/netcdf/file.py | 4 +- src/biotite/structure/io/pdb/file.py | 22 ++++---- src/biotite/structure/io/pdbqt/file.py | 15 ++++-- src/biotite/structure/io/pdbx/bcif.py | 10 ++-- src/biotite/structure/io/pdbx/cif.py | 4 +- src/biotite/structure/io/pdbx/component.py | 2 +- src/biotite/structure/io/pdbx/convert.py | 33 +++++++----- src/biotite/structure/io/tng/file.py | 2 +- src/biotite/structure/io/trajfile.py | 4 +- src/biotite/structure/io/trr/file.py | 2 +- src/biotite/structure/io/xtc/file.py | 2 +- src/biotite/structure/mechanics.py | 4 +- src/biotite/structure/molecules.py | 4 +- src/biotite/structure/rdf.py | 10 ++-- src/biotite/structure/repair.py | 6 +-- src/biotite/structure/residues.py | 2 +- src/biotite/structure/sequence.py | 12 ++--- src/biotite/structure/sse.py | 10 ++-- src/biotite/structure/superimpose.py | 14 ++--- src/biotite/structure/transform.py | 6 +-- tests/application/test_autodock.py | 2 +- tests/application/test_blast.py | 2 +- tests/application/test_dssp.py | 2 +- tests/application/test_msa.py | 2 +- tests/application/test_rnaalifold.py | 2 +- tests/application/test_rnafold.py | 2 +- tests/application/test_rnaplot.py | 2 +- tests/application/test_tantan.py | 2 +- tests/database/test_entrez.py | 2 +- tests/database/test_pubchem.py | 2 +- tests/database/test_rcsb.py | 2 +- tests/database/test_uniprot.py | 2 +- tests/sequence/align/conftest.py | 2 +- tests/sequence/align/test_multiple.py | 2 +- tests/sequence/align/test_pairwise.py | 2 +- tests/sequence/test_annotation.py | 2 +- tests/sequence/test_fasta.py | 2 +- tests/sequence/test_fastq.py | 2 +- tests/sequence/test_genbank.py | 2 +- tests/sequence/test_generalio.py | 2 +- tests/sequence/test_gff.py | 2 +- tests/sequence/test_graphics.py | 2 +- tests/sequence/test_phylo.py | 2 +- tests/structure/test_basepairs.py | 2 +- tests/structure/test_bonds.py | 2 +- tests/structure/test_box.py | 2 +- tests/structure/test_celllist.py | 2 +- tests/structure/test_chains.py | 2 +- tests/structure/test_compare.py | 2 +- tests/structure/test_dotbracket.py | 2 +- tests/structure/test_filter.py | 2 +- tests/structure/test_generalio.py | 2 +- tests/structure/test_geometry.py | 2 +- tests/structure/test_gro.py | 2 +- tests/structure/test_hbond.py | 2 +- tests/structure/test_info.py | 2 +- tests/structure/test_integrity.py | 2 +- tests/structure/test_mechanics.py | 2 +- tests/structure/test_mol.py | 2 +- tests/structure/test_pdb.py | 2 +- tests/structure/test_pdbqt.py | 2 +- tests/structure/test_pdbx.py | 2 +- tests/structure/test_pseudoknots.py | 2 +- tests/structure/test_rdf.py | 2 +- tests/structure/test_repair.py | 2 +- tests/structure/test_residues.py | 2 +- tests/structure/test_sasa.py | 2 +- tests/structure/test_sequence.py | 2 +- tests/structure/test_sse.py | 2 +- tests/structure/test_superimpose.py | 2 +- tests/structure/test_trajectory.py | 2 +- tests/structure/test_transform.py | 2 +- tests/test_doctest.py | 2 +- tests/test_modname.py | 2 +- 155 files changed, 424 insertions(+), 380 deletions(-) diff --git a/src/biotite/application/autodock/app.py b/src/biotite/application/autodock/app.py index 9b3bf6d72..babd860ae 100644 --- a/src/biotite/application/autodock/app.py +++ b/src/biotite/application/autodock/app.py @@ -9,12 +9,12 @@ import copy from tempfile import NamedTemporaryFile import numpy as np -from ...structure.bonds import find_connected -from ...structure.error import BadStructureError -from ...structure.io.pdbqt import PDBQTFile -from ...structure.residues import get_residue_masks, get_residue_starts_for -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.structure.bonds import find_connected +from biotite.structure.error import BadStructureError +from biotite.structure.io.pdbqt import PDBQTFile +from biotite.structure.residues import get_residue_masks, get_residue_starts_for class VinaApp(LocalApp): diff --git a/src/biotite/application/blast/alignment.py b/src/biotite/application/blast/alignment.py index 251520d2e..85890df66 100644 --- a/src/biotite/application/blast/alignment.py +++ b/src/biotite/application/blast/alignment.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["BlastAlignment"] -from ...sequence.align.alignment import Alignment +from biotite.sequence.align.alignment import Alignment class BlastAlignment(Alignment): diff --git a/src/biotite/application/blast/webapp.py b/src/biotite/application/blast/webapp.py index 58a98d269..06dbda091 100644 --- a/src/biotite/application/blast/webapp.py +++ b/src/biotite/application/blast/webapp.py @@ -9,14 +9,14 @@ import time from xml.etree import ElementTree import requests -from ...sequence.align.alignment import Alignment -from ...sequence.io.fasta.convert import get_sequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.sequence import Sequence -from ..application import AppState, requires_state -from ..webapp import WebApp -from .alignment import BlastAlignment +from biotite.application.application import AppState, requires_state +from biotite.application.blast.alignment import BlastAlignment +from biotite.application.webapp import WebApp +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.io.fasta.convert import get_sequence +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence +from biotite.sequence.sequence import Sequence _ncbi_url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" diff --git a/src/biotite/application/clustalo/app.py b/src/biotite/application/clustalo/app.py index a24112461..228300984 100644 --- a/src/biotite/application/clustalo/app.py +++ b/src/biotite/application/clustalo/app.py @@ -8,10 +8,10 @@ from tempfile import NamedTemporaryFile import numpy as np -from ...sequence.phylo.tree import Tree -from ..application import AppState, requires_state -from ..localapp import cleanup_tempfile -from ..msaapp import MSAApp +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import cleanup_tempfile +from biotite.application.msaapp import MSAApp +from biotite.sequence.phylo.tree import Tree class ClustalOmegaApp(MSAApp): diff --git a/src/biotite/application/dssp/app.py b/src/biotite/application/dssp/app.py index e4d84e87a..57e4ac0f3 100644 --- a/src/biotite/application/dssp/app.py +++ b/src/biotite/application/dssp/app.py @@ -8,10 +8,10 @@ from tempfile import NamedTemporaryFile import numpy as np -from ...structure.io.pdbx.cif import CIFFile -from ...structure.io.pdbx.convert import set_structure -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.structure.io.pdbx.cif import CIFFile +from biotite.structure.io.pdbx.convert import set_structure class DsspApp(LocalApp): diff --git a/src/biotite/application/localapp.py b/src/biotite/application/localapp.py index d52dc0632..2fa7ee1d3 100644 --- a/src/biotite/application/localapp.py +++ b/src/biotite/application/localapp.py @@ -10,7 +10,12 @@ import copy from os import chdir, getcwd, remove from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired -from .application import Application, AppState, AppStateError, requires_state +from biotite.application.application import ( + Application, + AppState, + AppStateError, + requires_state, +) class LocalApp(Application, metaclass=abc.ABCMeta): diff --git a/src/biotite/application/mafft/app.py b/src/biotite/application/mafft/app.py index 562dcf1ae..84f3f6b9b 100644 --- a/src/biotite/application/mafft/app.py +++ b/src/biotite/application/mafft/app.py @@ -8,9 +8,9 @@ import os import re -from ...sequence.phylo.tree import Tree -from ..application import AppState, requires_state -from ..msaapp import MSAApp +from biotite.application.application import AppState, requires_state +from biotite.application.msaapp import MSAApp +from biotite.sequence.phylo.tree import Tree _prefix_pattern = re.compile(r"\d*_") diff --git a/src/biotite/application/msaapp.py b/src/biotite/application/msaapp.py index d70a3012f..31eb0064c 100644 --- a/src/biotite/application/msaapp.py +++ b/src/biotite/application/msaapp.py @@ -10,12 +10,12 @@ from collections import OrderedDict from tempfile import NamedTemporaryFile import numpy as np -from ..sequence.align.alignment import Alignment -from ..sequence.io.fasta.file import FastaFile -from ..sequence.seqtypes import NucleotideSequence, ProteinSequence -from .application import AppState, requires_state -from .localapp import LocalApp, cleanup_tempfile -from .util import map_matrix, map_sequence +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.application.util import map_matrix, map_sequence +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence class MSAApp(LocalApp, metaclass=abc.ABCMeta): diff --git a/src/biotite/application/muscle/app3.py b/src/biotite/application/muscle/app3.py index 86a883afa..0bb05bc4b 100644 --- a/src/biotite/application/muscle/app3.py +++ b/src/biotite/application/muscle/app3.py @@ -11,10 +11,10 @@ import subprocess import warnings from tempfile import NamedTemporaryFile -from ...sequence.phylo.tree import Tree -from ..application import AppState, VersionError, requires_state -from ..localapp import cleanup_tempfile -from ..msaapp import MSAApp +from biotite.application.application import AppState, VersionError, requires_state +from biotite.application.localapp import cleanup_tempfile +from biotite.application.msaapp import MSAApp +from biotite.sequence.phylo.tree import Tree class MuscleApp(MSAApp): diff --git a/src/biotite/application/muscle/app5.py b/src/biotite/application/muscle/app5.py index 94a1f54bf..cc1ef5e2a 100644 --- a/src/biotite/application/muscle/app5.py +++ b/src/biotite/application/muscle/app5.py @@ -6,9 +6,9 @@ __author__ = "Patrick Kunzmann" __all__ = ["Muscle5App"] -from ..application import AppState, VersionError, requires_state -from ..msaapp import MSAApp -from .app3 import get_version +from biotite.application.application import AppState, VersionError, requires_state +from biotite.application.msaapp import MSAApp +from biotite.application.muscle.app3 import get_version class Muscle5App(MSAApp): diff --git a/src/biotite/application/sra/app.py b/src/biotite/application/sra/app.py index bda5be577..d1d99834f 100644 --- a/src/biotite/application/sra/app.py +++ b/src/biotite/application/sra/app.py @@ -11,12 +11,17 @@ from os.path import join from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired from tempfile import TemporaryDirectory -from ...sequence.io.fasta.convert import get_sequences -from ...sequence.io.fasta.file import FastaFile -from ...sequence.io.fastq.convert import get_sequences as get_sequences_and_scores -from ...sequence.io.fastq.file import FastqFile -from ...sequence.seqtypes import NucleotideSequence -from ..application import Application, AppState, AppStateError, requires_state +from biotite.application.application import ( + Application, + AppState, + AppStateError, + requires_state, +) +from biotite.sequence.io.fasta.convert import get_sequences +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.io.fastq.convert import get_sequences as get_sequences_and_scores +from biotite.sequence.io.fastq.file import FastqFile +from biotite.sequence.seqtypes import NucleotideSequence # Do not use LocalApp, as two programs are executed diff --git a/src/biotite/application/tantan/app.py b/src/biotite/application/tantan/app.py index 23416b83f..6d7020569 100644 --- a/src/biotite/application/tantan/app.py +++ b/src/biotite/application/tantan/app.py @@ -10,11 +10,11 @@ from collections.abc import Sequence as SequenceABC from tempfile import NamedTemporaryFile import numpy as np -from ...sequence.alphabet import common_alphabet -from ...sequence.io.fasta.file import FastaFile -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.sequence.alphabet import common_alphabet +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence MASKING_LETTER = "!" diff --git a/src/biotite/application/util.py b/src/biotite/application/util.py index af92a1354..4da2a342f 100644 --- a/src/biotite/application/util.py +++ b/src/biotite/application/util.py @@ -8,8 +8,8 @@ import numpy as np -from ..sequence.align.matrix import SubstitutionMatrix -from ..sequence.seqtypes import ProteinSequence +from biotite.sequence.align.matrix import SubstitutionMatrix +from biotite.sequence.seqtypes import ProteinSequence def map_sequence(sequence): diff --git a/src/biotite/application/viennarna/rnaalifold.py b/src/biotite/application/viennarna/rnaalifold.py index 1eebe573e..4604780aa 100644 --- a/src/biotite/application/viennarna/rnaalifold.py +++ b/src/biotite/application/viennarna/rnaalifold.py @@ -9,12 +9,12 @@ import copy from tempfile import NamedTemporaryFile import numpy as np -from ...sequence.io.fasta import FastaFile, set_alignment -from ...structure.bonds import BondList -from ...structure.dotbracket import base_pairs_from_dot_bracket -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile -from .util import build_constraint_string +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.application.viennarna.util import build_constraint_string +from biotite.sequence.io.fasta import FastaFile, set_alignment +from biotite.structure.bonds import BondList +from biotite.structure.dotbracket import base_pairs_from_dot_bracket class RNAalifoldApp(LocalApp): diff --git a/src/biotite/application/viennarna/rnafold.py b/src/biotite/application/viennarna/rnafold.py index c636fb285..37fb0e3d7 100644 --- a/src/biotite/application/viennarna/rnafold.py +++ b/src/biotite/application/viennarna/rnafold.py @@ -8,11 +8,11 @@ from tempfile import NamedTemporaryFile import numpy as np -from ...sequence.io.fasta import FastaFile, set_sequence -from ...structure.dotbracket import base_pairs_from_dot_bracket -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile -from .util import build_constraint_string +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.application.viennarna.util import build_constraint_string +from biotite.sequence.io.fasta import FastaFile, set_sequence +from biotite.structure.dotbracket import base_pairs_from_dot_bracket class RNAfoldApp(LocalApp): diff --git a/src/biotite/application/viennarna/rnaplot.py b/src/biotite/application/viennarna/rnaplot.py index acb23e74d..1f36f9142 100644 --- a/src/biotite/application/viennarna/rnaplot.py +++ b/src/biotite/application/viennarna/rnaplot.py @@ -10,9 +10,9 @@ from os import remove from tempfile import NamedTemporaryFile import numpy as np -from ...structure.dotbracket import dot_bracket as dot_bracket_ -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.structure.dotbracket import dot_bracket as dot_bracket_ class RNAplotApp(LocalApp): diff --git a/src/biotite/application/viennarna/util.py b/src/biotite/application/viennarna/util.py index fa9336e08..90bcd6c4e 100644 --- a/src/biotite/application/viennarna/util.py +++ b/src/biotite/application/viennarna/util.py @@ -7,7 +7,7 @@ __all__ = ["build_constraint_string"] import numpy as np -from ...structure.pseudoknots import pseudoknots +from biotite.structure.pseudoknots import pseudoknots def build_constraint_string( diff --git a/src/biotite/application/webapp.py b/src/biotite/application/webapp.py index d40e5ed22..6e76eb1cd 100644 --- a/src/biotite/application/webapp.py +++ b/src/biotite/application/webapp.py @@ -7,7 +7,7 @@ __all__ = ["WebApp", "RuleViolationError"] import abc -from .application import Application +from biotite.application.application import Application class WebApp(Application, metaclass=abc.ABCMeta): diff --git a/src/biotite/database/entrez/check.py b/src/biotite/database/entrez/check.py index 063ecdf03..a9e2db5e9 100644 --- a/src/biotite/database/entrez/check.py +++ b/src/biotite/database/entrez/check.py @@ -7,7 +7,7 @@ __all__ = ["check_for_errors"] import json -from ..error import RequestError +from biotite.database.error import RequestError # Taken from https://github.com/kblin/ncbi-entrez-error-messages _error_messages = [ diff --git a/src/biotite/database/entrez/download.py b/src/biotite/database/entrez/download.py index e2239b925..2c2438d8e 100644 --- a/src/biotite/database/entrez/download.py +++ b/src/biotite/database/entrez/download.py @@ -10,10 +10,10 @@ import os from os.path import getsize, isdir, isfile, join import requests -from ..error import RequestError -from .check import check_for_errors -from .dbnames import sanitize_database_name -from .key import get_api_key +from biotite.database.entrez.check import check_for_errors +from biotite.database.entrez.dbnames import sanitize_database_name +from biotite.database.entrez.key import get_api_key +from biotite.database.error import RequestError _fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" diff --git a/src/biotite/database/entrez/query.py b/src/biotite/database/entrez/query.py index 18b49d8fa..f9b4867ea 100644 --- a/src/biotite/database/entrez/query.py +++ b/src/biotite/database/entrez/query.py @@ -9,10 +9,10 @@ import abc from xml.etree import ElementTree import requests -from ..error import RequestError -from .check import check_for_errors -from .dbnames import sanitize_database_name -from .key import get_api_key +from biotite.database.entrez.check import check_for_errors +from biotite.database.entrez.dbnames import sanitize_database_name +from biotite.database.entrez.key import get_api_key +from biotite.database.error import RequestError _search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" diff --git a/src/biotite/database/pubchem/download.py b/src/biotite/database/pubchem/download.py index bc9e97d0d..85fa09e9e 100644 --- a/src/biotite/database/pubchem/download.py +++ b/src/biotite/database/pubchem/download.py @@ -11,9 +11,9 @@ import os from os.path import getsize, isdir, isfile, join import requests -from ..error import RequestError -from .error import parse_error_details -from .throttle import ThrottleStatus +from biotite.database.error import RequestError +from biotite.database.pubchem.error import parse_error_details +from biotite.database.pubchem.throttle import ThrottleStatus _base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" _binary_formats = ["png", "asnb"] diff --git a/src/biotite/database/pubchem/query.py b/src/biotite/database/pubchem/query.py index 9d54d4f1c..31a030e4a 100644 --- a/src/biotite/database/pubchem/query.py +++ b/src/biotite/database/pubchem/query.py @@ -22,10 +22,10 @@ import collections import copy import requests -from ...structure.io.mol.mol import MOLFile -from ..error import RequestError -from .error import parse_error_details -from .throttle import ThrottleStatus +from biotite.database.error import RequestError +from biotite.database.pubchem.error import parse_error_details +from biotite.database.pubchem.throttle import ThrottleStatus +from biotite.structure.io.mol.mol import MOLFile _base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" diff --git a/src/biotite/database/rcsb/download.py b/src/biotite/database/rcsb/download.py index 2af19a3e5..230792dae 100644 --- a/src/biotite/database/rcsb/download.py +++ b/src/biotite/database/rcsb/download.py @@ -10,7 +10,7 @@ import os from os.path import getsize, isfile, join import requests -from ..error import RequestError +from biotite.database.error import RequestError _standard_url = "https://files.rcsb.org/download/" _bcif_url = "https://models.rcsb.org/" diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index 155ab25d8..95d59703e 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -28,8 +28,8 @@ from datetime import datetime import numpy as np import requests -from ...sequence.seqtypes import NucleotideSequence -from ..error import RequestError +from biotite.database.error import RequestError +from biotite.sequence.seqtypes import NucleotideSequence _search_url = "https://search.rcsb.org/rcsbsearch/v2/query" _scope_to_target = { diff --git a/src/biotite/database/uniprot/check.py b/src/biotite/database/uniprot/check.py index bbd2db470..a1782e1ba 100644 --- a/src/biotite/database/uniprot/check.py +++ b/src/biotite/database/uniprot/check.py @@ -6,7 +6,7 @@ __author__ = "Maximilian Greil" __all__ = ["assert_valid_response"] -from ..error import RequestError +from biotite.database.error import RequestError # Taken from https://www.uniprot.org/help/api_retrieve_entries diff --git a/src/biotite/database/uniprot/download.py b/src/biotite/database/uniprot/download.py index 42d3cca58..bacb40e96 100644 --- a/src/biotite/database/uniprot/download.py +++ b/src/biotite/database/uniprot/download.py @@ -10,7 +10,7 @@ import os from os.path import getsize, isdir, isfile, join import requests -from .check import assert_valid_response +from biotite.database.uniprot.check import assert_valid_response _fetch_url = "https://rest.uniprot.org/" diff --git a/src/biotite/database/uniprot/query.py b/src/biotite/database/uniprot/query.py index 922749626..687c61f5f 100644 --- a/src/biotite/database/uniprot/query.py +++ b/src/biotite/database/uniprot/query.py @@ -8,7 +8,7 @@ import abc import requests -from .check import assert_valid_response +from biotite.database.uniprot.check import assert_valid_response _base_url = "https://rest.uniprot.org/uniprotkb/search/" diff --git a/src/biotite/file.py b/src/biotite/file.py index fc7a8f7aa..ec7047db6 100644 --- a/src/biotite/file.py +++ b/src/biotite/file.py @@ -16,7 +16,7 @@ import copy import io from os import PathLike -from .copyable import Copyable +from biotite.copyable import Copyable class File(Copyable, metaclass=abc.ABCMeta): diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py index b416cba43..f29ac0a50 100644 --- a/src/biotite/sequence/align/alignment.py +++ b/src/biotite/sequence/align/alignment.py @@ -8,7 +8,7 @@ import numbers import textwrap import numpy as np -from ..alphabet import LetterAlphabet +from biotite.sequence.alphabet import LetterAlphabet __all__ = [ "Alignment", diff --git a/src/biotite/sequence/align/cigar.py b/src/biotite/sequence/align/cigar.py index 2bd0de6b2..60366e897 100644 --- a/src/biotite/sequence/align/cigar.py +++ b/src/biotite/sequence/align/cigar.py @@ -8,7 +8,7 @@ import enum import numpy as np -from .alignment import Alignment, get_codes +from biotite.sequence.align.alignment import Alignment, get_codes class CigarOp(enum.IntEnum): diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index f53b2f223..2a7d23437 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -7,7 +7,7 @@ import os import numpy as np -from ..seqtypes import NucleotideSequence, ProteinSequence +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence __all__ = ["SubstitutionMatrix"] diff --git a/src/biotite/sequence/align/statistics.py b/src/biotite/sequence/align/statistics.py index a62eae224..72a783ac5 100644 --- a/src/biotite/sequence/align/statistics.py +++ b/src/biotite/sequence/align/statistics.py @@ -7,8 +7,8 @@ __all__ = ["EValueEstimator"] import numpy as np -from ..seqtypes import GeneralSequence -from .pairwise import align_optimal +from biotite.sequence.align.pairwise import align_optimal +from biotite.sequence.seqtypes import GeneralSequence class EValueEstimator: diff --git a/src/biotite/sequence/alphabet.py b/src/biotite/sequence/alphabet.py index 39c82a752..4231817bd 100644 --- a/src/biotite/sequence/alphabet.py +++ b/src/biotite/sequence/alphabet.py @@ -16,7 +16,7 @@ import string from numbers import Integral import numpy as np -from .codec import decode_to_chars, encode_chars, map_sequence_code +from biotite.sequence.codec import decode_to_chars, encode_chars, map_sequence_code class Alphabet(object): diff --git a/src/biotite/sequence/annotation.py b/src/biotite/sequence/annotation.py index 21d10768e..5843e6bb8 100644 --- a/src/biotite/sequence/annotation.py +++ b/src/biotite/sequence/annotation.py @@ -11,7 +11,7 @@ import sys from enum import Enum, Flag, auto import numpy as np -from ..copyable import Copyable +from biotite.copyable import Copyable class Location: diff --git a/src/biotite/sequence/codon.py b/src/biotite/sequence/codon.py index 5380706a3..5e7f59343 100644 --- a/src/biotite/sequence/codon.py +++ b/src/biotite/sequence/codon.py @@ -10,7 +10,7 @@ from numbers import Integral from os.path import dirname, join, realpath import numpy as np -from .seqtypes import NucleotideSequence, ProteinSequence +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence # Abbreviations _NUC_ALPH = NucleotideSequence.alphabet_unamb diff --git a/src/biotite/sequence/graphics/alignment.py b/src/biotite/sequence/graphics/alignment.py index aeaaf5b24..f3bdb6380 100644 --- a/src/biotite/sequence/graphics/alignment.py +++ b/src/biotite/sequence/graphics/alignment.py @@ -18,8 +18,8 @@ import abc import numpy as np -from ...visualize import colors -from .colorschemes import get_color_scheme +from biotite.sequence.graphics.colorschemes import get_color_scheme +from biotite.visualize import colors class SymbolPlotter(metaclass=abc.ABCMeta): diff --git a/src/biotite/sequence/graphics/colorschemes.py b/src/biotite/sequence/graphics/colorschemes.py index 88044e0f4..d38879c91 100644 --- a/src/biotite/sequence/graphics/colorschemes.py +++ b/src/biotite/sequence/graphics/colorschemes.py @@ -10,7 +10,7 @@ import json import os from os.path import dirname, join, realpath -from ..alphabet import Alphabet +from biotite.sequence.alphabet import Alphabet def load_color_scheme(file_name): diff --git a/src/biotite/sequence/graphics/features.py b/src/biotite/sequence/graphics/features.py index 031ab04dc..6fe25fa41 100644 --- a/src/biotite/sequence/graphics/features.py +++ b/src/biotite/sequence/graphics/features.py @@ -15,8 +15,8 @@ ] import abc -from ...visualize import AdaptiveFancyArrow, colors -from ..annotation import Location +from biotite.sequence.annotation import Location +from biotite.visualize import AdaptiveFancyArrow, colors def plot_feature_map( diff --git a/src/biotite/sequence/graphics/logo.py b/src/biotite/sequence/graphics/logo.py index 42995814a..3fc32a052 100644 --- a/src/biotite/sequence/graphics/logo.py +++ b/src/biotite/sequence/graphics/logo.py @@ -7,9 +7,9 @@ __all__ = ["plot_sequence_logo"] import numpy as np -from ...visualize import set_font_size_in_coord -from ..alphabet import LetterAlphabet -from .colorschemes import get_color_scheme +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.graphics.colorschemes import get_color_scheme +from biotite.visualize import set_font_size_in_coord def plot_sequence_logo(axes, profile, scheme=None, **kwargs): diff --git a/src/biotite/sequence/graphics/plasmid.py b/src/biotite/sequence/graphics/plasmid.py index 3869c36d2..f15f776f8 100644 --- a/src/biotite/sequence/graphics/plasmid.py +++ b/src/biotite/sequence/graphics/plasmid.py @@ -9,8 +9,8 @@ import re import warnings import numpy as np -from ...visualize import colors -from ..annotation import Feature, Location +from biotite.sequence.annotation import Feature, Location +from biotite.visualize import colors def plot_plasmid_map( diff --git a/src/biotite/sequence/io/fasta/convert.py b/src/biotite/sequence/io/fasta/convert.py index 2cf33f198..d128309d1 100644 --- a/src/biotite/sequence/io/fasta/convert.py +++ b/src/biotite/sequence/io/fasta/convert.py @@ -7,9 +7,9 @@ import warnings from collections import OrderedDict -from ...align.alignment import Alignment -from ...alphabet import AlphabetError, LetterAlphabet -from ...seqtypes import NucleotideSequence, ProteinSequence +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.alphabet import AlphabetError, LetterAlphabet +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence __all__ = [ "get_sequence", diff --git a/src/biotite/sequence/io/fasta/file.py b/src/biotite/sequence/io/fasta/file.py index ab465c0e9..e0fe20ad7 100644 --- a/src/biotite/sequence/io/fasta/file.py +++ b/src/biotite/sequence/io/fasta/file.py @@ -8,7 +8,7 @@ from collections import OrderedDict from collections.abc import MutableMapping -from ....file import InvalidFileError, TextFile, wrap_string +from biotite.file import InvalidFileError, TextFile, wrap_string class FastaFile(TextFile, MutableMapping): diff --git a/src/biotite/sequence/io/fastq/convert.py b/src/biotite/sequence/io/fastq/convert.py index 0ccc97c22..5b743fcd7 100644 --- a/src/biotite/sequence/io/fastq/convert.py +++ b/src/biotite/sequence/io/fastq/convert.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" from collections import OrderedDict -from ...seqtypes import NucleotideSequence +from biotite.sequence.seqtypes import NucleotideSequence __all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"] diff --git a/src/biotite/sequence/io/fastq/file.py b/src/biotite/sequence/io/fastq/file.py index 5ac85c7b1..c6c85c6cb 100644 --- a/src/biotite/sequence/io/fastq/file.py +++ b/src/biotite/sequence/io/fastq/file.py @@ -9,7 +9,7 @@ from collections.abc import MutableMapping from numbers import Integral import numpy as np -from ....file import InvalidFileError, TextFile, wrap_string +from biotite.file import InvalidFileError, TextFile, wrap_string __all__ = ["FastqFile"] diff --git a/src/biotite/sequence/io/genbank/annotation.py b/src/biotite/sequence/io/genbank/annotation.py index 1a28ae3af..4300c41a4 100644 --- a/src/biotite/sequence/io/genbank/annotation.py +++ b/src/biotite/sequence/io/genbank/annotation.py @@ -12,8 +12,8 @@ import re import warnings -from ....file import InvalidFileError -from ...annotation import Annotation, Feature, Location +from biotite.file import InvalidFileError +from biotite.sequence.annotation import Annotation, Feature, Location _KEY_START = 5 _QUAL_START = 21 diff --git a/src/biotite/sequence/io/genbank/file.py b/src/biotite/sequence/io/genbank/file.py index 4bbbaff02..d76a4b63d 100644 --- a/src/biotite/sequence/io/genbank/file.py +++ b/src/biotite/sequence/io/genbank/file.py @@ -12,7 +12,7 @@ # import re import io from collections import OrderedDict -from ....file import InvalidFileError, TextFile +from biotite.file import InvalidFileError, TextFile # from ...annotation import Location, Feature, Annotation, AnnotatedSequence # from ...seqtypes import NucleotideSequence, ProteinSequence diff --git a/src/biotite/sequence/io/genbank/metadata.py b/src/biotite/sequence/io/genbank/metadata.py index 8654e42c4..477c0fbf2 100644 --- a/src/biotite/sequence/io/genbank/metadata.py +++ b/src/biotite/sequence/io/genbank/metadata.py @@ -19,7 +19,7 @@ "set_locus", ] -from ....file import InvalidFileError +from biotite.file import InvalidFileError def get_locus(gb_file): diff --git a/src/biotite/sequence/io/genbank/sequence.py b/src/biotite/sequence/io/genbank/sequence.py index 1504bd9bf..f5b194746 100644 --- a/src/biotite/sequence/io/genbank/sequence.py +++ b/src/biotite/sequence/io/genbank/sequence.py @@ -17,10 +17,10 @@ ] import re -from ....file import InvalidFileError -from ...annotation import AnnotatedSequence -from ...seqtypes import NucleotideSequence, ProteinSequence -from .annotation import get_annotation, set_annotation +from biotite.file import InvalidFileError +from biotite.sequence.annotation import AnnotatedSequence +from biotite.sequence.io.genbank.annotation import get_annotation, set_annotation +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence _SYMBOLS_PER_CHUNK = 10 _SEQ_CHUNKS_PER_LINE = 6 diff --git a/src/biotite/sequence/io/general.py b/src/biotite/sequence/io/general.py index 54ed5bf4c..c76e11b72 100644 --- a/src/biotite/sequence/io/general.py +++ b/src/biotite/sequence/io/general.py @@ -14,7 +14,7 @@ import os.path from collections import OrderedDict import numpy as np -from ..seqtypes import NucleotideSequence +from biotite.sequence.seqtypes import NucleotideSequence def load_sequence(file_path): @@ -38,12 +38,12 @@ def load_sequence(file_path): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, get_sequence + from biotite.sequence.io.fasta import FastaFile, get_sequence file = FastaFile.read(file_path) return get_sequence(file) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile # Quality scores are irrelevant for this function # -> Offset is irrelevant @@ -54,7 +54,7 @@ def load_sequence(file_path): break return sequence elif suffix in [".gb", ".gbk", ".gp"]: - from .genbank import GenBankFile, get_sequence + from biotite.sequence.io.genbank import GenBankFile, get_sequence format = "gp" if suffix == ".gp" else "gb" file = GenBankFile.read(file_path) @@ -81,13 +81,13 @@ def save_sequence(file_path, sequence): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, set_sequence + from biotite.sequence.io.fasta import FastaFile, set_sequence file = FastaFile() set_sequence(file, sequence) file.write(file_path) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile # Quality scores are irrelevant for this function # -> Offset is irrelevant @@ -97,7 +97,7 @@ def save_sequence(file_path, sequence): file["sequence"] = str(sequence), scores file.write(file_path) elif suffix in [".gb", ".gbk", ".gp"]: - from .genbank import GenBankFile, set_locus, set_sequence + from biotite.sequence.io.genbank import GenBankFile, set_locus, set_sequence file = GenBankFile() set_locus(file, "sequence", len(sequence)) @@ -130,12 +130,12 @@ def load_sequences(file_path): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, get_sequences + from biotite.sequence.io.fasta import FastaFile, get_sequences file = FastaFile.read(file_path) return get_sequences(file) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile # Quality scores are irrelevant for this function # -> Offset is irrelevant @@ -145,7 +145,7 @@ def load_sequences(file_path): for identifier, (seq_str, scores) in file.items() } elif suffix in [".gb", ".gbk", ".gp"]: - from .genbank import MultiFile, get_definition, get_sequence + from biotite.sequence.io.genbank import MultiFile, get_definition, get_sequence file = MultiFile.read(file_path) format = "gp" if suffix == ".gp" else "gb" @@ -176,13 +176,13 @@ def save_sequences(file_path, sequences): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, set_sequences + from biotite.sequence.io.fasta import FastaFile, set_sequences file = FastaFile() set_sequences(file, sequences) file.write(file_path) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile # Quality scores are irrelevant for this function # -> Offset is irrelevant diff --git a/src/biotite/sequence/io/gff/convert.py b/src/biotite/sequence/io/gff/convert.py index 51971d799..8f3fb75f2 100644 --- a/src/biotite/sequence/io/gff/convert.py +++ b/src/biotite/sequence/io/gff/convert.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_annotation", "set_annotation"] -from ...annotation import Annotation, Feature, Location +from biotite.sequence.annotation import Annotation, Feature, Location def get_annotation(gff_file): diff --git a/src/biotite/sequence/io/gff/file.py b/src/biotite/sequence/io/gff/file.py index dff00822d..c151bd869 100644 --- a/src/biotite/sequence/io/gff/file.py +++ b/src/biotite/sequence/io/gff/file.py @@ -9,8 +9,8 @@ import string import warnings from urllib.parse import quote, unquote -from ....file import InvalidFileError, TextFile -from ...annotation import Location +from biotite.file import InvalidFileError, TextFile +from biotite.sequence.annotation import Location # All punctuation characters except # percent, semicolon, equals, ampersand, comma diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py index d8320107b..d208b2b3f 100644 --- a/src/biotite/sequence/profile.py +++ b/src/biotite/sequence/profile.py @@ -4,9 +4,13 @@ import warnings import numpy as np -from .align.alignment import get_codes -from .alphabet import LetterAlphabet -from .seqtypes import GeneralSequence, NucleotideSequence, ProteinSequence +from biotite.sequence.align.alignment import get_codes +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.seqtypes import ( + GeneralSequence, + NucleotideSequence, + ProteinSequence, +) __name__ = "biotite.sequence" __author__ = "Maximilian Greil" diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py index 2df25aaa4..f72c2c737 100644 --- a/src/biotite/sequence/seqtypes.py +++ b/src/biotite/sequence/seqtypes.py @@ -7,8 +7,8 @@ __all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"] import numpy as np -from .alphabet import AlphabetError, AlphabetMapper, LetterAlphabet -from .sequence import Sequence +from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet +from biotite.sequence.sequence import Sequence class GeneralSequence(Sequence): @@ -261,7 +261,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): # Determine codon_table if codon_table is None: # Import at this position to avoid circular import - from .codon import CodonTable + from biotite.sequence.codon import CodonTable codon_table = CodonTable.default_table() diff --git a/src/biotite/sequence/sequence.py b/src/biotite/sequence/sequence.py index 6acdcfb8a..4040fcc0e 100644 --- a/src/biotite/sequence/sequence.py +++ b/src/biotite/sequence/sequence.py @@ -13,8 +13,8 @@ import abc import numbers import numpy as np -from ..copyable import Copyable -from .alphabet import LetterAlphabet +from biotite.copyable import Copyable +from biotite.sequence.alphabet import LetterAlphabet _size_uint8 = np.iinfo(np.uint8).max + 1 _size_uint16 = np.iinfo(np.uint16).max + 1 diff --git a/src/biotite/structure/atoms.py b/src/biotite/structure/atoms.py index 3c344063e..ea9f2ef68 100644 --- a/src/biotite/structure/atoms.py +++ b/src/biotite/structure/atoms.py @@ -23,8 +23,8 @@ import abc import numbers import numpy as np -from ..copyable import Copyable -from .bonds import BondList +from biotite.copyable import Copyable +from biotite.structure.bonds import BondList class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta): diff --git a/src/biotite/structure/basepairs.py b/src/biotite/structure/basepairs.py index 02b7a4a0e..19265c756 100644 --- a/src/biotite/structure/basepairs.py +++ b/src/biotite/structure/basepairs.py @@ -21,20 +21,20 @@ import warnings from enum import IntEnum import numpy as np -from .atoms import Atom, array -from .celllist import CellList -from .compare import rmsd -from .error import ( +from biotite.structure.atoms import Atom, array +from biotite.structure.celllist import CellList +from biotite.structure.compare import rmsd +from biotite.structure.error import ( BadStructureError, IncompleteStructureWarning, UnexpectedStructureWarning, ) -from .filter import filter_nucleotides -from .hbond import hbond -from .info.standardize import standardize_order -from .residues import get_residue_masks, get_residue_starts_for -from .superimpose import superimpose -from .util import distance, norm_vector +from biotite.structure.filter import filter_nucleotides +from biotite.structure.hbond import hbond +from biotite.structure.info.standardize import standardize_order +from biotite.structure.residues import get_residue_masks, get_residue_starts_for +from biotite.structure.superimpose import superimpose +from biotite.structure.util import distance, norm_vector def _get_std_adenine(): diff --git a/src/biotite/structure/box.py b/src/biotite/structure/box.py index a04400b84..41349bb9d 100644 --- a/src/biotite/structure/box.py +++ b/src/biotite/structure/box.py @@ -26,11 +26,11 @@ from numbers import Integral import numpy as np import numpy.linalg as linalg -from .atoms import repeat -from .chains import get_chain_masks, get_chain_starts -from .error import BadStructureError -from .molecules import get_molecule_masks -from .util import vector_dot +from biotite.structure.atoms import repeat +from biotite.structure.chains import get_chain_masks, get_chain_starts +from biotite.structure.error import BadStructureError +from biotite.structure.molecules import get_molecule_masks +from biotite.structure.util import vector_dot def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): @@ -395,7 +395,7 @@ def remove_pbc(atoms, selection=None): half box size. """ # Avoid circular import - from .geometry import centroid + from biotite.structure.geometry import centroid if atoms.box is None: raise BadStructureError("The 'box' attribute must be set in the structure") @@ -462,7 +462,7 @@ def remove_pbc_from_coord(coord, box): """ # Import in function to avoid circular import - from .geometry import index_displacement + from biotite.structure.geometry import index_displacement # Get the PBC-sanitized displacements of all coordinates # to the respective next coordinate diff --git a/src/biotite/structure/chains.py b/src/biotite/structure/chains.py index 2778855c2..f38ccc05a 100644 --- a/src/biotite/structure/chains.py +++ b/src/biotite/structure/chains.py @@ -23,7 +23,7 @@ ] import numpy as np -from .resutil import * +from biotite.structure.resutil import * def get_chain_starts(array, add_exclusive_stop=False): diff --git a/src/biotite/structure/compare.py b/src/biotite/structure/compare.py index 0a07ea383..bdce1d7a0 100644 --- a/src/biotite/structure/compare.py +++ b/src/biotite/structure/compare.py @@ -12,9 +12,9 @@ __all__ = ["rmsd", "rmspd", "rmsf", "average"] import numpy as np -from .atoms import AtomArrayStack, coord -from .geometry import index_distance -from .util import vector_dot +from biotite.structure.atoms import AtomArrayStack, coord +from biotite.structure.geometry import index_distance +from biotite.structure.util import vector_dot def rmsd(reference, subject): diff --git a/src/biotite/structure/density.py b/src/biotite/structure/density.py index 9065672dd..86f24d53e 100644 --- a/src/biotite/structure/density.py +++ b/src/biotite/structure/density.py @@ -11,7 +11,7 @@ __all__ = ["density"] import numpy as np -from .atoms import coord +from biotite.structure.atoms import coord def density(atoms, selection=None, delta=1.0, bins=None, density=False, weights=None): diff --git a/src/biotite/structure/dotbracket.py b/src/biotite/structure/dotbracket.py index 0d208cc19..66d8af441 100644 --- a/src/biotite/structure/dotbracket.py +++ b/src/biotite/structure/dotbracket.py @@ -12,9 +12,9 @@ __all__ = ["dot_bracket_from_structure", "dot_bracket", "base_pairs_from_dot_bracket"] import numpy as np -from .basepairs import base_pairs -from .pseudoknots import pseudoknots -from .residues import get_residue_count, get_residue_positions +from biotite.structure.basepairs import base_pairs +from biotite.structure.pseudoknots import pseudoknots +from biotite.structure.residues import get_residue_count, get_residue_positions _OPENING_BRACKETS = "([{ Date: Wed, 3 Jul 2024 10:31:45 +0200 Subject: [PATCH 8/9] Apply ruff unsafe fixes --- .../sequence/homology/bionigma_alignment.py | 2 -- .../scripts/sequence/homology/plotepiscan.py | 12 +++---- .../structure/contacts/contact_sites.py | 6 ++-- doc/viewcode.py | 2 -- src/biotite/application/blast/webapp.py | 2 +- src/biotite/application/localapp.py | 2 +- src/biotite/application/sra/app.py | 2 +- src/biotite/sequence/graphics/plasmid.py | 10 ------ src/biotite/sequence/io/fasta/convert.py | 35 ++++++++++--------- src/biotite/sequence/seqtypes.py | 1 - src/biotite/structure/atoms.py | 2 +- src/biotite/structure/io/pdbqt/file.py | 3 +- src/biotite/structure/io/pdbx/bcif.py | 2 +- src/biotite/structure/io/pdbx/convert.py | 4 --- src/biotite/structure/resutil.py | 2 +- tests/application/test_blast.py | 6 ++-- tests/database/test_entrez.py | 4 +-- tests/database/test_pubchem.py | 2 +- tests/database/test_uniprot.py | 2 +- tests/sequence/align/test_matrix.py | 2 +- tests/sequence/align/test_statistics.py | 2 +- tests/sequence/test_codon.py | 2 +- tests/sequence/test_fasta.py | 1 - tests/sequence/test_gff.py | 4 +-- tests/sequence/test_phylo.py | 1 - tests/structure/test_basepairs.py | 8 ++--- tests/structure/test_geometry.py | 1 - tests/structure/test_info.py | 4 +-- tests/structure/test_molecules.py | 2 +- tests/test_init.py | 12 ------- 30 files changed, 54 insertions(+), 86 deletions(-) delete mode 100644 tests/test_init.py diff --git a/doc/examples/scripts/sequence/homology/bionigma_alignment.py b/doc/examples/scripts/sequence/homology/bionigma_alignment.py index 4b7fe5be5..c2275b2fe 100644 --- a/doc/examples/scripts/sequence/homology/bionigma_alignment.py +++ b/doc/examples/scripts/sequence/homology/bionigma_alignment.py @@ -271,7 +271,6 @@ def plot_alignment_shapes( label_size=None, show_line_position=False, spacing=1, - color_symbols=False, symbol_size=None, symbol_param=None, ): @@ -279,7 +278,6 @@ def plot_alignment_shapes( A thin wrapper around the 'ShapePlotter' and 'plot_alignment()' function. """ - alphabet = alignment.sequences[0].get_alphabet() symbol_plotter = ShapePlotter(axes, font_size=symbol_size, font_param=symbol_param) graphics.plot_alignment( axes=axes, diff --git a/doc/examples/scripts/sequence/homology/plotepiscan.py b/doc/examples/scripts/sequence/homology/plotepiscan.py index c4edc9173..140f078ca 100644 --- a/doc/examples/scripts/sequence/homology/plotepiscan.py +++ b/doc/examples/scripts/sequence/homology/plotepiscan.py @@ -239,7 +239,7 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): for b in range(len(lk1)): for a in template[x:]: if c < plen - 1: - if a == None: + if a is None: gapped.insert(x, (template[x], 0)) x = x + 1 elif a != lk1[b]: @@ -267,7 +267,7 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): for b in range(len(lk1)): for a in template[x:]: if c < plen - 1 and p == 0: - if a == None: + if a is None: gapped.insert(x, (template[x], 0)) x = x + 1 else: @@ -280,11 +280,11 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): x = x + 1 break if p != 0: - if a == None and c == 0: + if a is None and c == 0: gapped.insert(x, (template[x], 0)) x = x + 1 elif c % 2 == 0: - if a == None: + if a is None: gapped.insert(x, (template[x], 0)) x = x + 1 else: @@ -292,7 +292,7 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): x = x + 1 c = c + 1 elif c % 2 != 0: - if a == None: + if a is None: gapped.insert(x, (template[x], 0)) x = x + 1 elif a != lk1[b]: @@ -305,7 +305,7 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): break # For terminal gaps - if len(gapped) < len(template) and template[len(gapped) + 1] == None: + if len(gapped) < len(template) and template[len(gapped) + 1] is None: gapped_tail = [] for n in range(len(template) - len(gapped)): gapped_tail.append(("None", 0)) diff --git a/doc/examples/scripts/structure/contacts/contact_sites.py b/doc/examples/scripts/structure/contacts/contact_sites.py index 1c3856a34..94fb7d975 100644 --- a/doc/examples/scripts/structure/contacts/contact_sites.py +++ b/doc/examples/scripts/structure/contacts/contact_sites.py @@ -29,9 +29,9 @@ # Separate structure into the DNA and the two identical protein chains -dna = structure[np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False)] -protein_l = structure[(structure.chain_id == "L") & (structure.hetero == False)] -protein_r = structure[(structure.chain_id == "R") & (structure.hetero == False)] +dna = structure[np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero is False)] +protein_l = structure[(structure.chain_id == "L") & (structure.hetero is False)] +protein_r = structure[(structure.chain_id == "R") & (structure.hetero is False)] # Quick check if the two protein chains are really identical assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r)) diff --git a/doc/viewcode.py b/doc/viewcode.py index 10f5d9870..ec0b28974 100644 --- a/doc/viewcode.py +++ b/doc/viewcode.py @@ -144,7 +144,6 @@ def _index_cython_code(code_lines): continue if line.startswith(("def")): - attr_type = "def" # Get name of the function: # Remove 'def' from line... cropped_line = stripped_line[3:].strip() @@ -153,7 +152,6 @@ def _index_cython_code(code_lines): cropped_line = cropped_line[: cropped_line.index("(")].strip() attr_name = cropped_line elif line.startswith(("class", "cdef class")): - attr_type = "class" cropped_line = stripped_line # Get name of the class: # Remove potential 'cdef' from line... diff --git a/src/biotite/application/blast/webapp.py b/src/biotite/application/blast/webapp.py index 06dbda091..f8d6b09d1 100644 --- a/src/biotite/application/blast/webapp.py +++ b/src/biotite/application/blast/webapp.py @@ -305,7 +305,7 @@ def clean_up(self): param_dict = {} param_dict["CMD"] = "Delete" param_dict["RID"] = self._rid - request = requests.get(self.app_url(), params=param_dict) + requests.get(self.app_url(), params=param_dict) def evaluate(self): param_dict = {} diff --git a/src/biotite/application/localapp.py b/src/biotite/application/localapp.py index 2fa7ee1d3..990f7ce0a 100644 --- a/src/biotite/application/localapp.py +++ b/src/biotite/application/localapp.py @@ -236,7 +236,7 @@ def run(self): def is_finished(self): code = self._process.poll() - if code == None: + if code is None: return False else: self._stdout, self._stderr = self._process.communicate() diff --git a/src/biotite/application/sra/app.py b/src/biotite/application/sra/app.py index d1d99834f..7fc39ab4c 100644 --- a/src/biotite/application/sra/app.py +++ b/src/biotite/application/sra/app.py @@ -110,7 +110,7 @@ def run(self): def is_finished(self): code = self._process.poll() - if code == None: + if code is None: return False else: _, self._stderr = self._process.communicate() diff --git a/src/biotite/sequence/graphics/plasmid.py b/src/biotite/sequence/graphics/plasmid.py index f15f776f8..c1ebe020d 100644 --- a/src/biotite/sequence/graphics/plasmid.py +++ b/src/biotite/sequence/graphics/plasmid.py @@ -332,7 +332,6 @@ def draw(self, renderer, *args, **kwargs): else: for loc, indicator in zip(feature.locs, indicators_for_feature): # Calculate arrow shape parameters - row_center = row_bottom + self._feature_width / 2 row_top = row_bottom + self._feature_width start_ang = _loc_to_rad(loc.first, self._plasmid_size) stop_ang = _loc_to_rad(loc.last, self._plasmid_size) @@ -425,7 +424,6 @@ def set_bbox(self, bbox): def draw(self, renderer, *args, **kwargs): bbox = self._bbox - center_x = (bbox.x0 + bbox.x1) / 2 center_y = (bbox.y0 + bbox.y1) / 2 # Constant absolute width for all arrows @@ -516,14 +514,6 @@ def get_word_angles(self, renderer): ax_px_radius * 2 * np.pi * (self._radius / ax_unit_radius) ) - rad_angle = 360 - np.rad2deg(self._angle) - # Avoid to draw the text upside down, when drawn on the - # bottom half of the map - if rad_angle > 90 and rad_angle < 270: - turn_around = True - else: - turn_around = False - angles = [] for text in self._texts: orig_rot = text.get_rotation() diff --git a/src/biotite/sequence/io/fasta/convert.py b/src/biotite/sequence/io/fasta/convert.py index d128309d1..0a73240dd 100644 --- a/src/biotite/sequence/io/fasta/convert.py +++ b/src/biotite/sequence/io/fasta/convert.py @@ -219,41 +219,29 @@ def set_alignment(fasta_file, alignment, seq_names): def _convert_to_sequence(seq_str, seq_type=None): - # Define preprocessing of preimplemented sequence types - - # Replace selenocysteine with cysteine - # and pyrrolysine with lysine - process_protein_sequence = lambda x: x.upper().replace("U", "C").replace("O", "K") - # For nucleotides uracil is represented by thymine and there is only - # one letter for completely unknown nucleotides - process_nucleotide_sequence = ( - lambda x: x.upper().replace("U", "T").replace("X", "N") - ) - # Set manually selected sequence type - if seq_type is not None: # Do preprocessing as done without manual selection if seq_type == NucleotideSequence: - seq_str = process_nucleotide_sequence(seq_str) + seq_str = _process_nucleotide_sequence(seq_str) elif seq_type == ProteinSequence: if "U" in seq_str: warnings.warn( "ProteinSequence objects do not support selenocysteine " "(U), occurrences were substituted by cysteine (C)" ) - seq_str = process_protein_sequence(seq_str) + seq_str = _process_protein_sequence(seq_str) # Return the converted sequence return seq_type(seq_str) # Attempt to automatically determine sequence type try: - return NucleotideSequence(process_nucleotide_sequence(seq_str)) + return NucleotideSequence(_process_nucleotide_sequence(seq_str)) except AlphabetError: pass try: - prot_seq = ProteinSequence(process_protein_sequence(seq_str)) + prot_seq = ProteinSequence(_process_protein_sequence(seq_str)) # Raise Warning after conversion into 'ProteinSequence' # to wait for potential 'AlphabetError' if "U" in seq_str: @@ -269,6 +257,21 @@ def _convert_to_sequence(seq_str, seq_type=None): ) +def _process_protein_sequence(x): + """ + Replace selenocysteine with cysteine and pyrrolysine with lysine. + """ + return x.upper().replace("U", "C").replace("O", "K") + + +def _process_nucleotide_sequence(x): + """ + For nucleotides uracil is represented by thymine and there is only + one letter for completely unknown nucleotides + """ + return x.upper().replace("U", "T").replace("X", "N") + + def _convert_to_string(sequence, as_rna): if not isinstance(sequence.get_alphabet(), LetterAlphabet): raise ValueError( diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py index f72c2c737..e09527c35 100644 --- a/src/biotite/sequence/seqtypes.py +++ b/src/biotite/sequence/seqtypes.py @@ -501,7 +501,6 @@ class ProteinSequence(Sequence): def __init__(self, sequence=()): dict_3to1 = ProteinSequence._dict_3to1 - alph = ProteinSequence.alphabet # Convert 3-letter codes to single letter codes, # if list contains 3-letter codes sequence = [ diff --git a/src/biotite/structure/atoms.py b/src/biotite/structure/atoms.py index ea9f2ef68..d0641a125 100644 --- a/src/biotite/structure/atoms.py +++ b/src/biotite/structure/atoms.py @@ -923,7 +923,7 @@ class AtomArrayStack(_AtomArrayBase): def __init__(self, depth, length): super().__init__(length) - if depth == None or length == None: + if depth is None or length is None: self._coord = None else: self._coord = np.full((depth, length, 3), np.nan, dtype=np.float32) diff --git a/src/biotite/structure/io/pdbqt/file.py b/src/biotite/structure/io/pdbqt/file.py index 454b0cdac..21f883c0a 100644 --- a/src/biotite/structure/io/pdbqt/file.py +++ b/src/biotite/structure/io/pdbqt/file.py @@ -149,7 +149,6 @@ def get_remarks(self, model=None): if model is None: # Add exclusive end of file model_start_i = np.concatenate((model_start_i, [len(self.lines)])) - model_i = 0 remarks = [] for i in range(len(model_start_i) - 1): start = model_start_i[i] @@ -453,7 +452,7 @@ def set_structure( # for simple branch determination in '_write_atoms()' atoms.bonds.remove_bonds(rotatable_bonds) - hetero = ["ATOM" if e == False else "HETATM" for e in atoms.hetero] + hetero = ["HETATM" if e else "ATOM" for e in atoms.hetero] if "atom_id" in atoms.get_annotation_categories(): atom_id = atoms.atom_id else: diff --git a/src/biotite/structure/io/pdbx/bcif.py b/src/biotite/structure/io/pdbx/bcif.py index 881911c32..4f3aef3a5 100644 --- a/src/biotite/structure/io/pdbx/bcif.py +++ b/src/biotite/structure/io/pdbx/bcif.py @@ -564,7 +564,7 @@ def serialize(self): return {"dataBlocks": self._serialize_elements("header")} @classmethod - def read(self, file): + def read(cls, file): """ Read a *BinaryCIF* file. diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 790f76f6c..1a74ea1dc 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -676,10 +676,6 @@ def _filter_model(atom_site, model_starts, model): Reduce the ``atom_site`` category to the values for the given model. """ - Category = type(atom_site) - Column = Category.subcomponent_class() - Data = Column.subcomponent_class() - # Append exclusive stop model_starts = np.append(model_starts, [atom_site.row_count]) # Indexing starts at 0, but model number starts at 1 diff --git a/src/biotite/structure/resutil.py b/src/biotite/structure/resutil.py index d35e6fb6f..5841346b3 100644 --- a/src/biotite/structure/resutil.py +++ b/src/biotite/structure/resutil.py @@ -32,7 +32,7 @@ def apply_segment_wise(starts, data, function, axis): processed_data = None for i in range(len(starts) - 1): segment = data[starts[i] : starts[i + 1]] - if axis == None: + if axis is None: value = function(segment) else: value = function(segment, axis=axis) diff --git a/tests/application/test_blast.py b/tests/application/test_blast.py index ab36698a6..d9bb69f3a 100644 --- a/tests/application/test_blast.py +++ b/tests/application/test_blast.py @@ -84,14 +84,14 @@ def test_tblastn(): def test_file_input(): path = os.path.join(data_dir("sequence"), "prot.fasta") - app = blast.BlastWebApp("blastp", path, obey_rules=False) + blast.BlastWebApp("blastp", path, obey_rules=False) def test_invalid_query(): with pytest.raises(ValueError): - app = blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False) + blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False) with pytest.raises(ValueError): - app = blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False) + blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False) @pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") diff --git a/tests/database/test_entrez.py b/tests/database/test_entrez.py index 465995fb0..bc1e94f34 100644 --- a/tests/database/test_entrez.py +++ b/tests/database/test_entrez.py @@ -50,7 +50,7 @@ def test_fetch_single_file(as_file_like): def test_fetch_invalid(): with pytest.raises(RequestError): # Empty ID list - file = entrez.fetch_single_file([], None, "protein", "fasta", overwrite=True) + entrez.fetch_single_file([], None, "protein", "fasta", overwrite=True) with pytest.raises(RequestError): # Nonexisting ID - file = entrez.fetch("xxxx", None, "fa", "protein", "fasta", overwrite=True) + entrez.fetch("xxxx", None, "fa", "protein", "fasta", overwrite=True) diff --git a/tests/database/test_pubchem.py b/tests/database/test_pubchem.py index c0234f204..ed84809e3 100644 --- a/tests/database/test_pubchem.py +++ b/tests/database/test_pubchem.py @@ -45,7 +45,7 @@ def test_fetch_structural_formula(as_structural_formula): CID = 2244 mol_file = mol.MOLFile.read( - pubchem.fetch(2244, as_structural_formula=as_structural_formula) + pubchem.fetch(CID, as_structural_formula=as_structural_formula) ) atoms = mol_file.get_structure() diff --git a/tests/database/test_uniprot.py b/tests/database/test_uniprot.py index 09a2ee15b..7af70393a 100644 --- a/tests/database/test_uniprot.py +++ b/tests/database/test_uniprot.py @@ -41,7 +41,7 @@ def test_fetch(as_file_like): @pytest.mark.parametrize("format", ["fasta", "gff", "txt", "xml", "rdf", "tab"]) def test_fetch_invalid(format): with pytest.raises(RequestError): - file = uniprot.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True) + uniprot.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True) @pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 5916b23d9..570878945 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -22,7 +22,7 @@ def test_matrices(db_entry): """ alph1 = seq.ProteinSequence.alphabet alph2 = seq.ProteinSequence.alphabet - matrix = align.SubstitutionMatrix(alph1, alph2, db_entry) + align.SubstitutionMatrix(alph1, alph2, db_entry) def test_matrix_str(): diff --git a/tests/sequence/align/test_statistics.py b/tests/sequence/align/test_statistics.py index 5d9f5a13a..cb0840a16 100644 --- a/tests/sequence/align/test_statistics.py +++ b/tests/sequence/align/test_statistics.py @@ -190,4 +190,4 @@ def test_invalid_scoring_scheme(): freq = np.ones(len(alph)) with pytest.raises(ValueError): - estimator = EValueEstimator.from_samples(alph, matrix, -10, freq) + EValueEstimator.from_samples(alph, matrix, -10, freq) diff --git a/tests/sequence/test_codon.py b/tests/sequence/test_codon.py index 8f0a799e8..fe8d38eb4 100644 --- a/tests/sequence/test_codon.py +++ b/tests/sequence/test_codon.py @@ -36,7 +36,7 @@ ], ) def test_table_load(table_id): - table = seq.CodonTable.load(table_id) + seq.CodonTable.load(table_id) def test_table_indexing(): diff --git a/tests/sequence/test_fasta.py b/tests/sequence/test_fasta.py index c4de5c944..68133f44b 100644 --- a/tests/sequence/test_fasta.py +++ b/tests/sequence/test_fasta.py @@ -165,7 +165,6 @@ def test_write_iter(chars_per_line, n_sequences): random sequences. """ LENGTH_RANGE = (50, 150) - SCORE_RANGE = (10, 60) # Generate random sequences and scores np.random.seed(0) diff --git a/tests/sequence/test_gff.py b/tests/sequence/test_gff.py index 392a6e32a..0713a8324 100644 --- a/tests/sequence/test_gff.py +++ b/tests/sequence/test_gff.py @@ -161,8 +161,8 @@ def test_percent_encoding(): } file2 = gff.GFFFile() - file.append(seqid, source, type, start, end, score, strand, phase, attrib) - assert (seqid, source, type, start, end, score, strand, phase, attrib) == file[0] + file2.append(seqid, source, type, start, end, score, strand, phase, attrib) + assert (seqid, source, type, start, end, score, strand, phase, attrib) == file2[0] def test_error(): diff --git a/tests/sequence/test_phylo.py b/tests/sequence/test_phylo.py index 385a7157c..0943d6002 100644 --- a/tests/sequence/test_phylo.py +++ b/tests/sequence/test_phylo.py @@ -145,7 +145,6 @@ def test_distances(tree): def test_get_leaves(tree): # Manual example cases - node = tree.leaves[6] assert set(tree.leaves[6].parent.get_indices()) == set( [6, 11, 2, 3, 13, 8, 14, 5, 0, 15, 16] ) diff --git a/tests/structure/test_basepairs.py b/tests/structure/test_basepairs.py index 3b8fc55e3..e817c82d2 100644 --- a/tests/structure/test_basepairs.py +++ b/tests/structure/test_basepairs.py @@ -181,19 +181,19 @@ def test_map_nucleotide(): # pyrimidine/purine references psu_tuple = struc.map_nucleotide(residue("PSU")) assert psu_tuple[0] in pyrimidines - assert psu_tuple[1] == False + assert psu_tuple[1] is False psu_tuple = struc.map_nucleotide(residue("3MC")) assert psu_tuple[0] in pyrimidines - assert psu_tuple[1] == False + assert psu_tuple[1] is False i_tuple = struc.map_nucleotide(residue("I")) assert i_tuple[0] in purines - assert i_tuple[1] == False + assert i_tuple[1] is False m7g_tuple = struc.map_nucleotide(residue("M7G")) assert m7g_tuple[0] in purines - assert m7g_tuple[1] == False + assert m7g_tuple[1] is False with pytest.warns(struc.IncompleteStructureWarning): assert struc.map_nucleotide(residue("ALA")) == (None, False) diff --git a/tests/structure/test_geometry.py b/tests/structure/test_geometry.py index aeea36000..d5ab03fdc 100644 --- a/tests/structure/test_geometry.py +++ b/tests/structure/test_geometry.py @@ -201,7 +201,6 @@ def test_index_distance_periodic_triclinic(shift, angles): traj.unitcell_vectors = array.box[np.newaxis, :, :] / 10 # Nanometers to Angstrom mdtraj_dist = mdtraj.compute_distances(traj, dist_indices)[0] * 10 - ind = np.where(~np.isclose(ref_dist, mdtraj_dist, atol=2e-5, rtol=1e-3))[0] assert np.allclose(ref_dist, mdtraj_dist, atol=2e-5, rtol=1e-3) # Compare with shifted variant diff --git a/tests/structure/test_info.py b/tests/structure/test_info.py index 06c80f923..2fbf8afe8 100644 --- a/tests/structure/test_info.py +++ b/tests/structure/test_info.py @@ -72,7 +72,7 @@ def test_protOr_radii(): for res_name, atom_name in zip(array.res_name, array.atom_name): radius = strucinfo.vdw_radius_protor(res_name, atom_name) assert isinstance(radius, float) - assert radius != None + assert radius is not None def test_protor_radii_invalid(): @@ -83,7 +83,7 @@ def test_protor_radii_invalid(): # Expect raised exception when a residue does not contain an atom strucinfo.vdw_radius_protor("ALA", "K") # For all other unknown radii expect None - assert strucinfo.vdw_radius_protor("HOH", "O") == None + assert strucinfo.vdw_radius_protor("HOH", "O") is None def test_single_radii(): diff --git a/tests/structure/test_molecules.py b/tests/structure/test_molecules.py index 91447febc..6880cd8cd 100644 --- a/tests/structure/test_molecules.py +++ b/tests/structure/test_molecules.py @@ -95,7 +95,7 @@ def test_get_molecule_masks(array, as_stack, as_bonds): for i in range(len(test_masks)): # Assert that the mask is 'True' for all indices # and that these 'True' values are the only ones in the mask - assert (test_masks[i, ref_indices[i]] == True).all() + assert test_masks[i, ref_indices[i]].all() assert np.count_nonzero(test_masks[i]) == len(ref_indices[i]) diff --git a/tests/test_init.py b/tests/test_init.py deleted file mode 100644 index 39e9617e2..000000000 --- a/tests/test_init.py +++ /dev/null @@ -1,12 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__author__ = "Daniel Bauer" - -import biotite - - -def test_version_number(): - version = biotite.__version__ - assert hasattr(biotite, "__version__") From 85eaf6196dddb29e05ff6bfbbb760d505f17222a Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Wed, 3 Jul 2024 11:30:01 +0200 Subject: [PATCH 9/9] Fix remaining linter complaints --- doc/apidoc.py | 2 +- doc/bibliography.py | 2 +- doc/conf.py | 1 - .../misc/local_alignment_statistics.py | 36 ++++---- .../modeling/water_exchange_noexec.py | 4 +- .../structure/protein/peptide_assembly.py | 83 +------------------ doc/scraper.py | 4 +- doc/switcher.py | 2 +- setup_ccd.py | 19 ++--- src/biotite/__init__.py | 2 +- src/biotite/application/muscle/app3.py | 3 +- src/biotite/sequence/align/alignment.py | 5 +- src/biotite/sequence/codon.py | 4 +- src/biotite/sequence/graphics/plasmid.py | 4 +- src/biotite/sequence/io/genbank/annotation.py | 2 +- src/biotite/sequence/io/genbank/file.py | 4 +- src/biotite/structure/__init__.py | 2 +- src/biotite/structure/chains.py | 9 +- src/biotite/structure/filter.py | 4 +- src/biotite/structure/io/mol/sdf.py | 8 +- src/biotite/structure/io/pdbx/cif.py | 8 +- src/biotite/structure/io/pdbx/component.py | 6 +- src/biotite/structure/io/pdbx/convert.py | 2 +- src/biotite/structure/residues.py | 9 +- .../structure/{resutil.py => segments.py} | 0 src/biotite/visualize.py | 4 +- tests/structure/test_basepairs.py | 14 ++-- tests/structure/test_info.py | 2 +- tests/structure/test_mol.py | 6 +- tests/structure/test_pdbx.py | 8 +- tests/structure/test_sasa.py | 6 +- 31 files changed, 102 insertions(+), 163 deletions(-) rename src/biotite/structure/{resutil.py => segments.py} (100%) diff --git a/doc/apidoc.py b/doc/apidoc.py index 07c152620..8f10f0923 100644 --- a/doc/apidoc.py +++ b/doc/apidoc.py @@ -3,7 +3,7 @@ # information. __author__ = "Patrick Kunzmann" -__all__ = ["create_api_doc", "skip_non_methods"] +__all__ = ["create_api_doc", "skip_nonrelevant"] import enum import json diff --git a/doc/bibliography.py b/doc/bibliography.py index 2d9093adf..cf44587cc 100644 --- a/doc/bibliography.py +++ b/doc/bibliography.py @@ -74,6 +74,6 @@ def format_article(self, param): return Text(*authors, title, journal, volume, pages, date, doi) - except: + except Exception: warnings.warn(f"Invalid BibTeX entry '{entry.key}'") return Text(entry.key) diff --git a/doc/conf.py b/doc/conf.py index e3a785800..7f19bc67c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -190,7 +190,6 @@ # Do not run example scripts with a trailing '_noexec' "filename_pattern": "^((?!_noexec).)*$", "ignore_pattern": r"(.*ignore\.py)|(.*pymol\.py)", - "backreferences_dir": None, "download_all_examples": False, # Never report run time "min_reported_time": sys.maxsize, diff --git a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py index 5c576c651..aa3fd533e 100644 --- a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py +++ b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py @@ -106,9 +106,9 @@ # The probability density function of the extreme value distribution -def pdf(x, l, u): - t = np.exp(-l * (x - u)) - return l * t * np.exp(-t) +def pdf(x, lam, u): + t = np.exp(-lam * (x - u)) + return lam * t * np.exp(-t) x = np.linspace(-5, 10, 1000) @@ -240,15 +240,15 @@ def pdf(x, l, u): # respectively. # Use method of moments to estimate distribution parameters -l = np.pi / np.sqrt(6 * np.var(sample_scores)) -u = np.mean(sample_scores) - np.euler_gamma / l +lam = np.pi / np.sqrt(6 * np.var(sample_scores)) +u = np.mean(sample_scores) - np.euler_gamma / lam # Score frequencies for the histogram freqs = np.bincount(sample_scores) / SAMPLE_SIZE # Coordinates for the fit x = np.linspace(0, len(freqs) - 1, 1000) -y = pdf(x, l, u) +y = pdf(x, lam, u) fig, ax = plt.subplots(figsize=(8.0, 4.0)) ax.scatter( @@ -291,7 +291,7 @@ def pdf(x, l, u): # The sequence lengths to be sampled length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE).astype(int) u_series = np.zeros(LENGTH_SAMPLE_SIZE) -l_series = np.zeros(LENGTH_SAMPLE_SIZE) +lam_series = np.zeros(LENGTH_SAMPLE_SIZE) for i, length in enumerate(length_samples): # The same procedure from above random_sequence_code = np.random.choice( @@ -311,8 +311,8 @@ def pdf(x, l, u): )[0] scores[j] = sample_alignment.score - l_series[i] = np.pi / np.sqrt(6 * np.var(scores)) - u_series[i] = np.mean(scores) - np.euler_gamma / l_series[i] + lam_series[i] = np.pi / np.sqrt(6 * np.var(scores)) + u_series[i] = np.mean(scores) - np.euler_gamma / lam_series[i] ######################################################################## # Now we use a linear fit of :math:`u` to check if there is a linear @@ -325,8 +325,8 @@ def pdf(x, l, u): slope, intercept, r, _, _ = linregress(ln_mn, u_series) # More precise parameter estimation from fit -l = 1 / slope -k = np.exp(intercept * l) +lam = 1 / slope +k = np.exp(intercept * lam) # Coordinates for fit x_fit = np.linspace(0, 16, 100) @@ -347,12 +347,12 @@ def pdf(x, l, u): ) ax2 = ax.twinx() -ax2.scatter(ln_mn, l_series, color=biotite.colors["lightgreen"], s=8) -ax2.axhline(l, color=biotite.colors["darkgreen"], linestyle=":") +ax2.scatter(ln_mn, lam_series, color=biotite.colors["lightgreen"], s=8) +ax2.axhline(lam, color=biotite.colors["darkgreen"], linestyle=":") x_annot = 2 ax2.annotate( - f"λ = {l:.3f}", - xy=(x_annot, l), + f"λ = {lam:.3f}", + xy=(x_annot, lam), xytext=(0, -50), textcoords="offset pixels", arrowprops=arrowprops, @@ -438,11 +438,11 @@ def pdf(x, l, u): DATABASE_SIZE = 1_000_000 -def e_value(score, length1, length2, k, l): - return k * length1 * length2 * np.exp(-l * score) +def e_value(score, length1, length2, k, lam): + return k * length1 * length2 * np.exp(-lam * score) -e = e_value(alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, l) +e = e_value(alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, lam) print(f"E-value = {e:.2e}") ######################################################################## diff --git a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py index d5566aac4..f62ca18ee 100644 --- a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py +++ b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py @@ -27,6 +27,7 @@ import matplotlib.pyplot as plt import numpy as np +from pylab import polyfit import biotite import biotite.structure as struct import biotite.structure.io.gro as gro @@ -83,12 +84,9 @@ def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)): # Linear fitting -from pylab import polyfit - open_fit = polyfit(time, counts[0], 1) closed_fit = polyfit(time, counts[1], 1) - fig, ax = plt.subplots(figsize=(8.0, 4.0)) ax.plot(time, counts[0], label="open pore", color=biotite.colors["dimgreen"]) ax.plot( diff --git a/doc/examples/scripts/structure/protein/peptide_assembly.py b/doc/examples/scripts/structure/protein/peptide_assembly.py index 71ed3469e..de9f24704 100644 --- a/doc/examples/scripts/structure/protein/peptide_assembly.py +++ b/doc/examples/scripts/structure/protein/peptide_assembly.py @@ -40,84 +40,7 @@ # Reference peptide bond atom coordinates taken from 1l2y: # CA, C, N, O, H -peptide_coord = np.array( - [ - [-8.608, 3.135, -1.618], - [-7.117, 2.964, -1.897], - [-6.379, 4.031, -2.228], - [-6.634, 1.849, -1.758], - [-6.821, 4.923, -2.394], - ] -) - - -def create_raw_backbone_coord(number_of_res): - """ - Create coordinates for straight peptide chain in z-plane. - The peptide bonds are in trans configuration. - """ - coord = np.zeros((number_of_res * 3, 3)) - for i, angle, angle_direction, length in zip( - range(len(coord)), - itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]), - itertools.cycle([1, -1]), - itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]), - ): - if i == 0: - coord[i] = [0, 0, 0] - elif i == 1: - coord[i] = [0, length, 0] - else: - # Rotate about z-axis -> backbone lies in z-plane - rot_axis = [0, 0, angle_direction] - # Calculate the coordinates of a new atoms by rotating the previous - # bond by the given angle - new_coord = struc.rotate_about_axis( - coord[i - 2], - axis=rot_axis, - angle=np.deg2rad(angle), - support=coord[i - 1], - ) - # Scale bond to correct bond length - bond_vector = new_coord - coord[i - 1] - coord[i] = coord[i - 1] + bond_vector * length / norm(bond_vector) - return coord - - -def append_residue(chain, residue): - """ - Append a residue to an existing chain. - Modify annotation arrays and remove atoms as necessary. - The atom coordinates are not altered. - """ - if chain.array_length() == 0: - # Chain is empty - residue.res_id[:] = 1 - return residue - - last_res_id = chain.res_id[-1] - - # Remove atoms removed by peptide bond - chain = chain[ - (chain.res_id != last_res_id) | ~np.isin(chain.atom_name, ["OXT", "HXT"]) - ] - residue = residue[~np.isin(residue.atom_name, ["H2", "H3"])] - - # Increment residue ID for attached residue - residue.res_id[:] = last_res_id + 1 - - -C_N_LENGTH = 1.34 -N_CA_LENGTH = 1.46 -CA_C_LENGTH = 1.54 - -CA_C_N_ANGLE = 114 -C_N_CA_ANGLE = 123 -N_CA_C_ANGLE = 110 - -# Reference peptide bond atom coordinates taken from 1l2y: -# CA, C, N, O, H -peptide_coord = np.array( +PEPTIDE_COORD = np.array( [ [-8.608, 3.135, -1.618], [-7.117, 2.964, -1.897], @@ -221,9 +144,9 @@ def assemble_peptide(sequence): for atom_name in ["N", "H"] ] _, transformation = struc.superimpose( - chain.coord[[ca_i, c_i, n_i]], peptide_coord[:3] + chain.coord[[ca_i, c_i, n_i]], PEPTIDE_COORD[:3] ) - chain.coord[[o_i, h_i]] = transformation.apply(peptide_coord[3:]) + chain.coord[[o_i, h_i]] = transformation.apply(PEPTIDE_COORD[3:]) return chain diff --git a/doc/scraper.py b/doc/scraper.py index e21c52048..c9fd629ce 100644 --- a/doc/scraper.py +++ b/doc/scraper.py @@ -63,8 +63,8 @@ def pymol_scraper(block, block_vars, gallery_conf): ) try: - import ammolite - import pymol + import ammolite # noqa: F401 + import pymol # noqa: F401 except ImportError: # If Ammolite is not installed, fall back to the image file, # if already existing diff --git a/doc/switcher.py b/doc/switcher.py index e8fcb7208..095d30a85 100644 --- a/doc/switcher.py +++ b/doc/switcher.py @@ -3,7 +3,7 @@ # information. __author__ = "Patrick Kunzmann" -__all__ = ["create_api_doc", "skip_non_methods"] +__all__ = ["create_switcher_json"] import json import re diff --git a/setup_ccd.py b/setup_ccd.py index fba5d3ab2..a3351c205 100644 --- a/setup_ccd.py +++ b/setup_ccd.py @@ -2,12 +2,13 @@ import logging from dataclasses import dataclass from io import StringIO +from pathlib import Path import numpy as np import requests from biotite.structure.io.pdbx import * -class ComponentException(Exception): +class ComponentError(Exception): pass @@ -303,14 +304,14 @@ def check_presence(pdbx_file, category_name, column_names): is_present = column_names[0] in category for name in column_names: if (name in category) != is_present: - raise ComponentException("Only some column names are missing") + raise ComponentError("Only some column names are missing") if not is_present: return is_unmasked = category[column_names[0]].mask is None for name in column_names: if (category[name].mask is None) != is_unmasked: - raise ComponentException("Only some column names are masked") + raise ComponentError("Only some column names are masked") def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): @@ -337,7 +338,7 @@ def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): for comp_id, block in pdbx_file.items(): try: if category_name not in block: - raise ComponentException(f"Block has no category '{category_name}'") + raise ComponentError(f"Block has no category '{category_name}'") chunk = {} category = block[category_name] for col_name, info in column_infos.items(): @@ -348,17 +349,15 @@ def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): if info.alternative is not None: col = category[info.alternative] if col.mask is not None: - raise ComponentException( + raise ComponentError( f"Missing values in alternative " f"'{info.alternative}'" ) else: - raise ComponentException( - f"Missing values in column '{col_name}'" - ) + raise ComponentError(f"Missing values in column '{col_name}'") data_array = col.as_array(info.dtype, info.fill_value) chunk[col_name] = data_array - except ComponentException as e: + except ComponentError as e: logging.warning(f"Skipping '{comp_id}': {e}") # Append all columns in the chunk after the try-except block # to avoid appending incomplete chunks @@ -472,6 +471,4 @@ def setup_ccd(target_diriectory): compressed_file.write(target_diriectory / "components.bcif") -from pathlib import Path - setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd") diff --git a/src/biotite/__init__.py b/src/biotite/__init__.py index 8b80e11ea..653caf8f5 100644 --- a/src/biotite/__init__.py +++ b/src/biotite/__init__.py @@ -14,5 +14,5 @@ from .copyable import * from .file import * -from .version import __version__, __version_tuple__ +from .version import __version__, __version_tuple__ # noqa: F401 from .visualize import * diff --git a/src/biotite/application/muscle/app3.py b/src/biotite/application/muscle/app3.py index 0bb05bc4b..60118966a 100644 --- a/src/biotite/application/muscle/app3.py +++ b/src/biotite/application/muscle/app3.py @@ -10,6 +10,7 @@ import re import subprocess import warnings +from collections.abc import Sequence from tempfile import NamedTemporaryFile from biotite.application.application import AppState, VersionError, requires_state from biotite.application.localapp import cleanup_tempfile @@ -136,7 +137,7 @@ def set_gap_penalty(self, gap_penalty): raise ValueError("Gap penalty must be negative") self._gap_open = gap_penalty self._gap_ext = gap_penalty - elif type(gap_penalty) == tuple: + elif isinstance(gap_penalty, Sequence): if gap_penalty[0] > 0 or gap_penalty[1] > 0: raise ValueError("Gap penalty must be negative") self._gap_open = gap_penalty[0] diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py index f29ac0a50..d33e3d051 100644 --- a/src/biotite/sequence/align/alignment.py +++ b/src/biotite/sequence/align/alignment.py @@ -7,6 +7,7 @@ import numbers import textwrap +from collections.abc import Sequence import numpy as np from biotite.sequence.alphabet import LetterAlphabet @@ -519,10 +520,10 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True): score += matrix[code_i, code_j] # Sum gap penalties - if type(gap_penalty) == int: + if isinstance(gap_penalty, numbers.Real): gap_open = gap_penalty gap_ext = gap_penalty - elif type(gap_penalty) == tuple: + elif isinstance(gap_penalty, Sequence): gap_open = gap_penalty[0] gap_ext = gap_penalty[1] else: diff --git a/src/biotite/sequence/codon.py b/src/biotite/sequence/codon.py index 5e7f59343..13a5d64d8 100644 --- a/src/biotite/sequence/codon.py +++ b/src/biotite/sequence/codon.py @@ -397,11 +397,11 @@ def load(table_name): for line in lines: if not line: table_found = False - if type(table_name) == int and line.startswith("id"): + if isinstance(table_name, Integral) and line.startswith("id"): # remove identifier 'id' if table_name == int(line[2:]): table_found = True - elif type(table_name) == str and line.startswith("name"): + elif isinstance(table_name, str) and line.startswith("name"): # Get list of table names from lines # (separated with ';') # remove identifier 'name' diff --git a/src/biotite/sequence/graphics/plasmid.py b/src/biotite/sequence/graphics/plasmid.py index c1ebe020d..08972fce9 100644 --- a/src/biotite/sequence/graphics/plasmid.py +++ b/src/biotite/sequence/graphics/plasmid.py @@ -252,7 +252,7 @@ def __init__( bbox = Bbox.from_extents(0, 0, 0, 0) # Draw features as curved arrows (feature indicator) indicator = axes.add_artist( - Feature_Indicator( + FeatureIndicator( axes, self.zorder + 1, feature, @@ -340,7 +340,7 @@ def draw(self, renderer, *args, **kwargs): ) indicator.set_bbox(bbox) - class Feature_Indicator(Artist): + class FeatureIndicator(Artist): def __init__( self, axes, diff --git a/src/biotite/sequence/io/genbank/annotation.py b/src/biotite/sequence/io/genbank/annotation.py index 4300c41a4..223a67ddb 100644 --- a/src/biotite/sequence/io/genbank/annotation.py +++ b/src/biotite/sequence/io/genbank/annotation.py @@ -88,7 +88,7 @@ def get_annotation(gb_file, include_only=None): loc_string = qualifier_parts.pop(0).strip() try: locs = _parse_locs(loc_string) - except: + except Exception: warnings.warn( f"'{loc_string}' is an unsupported location identifier, " f"skipping feature" diff --git a/src/biotite/sequence/io/genbank/file.py b/src/biotite/sequence/io/genbank/file.py index d76a4b63d..0fdd99c63 100644 --- a/src/biotite/sequence/io/genbank/file.py +++ b/src/biotite/sequence/io/genbank/file.py @@ -299,11 +299,11 @@ def __getitem__(self, index): subfield_dict = OrderedDict() subfield_start = None first_subfield_start = None + header = None for i in range(start + 1, stop): line = self.lines[i] - # Check if line contains a new subfield - # (Header beginning from first column) if len(line) != 0 and line[:12].strip() != "": + # New header -> new subfield if first_subfield_start is None: first_subfield_start = i # Store previous subfield diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py index 6349a3d85..df9776324 100644 --- a/src/biotite/structure/__init__.py +++ b/src/biotite/structure/__init__.py @@ -129,4 +129,4 @@ from .sse import * from .superimpose import * from .transform import * -# util and resutil are used internally +# util and segments are used internally diff --git a/src/biotite/structure/chains.py b/src/biotite/structure/chains.py index f38ccc05a..c4bbd4996 100644 --- a/src/biotite/structure/chains.py +++ b/src/biotite/structure/chains.py @@ -23,7 +23,14 @@ ] import numpy as np -from biotite.structure.resutil import * +from biotite.structure.segments import ( + apply_segment_wise, + get_segment_masks, + get_segment_positions, + get_segment_starts_for, + segment_iter, + spread_segment_wise, +) def get_chain_starts(array, add_exclusive_stop=False): diff --git a/src/biotite/structure/filter.py b/src/biotite/structure/filter.py index dae30249f..c6e4aefd6 100644 --- a/src/biotite/structure/filter.py +++ b/src/biotite/structure/filter.py @@ -491,7 +491,7 @@ def filter_first_altloc(atoms, altloc_ids): # And filter all atoms for each residue with the first altloc ID residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) for start, stop in zip(residue_starts[:-1], residue_starts[1:]): - letter_altloc_ids = [l for l in altloc_ids[start:stop] if l.isalpha()] + letter_altloc_ids = [loc for loc in altloc_ids[start:stop] if loc.isalpha()] if len(letter_altloc_ids) > 0: first_id = letter_altloc_ids[0] altloc_filter[start:stop] |= altloc_ids[start:stop] == first_id @@ -572,7 +572,7 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies): occupancies_in_res = occupancies[start:stop] altloc_ids_in_res = altloc_ids[start:stop] - letter_altloc_ids = [l for l in altloc_ids_in_res if l.isalpha()] + letter_altloc_ids = [loc for loc in altloc_ids_in_res if loc.isalpha()] if len(letter_altloc_ids) > 0: highest = -1.0 diff --git a/src/biotite/structure/io/mol/sdf.py b/src/biotite/structure/io/mol/sdf.py index 53e4c3efb..2048a482a 100644 --- a/src/biotite/structure/io/mol/sdf.py +++ b/src/biotite/structure/io/mol/sdf.py @@ -392,7 +392,7 @@ def header(self): if isinstance(self._header, str): try: self._header = Header.deserialize(self._header) - except: + except Exception: raise DeserializationError("Failed to deserialize header") return self._header @@ -410,7 +410,7 @@ def metadata(self): if isinstance(self._metadata, str): try: self._metadata = Metadata.deserialize(self._metadata) - except: + except Exception: raise DeserializationError("Failed to deserialize metadata") return self._metadata @@ -780,7 +780,7 @@ def serialize(self): else: try: text_blocks.append(record.serialize()) - except: + except Exception: raise SerializationError( f"Failed to serialize record '{record_name}'" ) @@ -839,7 +839,7 @@ def __getitem__(self, key): # -> must be deserialized first try: record = SDRecord.deserialize(record) - except: + except Exception: raise DeserializationError(f"Failed to deserialize record '{key}'") # Update with deserialized object self._records[key] = record diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index 33e4ba9cf..25cd91387 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -616,7 +616,7 @@ def serialize(self): try: category.name = category_name text_blocks.append(category.serialize()) - except: + except Exception: raise SerializationError( f"Failed to serialize category '{category_name}'" ) @@ -639,7 +639,7 @@ def __getitem__(self, key): else: expect_whitespace = True category = CIFCategory.deserialize(category, expect_whitespace) - except: + except Exception: raise DeserializationError(f"Failed to deserialize category '{key}'") # Update with deserialized object self._categories[key] = category @@ -788,7 +788,7 @@ def serialize(self): else: try: text_blocks.append(block.serialize()) - except: + except Exception: raise SerializationError( f"Failed to serialize block '{block_name}'" ) @@ -848,7 +848,7 @@ def __getitem__(self, key): # -> must be deserialized first try: block = CIFBlock.deserialize(block) - except: + except Exception: raise DeserializationError(f"Failed to deserialize block '{key}'") # Update with deserialized object self._blocks[key] = block diff --git a/src/biotite/structure/io/pdbx/component.py b/src/biotite/structure/io/pdbx/component.py index ce3219ce5..fb2f228ed 100644 --- a/src/biotite/structure/io/pdbx/component.py +++ b/src/biotite/structure/io/pdbx/component.py @@ -181,7 +181,7 @@ def _serialize_elements(self, store_key_in=None): if isinstance(element, self.subcomponent_class()): try: serialized_element = element.serialize() - except: + except Exception: raise SerializationError(f"Failed to serialize element '{key}'") else: # Element is already stored in serialized form @@ -198,7 +198,7 @@ def __getitem__(self, key): # -> must be deserialized first try: element = self.subcomponent_class().deserialize(element) - except: + except Exception: raise DeserializationError(f"Failed to deserialize element '{key}'") # Update container with deserialized object self._elements[key] = element @@ -216,7 +216,7 @@ def __setitem__(self, key, element): else: try: element = self.subcomponent_class().deserialize(element) - except: + except Exception: raise DeserializationError("Failed to deserialize given value") self._elements[key] = element diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 1a74ea1dc..d514ed87c 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -895,7 +895,7 @@ def _determine_entity_id(chain_id): for i in range(len(chain_id)): try: entity_id[i] = id_translation[chain_id[i]] - except: + except KeyError: # chain_id is not in dictionary -> new entry id_translation[chain_id[i]] = id entity_id[i] = id_translation[chain_id[i]] diff --git a/src/biotite/structure/residues.py b/src/biotite/structure/residues.py index 9820a0550..61ae1712a 100644 --- a/src/biotite/structure/residues.py +++ b/src/biotite/structure/residues.py @@ -22,7 +22,14 @@ ] import numpy as np -from biotite.structure.resutil import * +from biotite.structure.segments import ( + apply_segment_wise, + get_segment_masks, + get_segment_positions, + get_segment_starts_for, + segment_iter, + spread_segment_wise, +) def get_residue_starts(array, add_exclusive_stop=False): diff --git a/src/biotite/structure/resutil.py b/src/biotite/structure/segments.py similarity index 100% rename from src/biotite/structure/resutil.py rename to src/biotite/structure/segments.py diff --git a/src/biotite/visualize.py b/src/biotite/visualize.py index f45b7815f..eb7444c54 100644 --- a/src/biotite/visualize.py +++ b/src/biotite/visualize.py @@ -85,11 +85,11 @@ def __init__(self, text, width, height, mode): self._width = width self._height = height - def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): + def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): # noqa: N803 ax = self._text.axes try: renderer = ax.get_figure().canvas.get_renderer() - except: + except Exception: # Use cached renderer for backends, where # `get_renderer()` is not available # Based on the strategy from `Text.get_window_extent()` diff --git a/tests/structure/test_basepairs.py b/tests/structure/test_basepairs.py index e817c82d2..b11b78ce5 100644 --- a/tests/structure/test_basepairs.py +++ b/tests/structure/test_basepairs.py @@ -100,8 +100,8 @@ def test_base_pairs_reverse(nuc_sample_array, basepairs, unique_bool): # Reverse sequence of residues in nuc_sample_array reversed_nuc_sample_array = struc.AtomArray(0) - for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)): - reversed_nuc_sample_array = reversed_nuc_sample_array + residue + for res in reversed_iterator(struc.residue_iter(nuc_sample_array)): + reversed_nuc_sample_array = reversed_nuc_sample_array + res computed_basepairs = struc.base_pairs(reversed_nuc_sample_array, unique=unique_bool) check_residue_starts(computed_basepairs, reversed_nuc_sample_array) @@ -117,8 +117,8 @@ def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs): nuc_sample_array = nuc_sample_array[nuc_sample_array.element != "H"] # Reverse sequence of residues in nuc_sample_array reversed_nuc_sample_array = struc.AtomArray(0) - for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)): - reversed_nuc_sample_array = reversed_nuc_sample_array + residue + for res in reversed_iterator(struc.residue_iter(nuc_sample_array)): + reversed_nuc_sample_array = reversed_nuc_sample_array + res computed_basepairs = struc.base_pairs(reversed_nuc_sample_array) check_residue_starts(computed_basepairs, reversed_nuc_sample_array) @@ -154,10 +154,10 @@ def test_base_pairs_reordered(nuc_sample_array, seed): nuc_sample_array_reordered = struc.AtomArray(0) np.random.seed(seed) - for residue in struc.residue_iter(nuc_sample_array): - bound = residue.array_length() + for res in struc.residue_iter(nuc_sample_array): + bound = res.array_length() indices = np.random.choice(np.arange(bound), bound, replace=False) - nuc_sample_array_reordered += residue[..., indices] + nuc_sample_array_reordered += res[..., indices] assert np.all( struc.base_pairs(nuc_sample_array) diff --git a/tests/structure/test_info.py b/tests/structure/test_info.py index 2fbf8afe8..2d823aaf1 100644 --- a/tests/structure/test_info.py +++ b/tests/structure/test_info.py @@ -58,7 +58,7 @@ def test_mass(): assert np.all(np.round(multiple_of_h_masses, decimals=2) % 1 == 0) -def test_protOr_radii(): +def test_protor_radii(): """ Assert that ProtOr VdW radii (except hydrogen) can be calculated for all atoms in the given structure, since the structure (1GYA) diff --git a/tests/structure/test_mol.py b/tests/structure/test_mol.py index cd7efa729..ce4378e86 100644 --- a/tests/structure/test_mol.py +++ b/tests/structure/test_mol.py @@ -84,7 +84,11 @@ def test_header_conversion(): ), ) def test_structure_conversion( - FileClass, path, version, omit_charge, use_charge_property + FileClass, # noqa: N803 + path, + version, + omit_charge, + use_charge_property, ): """ After reading a file, writing the structure back to a new file diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py index ba3566b17..7a02960f5 100644 --- a/tests/structure/test_pdbx.py +++ b/tests/structure/test_pdbx.py @@ -495,7 +495,7 @@ def test_bcif_encoding(): test_msgpack = column.serialize() assert test_msgpack == ref_msgpack - except: + except Exception: raise Exception(f"Encoding failed for '{category_name}.{column_name}'") # Check if each encoding was used at least once @@ -503,7 +503,7 @@ def test_bcif_encoding(): for key, was_used in encodings_used.items(): try: assert was_used - except: + except Exception: raise Exception(f"Encoding {key} was not used") @@ -571,7 +571,7 @@ def test_bcif_cif_consistency(): assert cif_column.as_array(dtype).tolist() == pytest.approx( bcif_column.as_array(dtype).tolist() ) - except: + except Exception: raise Exception( f"Comparison failed for '{category_name}.{column_name}'" ) @@ -614,7 +614,7 @@ def test_serialization_consistency(format, create_new_encoding): try: for key in test_category.keys(): assert ref_category[key] == test_category[key] - except: + except Exception: raise Exception(f"Comparison failed for '{category_name}.{key}'") diff --git a/tests/structure/test_sasa.py b/tests/structure/test_sasa.py index ed6e0370b..a5f9d0171 100644 --- a/tests/structure/test_sasa.py +++ b/tests/structure/test_sasa.py @@ -24,10 +24,12 @@ def test_single(pdb_id): sasa = struc.sasa(array, vdw_radii="Single", point_number=5000) import mdtraj - from biotite.structure.info.radii import _SINGLE_RADII as radii + from biotite.structure.info.radii import _SINGLE_RADII as SINGLE_RADII # Use the same atom radii - radii = {element.capitalize(): radius / 10 for element, radius in radii.items()} + radii = { + element.capitalize(): radius / 10 for element, radius in SINGLE_RADII.items() + } traj = mdtraj.load(file_name) # Conversion from nm^2 to A^2 sasa_exp = (