diff --git a/.github/workflows/test_and_deploy.yml b/.github/workflows/test_and_deploy.yml index 7c2b97591..09f7c5718 100644 --- a/.github/workflows/test_and_deploy.yml +++ b/.github/workflows/test_and_deploy.yml @@ -31,6 +31,22 @@ env: jobs: + lint: + name: Check code style + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install ruff + run: pip install ruff==0.5.2 + - name: Check code formatting + run: ruff format --diff + - name: Lint code base + run: ruff check + + generate-wheels-matrix: name: "Generate wheels matrix" runs-on: "ubuntu-latest" @@ -211,6 +227,10 @@ jobs: steps: - uses: actions/checkout@v4 + with: + # Make sure to fetch the latest tag, so 'switcher.py' works correctly + fetch-depth: 0 + fetch-tags: true - uses: conda-incubator/setup-miniconda@v2 with: environment-file: environment.yml @@ -245,6 +265,7 @@ jobs: permissions: contents: write needs: + - lint - test-and-build - make-sdist - test-interfaces diff --git a/.gitignore b/.gitignore index b76f59e63..037c3f92e 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,6 @@ htmlcov # Ignore all compiled python files (e.g. from running the unit tests) *.pyc *.pyo -*.py{} *.py-e # Ignore potential directory created during install diff --git a/README.rst b/README.rst index 57a2e818f..5079d12f8 100644 --- a/README.rst +++ b/README.rst @@ -21,7 +21,7 @@ This package bundles popular tasks in computational molecular biology into a uniform *Python* library. It can handle a major part of the typical workflow for sequence and biomolecular structure data: - + - Searching and fetching data from biological databases - Reading and writing popular sequence/structure file formats - Analyzing and editing sequence/structure data @@ -57,7 +57,6 @@ Installation Some functions require some extra packages: - - **mdtraj** - Required for trajetory file I/O operations. - **matplotlib** - Required for plotting purposes. *Biotite* can be installed via *Conda*... diff --git a/doc/apidoc.json b/doc/apidoc.json index a3309012c..b68573551 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -4,10 +4,6 @@ "File", "TextFile" ], - "Temporary files" : [ - "temp_dir", - "temp_file" - ], "Visualization utilities":[ "set_font_size_in_coord", "AdaptiveFancyArrow" @@ -260,8 +256,7 @@ "superimpose", "superimpose_homologs", "superimpose_without_outliers", - "AffineTransformation", - "superimpose_apply" + "AffineTransformation" ], "Filters" : [ "filter_canonical_nucleotides", @@ -269,7 +264,6 @@ "filter_canonical_amino_acids", "filter_amino_acids", "filter_carbohydrates", - "filter_backbone", "filter_peptide_backbone", "filter_phosphate_backbone", "filter_linear_bond_continuity", @@ -281,17 +275,13 @@ "filter_highest_occupancy_altloc" ], "Checks" : [ - "check_id_continuity", "check_atom_id_continuity", "check_res_id_continuity", "check_backbone_continuity", "check_duplicate_atoms", - "check_bond_continuity", "check_linear_continuity" ], "Repair" : [ - "renumber_atom_ids", - "renumber_res_ids", "create_continuous_res_ids", "infer_elements", "create_atom_names" diff --git a/doc/apidoc.py b/doc/apidoc.py index 5bc412333..8f10f0923 100644 --- a/doc/apidoc.py +++ b/doc/apidoc.py @@ -3,17 +3,16 @@ # information. __author__ = "Patrick Kunzmann" -__all__ = ["create_api_doc", "skip_non_methods"] +__all__ = ["create_api_doc", "skip_nonrelevant"] -from os.path import join, isdir -from os import listdir, makedirs -from importlib import import_module -import types -import json import enum -from textwrap import dedent +import json +import types from collections import OrderedDict - +from importlib import import_module +from os import listdir, makedirs +from os.path import isdir, join +from textwrap import dedent _INDENT = " " * 4 @@ -24,7 +23,6 @@ _pck_categories = json.load(file, object_pairs_hook=OrderedDict) - def create_api_doc(src_path, doc_path): """ Create *.rst files for API documentation. @@ -40,11 +38,7 @@ def create_api_doc(src_path, doc_path): # Create directory to store apidoc if not isdir(doc_path): makedirs(doc_path) - package_list = _create_package_doc( - "biotite", - join(src_path, "biotite"), - doc_path - ) + package_list = _create_package_doc("biotite", join(src_path, "biotite"), doc_path) _create_package_index(doc_path, package_list) @@ -67,19 +61,24 @@ def _create_package_doc(pck, src_path, doc_path): module = import_module(pck) attr_list = dir(module) # Classify attribute names into classes and functions - class_list = [attr for attr in attr_list - # Do not document private classes - if attr[0] != "_" - # Check if object is a class - and isinstance(getattr(module, attr), type)] - func_list = [attr for attr in attr_list - # Do not document private classes - if attr[0] != "_" - # All functions are callable... - and callable(getattr(module, attr)) - # ...but classes are also callable - and attr not in class_list - ] + class_list = [ + attr + for attr in attr_list + # Do not document private classes + if attr[0] != "_" + # Check if object is a class + and isinstance(getattr(module, attr), type) + ] + func_list = [ + attr + for attr in attr_list + # Do not document private classes + if attr[0] != "_" + # All functions are callable... + and callable(getattr(module, attr)) + # ...but classes are also callable + and attr not in class_list + ] # Create *.rst files _create_package_page(doc_path, pck, class_list, func_list, sub_pck) for class_name in class_list: @@ -87,11 +86,10 @@ def _create_package_doc(pck, src_path, doc_path): for function_name in func_list: _create_function_page(doc_path, pck, function_name) - return([pck] + sub_pck) + return [pck] + sub_pck -def _create_package_page(doc_path, package_name, - classes, functions, subpackages): +def _create_package_page(doc_path, package_name, classes, functions, subpackages): attributes = classes + functions # Get categories for this package @@ -114,7 +112,6 @@ def _create_package_page(doc_path, package_name, misc_category_name = "Miscellaneous" if categories else "Content" categories[misc_category_name] = misc_attributes - # String for categorized class and function enumeration category_strings = [] for category, attrs in categories.items(): @@ -135,12 +132,11 @@ def _create_package_page(doc_path, package_name, attributes_string = "\n".join(category_strings) # String for subpackage enumeration - subpackages_string = "\n".join( - [_INDENT + pck for pck in subpackages] - ) + subpackages_string = "\n".join([_INDENT + pck for pck in subpackages]) # Assemble page - file_content = dedent(f""" + file_content = ( + dedent(f""" ``{package_name}`` {"=" * (len(package_name) + 4)} @@ -150,16 +146,21 @@ def _create_package_page(doc_path, package_name, .. currentmodule:: {package_name} - """) + attributes_string + """) + + attributes_string + ) if len(subpackages) > 0: - file_content += dedent(f""" + file_content += ( + dedent(""" Subpackages ----------- .. autosummary:: - """) + subpackages_string + """) + + subpackages_string + ) with open(join(doc_path, f"{package_name}.rst"), "w") as f: f.write(file_content) @@ -201,18 +202,19 @@ def _create_function_page(doc_path, package_name, function_name): def _create_package_index(doc_path, package_list): # String for package enumeration - packages_string = "\n".join( - [_INDENT + pck for pck in sorted(package_list)] - ) + packages_string = "\n".join([_INDENT + pck for pck in sorted(package_list)]) - file_content = dedent(f""" + file_content = ( + dedent(""" API Reference ============= .. autosummary:: :toctree: - """) + packages_string + """) + + packages_string + ) with open(join(doc_path, "index.rst"), "w") as f: f.write(file_content) @@ -249,20 +251,21 @@ def _is_relevant_type(obj): # These are some special built-in Python methods return False return ( - # Functions - type(obj) in [ - types.FunctionType, types.BuiltinFunctionType, types.MethodType - ] - ) | ( - # Functions from C-extensions - type(obj).__name__ in [ - "cython_function_or_method", - "fused_cython_function" - ] - ) | ( - # Enum instance - isinstance(obj, enum.Enum) - ) | ( - # Inner class - isinstance(obj, type) - ) \ No newline at end of file + ( + # Functions + type(obj) + in [types.FunctionType, types.BuiltinFunctionType, types.MethodType] + ) + | ( + # Functions from C-extensions + type(obj).__name__ in ["cython_function_or_method", "fused_cython_function"] + ) + | ( + # Enum instance + isinstance(obj, enum.Enum) + ) + | ( + # Inner class + isinstance(obj, type) + ) + ) diff --git a/doc/bibliography.py b/doc/bibliography.py index 9c0bc4831..cf44587cc 100644 --- a/doc/bibliography.py +++ b/doc/bibliography.py @@ -5,14 +5,14 @@ __author__ = "Patrick Kunzmann" import warnings -from pybtex.richtext import Text, Tag, HRef +from pybtex.richtext import HRef, Tag, Text from pybtex.style.formatting import BaseStyle class IEEEStyle(BaseStyle): def format_article(self, param): entry = param["entry"] - + try: authors = [] for author in entry.persons["author"]: @@ -28,7 +28,7 @@ def format_article(self, param): text += " " text += " ".join([s for s in author.last_names]) authors.append(Text(text + ", ")) - + title = "" in_protected = False for char in entry.fields["title"]: @@ -46,34 +46,34 @@ def format_article(self, param): else: title += char.lower() title = Text('"', title, '," ') - + journal = Text(Tag("em", entry.fields["journal"]), ", ") - + if "volume" in entry.fields: volume = Text("vol. ", entry.fields["volume"], ", ") else: volume = Text() - + if "pages" in entry.fields: pages = Text("pp. ", entry.fields["pages"], ", ") else: pages = Text() - + date = entry.fields["year"] if "month" in entry.fields: date = entry.fields["month"] + " " + date date = Text(date, ". ") - - if "doi" in entry.fields: - doi = Text("doi: ", HRef( - "https://doi.org/" + entry.fields["doi"], - entry.fields["doi"] - )) + + if "doi" in entry.fields: + doi = Text( + "doi: ", + HRef("https://doi.org/" + entry.fields["doi"], entry.fields["doi"]), + ) else: doi = Text() - + return Text(*authors, title, journal, volume, pages, date, doi) - - except: + + except Exception: warnings.warn(f"Invalid BibTeX entry '{entry.key}'") - return Text(entry.key) \ No newline at end of file + return Text(entry.key) diff --git a/doc/conf.py b/doc/conf.py index d3f6e53c0..7f19bc67c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -5,24 +5,21 @@ __author__ = "Patrick Kunzmann" # Setup Cython for import of uncompiled *.pyx files -import pyximport import numpy as np +import pyximport + pyximport.install( - setup_args={'include_dirs': np.get_include()}, - build_in_temp=False, - language_level=3 + setup_args={"include_dirs": np.get_include()}, build_in_temp=False, language_level=3 ) -from os.path import realpath, dirname, join import sys import warnings -import pybtex -from sphinx_gallery.sorting import FileNameSortKey, ExplicitOrder +from os.path import dirname, join, realpath import matplotlib - +import pybtex +from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey import biotite - BIOTITE_DOMAIN = "www.biotite-python.org" DOC_PATH = dirname(realpath(__file__)) PACKAGE_PATH = join(dirname(DOC_PATH), "src") @@ -32,28 +29,21 @@ # in order to import modules for API doc generation etc. sys.path.insert(0, DOC_PATH) import apidoc -import viewcode -import scraper import bibliography import key +import scraper import switcher - +import viewcode # Reset matplotlib params matplotlib.rcdefaults() # Pregeneration of files apidoc.create_api_doc(PACKAGE_PATH, join(DOC_PATH, "apidoc")) -switcher.create_switcher_json( - join("static", "switcher.json"), - "v0.41.0", - n_versions=5 -) +switcher.create_switcher_json(join("static", "switcher.json"), "v0.41.0", n_versions=5) # Use custom citation style -pybtex.plugin.register_plugin( - "pybtex.style.formatting", "ieee", bibliography.IEEEStyle -) +pybtex.plugin.register_plugin("pybtex.style.formatting", "ieee", bibliography.IEEEStyle) #### Source code link ### @@ -61,14 +51,13 @@ #### General #### -import warnings # Removed standard matplotlib warning when generating gallery warnings.filterwarnings( "ignore", category=UserWarning, message="Matplotlib is currently using agg, which is a non-GUI backend, " - "so cannot show the figure." + "so cannot show the figure.", ) extensions = [ @@ -127,10 +116,7 @@ html_theme = "pydata_sphinx_theme" html_static_path = ["static"] -html_css_files = [ - "biotite.css", - "fonts.css" -] +html_css_files = ["biotite.css", "fonts.css"] html_title = "Biotite" html_logo = "static/assets/general/biotite_logo.svg" html_favicon = "static/assets/general/biotite_icon_32p.png" @@ -162,11 +148,11 @@ "url": "https://biotite.bsky.social", "icon": "fa-brands fa-bluesky", "type": "fontawesome", - } - ], - "use_edit_page_button": True, - "show_prev_next": False, - "show_toc_level": 2, + }, + ], + "use_edit_page_button": True, + "show_prev_next": False, + "show_toc_level": 2, } html_sidebars = { # No primary sidebar for these pages @@ -183,53 +169,49 @@ } sphinx_gallery_conf = { - "examples_dirs" : [ - "examples/scripts/sequence", - "examples/scripts/structure" - ], - "gallery_dirs" : [ - "examples/gallery/sequence", - "examples/gallery/structure" - ], - "subsection_order": ExplicitOrder([ - "examples/scripts/sequence/homology", - "examples/scripts/sequence/sequencing", - "examples/scripts/sequence/profile", - "examples/scripts/sequence/annotation", - "examples/scripts/sequence/misc", - "examples/scripts/structure/protein", - "examples/scripts/structure/nucleotide", - "examples/scripts/structure/molecule", - "examples/scripts/structure/contacts", - "examples/scripts/structure/modeling", - "examples/scripts/structure/misc", - ]), - "within_subsection_order" : FileNameSortKey, + "examples_dirs": ["examples/scripts/sequence", "examples/scripts/structure"], + "gallery_dirs": ["examples/gallery/sequence", "examples/gallery/structure"], + "subsection_order": ExplicitOrder( + [ + "examples/scripts/sequence/homology", + "examples/scripts/sequence/sequencing", + "examples/scripts/sequence/profile", + "examples/scripts/sequence/annotation", + "examples/scripts/sequence/misc", + "examples/scripts/structure/protein", + "examples/scripts/structure/nucleotide", + "examples/scripts/structure/molecule", + "examples/scripts/structure/contacts", + "examples/scripts/structure/modeling", + "examples/scripts/structure/misc", + ] + ), + "within_subsection_order": FileNameSortKey, # Do not run example scripts with a trailing '_noexec' - "filename_pattern" : "^((?!_noexec).)*$", - "ignore_pattern" : "(.*ignore\.py)|(.*pymol\.py)", - "backreferences_dir" : None, - "download_all_examples" : False, + "filename_pattern": "^((?!_noexec).)*$", + "ignore_pattern": r"(.*ignore\.py)|(.*pymol\.py)", + "download_all_examples": False, # Never report run time - "min_reported_time" : sys.maxsize, - "default_thumb_file" : join( + "min_reported_time": sys.maxsize, + "default_thumb_file": join( DOC_PATH, "static/assets/general/biotite_icon_thumb.png" ), - "image_scrapers" : ( + "image_scrapers": ( "matplotlib", scraper.static_image_scraper, - scraper.pymol_scraper + scraper.pymol_scraper, ), - "matplotlib_animations" : True, - "backreferences_dir" : "examples/backreferences", - "doc_module" : ("biotite",), + "matplotlib_animations": True, + "backreferences_dir": "examples/backreferences", + "doc_module": ("biotite",), # Set the NCBI API key - "reset_modules" : (key.set_ncbi_api_key_from_env,), - "remove_config_comments" : True, + "reset_modules": (key.set_ncbi_api_key_from_env,), + "remove_config_comments": True, } #### App setup #### + def setup(app): - app.connect("autodoc-skip-member", apidoc.skip_nonrelevant) \ No newline at end of file + app.connect("autodoc-skip-member", apidoc.skip_nonrelevant) diff --git a/doc/contribution/development.rst b/doc/contribution/development.rst index 9cee6fefa..49ac498a3 100644 --- a/doc/contribution/development.rst +++ b/doc/contribution/development.rst @@ -53,13 +53,19 @@ Official support for PyPy might be added someday. Code style ---------- -*Biotite* is in compliance with PEP 8. -The maximum line length is 79 for code lines and 72 for docstring and -comment lines. +*Biotite* is compliant with :pep:`8` and uses `Ruff `_ for +code formatting and linting. +The maximum line length is 88 characters. An exception is made for docstring lines, if it is not possible to use a -maximum of 72 characters (e.g. tables), and for -`doctest `_ lines, -where the actual code may take up to 79 characters. +maximum of 88 characters (e.g. tables and parameter type descriptions). +To make code changes ready for a pull request, simply run + +.. code-block:: console + + $ ruff format + $ ruff check --fix + +and fix the remaining linter complaints. Dependencies ------------ @@ -73,9 +79,8 @@ The import statement for the dependency should be located directly inside the function or class, rather than module level, to ensure that the package is not required for any other functionality or for building the API documentation. -An example for this approach is the support for trajectory files in -:mod:`biotite.structure.io`, that require `MDTraj `_. -The usage of these packages is not only allowed but even encouraged. +An example for this approach are the plotting functions in +:mod:`biotite.sequence.graphics`, that require *Matplotlib*. Code efficiency --------------- @@ -124,14 +129,14 @@ accessible, in a relative manner. Import statements should be the only statements in a ``__init__.py`` file. In case a module needs functionality from another subpackage of *Biotite*, -use a relative import. +use an absolute import as suggested by PEP 8. This import should target the module directly and not the package to avoid circular imports and thus an ``ImportError``. So import statements like the following are totally OK: .. code-block:: python - from ...package.subpackage.module import foo + from biotite.subpackage.module import foo In order to prevent namespace pollution, all modules must define the `__all__` variable with all publicly accessible attributes of the module. diff --git a/doc/examples/scripts/sequence/annotation/operon_map.py b/doc/examples/scripts/sequence/annotation/operon_map.py index dcd730ee7..37be67652 100644 --- a/doc/examples/scripts/sequence/annotation/operon_map.py +++ b/doc/examples/scripts/sequence/annotation/operon_map.py @@ -10,31 +10,39 @@ # License: BSD 3 clause import matplotlib.pyplot as plt -from biotite.sequence import Annotation, Feature, Location import biotite.sequence.graphics as graphics +from biotite.sequence import Annotation, Feature, Location strand = Location.Strand.FORWARD -prom = Feature("regulatory", [Location(10, 50, strand)], - {"regulatory_class" : "promoter", - "note" : "T7"}) -rbs1 = Feature("regulatory", [Location(60, 75, strand)], - {"regulatory_class" : "ribosome_binding_site", - "note" : "RBS1"}) -gene1 = Feature("gene", [Location(81, 380, strand)], - {"gene" : "gene1"}) -rbs2 = Feature("regulatory", [Location(400, 415, strand)], - {"regulatory_class" : "ribosome_binding_site", - "note" : "RBS2"}) -gene2 = Feature("gene", [Location(421, 1020, strand)], - {"gene" : "gene2"}) -term = Feature("regulatory", [Location(1050, 1080, strand)], - {"regulatory_class" : "terminator"}) +prom = Feature( + "regulatory", + [Location(10, 50, strand)], + {"regulatory_class": "promoter", "note": "T7"}, +) +rbs1 = Feature( + "regulatory", + [Location(60, 75, strand)], + {"regulatory_class": "ribosome_binding_site", "note": "RBS1"}, +) +gene1 = Feature("gene", [Location(81, 380, strand)], {"gene": "gene1"}) +rbs2 = Feature( + "regulatory", + [Location(400, 415, strand)], + {"regulatory_class": "ribosome_binding_site", "note": "RBS2"}, +) +gene2 = Feature("gene", [Location(421, 1020, strand)], {"gene": "gene2"}) +term = Feature( + "regulatory", [Location(1050, 1080, strand)], {"regulatory_class": "terminator"} +) annotation = Annotation([prom, rbs1, gene1, rbs2, gene2, term]) fig = plt.figure(figsize=(8.0, 0.8)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, multi_line=False, loc_range=(1, 1101), + ax, + annotation, + multi_line=False, + loc_range=(1, 1101), ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/annotation/plasmid_map.py b/doc/examples/scripts/sequence/annotation/plasmid_map.py index b25623bf4..bf4118352 100644 --- a/doc/examples/scripts/sequence/annotation/plasmid_map.py +++ b/doc/examples/scripts/sequence/annotation/plasmid_map.py @@ -18,26 +18,33 @@ # License: BSD 3 clause import io -import requests import matplotlib.pyplot as plt -import numpy as np +import requests import biotite -import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.genbank as gb -PLASMID_URL = "https://media.addgene.org/snapgene-media/v1.7.9-0-g88a3305/"\ - "sequences/12250/9998fdbe-051f-4dc6-ba0f-24e65127a0c5/" \ - "addgene-plasmid-26092-sequence-12250.gbk" +PLASMID_URL = ( + "https://media.addgene.org/snapgene-media/v1.7.9-0-g88a3305/" + "sequences/12250/9998fdbe-051f-4dc6-ba0f-24e65127a0c5/" + "addgene-plasmid-26092-sequence-12250.gbk" +) response = requests.get(PLASMID_URL) gb_file = gb.GenBankFile.read(io.StringIO(response.text)) -annotation = gb.get_annotation(gb_file, include_only=[ - "promoter", "terminator", "protein_bind", - "RBS", "CDS", "rep_origin", "primer_bind" -]) +annotation = gb.get_annotation( + gb_file, + include_only=[ + "promoter", + "terminator", + "protein_bind", + "RBS", + "CDS", + "rep_origin", + "primer_bind", + ], +) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) # AddGene stores the plasmid name in the 'KEYWORDS' field # [0][0][0] -> @@ -69,8 +76,11 @@ def custom_feature_formatter(feature): fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111, projection="polar") graphics.plot_plasmid_map( - ax, annotation, plasmid_size=seq_length, - label=plasmid_name, feature_formatter=custom_feature_formatter + ax, + annotation, + plasmid_size=seq_length, + label=plasmid_name, + feature_formatter=custom_feature_formatter, ) fig.tight_layout() plt.show() diff --git a/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py b/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py index 67fb87834..494b2d17e 100644 --- a/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py +++ b/doc/examples/scripts/sequence/annotation/plasmid_map_custom.py @@ -5,129 +5,100 @@ .. currentmodule:: biotite.sequence This script shows how :class:`Feature` objects are displayed in a -plasmid map by using a custom 'toy' :class:`Annotation`. +plasmid map by using a custom 'toy' :class:`Annotation`. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import matplotlib.pyplot as plt -import numpy as np import biotite.sequence as seq -import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - -annotation = seq.Annotation([ - seq.Feature( - "source", - [seq.Location(0, 1500)], - {"organism": "Escherichia coli"} - ), - - # Ori - seq.Feature( - "rep_origin", - [seq.Location(600, 700, seq.Location.Strand.REVERSE)], - {"regulatory_class": "promoter", "note": "MyProm"} - ), - - # Promoter - seq.Feature( - "regulatory", - [seq.Location(1000, 1060)], - {"regulatory_class": "promoter", "note": "MyProm"} - ), - seq.Feature( - "protein_bind", - [seq.Location(1025, 1045)], - {"note": "repr"} - ), - - # Gene A - seq.Feature( - "regulatory", - [seq.Location(1070, 1080)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - [seq.Location(1091, 1150)], - {"product": "geneA"} - ), - - # Gene B - seq.Feature( - "regulatory", - [seq.Location(1180, 1190)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - [seq.Location(1201, 1350)], - {"product": "geneB"} - ), - seq.Feature( - "regulatory", - [seq.Location(1220, 1230)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - [seq.Location(1240, 1350)], - {"product": "geneB2"} - ), - - # Gene C - seq.Feature( - "regulatory", - [seq.Location(1380, 1390)], - {"regulatory_class": "ribosome_binding_site"} - ), - seq.Feature( - "CDS", - # CDS extends over periodic boundary -> two locations - [seq.Location(1, 300), seq.Location(1402, 1500)], - {"product": "geneC"} - ), - - # Terminator - seq.Feature( - "regulatory", - [seq.Location(310, 350)], - {"regulatory_class": "terminator", "note": "MyTerm"} - ), - - # Primers - # The labels will be too long to be displayed on the map - # If you want to display them nevertheless, set the - # 'omit_oversized_labels' to False - seq.Feature( - "primer_bind", - [seq.Location(1385, 1405)], - {"note": "geneC"} - ), - seq.Feature( - "primer_bind", - [seq.Location(345, 365, seq.Location.Strand.REVERSE)], - {"note": "geneC_R"} - ), - - # Terminator - seq.Feature( - "regulatory", - [seq.Location(310, 350)], - {"regulatory_class": "terminator", "note": "MyTerm"} - ), -]) +annotation = seq.Annotation( + [ + seq.Feature( + "source", [seq.Location(0, 1500)], {"organism": "Escherichia coli"} + ), + # Ori + seq.Feature( + "rep_origin", + [seq.Location(600, 700, seq.Location.Strand.REVERSE)], + {"regulatory_class": "promoter", "note": "MyProm"}, + ), + # Promoter + seq.Feature( + "regulatory", + [seq.Location(1000, 1060)], + {"regulatory_class": "promoter", "note": "MyProm"}, + ), + seq.Feature("protein_bind", [seq.Location(1025, 1045)], {"note": "repr"}), + # Gene A + seq.Feature( + "regulatory", + [seq.Location(1070, 1080)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature("CDS", [seq.Location(1091, 1150)], {"product": "geneA"}), + # Gene B + seq.Feature( + "regulatory", + [seq.Location(1180, 1190)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature("CDS", [seq.Location(1201, 1350)], {"product": "geneB"}), + seq.Feature( + "regulatory", + [seq.Location(1220, 1230)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature("CDS", [seq.Location(1240, 1350)], {"product": "geneB2"}), + # Gene C + seq.Feature( + "regulatory", + [seq.Location(1380, 1390)], + {"regulatory_class": "ribosome_binding_site"}, + ), + seq.Feature( + "CDS", + # CDS extends over periodic boundary -> two locations + [seq.Location(1, 300), seq.Location(1402, 1500)], + {"product": "geneC"}, + ), + # Terminator + seq.Feature( + "regulatory", + [seq.Location(310, 350)], + {"regulatory_class": "terminator", "note": "MyTerm"}, + ), + # Primers + # The labels will be too long to be displayed on the map + # If you want to display them nevertheless, set the + # 'omit_oversized_labels' to False + seq.Feature("primer_bind", [seq.Location(1385, 1405)], {"note": "geneC"}), + seq.Feature( + "primer_bind", + [seq.Location(345, 365, seq.Location.Strand.REVERSE)], + {"note": "geneC_R"}, + ), + # Terminator + seq.Feature( + "regulatory", + [seq.Location(310, 350)], + {"regulatory_class": "terminator", "note": "MyTerm"}, + ), + ] +) fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111, projection="polar") graphics.plot_plasmid_map( - ax, annotation, plasmid_size=1500, label="My plasmid", - label_properties={"fontsize": 8} + ax, + annotation, + plasmid_size=1500, + label="My plasmid", + label_properties={"fontsize": 8}, ) ticks = ax.get_xticks() diff --git a/doc/examples/scripts/sequence/annotation/region_visualization.py b/doc/examples/scripts/sequence/annotation/region_visualization.py index 6bdd55455..09aa3b43c 100644 --- a/doc/examples/scripts/sequence/annotation/region_visualization.py +++ b/doc/examples/scripts/sequence/annotation/region_visualization.py @@ -9,16 +9,13 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import biotite.sequence as seq +import matplotlib.pyplot as plt +import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb -import biotite.database.entrez as entrez -import numpy as np -import matplotlib.pyplot as plt # Download E. coli BL21 genome -file = entrez.fetch("CP001509", None, suffix="gb", - db_name="nuccore", ret_type="gb") +file = entrez.fetch("CP001509", None, suffix="gb", db_name="nuccore", ret_type="gb") gb_file = gb.GenBankFile.read(file) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) annotation = gb.get_annotation(gb_file, include_only=["gene"]) @@ -29,13 +26,15 @@ for loc in feature.locs: # Ignore if feature is only a pseudo-gene (e.g. gene fragment) # and check if feature is lacA gene (begin of lac operon) - if "gene" in feature.qual \ - and "pseudo" not in feature.qual \ - and feature.qual["gene"] == "lacA": - if min_loc > loc.first: - min_loc = loc.first - if max_loc < loc.last: - max_loc = loc.last + if ( + "gene" in feature.qual + and "pseudo" not in feature.qual + and feature.qual["gene"] == "lacA" + ): + if min_loc > loc.first: + min_loc = loc.first + if max_loc < loc.last: + max_loc = loc.last # Extend the location range by 1000 (arbitrary) in each direction min_loc -= 10000 max_loc += 10000 @@ -44,9 +43,13 @@ fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, loc_range=(min_loc, max_loc), symbols_per_line=2000, - show_numbers=True, show_line_position=True + ax, + annotation, + loc_range=(min_loc, max_loc), + symbols_per_line=2000, + show_numbers=True, + show_line_position=True, ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/annotation/sigma_domains.py b/doc/examples/scripts/sequence/annotation/sigma_domains.py index 5d5a5ea36..d3c6b7365 100644 --- a/doc/examples/scripts/sequence/annotation/sigma_domains.py +++ b/doc/examples/scripts/sequence/annotation/sigma_domains.py @@ -11,37 +11,37 @@ import re from collections import OrderedDict -import numpy as np import matplotlib.pyplot as plt -from matplotlib.patches import Rectangle, FancyBboxPatch -import biotite.sequence as seq -import biotite.sequence.io.genbank as gb +import numpy as np +from matplotlib.patches import FancyBboxPatch, Rectangle import biotite.database.entrez as entrez - +import biotite.sequence.io.genbank as gb # The names of the sigma factors and the corresponding genes -genes = OrderedDict({ - r"$\sigma^{70}$": "rpoD", - r"$\sigma^{24}$": "rpoE", - r"$\sigma^{28}$": "rpoF", - r"$\sigma^{32}$": "rpoH", - r"$\sigma^{38}$": "rpoS", -}) +genes = OrderedDict( + { + r"$\sigma^{70}$": "rpoD", + r"$\sigma^{24}$": "rpoE", + r"$\sigma^{28}$": "rpoF", + r"$\sigma^{32}$": "rpoH", + r"$\sigma^{38}$": "rpoS", + } +) # Find SwissProt entries for these genes in NCBI Entrez protein database uids = [] for name, gene in genes.items(): - query = entrez.SimpleQuery(gene, "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \ - & entrez.SimpleQuery("Escherichia coli K-12", "Organism") + query = ( + entrez.SimpleQuery(gene, "Gene Name") + & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") + & entrez.SimpleQuery("Escherichia coli K-12", "Organism") + ) ids = entrez.search(query, "protein") # Only one entry per gene in E. coli K-12 is expected assert len(ids) == 1 uids += ids # Download corresponding GenBank files as single, merged file -file = entrez.fetch_single_file( - uids, None, "protein", ret_type="gb" -) +file = entrez.fetch_single_file(uids, None, "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position @@ -55,53 +55,66 @@ # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file) - seq_lengths[i] = length + seq_lengths[i] = length annotation = gb.get_annotation(gb_file) # Find features, that represent a sigma factor domain for feature in annotation: - if feature.key == "Region" and "note" in feature.qual \ - and "Sigma-70 factor domain" in feature.qual["note"]: - # Extract the domain number - # and decrement for 0-based indexing - # - # e.g. 'Sigma-70 factor domain-2.' => 1 - # ^ - domain_index = int(re.findall( - "(?<=Sigma-70 factor domain-)\d+", - feature.qual["note"] - )[0]) -1 - # Expect a single contiguous location of the domain - assert len(feature.locs) == 1 - loc = list(feature.locs)[0] - # Store first and last position of the domain - domain_pos[i, domain_index, :] = [loc.first, loc.last] + if ( + feature.key == "Region" + and "note" in feature.qual + and "Sigma-70 factor domain" in feature.qual["note"] + ): + # Extract the domain number + # and decrement for 0-based indexing + # + # e.g. 'Sigma-70 factor domain-2.' => 1 + # ^ + domain_index = ( + int( + re.findall( + r"(?<=Sigma-70 factor domain-)\d+", feature.qual["note"] + )[0] + ) + - 1 + ) + # Expect a single contiguous location of the domain + assert len(feature.locs) == 1 + loc = list(feature.locs)[0] + # Store first and last position of the domain + domain_pos[i, domain_index, :] = [loc.first, loc.last] fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.gca() # The color for each one of the four domains colors = ["firebrick", "forestgreen", "dodgerblue", "goldenrod"] # Draw each sequence -for i, (gene_name, domain_pos_for_gene, length) \ - in enumerate(zip(genes.keys(), domain_pos, seq_lengths)): - # Add base line representing the sequence itself - ax.add_patch(Rectangle( - (1, i-0.05), length, 0.1, color="gray" - )) - # Draw each domain - for j, ((first, last), color) \ - in enumerate(zip(domain_pos_for_gene, colors)): - if first != -1 and last != -1: - # FancyBboxPatch to get rounded corners in rectangle - ax.add_patch(FancyBboxPatch( - (first, i-0.4), last-first, 0.8, #color=color, - boxstyle="round,pad=0,rounding_size=10", - ec="black", fc=color, - mutation_aspect=0.02 - )) - ax.text( - x=(last+first)/2, y=i, s=fr"$\sigma_{j+1}$", - ha="center", va="center" - ) +for i, (gene_name, domain_pos_for_gene, length) in enumerate( + zip(genes.keys(), domain_pos, seq_lengths) +): + # Add base line representing the sequence itself + ax.add_patch(Rectangle((1, i - 0.05), length, 0.1, color="gray")) + # Draw each domain + for j, ((first, last), color) in enumerate(zip(domain_pos_for_gene, colors)): + if first != -1 and last != -1: + # FancyBboxPatch to get rounded corners in rectangle + ax.add_patch( + FancyBboxPatch( + (first, i - 0.4), + last - first, + 0.8, # color=color, + boxstyle="round,pad=0,rounding_size=10", + ec="black", + fc=color, + mutation_aspect=0.02, + ) + ) + ax.text( + x=(last + first) / 2, + y=i, + s=rf"$\sigma_{j+1}$", + ha="center", + va="center", + ) ax.set_xlim(0, max(seq_lengths)) ax.set_xlabel("Sequence position") # Inverted y-axis diff --git a/doc/examples/scripts/sequence/homology/avidin_alignment.py b/doc/examples/scripts/sequence/homology/avidin_alignment.py index 40b50083f..da67ff617 100644 --- a/doc/examples/scripts/sequence/homology/avidin_alignment.py +++ b/doc/examples/scripts/sequence/homology/avidin_alignment.py @@ -11,16 +11,16 @@ # License: BSD 3 clause import matplotlib.pyplot as plt +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.io.fasta as fasta -import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics +import biotite.sequence.io.fasta as fasta # Download and parse protein sequences of avidin and streptavidin -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - ["CAC34569", "ACL82594"], None, "protein", "fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta") +) for name, sequence in fasta_file.items(): if "CAC34569" in name: avidin_seq = seq.ProteinSequence(sequence) @@ -31,16 +31,21 @@ matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized -alignments = align.align_optimal(avidin_seq, streptavidin_seq, matrix, - gap_penalty=(-10, -1), terminal_penalty=False) +alignments = align.align_optimal( + avidin_seq, streptavidin_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False +) # Draw first and only alignment # The color intensity indicates the similiarity fig = plt.figure(figsize=(8.0, 2.5)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignments[0], matrix=matrix, labels=["Avidin", "Streptavidin"], - show_numbers=True, show_line_position=True + ax, + alignments[0], + matrix=matrix, + labels=["Avidin", "Streptavidin"], + show_numbers=True, + show_line_position=True, ) fig.tight_layout() diff --git a/doc/examples/scripts/sequence/homology/bionigma_alignment.py b/doc/examples/scripts/sequence/homology/bionigma_alignment.py index 9a3b8d1b5..c2275b2fe 100644 --- a/doc/examples/scripts/sequence/homology/bionigma_alignment.py +++ b/doc/examples/scripts/sequence/homology/bionigma_alignment.py @@ -12,121 +12,132 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Rectangle from matplotlib.transforms import Bbox +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.io.fasta as fasta import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.fasta as fasta # The polygon coordinates for the different shapes -_hexagon_coord = np.array([ - (0.500, 0.000), - (0.937, 0.250), - (0.937, 0.750), - (0.500, 1.000), - (0.063, 0.750), - (0.063, 0.250) -]) - -_spiked_coord = np.array([ - (0.000, 0.000), - (0.500, 0.150), - (1.000, 0.000), - (0.850, 0.500), - (1.000, 1.000), - (0.500, 0.850), - (0.000, 1.000), - (0.150, 0.500), -]) - -_spiked_coord = np.array([ - (0.000, 0.000), - (0.500, 0.150), - (1.000, 0.000), - (0.850, 0.500), - (1.000, 1.000), - (0.500, 0.850), - (0.000, 1.000), - (0.150, 0.500), -]) - -_cross_coord = np.array([ - (0.220, 0.000), - (0.780, 0.000), - (0.780, 0.220), - (1.000, 0.220), - (1.000, 0.780), - (0.780, 0.780), - (0.780, 1.000), - (0.220, 1.000), - (0.220, 0.780), - (0.000, 0.780), - (0.000, 0.220), - (0.220, 0.220), -]) - -_star_coord = np.array([ - (0.500, 0.000), - (0.648, 0.150), - (0.852, 0.150), - (0.852, 0.352), - (1.000, 0.500), - (0.852, 0.648), - (0.852, 0.852), - (0.648, 0.852), - (0.500, 1.000), - (0.352, 0.852), - (0.148, 0.852), - (0.148, 0.648), - (0.000, 0.500), - (0.148, 0.352), - (0.148, 0.148), - (0.352, 0.148), -]) - -_hourglass_coord = np.array([ - (0.000, 0.000), - (1.000, 0.000), - (1.000, 0.220), - (0.740, 0.420), - (0.740, 0.580), - (1.000, 0.780), - (1.000, 1.000), - (0.000, 1.000), - (0.000, 0.780), - (0.260, 0.580), - (0.260, 0.420), - (0.000, 0.220), -]) +_hexagon_coord = np.array( + [ + (0.500, 0.000), + (0.937, 0.250), + (0.937, 0.750), + (0.500, 1.000), + (0.063, 0.750), + (0.063, 0.250), + ] +) + +_spiked_coord = np.array( + [ + (0.000, 0.000), + (0.500, 0.150), + (1.000, 0.000), + (0.850, 0.500), + (1.000, 1.000), + (0.500, 0.850), + (0.000, 1.000), + (0.150, 0.500), + ] +) + +_spiked_coord = np.array( + [ + (0.000, 0.000), + (0.500, 0.150), + (1.000, 0.000), + (0.850, 0.500), + (1.000, 1.000), + (0.500, 0.850), + (0.000, 1.000), + (0.150, 0.500), + ] +) + +_cross_coord = np.array( + [ + (0.220, 0.000), + (0.780, 0.000), + (0.780, 0.220), + (1.000, 0.220), + (1.000, 0.780), + (0.780, 0.780), + (0.780, 1.000), + (0.220, 1.000), + (0.220, 0.780), + (0.000, 0.780), + (0.000, 0.220), + (0.220, 0.220), + ] +) + +_star_coord = np.array( + [ + (0.500, 0.000), + (0.648, 0.150), + (0.852, 0.150), + (0.852, 0.352), + (1.000, 0.500), + (0.852, 0.648), + (0.852, 0.852), + (0.648, 0.852), + (0.500, 1.000), + (0.352, 0.852), + (0.148, 0.852), + (0.148, 0.648), + (0.000, 0.500), + (0.148, 0.352), + (0.148, 0.148), + (0.352, 0.148), + ] +) + +_hourglass_coord = np.array( + [ + (0.000, 0.000), + (1.000, 0.000), + (1.000, 0.220), + (0.740, 0.420), + (0.740, 0.580), + (1.000, 0.780), + (1.000, 1.000), + (0.000, 1.000), + (0.000, 0.780), + (0.260, 0.580), + (0.260, 0.420), + (0.000, 0.220), + ] +) # The shape color for each symbols _colors = { - "A" : "#1e67b6", - "C" : "#00a391", - "D" : "#ea42fc", - "E" : "#109c4b", - "F" : "#fed700", - "G" : "#8d4712", - "H" : "#ff8e00", - "I" : "#d82626", - "K" : "#109c4b", - "L" : "#d82626", - "M" : "#d82626", - "N" : "#ea42fc", - "P" : "#ffa9e3", - "Q" : "#109c4b", - "R" : "#109c4b", - "S" : "#1e67b6", - "T" : "#1e67b6", - "V" : "#d82626", - "W" : "#fed700", - "Y" : "#fed700" + "A": "#1e67b6", + "C": "#00a391", + "D": "#ea42fc", + "E": "#109c4b", + "F": "#fed700", + "G": "#8d4712", + "H": "#ff8e00", + "I": "#d82626", + "K": "#109c4b", + "L": "#d82626", + "M": "#d82626", + "N": "#ea42fc", + "P": "#ffa9e3", + "Q": "#109c4b", + "R": "#109c4b", + "S": "#1e67b6", + "T": "#1e67b6", + "V": "#d82626", + "W": "#fed700", + "Y": "#fed700", } @@ -134,31 +145,32 @@ class ShapePlotter(graphics.SymbolPlotter): """ A symbol plotter that depicts each symbol by color and shape. """ + def __init__(self, axes, font_size=None, font_param=None): super().__init__(axes) # The symbol to shape mapping self._draw_funcs = { - "A" : ShapePlotter._draw_circle, - "T" : ShapePlotter._draw_circle, - "S" : ShapePlotter._draw_circle, - "N" : ShapePlotter._draw_circle, - "D" : ShapePlotter._draw_rectangle, - "E" : ShapePlotter._draw_rectangle, - "Q" : ShapePlotter._draw_rectangle, - "K" : ShapePlotter._draw_rectangle, - "R" : ShapePlotter._draw_rectangle, - "I" : ShapePlotter._draw_hexagon, - "L" : ShapePlotter._draw_hexagon, - "V" : ShapePlotter._draw_hexagon, - "M" : ShapePlotter._draw_hexagon, - "F" : ShapePlotter._draw_spiked, - "W" : ShapePlotter._draw_spiked, - "Y" : ShapePlotter._draw_spiked, - "H" : ShapePlotter._draw_spiked, - "G" : ShapePlotter._draw_cross, - "P" : ShapePlotter._draw_star, - "C" : ShapePlotter._draw_hourglass + "A": ShapePlotter._draw_circle, + "T": ShapePlotter._draw_circle, + "S": ShapePlotter._draw_circle, + "N": ShapePlotter._draw_circle, + "D": ShapePlotter._draw_rectangle, + "E": ShapePlotter._draw_rectangle, + "Q": ShapePlotter._draw_rectangle, + "K": ShapePlotter._draw_rectangle, + "R": ShapePlotter._draw_rectangle, + "I": ShapePlotter._draw_hexagon, + "L": ShapePlotter._draw_hexagon, + "V": ShapePlotter._draw_hexagon, + "M": ShapePlotter._draw_hexagon, + "F": ShapePlotter._draw_spiked, + "W": ShapePlotter._draw_spiked, + "Y": ShapePlotter._draw_spiked, + "H": ShapePlotter._draw_spiked, + "G": ShapePlotter._draw_cross, + "P": ShapePlotter._draw_star, + "C": ShapePlotter._draw_hourglass, } self._font_size = font_size @@ -166,8 +178,8 @@ def __init__(self, axes, font_size=None, font_param=None): def plot_symbol(self, bbox, alignment, column_i, seq_i): trace = alignment.trace - if trace[column_i,seq_i] != -1: - symbol = alignment.sequences[seq_i][trace[column_i,seq_i]] + if trace[column_i, seq_i] != -1: + symbol = alignment.sequences[seq_i][trace[column_i, seq_i]] else: symbol = "" color = self._get_color(alignment, column_i, seq_i) @@ -178,16 +190,21 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i): # Shrink Bbox slightly to get a small margin between shapes f = 0.04 shape_bbox = Bbox( - ((bbox.x0 + f*bbox.width, - bbox.y0 + f*bbox.height), - (bbox.x1 - f*bbox.width, - bbox.y1 - f*bbox.height)), + ( + (bbox.x0 + f * bbox.width, bbox.y0 + f * bbox.height), + (bbox.x1 - f * bbox.width, bbox.y1 - f * bbox.height), + ), ) draw_func(self, shape_bbox, color) text = self.axes.text( - bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2, - symbol, color="black", ha="center", va="center", - size=self._font_size, **self._font_param + bbox.x0 + bbox.width / 2, + bbox.y0 + bbox.height / 2, + symbol, + color="black", + ha="center", + va="center", + size=self._font_size, + **self._font_param, ) text.set_clip_on(True) @@ -203,15 +220,17 @@ def _draw_circle(self, bbox, color): from matplotlib.patches import Circle circle = Circle( - (bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2), bbox.width/2, - facecolor=color, edgecolor="None", fill=True + (bbox.x0 + bbox.width / 2, bbox.y0 + bbox.height / 2), + bbox.width / 2, + facecolor=color, + edgecolor="None", + fill=True, ) self.axes.add_patch(circle) def _draw_rectangle(self, bbox, color): rectangle = Rectangle( - bbox.p0, bbox.width, bbox.height, - facecolor=color, edgecolor="None" + bbox.p0, bbox.width, bbox.height, facecolor=color, edgecolor="None" ) self.axes.add_patch(rectangle) @@ -241,45 +260,50 @@ def _draw_polygon(self, bbox, color, coord): self.axes.add_patch(polygon) -def plot_alignment_shapes(axes, alignment, symbols_per_line=30, - show_numbers=False, number_size=None, - number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, color_symbols=False, - symbol_size=None, symbol_param=None): +def plot_alignment_shapes( + axes, + alignment, + symbols_per_line=30, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + symbol_size=None, + symbol_param=None, +): """ A thin wrapper around the 'ShapePlotter' and 'plot_alignment()' function. """ - alphabet = alignment.sequences[0].get_alphabet() - symbol_plotter = ShapePlotter( - axes, font_size=symbol_size, font_param=symbol_param - ) + symbol_plotter = ShapePlotter(axes, font_size=symbol_size, font_param=symbol_param) graphics.plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing + spacing=spacing, ) twin = axes.get_shared_x_axes().get_siblings(axes)[0] for ax in (axes, twin): - ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color":"white"}) + ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"}) axes.get_figure().patch.set_facecolor("#181818") - - # Using cyclotide sequences as example -query = ( - entrez.SimpleQuery("Cyclotide") & - entrez.SimpleQuery("cter") & - entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ - entrez.SimpleQuery("Precursor") +query = entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery( + "cter" +) & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^ entrez.SimpleQuery( + "Precursor" ) uids = entrez.search(query, "protein") fasta_file = fasta.FastaFile.read( @@ -289,8 +313,7 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30, # Currently there seems to b a bug in the NCBI search, # so that 'Precursor' results are still included # Solve this by filtering the sequence length -sequence_dict = {header: seq for header, seq in sequence_dict.items() - if len(seq) < 100} +sequence_dict = {header: seq for header, seq in sequence_dict.items() if len(seq) < 100} headers = list(sequence_dict.keys()) sequences = list(sequence_dict.values()) labels = [header[-1] for header in headers] @@ -306,8 +329,7 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30, fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.add_subplot(111) plot_alignment_shapes( - ax, alignment, labels=labels, symbols_per_line=len(alignment), - symbol_size=8 + ax, alignment, labels=labels, symbols_per_line=len(alignment), symbol_size=8 ) # The aspect ratio of the shapes should be preserved: # Squares should look like squares, circles should look like circles @@ -316,4 +338,4 @@ def plot_alignment_shapes(axes, alignment, symbols_per_line=30, ax.set_ylabel("Type", color="white") ax.set_title("Comparison of cyclotide sequences", color="white") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/genome_comparison.py b/doc/examples/scripts/sequence/homology/genome_comparison.py index fc360804d..066388ce5 100644 --- a/doc/examples/scripts/sequence/homology/genome_comparison.py +++ b/doc/examples/scripts/sequence/homology/genome_comparison.py @@ -31,28 +31,25 @@ # License: BSD 3 clause import tempfile -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Rectangle from matplotlib.ticker import MultipleLocator import biotite +import biotite.application.tantan as tantan +import biotite.database.entrez as entrez import biotite.sequence as seq +import biotite.sequence.align as align import biotite.sequence.io as seqio import biotite.sequence.io.genbank as gb -import biotite.sequence.align as align -import biotite.database.entrez as entrez -import biotite.application.tantan as tantan - fasta_file = entrez.fetch( - "NC_000932", tempfile.gettempdir(), "fasta", - db_name="Nucleotide", ret_type="fasta" + "NC_000932", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta" ) chloroplast_seq = seqio.load_sequence(fasta_file) fasta_file = entrez.fetch( - "NC_000911", tempfile.gettempdir(), "fasta", - db_name="Nucleotide", ret_type="fasta" + "NC_000911", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta" ) bacterium_seq = seqio.load_sequence(fasta_file) @@ -73,15 +70,13 @@ # one ``111∗1∗11∗1∗∗11∗111`` :footcite:`Choi2004` is used here. repeat_mask = tantan.TantanApp.mask_repeats(bacterium_seq) -bacterium_seqs = [ - bacterium_seq, bacterium_seq.reverse(copy=False).complement() -] +bacterium_seqs = [bacterium_seq, bacterium_seq.reverse(copy=False).complement()] table = align.KmerTable.from_sequences( - k = 12, - sequences = bacterium_seqs, - spacing = "111∗1∗11∗1∗∗11∗111", - ignore_masks = [repeat_mask, repeat_mask[::-1].copy()] + k=12, + sequences=bacterium_seqs, + spacing="111∗1∗11∗1∗∗11∗111", + ignore_masks=[repeat_mask, repeat_mask[::-1].copy()], ) ######################################################################## @@ -117,7 +112,7 @@ # Store the indices to the match array # for each combination of diagonal and strand on the bacterial genome matches_for_diagonals = {} -for i, (diag, strand) in enumerate(zip(diagonals, matches[:,1])): +for i, (diag, strand) in enumerate(zip(diagonals, matches[:, 1])): if (diag, strand) not in matches_for_diagonals: matches_for_diagonals[(diag, strand)] = [i] else: @@ -125,8 +120,9 @@ # If a diagonal has more than one match, # the first match on this diagonal is a double hit -double_hit_indices = [indices[0] for indices - in matches_for_diagonals.values() if len(indices) > 1] +double_hit_indices = [ + indices[0] for indices in matches_for_diagonals.values() if len(indices) > 1 +] double_hits = matches[double_hit_indices] print("Number of double hits:", len(double_hits)) @@ -148,13 +144,19 @@ ACCEPT_THRESHOLD = 100 matrix = align.SubstitutionMatrix.std_nucleotide_matrix() -ungapped_scores = np.array([ - align.align_local_ungapped( - chloroplast_seq, bacterium_seqs[strand], matrix, - seed=(i,j), threshold=X_DROP, score_only=True - ) - for i, strand, j in double_hits -]) +ungapped_scores = np.array( + [ + align.align_local_ungapped( + chloroplast_seq, + bacterium_seqs[strand], + matrix, + seed=(i, j), + threshold=X_DROP, + score_only=True, + ) + for i, strand, j in double_hits + ] +) accepted_hits = double_hits[ungapped_scores > ACCEPT_THRESHOLD] print("Number of accepted ungapped alignments:", len(accepted_hits)) @@ -190,19 +192,27 @@ estimator = align.EValueEstimator.from_samples( chloroplast_seq.alphabet, # The scoring scheme must be the same as used for the alignment - matrix, GAP_PENALTY, - background + matrix, + GAP_PENALTY, + background, ) # Compute similarity scores for each hit -gapped_scores = np.array([ - align.align_local_gapped( - chloroplast_seq, bacterium_seqs[strand], matrix, - seed=(i,j), gap_penalty=GAP_PENALTY, threshold=X_DROP, score_only=True, - max_table_size=100_000_000 - ) - for i, strand, j in accepted_hits -]) +gapped_scores = np.array( + [ + align.align_local_gapped( + chloroplast_seq, + bacterium_seqs[strand], + matrix, + seed=(i, j), + gap_penalty=GAP_PENALTY, + threshold=X_DROP, + score_only=True, + max_table_size=100_000_000, + ) + for i, strand, j in accepted_hits + ] +) # Calculate the E-values # For numeric stability reasons the method returns the common logarithm @@ -215,10 +225,14 @@ accepted_alignments = [ ( align.align_local_gapped( - chloroplast_seq, bacterium_seqs[strand], matrix, - seed=(i,j), gap_penalty=GAP_PENALTY, threshold=X_DROP, + chloroplast_seq, + bacterium_seqs[strand], + matrix, + seed=(i, j), + gap_penalty=GAP_PENALTY, + threshold=X_DROP, )[0], - log_evalue + log_evalue, ) for (i, strand, j), log_evalue in zip(accepted_hits, log_evalues) if log_evalue <= np.log10(EVALUE_THRESHOLD) @@ -248,11 +262,11 @@ stop = alignment.trace[-1, 0] # If this region was not covered by any other alignment before, # accept it and mark the region as covered - if not covered_range[start : stop].any(): + if not covered_range[start:stop].any(): unique_alignments.append((alignment, log_evalue)) - covered_range[start : stop] = True + covered_range[start:stop] = True -print("Number of unique alignments:", len(unique_alignments)) +print("Number of unique alignments:", len(unique_alignments)) ######################################################################## # To take a closer look on the found homologous regions, they are viewed @@ -269,9 +283,9 @@ MARGIN_SIZE = 250 COLORS = { - "CDS" : biotite.colors["dimgreen"], + "CDS": biotite.colors["dimgreen"], "tRNA": biotite.colors["orange"], - "rRNA": biotite.colors["orange"] + "rRNA": biotite.colors["orange"], } @@ -282,7 +296,6 @@ annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"]) - def draw_arrow(ax, feature, loc): x = loc.first dx = loc.last - loc.first + 1 @@ -294,18 +307,25 @@ def draw_arrow(ax, feature, loc): dx = loc.first - loc.last + 1 # Create head with 90 degrees tip -> head width/length ratio = 1/2 - ax.add_patch(biotite.AdaptiveFancyArrow( - x, 0.5, dx, 0, tail_width=0.4, head_width=0.7, head_ratio=0.5, - draw_head=True, color=COLORS[feature.key], linewidth=0 - )) + ax.add_patch( + biotite.AdaptiveFancyArrow( + x, + 0.5, + dx, + 0, + tail_width=0.4, + head_width=0.7, + head_ratio=0.5, + draw_head=True, + color=COLORS[feature.key], + linewidth=0, + ) + ) label = feature.qual.get("gene") if label is not None: - ax.text( - x + dx/2, 0.5, label, color="black", - ha="center", va="center", size=8 - ) + ax.text(x + dx / 2, 0.5, label, color="black", ha="center", va="center", size=8) # Fetch features of the chloroplast genome @@ -315,21 +335,15 @@ def draw_arrow(ax, feature, loc): annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"]) n_rows = int(np.ceil(len(unique_alignments) / N_COL)) -fig, axes = plt.subplots( - n_rows, N_COL, - figsize=(8.0, 24.0), - constrained_layout=True -) +fig, axes = plt.subplots(n_rows, N_COL, figsize=(8.0, 24.0), constrained_layout=True) -for (alignment, log_evalue), ax in zip( - unique_alignments, axes.flatten() -): +for (alignment, log_evalue), ax in zip(unique_alignments, axes.flatten()): # Transform 0-based sequence index to 1-based sequence position first = alignment.trace[0, 0] + 1 last = alignment.trace[-1, 0] + 1 center = (first + last) // 2 if last - first < EXCERPT_SIZE - MARGIN_SIZE * 2: - excerpt_loc = (center - EXCERPT_SIZE//2, center + EXCERPT_SIZE//2) + excerpt_loc = (center - EXCERPT_SIZE // 2, center + EXCERPT_SIZE // 2) else: # Exceed excerpt size to show entire alignment range excerpt_loc = (first - MARGIN_SIZE, last + MARGIN_SIZE) @@ -345,11 +359,18 @@ def draw_arrow(ax, feature, loc): for loc in feature.locs: draw_arrow(ax, feature, loc) # Draw rectangle representing homologuous region - ax.add_patch(Rectangle( - (first, 0.1), last - first + 1, 1 - 2*0.1, - facecolor="None", edgecolor="black", alpha=0.2, linewidth=1, - clip_on=False - )) + ax.add_patch( + Rectangle( + (first, 0.1), + last - first + 1, + 1 - 2 * 0.1, + facecolor="None", + edgecolor="black", + alpha=0.2, + linewidth=1, + clip_on=False, + ) + ) ax.xaxis.set_major_locator(MultipleLocator(1000)) ax.tick_params(labelsize=6) @@ -359,13 +380,13 @@ def draw_arrow(ax, feature, loc): ax.get_yaxis().set_tick_params(left=False, right=False, labelleft=False) exponent = int(np.floor(log_evalue)) - mantissa = 10**(log_evalue-exponent) + mantissa = 10 ** (log_evalue - exponent) homolog_excerpt = annotation[first : last + 1] if len(homolog_excerpt) > 0: # Select the longest feature in range for name display in title representative_feature = max( homolog_excerpt, - key=lambda feature: -np.subtract(*feature.get_location_range()) + key=lambda feature: -np.subtract(*feature.get_location_range()), ) feature_name = representative_feature.qual["product"] else: @@ -377,14 +398,15 @@ def draw_arrow(ax, feature, loc): ax.set_title( f"{feature_name}\n" - fr"E-Value: ${mantissa:.2f} \times 10^{{{exponent}}}$" + rf"E-Value: ${mantissa:.2f} \times 10^{{{exponent}}}$" f"\nIdentity: {align.get_sequence_identity(alignment) * 100:3.1f} %", - loc="left", size=8 + loc="left", + size=8, ) # Hide empty axes -for ax in axes.flatten()[len(unique_alignments):]: - ax.axis('off') +for ax in axes.flatten()[len(unique_alignments) :]: + ax.axis("off") fig.tight_layout(h_pad=3.0, w_pad=0.5) @@ -399,4 +421,4 @@ def draw_arrow(ax, feature, loc): # ---------- # # .. footbibliography:: -# \ No newline at end of file +# diff --git a/doc/examples/scripts/sequence/homology/genome_search.py b/doc/examples/scripts/sequence/homology/genome_search.py index 8e98bd446..6649704b8 100644 --- a/doc/examples/scripts/sequence/homology/genome_search.py +++ b/doc/examples/scripts/sequence/homology/genome_search.py @@ -15,30 +15,26 @@ and is similar to the method used by software like *BLAST*. At first the sequences for the *M1* coding gene and the *S. enterica* -genome are downloaded from *NCBI Entrez*. +genome are downloaded from *NCBI Entrez*. """ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.collections import LineCollection import biotite -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import biotite.sequence.io.genbank as gb -import biotite.sequence.graphics as seqgraphics -import biotite.sequence.align as align -import biotite.database.entrez as entrez -import biotite.structure.graphics as strucgraphics import biotite.application.viennarna as viennarna - +import biotite.database.entrez as entrez +import biotite.sequence.align as align +import biotite.sequence.graphics as seqgraphics +import biotite.sequence.io.genbank as gb # Download Escherichia coli BL21 and Salmonella enterica genome -gb_file = gb.MultiFile.read(entrez.fetch_single_file( - ["CP001509", "CP019649"], None, "nuccore", "gb" -)) +gb_file = gb.MultiFile.read( + entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb") +) ec_file, se_file = tuple(gb_file) annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"]) @@ -83,24 +79,27 @@ trigger_matches = [] # 0 represents the original genome sequence, 1 the reverse complement for strand in (0, 1): - matches_for_strand = matches[matches[:,1] == strand] + matches_for_strand = matches[matches[:, 1] == strand] # Plot match positions - ax = fig.add_subplot(1, 2, strand+1) + ax = fig.add_subplot(1, 2, strand + 1) ax.scatter( - matches_for_strand[:,0], matches_for_strand[:,2] / 1e6, - s=4, marker="o", color=biotite.colors["dimorange"] + matches_for_strand[:, 0], + matches_for_strand[:, 2] / 1e6, + s=4, + marker="o", + color=biotite.colors["dimorange"], ) ax.set_xlim(0, len(m1_sequence)) ax.set_ylim(0, len(se_genome) / 1e6) ax.set_xlabel("E. coli M1 position (b)") if strand == 0: ax.set_ylabel("S. enterica genome position (Mb)") - else: # strand == 1 + else: # strand == 1 ax.set_ylabel("S. enterica genome position (Mb) (reverse complement)") - + # Check if there are two adjacent matches on the same diagonal - diagonals = matches_for_strand[:,2] - matches_for_strand[:,0] + diagonals = matches_for_strand[:, 2] - matches_for_strand[:, 0] unique_diag = np.unique(diagonals) trigger_diagonals = np.array([], dtype=int) for diag in unique_diag: @@ -116,7 +115,7 @@ # The other match on the same diagonal should not overlap # with this match and should be within a cutoff range if np.any((distances > K) & (distances < DISCARD_RANGE)): - trigger_matches.append((strand, pos, pos+diag)) + trigger_matches.append((strand, pos, pos + diag)) trigger_diagonals = np.append(trigger_diagonals, diag) # Only add one match per diagonal at maximum break @@ -142,11 +141,14 @@ genome = genomic_seqs[strand] diagonal = genome_pos - m1_pos alignment = align.align_banded( - m1_sequence, genome, matrix, - band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), max_number=1 + m1_sequence, + genome, + matrix, + band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), + max_number=1, )[0] alignments.append((strand, alignment)) - + strand, best_alignment = max( alignments, key=lambda strand_alignment: alignment[1].score ) @@ -159,15 +161,19 @@ # genomic sequence. # Reverse sequence numbering for second sequence (genome) in alignment -number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x] +number_funcs = [None, lambda x: len(best_alignment.sequences[1]) - x] # Visualize alignment, use custom color fig = plt.figure(figsize=(8.0, 4.0)) ax = fig.add_subplot(111) seqgraphics.plot_alignment_similarity_based( - ax, best_alignment, matrix=matrix, - labels=["E. coli M1 coding gene", "S. enterica genome"], show_numbers=True, - number_functions=number_funcs, show_line_position=True, - color=biotite.colors["brightorange"] + ax, + best_alignment, + matrix=matrix, + labels=["E. coli M1 coding gene", "S. enterica genome"], + show_numbers=True, + number_functions=number_funcs, + show_line_position=True, + color=biotite.colors["brightorange"], ) fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 @@ -199,22 +205,25 @@ # Plot base connections ax.plot(*plot_coord.T, color="black", linewidth=1, zorder=1) # Plot base pairings -ax.add_collection(LineCollection( - [(plot_coord[i], plot_coord[j]) for i, j in base_pairs], - color="silver", linewidth=1, zorder=1 -)) +ax.add_collection( + LineCollection( + [(plot_coord[i], plot_coord[j]) for i, j in base_pairs], + color="silver", + linewidth=1, + zorder=1, + ) +) # Plot base markers ax.scatter( *plot_coord.T, - s = 12, + s=12, # Render markers over lines - zorder = 2, - # Display base marker color based on the identity in the alignment - color = ["forestgreen" if identity else "firebrick" - for identity in identities] + zorder=2, + # Display base marker color based on the identity in the alignment + color=["forestgreen" if identity else "firebrick" for identity in identities], ) ax.set_aspect("equal") ax.axis("off") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/gpcr_evolution.py b/doc/examples/scripts/sequence/homology/gpcr_evolution.py index 5ac3d7ea1..7072601c2 100644 --- a/doc/examples/scripts/sequence/homology/gpcr_evolution.py +++ b/doc/examples/scripts/sequence/homology/gpcr_evolution.py @@ -20,22 +20,23 @@ import re import matplotlib.pyplot as plt import networkx as nx +import biotite.application.clustalo as clustalo +import biotite.database.uniprot as uniprot import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.phylo as phylo import biotite.sequence.io.fasta as fasta -import biotite.database.uniprot as uniprot -import biotite.application.clustalo as clustalo - +import biotite.sequence.phylo as phylo # The bovine GPCRs are investigated SPECIES = "Bovine" query = ( - uniprot.SimpleQuery("reviewed", "true") & + uniprot.SimpleQuery("reviewed", "true") + & # Bovine proteins - uniprot.SimpleQuery("organism_name", "Bos taurus") & + uniprot.SimpleQuery("organism_name", "Bos taurus") + & # Keyword ID for GPCRs uniprot.SimpleQuery("keyword", "KW-0297") ) @@ -62,13 +63,11 @@ # The distance measure required for the tree calculation is the # percentage of non-identical amino acids in the respective two # sequences -distances = 1 - align.get_pairwise_sequence_identity( - alignment, mode="shortest" -) +distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") # Create tree via neighbor joining tree = phylo.neighbor_joining(distances) # Convert to NetworkX graph -#For the graph visualization, the edge directions are unnecessary +# For the graph visualization, the edge directions are unnecessary graph = tree.as_graph().to_undirected() fig = plt.figure(figsize=(8.0, 8.0)) @@ -78,15 +77,17 @@ pos = nx.kamada_kawai_layout(graph) # Assign the gene names to the nodes that represent a reference index node_labels = {i: name for i, name in enumerate(genes)} -nx.draw_networkx_edges( - graph, pos, ax=ax -) +nx.draw_networkx_edges(graph, pos, ax=ax) nx.draw_networkx_labels( - graph, pos, ax=ax, labels=node_labels, font_size=7, + graph, + pos, + ax=ax, + labels=node_labels, + font_size=7, # Draw a white background behind the labeled nodes # for better readability - bbox=dict(pad=0, color="white") + bbox=dict(pad=0, color="white"), ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/hcn_hydropathy.py b/doc/examples/scripts/sequence/homology/hcn_hydropathy.py index 637959a52..ff879afca 100644 --- a/doc/examples/scripts/sequence/homology/hcn_hydropathy.py +++ b/doc/examples/scripts/sequence/homology/hcn_hydropathy.py @@ -16,17 +16,17 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Patch import biotite +import biotite.application.mafft as mafft import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb -import biotite.application.mafft as mafft # Taken from # Kyte, J and Doolittle, RF. @@ -35,37 +35,39 @@ # Journal of Molecular Biology (2015). 157(1):105–32. # doi:10.1016/0022-2836(82)90515-0 hydropathy_dict = { - "I" : 4.5, - "V" : 4.2, - "L" : 3.8, - "F" : 2.8, - "C" : 2.5, - "M" : 1.9, - "A" : 1.8, - "G" : -0.4, - "T" : -0.7, - "S" : -0.8, - "W" : -0.9, - "Y" : -1.3, - "P" : -1.6, - "H" : -3.2, - "E" : -3.5, - "Q" : -3.5, - "D" : -3.5, - "N" : -3.5, - "K" : -3.9, - "R" : -4.5 + "I": 4.5, + "V": 4.2, + "L": 3.8, + "F": 2.8, + "C": 2.5, + "M": 1.9, + "A": 1.8, + "G": -0.4, + "T": -0.7, + "S": -0.8, + "W": -0.9, + "Y": -1.3, + "P": -1.6, + "H": -3.2, + "E": -3.5, + "Q": -3.5, + "D": -3.5, + "N": -3.5, + "K": -3.9, + "R": -4.5, } # Look for the Swiss-Prot entry contaning the human HCN1 channel -query = entrez.SimpleQuery("HCN1", "Gene Name") \ - & entrez.SimpleQuery("homo sapiens", "Organism") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = ( + entrez.SimpleQuery("HCN1", "Gene Name") + & entrez.SimpleQuery("homo sapiens", "Organism") + & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +) uids = entrez.search(query, db_name="protein") -gp_file = gb.GenBankFile.read(entrez.fetch( - uids[0], None, "gp", db_name="protein", ret_type="gp" -)) +gp_file = gb.GenBankFile.read( + entrez.fetch(uids[0], None, "gp", db_name="protein", ret_type="gp") +) hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp")) print(hcn1) @@ -75,13 +77,15 @@ hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1]) + def moving_average(data_set, window_size): - weights = np.full(window_size, 1/window_size) - return np.convolve(data_set, weights, mode='valid') + weights = np.full(window_size, 1 / window_size) + return np.convolve(data_set, weights, mode="valid") + # Apply moving average over 15 amino acids for clearer visualization ma_radius = 7 -hydropathies = moving_average(hydropathies, 2*ma_radius+1) +hydropathies = moving_average(hydropathies, 2 * ma_radius + 1) ######################################################################## # In order to assess the positional conservation, the sequences @@ -91,14 +95,16 @@ def moving_average(data_set, window_size): uids = [] for name in names: - query = entrez.SimpleQuery(name, "Gene Name") \ - & entrez.SimpleQuery("homo sapiens", "Organism") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") + query = ( + entrez.SimpleQuery(name, "Gene Name") + & entrez.SimpleQuery("homo sapiens", "Organism") + & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") + ) uids += entrez.search(query, db_name="protein") -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) for header in fasta_file: print(header) @@ -121,8 +127,8 @@ def moving_average(data_set, window_size): scores = np.zeros(len(hcn1)) for i in range(len(alignment)): # The column is also an alignment with length 1 - column = alignment[i:i+1] - hcn1_index = column.trace[0,0] + column = alignment[i : i + 1] + hcn1_index = column.trace[0, 0] if hcn1_index == -1: # Gap in HCN1 row # As similarity score should be analyzed in dependence of the @@ -131,7 +137,7 @@ def moving_average(data_set, window_size): continue scores[hcn1_index] = align.score(column, matrix, gap_penalty=-5) -scores = moving_average(scores, 2*ma_radius+1) +scores = moving_average(scores, 2 * ma_radius + 1) ######################################################################## # Now the hydropathy and the similarity score can be plotted. @@ -141,11 +147,12 @@ def moving_average(data_set, window_size): # Plot hydropathy ax.plot( - np.arange(1+ma_radius, len(hcn1)-ma_radius+1), hydropathies, - color=biotite.colors["dimorange"] + np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), + hydropathies, + color=biotite.colors["dimorange"], ) ax.axhline(0, color="gray", linewidth=0.5) -ax.set_xlim(1, len(hcn1)+1) +ax.set_xlim(1, len(hcn1) + 1) ax.set_xlabel("HCN1 sequence position") ax.set_ylabel("Hydropathy (15 residues moving average)") @@ -153,8 +160,11 @@ def moving_average(data_set, window_size): # with hydropathy plot annotation = gb.get_annotation(gp_file, include_only=["Region"]) transmembrane_annotation = seq.Annotation( - [feature for feature in annotation - if feature.qual["region_name"] == "Transmembrane region"] + [ + feature + for feature in annotation + if feature.qual["region_name"] == "Transmembrane region" + ] ) for feature in transmembrane_annotation: first, last = feature.get_location_range() @@ -163,17 +173,18 @@ def moving_average(data_set, window_size): # Plot similarity score as measure for conservation ax2 = ax.twinx() ax2.plot( - np.arange(1+ma_radius, len(hcn1)-ma_radius+1), scores, - color=biotite.colors["brightorange"] + np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), + scores, + color=biotite.colors["brightorange"], ) ax2.set_ylabel("Similarity score (15 residues moving average)") ax.legend( handles=[ - Patch(color=biotite.colors["dimorange"], label="Hydropathy"), - Patch(color=biotite.colors["brightorange"], label="Score" ) + Patch(color=biotite.colors["dimorange"], label="Hydropathy"), + Patch(color=biotite.colors["brightorange"], label="Score"), ], - fontsize=9 + fontsize=9, ) ######################################################################## @@ -190,17 +201,20 @@ def moving_average(data_set, window_size): # values as input. # Hydrophilic amino acids are depicted in blue, hydrophobic ones in red. + def hydropathy_to_color(hydropathy, colormap): # Normalize hydropathy to range between 0 and 1 # (orginally between -4.5 and 4.5) norm_hydropathy = (hydropathy - (-4.5)) / (4.5 - (-4.5)) return colormap(norm_hydropathy) + # Create a color scheme highlighting the hydropathy colormap = plt.get_cmap("coolwarm") colorscheme = [ hydropathy_to_color(hydropathy_dict[symbol], colormap) - if symbol in hydropathy_dict else None + if symbol in hydropathy_dict + else None for symbol in sequences[0].get_alphabet() ] @@ -210,8 +224,7 @@ def hydropathy_to_color(hydropathy, colormap): ax = fig.add_subplot(111) # Color the symbols instead of the background graphics.plot_alignment_type_based( - ax, alignment[:600], labels=names, show_numbers=True, - color_scheme=colorscheme + ax, alignment[:600], labels=names, show_numbers=True, color_scheme=colorscheme ) plt.show() diff --git a/doc/examples/scripts/sequence/homology/hcn_similarity.py b/doc/examples/scripts/sequence/homology/hcn_similarity.py index f81c55ee5..961abcd07 100644 --- a/doc/examples/scripts/sequence/homology/hcn_similarity.py +++ b/doc/examples/scripts/sequence/homology/hcn_similarity.py @@ -15,32 +15,31 @@ # Code source: Daniel Bauer # License: BSD 3 clause -import biotite.sequence.io.fasta as fasta +import matplotlib.pyplot as plt +import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.application.clustalo as clustalo import biotite.sequence.align as align -import biotite.sequence.phylo as phylo -import matplotlib.pyplot as plt import biotite.sequence.graphics as graphics - +import biotite.sequence.io.fasta as fasta +import biotite.sequence.phylo as phylo UNIPROT_IDS = dict( - hHCN1 = "O60741", - hHCN2 = "Q9UL51", - hHCN3 = "Q9P1Z3", - hHCN4 = "Q9Y3Q4", - spHCN = "O76977", - hEAG1 = "O95259", - hERG1 = "Q12809", - KAT1 = "Q39128", + hHCN1="O60741", + hHCN2="Q9UL51", + hHCN3="Q9P1Z3", + hHCN4="Q9Y3Q4", + spHCN="O76977", + hEAG1="O95259", + hERG1="Q12809", + KAT1="Q39128", ) ### fetch sequences for UniProt IDs from NCBI Entrez -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - list(UNIPROT_IDS.values()), None, "protein", "fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(list(UNIPROT_IDS.values()), None, "protein", "fasta") +) sequences = { name: seq.ProteinSequence(seq_str) for name, seq_str in zip(UNIPROT_IDS.keys(), fasta_file.values()) @@ -50,42 +49,44 @@ # create MSA alignment = clustalo.ClustalOmegaApp.align(list(sequences.values())) # build simple tree based on deviation from sequence identity -distances = 1 - align.get_pairwise_sequence_identity( - alignment, mode="shortest" -) +distances = 1 - align.get_pairwise_sequence_identity(alignment, mode="shortest") tree = phylo.upgma(distances) ### plot the tree fig, ax = plt.subplots(1, 1, figsize=(8, 5)) graphics.plot_dendrogram( - ax, tree, orientation="left", labels=list(UNIPROT_IDS.keys()), - show_distance=False, linewidth=2 - ) + ax, + tree, + orientation="left", + labels=list(UNIPROT_IDS.keys()), + show_distance=False, + linewidth=2, +) ax.grid(False) ax.set_xticks([]) # distance indicator indicator_len = 0.1 indicator_start = ( - ax.get_xlim()[0] + ax.get_xlim()[1]*0.02, - ax.get_ylim()[1] - ax.get_ylim()[1]*0.15 -) -indicator_stop = ( - indicator_start[0] + indicator_len, - indicator_start[1] + ax.get_xlim()[0] + ax.get_xlim()[1] * 0.02, + ax.get_ylim()[1] - ax.get_ylim()[1] * 0.15, ) +indicator_stop = (indicator_start[0] + indicator_len, indicator_start[1]) indicator_center = ( - (indicator_start[0] + indicator_stop[0])/2, - (indicator_start[1] + 0.25) + (indicator_start[0] + indicator_stop[0]) / 2, + (indicator_start[1] + 0.25), ) ax.annotate( - "", xy=indicator_start, xytext=indicator_stop, xycoords="data", - textcoords="data", arrowprops={"arrowstyle": "|-|", "linewidth": 2} + "", + xy=indicator_start, + xytext=indicator_stop, + xycoords="data", + textcoords="data", + arrowprops={"arrowstyle": "|-|", "linewidth": 2}, ) ax.annotate( - f"{int(indicator_len * 100)} %", xy=indicator_center, - ha="center", va="center" + f"{int(indicator_len * 100)} %", xy=indicator_center, ha="center", va="center" ) ax.set_title("Sequence deviation of HCN to other CNG superfamily channels") diff --git a/doc/examples/scripts/sequence/homology/homolog_msa.py b/doc/examples/scripts/sequence/homology/homolog_msa.py index c186bd222..f3b91dd65 100644 --- a/doc/examples/scripts/sequence/homology/homolog_msa.py +++ b/doc/examples/scripts/sequence/homology/homolog_msa.py @@ -10,13 +10,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 cl from tempfile import gettempdir -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import biotite.sequence.graphics as graphics -import biotite.application.muscle as muscle +import matplotlib.pyplot as plt import biotite.application.blast as blast +import biotite.application.muscle as muscle import biotite.database.entrez as entrez -import matplotlib.pyplot as plt +import biotite.sequence.graphics as graphics +import biotite.sequence.io.fasta as fasta # Download sequence of Streptococcus pyogenes Cas9 file_name = entrez.fetch("Q99ZW2", gettempdir(), "fa", "protein", "fasta") @@ -49,7 +48,7 @@ print("MSA results:") gapped_seqs = alignment.get_gapped_sequences() for i in range(len(gapped_seqs)): - print(hits[i], " "*3, gapped_seqs[i]) + print(hits[i], " " * 3, gapped_seqs[i]) # Visualize the first 200 columns of the alignment # Reorder alignments to reflect sequence distance @@ -58,9 +57,11 @@ ax = fig.add_subplot(111) order = app.get_alignment_order() graphics.plot_alignment_type_based( - ax, alignment[:200, order.tolist()], labels=[hits[i] for i in order], - show_numbers=True + ax, + alignment[:200, order.tolist()], + labels=[hits[i] for i in order], + show_numbers=True, ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/lexa_conservation.py b/doc/examples/scripts/sequence/homology/lexa_conservation.py index 104fe9fd4..957db2f6a 100644 --- a/doc/examples/scripts/sequence/homology/lexa_conservation.py +++ b/doc/examples/scripts/sequence/homology/lexa_conservation.py @@ -21,24 +21,22 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import biotite.sequence.io.genbank as gb -import biotite.sequence.graphics as graphics import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez +import biotite.sequence as seq +import biotite.sequence.graphics as graphics +import biotite.sequence.io.genbank as gb + # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database -query = entrez.SimpleQuery("lexA", "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = entrez.SimpleQuery("lexA", "Gene Name") & entrez.SimpleQuery( + "srcdb_swiss-prot", "Properties" +) # Search for the first 200 hits # More than 200 UIDs are not recommended for the EFetch service # for a single fetch uids = entrez.search(query, db_name="protein", number=200) -file = entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="gp" -) +file = entrez.fetch_single_file(uids, None, db_name="protein", ret_type="gp") # The file contains multiple concatenated GenPept files # -> Usage of MultiFile multi_file = gb.MultiFile.read(file) @@ -57,12 +55,14 @@ # on. Therefore, we write a function that creates a proper abbreviation # for a species name. + def abbreviate(species): # Remove possible brackets - species = species.replace("[","").replace("]","") - splitted_species= species.split() + species = species.replace("[", "").replace("]", "") + splitted_species = species.split() return "{:}. {:}".format(splitted_species[0][0], splitted_species[1]) + print("Sources:") all_sources = [abbreviate(gb.get_source(file)) for file in files] for source in all_sources[:20]: @@ -97,16 +97,16 @@ def abbreviate(species): # Ignore already listed species continue bind_feature = None - annot_seq = gb.get_annotated_sequence( - file, include_only=["Site"], format="gp" - ) + annot_seq = gb.get_annotated_sequence(file, include_only=["Site"], format="gp") # Find the feature for DNA-binding site for feature in annot_seq.annotation: # DNA binding site is a helix-turn-helix motif - if "site_type" in feature.qual \ - and feature.qual["site_type"] == "DNA binding" \ - and "H-T-H motif" in feature.qual["note"]: - bind_feature = feature + if ( + "site_type" in feature.qual + and feature.qual["site_type"] == "DNA binding" + and "H-T-H motif" in feature.qual["note"] + ): + bind_feature = feature if bind_feature is not None: # If the feature is found, # get the sequence slice that is defined by the feature... @@ -128,10 +128,10 @@ def abbreviate(species): fig = plt.figure(figsize=(4.5, 4.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignment[:,:20], labels=sources[:20], symbols_per_line=len(alignment) + ax, alignment[:, :20], labels=sources[:20], symbols_per_line=len(alignment) ) # Source names in italic -ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"}) +ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"}) fig.tight_layout() ######################################################################## @@ -145,7 +145,7 @@ def abbreviate(species): fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_sequence_logo(ax, profile, scheme="flower") -ax.set_xticks([5,10,15,20]) +ax.set_xticks([5, 10, 15, 20]) ax.set_xlabel("Residue position") ax.set_ylabel("Bits") # Only show left and bottom spine @@ -154,4 +154,4 @@ def abbreviate(species): fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/luxa_comparison.py b/doc/examples/scripts/sequence/homology/luxa_comparison.py index 080531860..eda03243a 100644 --- a/doc/examples/scripts/sequence/homology/luxa_comparison.py +++ b/doc/examples/scripts/sequence/homology/luxa_comparison.py @@ -12,22 +12,21 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.fasta as fasta # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database -query = entrez.SimpleQuery("luxA", "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = entrez.SimpleQuery("luxA", "Gene Name") & entrez.SimpleQuery( + "srcdb_swiss-prot", "Properties" +) uids = entrez.search(query, db_name="protein") -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) ids = [] sequences = [] @@ -39,7 +38,7 @@ matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, tree, distances = align.align_multiple( - sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False + sequences, matrix, gap_penalty=(-10, -1), terminal_penalty=False ) # Order alignment according to the guide tree alignment = alignment[:, order] @@ -48,9 +47,8 @@ fig = plt.figure(figsize=(8.0, 20.0)) ax = fig.add_subplot(111) graphics.plot_alignment_type_based( - ax, alignment, labels=ids, show_numbers=True, spacing=2.0, - color_scheme="blossom" + ax, alignment, labels=ids, show_numbers=True, spacing=2.0, color_scheme="blossom" ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/pi3k_alignment.py b/doc/examples/scripts/sequence/homology/pi3k_alignment.py index e705566eb..4f745876b 100644 --- a/doc/examples/scripts/sequence/homology/pi3k_alignment.py +++ b/doc/examples/scripts/sequence/homology/pi3k_alignment.py @@ -16,23 +16,23 @@ # License: BSD 3 clause import warnings -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite +import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.sequence.io.fasta as fasta -import biotite.application.clustalo as clustalo -uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"] -names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"] +uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"] +names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"] sequences = [] -file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) for header, seq_str in file.items(): sequences.append(seq.ProteinSequence(seq_str)) @@ -47,25 +47,27 @@ # Like the :class:`LetterSimilarityPlotter` we will use the # *average normalized similarity* as measure. + def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i): - code1 = trace_code[seq_i, pos_i] - if code1 == -1: - return np.nan - similarities = np.zeros(trace_code.shape[0]) - for i in range(trace_code.shape[0]): - code2 = trace_code[i, pos_i] - if code2 == -1: - similarities[i] = 0 - else: - sim = matrix[code1, code2] - # Normalize (range 0.0 - 1.0) - min_sim = np.min(matrix[code1]) - max_sim = np.max(matrix[code1]) - sim = (sim - min_sim) / (max_sim - min_sim) - similarities[i] = sim - # Delete self-similarity - similarities = np.delete(similarities, seq_i) - return np.average(similarities) + code1 = trace_code[seq_i, pos_i] + if code1 == -1: + return np.nan + similarities = np.zeros(trace_code.shape[0]) + for i in range(trace_code.shape[0]): + code2 = trace_code[i, pos_i] + if code2 == -1: + similarities[i] = 0 + else: + sim = matrix[code1, code2] + # Normalize (range 0.0 - 1.0) + min_sim = np.min(matrix[code1]) + max_sim = np.max(matrix[code1]) + sim = (sim - min_sim) / (max_sim - min_sim) + similarities[i] = sim + # Delete self-similarity + similarities = np.delete(similarities, seq_i) + return np.average(similarities) + matrix = align.SubstitutionMatrix.std_protein_matrix() # Get the alignment columns as symbols codes (-1 for gaps) @@ -73,15 +75,13 @@ def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i): similarities = np.zeros(trace_code.shape) for i in range(similarities.shape[0]): for j in range(similarities.shape[1]): - similarities[i,j] = get_average_normalized_similarity( + similarities[i, j] = get_average_normalized_similarity( trace_code, matrix.score_matrix(), i, j ) figure = plt.figure(figsize=(8.0, 3.0)) ax = figure.add_subplot(111) -heatmap = ax.pcolor( - similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0 -) +heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0) cbar = figure.colorbar(heatmap) figure.tight_layout() @@ -93,16 +93,19 @@ def get_average_normalized_similarity(trace_code, matrix, seq_i, pos_i): # Hence, we create bins, that contain the mean similarity over a range of # columns. + def calculate_bins(similarities, bin_count): edges = np.linspace(0, similarities.shape[1], bin_count, dtype=int) edges = np.append(edges, similarities.shape[1]) binned_similarities = np.zeros(similarities.shape) for i in range(similarities.shape[0]): for j in range(len(edges) - 1): - binned_similarities[i, edges[j]:edges[j+1]] = \ - np.nanmean(similarities[i, edges[j]:edges[j+1]]) + binned_similarities[i, edges[j] : edges[j + 1]] = np.nanmean( + similarities[i, edges[j] : edges[j + 1]] + ) return binned_similarities + with warnings.catch_warnings(): # Catch warnings about empty slice for gap-only parts warnings.simplefilter("ignore") @@ -110,9 +113,7 @@ def calculate_bins(similarities, bin_count): figure = plt.figure(figsize=(8.0, 3.0)) ax = figure.add_subplot(111) -heatmap = ax.pcolor( - similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0 -) +heatmap = ax.pcolor(similarities, cmap="RdYlGn", vmin=0.0, vmax=1.0) cbar = figure.colorbar(heatmap) # Furthermore, add some labels to the figure cbar.set_label("Average normalized similarity") @@ -136,14 +137,14 @@ def calculate_bins(similarities, bin_count): # From beginning of the sequence... for i in range(len(trace)): # Check if all sequences have no gap at the given position - if trace[i,0] != -1: + if trace[i, 0] != -1: start_index = i break # ...and the end of the sequence -for i in range(len(trace)-1, -1, -1): +for i in range(len(trace) - 1, -1, -1): # Check if all sequences have no gap at the given position - if trace[i,0] != -1: - stop_index = i+1 + if trace[i, 0] != -1: + stop_index = i + 1 break # Truncate alignment to region where the 'PI3K' sequence exists @@ -155,11 +156,17 @@ def calculate_bins(similarities, bin_count): # The alignment is quite long # -> Reduce font size to reduce figure size graphics.plot_alignment_similarity_based( - ax, alignment, matrix=matrix, symbols_per_line=80, labels=names, + ax, + alignment, + matrix=matrix, + symbols_per_line=80, + labels=names, show_numbers=True, - label_size=10, number_size=10, symbol_size=6, - color=biotite.colors["orange"] + label_size=10, + number_size=10, + symbol_size=6, + color=biotite.colors["orange"], ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/plotepiscan.py b/doc/examples/scripts/sequence/homology/plotepiscan.py index 87a6e1c6d..140f078ca 100644 --- a/doc/examples/scripts/sequence/homology/plotepiscan.py +++ b/doc/examples/scripts/sequence/homology/plotepiscan.py @@ -3,25 +3,25 @@ ========================================================== Peptide arrays can be used as a high-throughput platform for screening -biological interactions. Typical screenings involve the immobilization -of diverse peptides on a solid surface to study their interactions with -various target molecules. Specifically, arrays of peptides with -overlapping sequences can be used to identify the epitope of antibodies +biological interactions. Typical screenings involve the immobilization +of diverse peptides on a solid surface to study their interactions with +various target molecules. Specifically, arrays of peptides with +overlapping sequences can be used to identify the epitope of antibodies on a protein antigen at amino acid level. General scannings for molecular recognition using peptide arrays -are particlularly useful for epitope identification on monoclonal -antibodies. This example visualizes the data from two epitope mapping +are particlularly useful for epitope identification on monoclonal +antibodies. This example visualizes the data from two epitope mapping studies, using a color coded sequence alignment representation -of the antigens screened. The scannings interrogated a monoclonal +of the antigens screened. The scannings interrogated a monoclonal antibody (MAb) against two arrays of overlaping peptides :footcite:`Iyamu2023`. The files containing peptide array data can be downloaded :download:`here ` -and +and :download:`here `. The antigens screened span the extracellular domain of VAR2CSA, a virulence factor of *Plasmodiun falciparum* for the strains FCR3 -(residues 1-2659) and NF54 (residues 1-2652). The sequence of +(residues 1-2659) and NF54 (residues 1-2652). The sequence of the two domains can be downloaded :download:`here `. @@ -54,53 +54,55 @@ # Get BLOSUM62 matrix matrix = align.SubstitutionMatrix.std_protein_matrix() # Perform pairwise sequence alignment -alignments = align.align_optimal(fcr3_seq, nf54_seq, matrix, - gap_penalty = (-10, -1), - terminal_penalty = False) +alignments = align.align_optimal( + fcr3_seq, nf54_seq, matrix, gap_penalty=(-10, -1), terminal_penalty=False +) alignment = alignments[0] print(alignment) ######################################################################## -# Epitope mapping data +# Epitope mapping data # -------------------- # # This study used arrays of overlaping peptides to achive high acurracy -# in mapping the epitope. Both FCR3 and NF54 arrays, consisted of +# in mapping the epitope. Both FCR3 and NF54 arrays, consisted of # 20-mer peptides with an overlap of 19 and 18 amino acids respectively. # Arbitrary units (AU) of fluorescence intensity quantified the antibody -# recognition for each peptide. -# Our goal is to decorate the aligment, with the fluorescence intensity -# scores of each peptide in the arrays. We used a -# color code from red to white for high to low intensity, respectively. -# The background color of the symbols on the aligment corresponds to the +# recognition for each peptide. +# Our goal is to decorate the aligment, with the fluorescence intensity +# scores of each peptide in the arrays. We used a +# color code from red to white for high to low intensity, respectively. +# The background color of the symbols on the aligment corresponds to the # score for the 20th amino acid at the end of the peptide. # -# Lets create a function that maps the peptide score to the 20th residue +# Lets create a function that maps the peptide score to the 20th residue # of the peptide: + def read_scan(filename, pep_len=20, score_res=20): - if not type(pep_len) is int: + if type(pep_len) is not int: raise TypeError("pep_len : only integers are allowed") - elif not type(score_res) is int: - raise TypeError("score_res : only integers are allowed") + elif type(score_res) is not int: + raise TypeError("score_res : only integers are allowed") elif pep_len < score_res: raise Exception("score_res can't be higher than pep_len") - - elif pep_len != 20 or score_res != 20: - s = (score_res) - pep_len -1 + + elif pep_len != 20 or score_res != 20: + s = (score_res) - pep_len - 1 else: - s =-1 + s = -1 - df= pd.read_csv(filename) - scor_res = df['Seq'].str[s] - df['s_res'] = scor_res + df = pd.read_csv(filename) + scor_res = df["Seq"].str[s] + df["s_res"] = scor_res return df + # Load epitope scan data -fcr3_file_path = "../../../download/FCR3_10ug.csv" -nf54_file_path = "../../../download/NF54_10ug.csv" +fcr3_file_path = "../../../download/FCR3_10ug.csv" +nf54_file_path = "../../../download/NF54_10ug.csv" # Define the score residues on the arrays files = [fcr3_file_path, nf54_file_path] @@ -114,66 +116,70 @@ def read_scan(filename, pep_len=20, score_res=20): ag1_scan.head(5) ######################################################################## -# The microarrays contained each peptide printed in duplicated spots. We -# need to combine the values of those experimental replicates into a +# The microarrays contained each peptide printed in duplicated spots. We +# need to combine the values of those experimental replicates into a # unique score for each peptide. Typically, this unique value could come # from the geometric mean between replicates that do not deviate wildly. -# If the average deviation between replicates is high, one can assumme +# If the average deviation between replicates is high, one can assumme # that experimental errors should result in a lower score at a given spot. -# It is easy to imagine that imperfections on the printing of the spot, -# will rather decrease and not increase, the antibody recognition, in -# which case the the peptide signal is better represented +# It is easy to imagine that imperfections on the printing of the spot, +# will rather decrease and not increase, the antibody recognition, in +# which case the the peptide signal is better represented # by the higher score replicate. # -# Now lets write a function to combine the scores adding the flexibility -# to choose cases for those criterias exposed above. We will flag with -# 0 or 1 every peptide entry on the arrays: 1 if the deviation between +# Now lets write a function to combine the scores adding the flexibility +# to choose cases for those criterias exposed above. We will flag with +# 0 or 1 every peptide entry on the arrays: 1 if the deviation between # replicates is higher than 40%, otherwise 0. -def combine_scores(dataframe, combine='max', flag_noisy=True): - df= dataframe + +def combine_scores(dataframe, combine="max", flag_noisy=True): + df = dataframe # mean - df['ave'] = df.iloc[:,[1,2]].mean(axis = 1) + df["ave"] = df.iloc[:, [1, 2]].mean(axis=1) # mean deviation - df['avedev'] = ((df.r1 - df.ave).abs() + (df.r2 - df.ave).abs()) / 2 + df["avedev"] = ((df.r1 - df.ave).abs() + (df.r2 - df.ave).abs()) / 2 # percent deviation between replicates - df['dev_ratio'] = df.apply(lambda x:0 - if x.avedev==0 else x.avedev/x.ave, axis=1) - + df["dev_ratio"] = df.apply( + lambda x: 0 if x.avedev == 0 else x.avedev / x.ave, axis=1 + ) + # signal value: - if combine == 'max': - df['comb_signal'] = df.apply(lambda x:max(x.r1, x.r2) - if x.dev_ratio >=0.4 else x.ave, axis=1) - elif combine == 'mean': - df['comb_signal'] = df.apply(lambda x:x.ave - if x.dev_ratio <= 0.4 else 0, axis=1) - + if combine == "max": + df["comb_signal"] = df.apply( + lambda x: max(x.r1, x.r2) if x.dev_ratio >= 0.4 else x.ave, axis=1 + ) + elif combine == "mean": + df["comb_signal"] = df.apply( + lambda x: x.ave if x.dev_ratio <= 0.4 else 0, axis=1 + ) + if flag_noisy: - df['flag'] = df.apply(lambda x:0 - if x.dev_ratio <= 0.4 else 1, axis=1) + df["flag"] = df.apply(lambda x: 0 if x.dev_ratio <= 0.4 else 1, axis=1) return df -# Make the corresponding signal equal the replicate with the higest + +# Make the corresponding signal equal the replicate with the higest # score value. -dfa = combine_scores(ag1_scan, combine = 'max', flag_noisy = True) -dfb = combine_scores(ag2_scan, combine = 'max', flag_noisy = True) +dfa = combine_scores(ag1_scan, combine="max", flag_noisy=True) +dfb = combine_scores(ag2_scan, combine="max", flag_noisy=True) dfa.head(5) ######################################################################## -# Many molecular recognition screening campaings e.g. epitope mapping -# screenings follow a long-tailed data distribution. To properly +# Many molecular recognition screening campaings e.g. epitope mapping +# screenings follow a long-tailed data distribution. To properly # represent such distribution one can normalize the date using linear or # non-linear transformations on the combined score data. + def data_transform(dataframe, threshold=0): df = dataframe - # Option to set a "threshold" for the signal scores. + # Option to set a "threshold" for the signal scores. t = threshold - df['cubic'] = df.apply(lambda x: np.cbrt(max(0, x.comb_signal-t)), - axis=1) - df['signal_plot'] = df.apply(lambda x: x.cubic/df['cubic'].max(), - axis=1) + df["cubic"] = df.apply(lambda x: np.cbrt(max(0, x.comb_signal - t)), axis=1) + df["signal_plot"] = df.apply(lambda x: x.cubic / df["cubic"].max(), axis=1) + # Normalize, using the power law with cubic exponent. No threshold data_transform(dfa, threshold=0) @@ -184,134 +190,136 @@ def data_transform(dataframe, threshold=0): # ------------------------------------------------------------------------------- # # So far, we have the peptide score data combined, normalized, and mapped -# to a residue for each peptide. +# to a residue for each peptide. # Next, using the alignment trace as a template, we will match the signal -# intensities associated to the score residues, to the position of each +# intensities associated to the score residues, to the position of each # symbol on the alignment, considering the gaps. -# Get the trace for each sequence on the alignment: +# Get the trace for each sequence on the alignment: trace_a = align.get_symbols(alignment)[0] trace_b = align.get_symbols(alignment)[1] + def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): """ - Generate a gapped sequence that relates peptide score data signal with a - template alignment trace. The function returns a list of tuples representing - the gapped sequence, where each tuple consists of a residue and its associated - signal value. + Generate a gapped sequence that relates peptide score data signal with a + template alignment trace. The function returns a list of tuples representing + the gapped sequence, where each tuple consists of a residue and its associated + signal value. Parameters ---------- - dataframe : DataFrame - A *Pandas* dataframe containing columns for each peptide score data, + dataframe : DataFrame + A *Pandas* dataframe containing columns for each peptide score data, and its designated score residue. - seq_trace : list + seq_trace : list The sequence trace obtained from the alignment. - p_len : int + p_len : int The length of each overlapping peptide. overlap_step : int, optional The step size for overlapping peptides.Default is 1. Note: ----- - The 'gapped' sequence may be shorter than the aligment trace if the alignment results - in gaps at either end. Any remaining elements in the trace with 'None' values are + The 'gapped' sequence may be shorter than the aligment trace if the alignment results + in gaps at either end. Any remaining elements in the trace with 'None' values are filled with tuples: ('None', 0). """ template = seq_trace df = dataframe - step = overlap_step - gapped = list(zip(df.s_res , df.signal_plot)) - lk1 = df["s_res"].values.tolist() - plen = p_len # peptide length - + step = overlap_step + gapped = list(zip(df.s_res, df.signal_plot)) + lk1 = df["s_res"].values.tolist() + plen = p_len # peptide length + if step == 1: x, b = 0, 0 - c = 0 # cyclic counter up to the peptide length :20 - p = 0 # peptide counter + c = 0 # cyclic counter up to the peptide length :20 + p = 0 # peptide counter for b in range(len(lk1)): for a in template[x:]: - if c < plen-1 : - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + if c < plen - 1: + if a is None: + gapped.insert(x, (template[x], 0)) + x = x + 1 elif a != lk1[b]: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - elif p==0: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + elif p == 0: + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 else: - x=x+1 - c=c+1 + x = x + 1 + c = c + 1 break else: - c = 0 # reset the counter - p=p+1 - x=x+1 + c = 0 # reset the counter + p = p + 1 + x = x + 1 break elif step == 2: x, b = 0, 0 - c=0 - p=0 + c = 0 + p = 0 for b in range(len(lk1)): for a in template[x:]: - if c < plen-1 and p==0: - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + if c < plen - 1 and p == 0: + if a is None: + gapped.insert(x, (template[x], 0)) + x = x + 1 else: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - elif p==0 : - c = 0 # reset the counter - p=p+1 - x=x+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + elif p == 0: + c = 0 # reset the counter + p = p + 1 + x = x + 1 break - if p!=0: - if a==None and c == 0: - gapped.insert(x,(template[x],0)) - x=x+1 - elif c % 2 == 0: - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + if p != 0: + if a is None and c == 0: + gapped.insert(x, (template[x], 0)) + x = x + 1 + elif c % 2 == 0: + if a is None: + gapped.insert(x, (template[x], 0)) + x = x + 1 else: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - elif c % 2 != 0: - if a==None: - gapped.insert(x,(template[x],0)) - x=x+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + elif c % 2 != 0: + if a is None: + gapped.insert(x, (template[x], 0)) + x = x + 1 elif a != lk1[b]: - gapped.insert(x,(template[x],0)) - x=x+1 - c=c+1 - else: - x=x+1 - c=c+1 + gapped.insert(x, (template[x], 0)) + x = x + 1 + c = c + 1 + else: + x = x + 1 + c = c + 1 break # For terminal gaps - if len(gapped) < len(template) and template[len(gapped)+1]== None: - gapped_tail=[] - for n in range(len(template)-len(gapped)): - gapped_tail.append(('None', 0)) + if len(gapped) < len(template) and template[len(gapped) + 1] is None: + gapped_tail = [] + for n in range(len(template) - len(gapped)): + gapped_tail.append(("None", 0)) gapped = gapped + gapped_tail - + return gapped + # Let's use gapped_seq() to build the gapped sequences # FCR3 array, overlap_step: 1 (pep = 20-mer with 19 overlap) gapd_s1 = gapped_seq(dfa, trace_a, 20, 1) # NF54 array, overlap_step: 2 (pep = 20-mer with 18 overlap) -gapd_s2 = gapped_seq(dfb, trace_b, 20, 2) +gapd_s2 = gapped_seq(dfb, trace_b, 20, 2) # Checkpoint. Both gapped sequences must have the same length. len(gapd_s1) == len(gapd_s2) @@ -320,116 +328,133 @@ def gapped_seq(dataframe, seq_trace, p_len, overlap_step=1): # Create a signal map # ------------------- # -# Now we will generate an object mapping the signal scores from two gapped +# Now we will generate an object mapping the signal scores from two gapped # sequences. -def signal_map(gapped_seq1, gapped_seq2,): + +def signal_map( + gapped_seq1, + gapped_seq2, +): """ Generate a mapping of signal scores from two gapped sequences. This function takes two gapped sequences, `gapped_seq1` and - `gapped_seq2`. Each sequence is represented as a list of tuples, - with the first element being an amino acid symbol and the second - element being a signal score. It extracts the signal scores from - each sequence and creates a 2D array with two columns, where the - first column contains signal scores from `gapped_seq1` and the + `gapped_seq2`. Each sequence is represented as a list of tuples, + with the first element being an amino acid symbol and the second + element being a signal score. It extracts the signal scores from + each sequence and creates a 2D array with two columns, where the + first column contains signal scores from `gapped_seq1` and the second column contains signal scores from `gapped_seq2`. Parameters: ----------- gapped_seq1: list The first gapped sequence. - gapped_seq2: list + gapped_seq2: list The second gapped sequence. Returns: -------- - numpy.ndarray: A 2D numpy array with two columns containing signal + numpy.ndarray: A 2D numpy array with two columns containing signal scores extracted from `gapped_seq1` and `gapped_seq2` respectively. """ gapd_s1 = gapped_seq1 gapd_s2 = gapped_seq2 - fl_score = np.zeros((len(gapd_s1),2)) - + fl_score = np.zeros((len(gapd_s1), 2)) + for v1 in range(len(gapd_s1)): - fl_score[v1,0] = gapd_s1[v1][1] - fl_score[v1,1] = gapd_s2[v1][1] - + fl_score[v1, 0] = gapd_s1[v1][1] + fl_score[v1, 1] = gapd_s2[v1][1] + return fl_score + score = signal_map(gapd_s1, gapd_s2) ######################################################################## -# Sequence alignment decorated with MAb recognition regions +# Sequence alignment decorated with MAb recognition regions # --------------------------------------------------------- # -# Now we can plot the sequence alignment using an :class:`ArrayPlotter` -# instance that higlights sequence recognition regions at the positions +# Now we can plot the sequence alignment using an :class:`ArrayPlotter` +# instance that higlights sequence recognition regions at the positions # of the respective score residue per alignment column. -# To easily interpret the intensity-decorated alignment we will add a -# colorbar scaled accordingly. The scale matches the transformation +# To easily interpret the intensity-decorated alignment we will add a +# colorbar scaled accordingly. The scale matches the transformation # applied to the recognition signal recorded on the score ndarray. # -# Let's build a function to create a custom colorbar object. We will -# specify the dataframes corresponding to the two antigens screened in -# this example, the colormap, and the transformation to be +# Let's build a function to create a custom colorbar object. We will +# specify the dataframes corresponding to the two antigens screened in +# this example, the colormap, and the transformation to be # represented with the colorbar. fig = plt.figure(figsize=(8.0, 15)) ax = fig.add_subplot(111) graphics.plot_alignment_array( - ax, alignments[0], fl_score=score, labels=["FCR3", "NF54"], - show_numbers=True, symbols_per_line=80, - show_line_position=True, label_size=10, - number_size=10, symbol_size=6) + ax, + alignments[0], + fl_score=score, + labels=["FCR3", "NF54"], + show_numbers=True, + symbols_per_line=80, + show_line_position=True, + label_size=10, + number_size=10, + symbol_size=6, +) # Add the axes where the colorbar will reside: -ax2 = fig.add_axes([0.13, 0.07, 0.8, 0.01]) +ax2 = fig.add_axes([0.13, 0.07, 0.8, 0.01]) ax2.set_frame_on(False) -# Access the colormap of the relevant instace of ArrayPlotter: +# Access the colormap of the relevant instace of ArrayPlotter: colormap = graphics.ArrayPlotter(ax2, score).get_cmap() -def draw_colorbar(axes, array1, array2, colormap, - orient=None, title=None): + +def draw_colorbar(axes, array1, array2, colormap, orient=None, title=None): df1 = array1 df2 = array2 cmp = colormap ax = axes orientation = orient label = title - + # custom Formtatter for tick labels on the colorbar def fmt(x, pos): - a, b = '{:.1e}'.format(x).split('e') + a, b = "{:.1e}".format(x).split("e") b = int(b) - return r'${}\cdot10^{{{}}}$'.format(a, b) - - vmiA = df1['comb_signal'].min() - vmiB = df2['comb_signal'].min() - vmxA = df1['comb_signal'].max() - vmxB = df2['comb_signal'].max() - - # The normalization of this colormap needs to be consistent with the - # data trasnformtion used earlier on this example. The "cubic" law: - norm = mpl.colors.PowerNorm(gamma=0.33, vmin=min(vmiA,vmiB), - vmax=max(vmxA,vmxB)) - - fig = mpl.pyplot.figure() - return fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmp), - cax=ax, orientation=orientation, label=label, - format=mpl.ticker.FuncFormatter(fmt)) - -# Draw the colorbar -cbar = draw_colorbar(ax2, dfa, dfb, colormap, orient='horizontal', - title='Fluorescence Intensity [AU]') + return r"${}\cdot10^{{{}}}$".format(a, b) + + vmiA = df1["comb_signal"].min() + vmiB = df2["comb_signal"].min() + vmxA = df1["comb_signal"].max() + vmxB = df2["comb_signal"].max() + + # The normalization of this colormap needs to be consistent with the + # data trasnformtion used earlier on this example. The "cubic" law: + norm = mpl.colors.PowerNorm(gamma=0.33, vmin=min(vmiA, vmiB), vmax=max(vmxA, vmxB)) + + fig = mpl.pyplot.figure() + return fig.colorbar( + mpl.cm.ScalarMappable(norm=norm, cmap=cmp), + cax=ax, + orientation=orientation, + label=label, + format=mpl.ticker.FuncFormatter(fmt), + ) + + +# Draw the colorbar +cbar = draw_colorbar( + ax2, dfa, dfb, colormap, orient="horizontal", title="Fluorescence Intensity [AU]" +) # To improve readability we tilt the ticklabels on the colorbar labels = cbar.ax.get_xticklabels() -plt.setp(labels, rotation=45, horizontalalignment='center') +plt.setp(labels, rotation=45, horizontalalignment="center") plt.show() ######################################################################## # References # ---------- # -# .. footbibliography:: \ No newline at end of file +# .. footbibliography:: diff --git a/doc/examples/scripts/sequence/homology/residue_coevolution.py b/doc/examples/scripts/sequence/homology/residue_coevolution.py index e1f2f7329..e84b59f2f 100644 --- a/doc/examples/scripts/sequence/homology/residue_coevolution.py +++ b/doc/examples/scripts/sequence/homology/residue_coevolution.py @@ -43,22 +43,21 @@ # License: BSD 3 clause import warnings -import numpy as np -import matplotlib.pyplot as plt import matplotlib.colors as colors +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx -import biotite.sequence.align as align -import biotite.sequence.graphics as graphics import biotite.application.blast as blast import biotite.application.clustalo as clustalo import biotite.database.rcsb as rcsb - +import biotite.sequence.align as align +import biotite.sequence.graphics as graphics +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx # Get structure and sequence pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif")) -sequence = pdbx.get_sequence(pdbx_file)['A'] +sequence = pdbx.get_sequence(pdbx_file)["A"] # 'use_author_fields' is set to false, # to ensure that values in the 'res_id' annotation point to the sequence structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) @@ -88,16 +87,24 @@ # Plot MSA number_functions = [] for start in hit_starts: + def some_func(x, start=start): return x + start + number_functions.append(some_func) fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.gca() graphics.plot_alignment_type_based( - ax, alignment, symbols_per_line=len(alignment), labels=hit_ids, - symbol_size=8, number_size=8, label_size=8, - show_numbers=True, number_functions=number_functions, - color_scheme="flower" + ax, + alignment, + symbols_per_line=len(alignment), + labels=hit_ids, + symbol_size=8, + number_size=8, + label_size=8, + show_numbers=True, + number_functions=number_functions, + color_scheme="flower", ) ax.set_title("C-Myb R1-like sequences") fig.tight_layout() @@ -111,6 +118,7 @@ def some_func(x, start=start): # High values indicate that the residues at the respective two # positions have coevolved. + def mutual_information_zscore(alignment, n_shuffle=100): codes = align.get_codes(alignment).T alph = alignment.sequences[0].alphabet @@ -127,6 +135,7 @@ def mutual_information_zscore(alignment, n_shuffle=100): z_score = (mi - mean) / std return z_score + def _shuffle(codes): shuffled_codes = codes.copy() # Shuffle each alignment column @@ -134,6 +143,7 @@ def _shuffle(codes): np.random.shuffle(shuffled_codes[i]) return shuffled_codes + def _mutual_information(codes, alph): mi = np.zeros((len(alignment), len(alignment))) # Iterate over all columns to choose first column @@ -147,10 +157,10 @@ def _mutual_information(codes, alph): # Iterate over all symbols in both columns for k in range(codes.shape[1]): # Skip rows where either column has a gap - if codes[i,k] != -1 and codes[j,k] != -1: - marginal_counts_i[codes[i,k]] += 1 - marginal_counts_j[codes[j,k]] += 1 - combined_counts[codes[i,k], codes[j,k]] += 1 + if codes[i, k] != -1 and codes[j, k] != -1: + marginal_counts_i[codes[i, k]] += 1 + marginal_counts_j[codes[j, k]] += 1 + combined_counts[codes[i, k], codes[j, k]] += 1 nrows += 1 marginal_probs_i = marginal_counts_i / nrows marginal_probs_j = marginal_counts_j / nrows @@ -159,27 +169,31 @@ def _mutual_information(codes, alph): with warnings.catch_warnings(): warnings.simplefilter("ignore") mi_before_sum = ( - combined_probs * np.log2( - combined_probs / ( - marginal_probs_i[:, np.newaxis] * - marginal_probs_j[np.newaxis, :] + combined_probs + * np.log2( + combined_probs + / ( + marginal_probs_i[:, np.newaxis] + * marginal_probs_j[np.newaxis, :] ) ) ).flatten() - mi[i,j] = np.sum(mi_before_sum[~np.isnan(mi_before_sum)]) + mi[i, j] = np.sum(mi_before_sum[~np.isnan(mi_before_sum)]) return mi # Remove alignment columns that have a gap in the C-Myb sequence -alignment = alignment[alignment.trace[:,0] != -1] +alignment = alignment[alignment.trace[:, 0] != -1] mi = mutual_information_zscore(alignment) # Create the color map for the plot color = colors.to_rgb(biotite.colors["dimorange"]) cmap_val = np.stack( - [np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) - for i in range(len(color))] + [ + np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) + for i in range(len(color)) + ] ).transpose() cmap = colors.ListedColormap(cmap_val) @@ -196,4 +210,4 @@ def _mutual_information(codes, alph): fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py b/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py index a88d24284..b8c8ad276 100644 --- a/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py +++ b/doc/examples/scripts/sequence/homology/thca_synthase_polymorphism.py @@ -23,29 +23,28 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import LinearSegmentedColormap +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.io.genbank as gb -import biotite.sequence.align as align import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez -import biotite.application.clustalo as clustalo - +import biotite.sequence.io.genbank as gb # Search for DNA sequences that belong to the cited article -query = entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \ - & entrez.SimpleQuery("159", "Volume") \ - & entrez.SimpleQuery("132-140", "Page Number") +query = ( + entrez.SimpleQuery("Forensic Sci. Int.", "Journal") + & entrez.SimpleQuery("159", "Volume") + & entrez.SimpleQuery("132-140", "Page Number") +) uids = entrez.search(query, db_name="nuccore") # Download and read file containing the Genbank records for the THCA # synthase genes -multi_file = gb.MultiFile.read(entrez.fetch_single_file( - uids, file_name=None, db_name="nuccore", ret_type="gb" -)) +multi_file = gb.MultiFile.read( + entrez.fetch_single_file(uids, file_name=None, db_name="nuccore", ret_type="gb") +) # This dictionary maps the strain ID to the protein sequence @@ -81,6 +80,7 @@ for sequence in sequences.values(): assert len(sequence) == seq_len + # Create consensus sequences for the drug-type and fiber-type cannabis # strains def create_consensus(sequences): @@ -89,9 +89,7 @@ def create_consensus(sequences): for seq_pos in range(seq_len): # Count the number of occurrences of each amino acid # at the given sequence position - counts = np.bincount( - [sequence.code[seq_pos] for sequence in sequences] - ) + counts = np.bincount([sequence.code[seq_pos] for sequence in sequences]) # The consensus amino acid is the most frequent amino acid consensus_code[seq_pos] = np.argmax(counts) # Create empty ProteinSequence object... @@ -101,6 +99,7 @@ def create_consensus(sequences): consensus_sequence.code = consensus_code return consensus_sequence + drug_type_consensus = create_consensus( [sequences[strain] for strain in (1, 10, 13, 20, 53, 54)] ) @@ -120,7 +119,8 @@ def create_consensus(sequences): # At low similarity the symbols are colored red, # at high similarity the symbols are colored white cmap = LinearSegmentedColormap.from_list( - "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)] + "custom", + colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)], # ^ reddish ^ white ) @@ -128,11 +128,16 @@ def create_consensus(sequences): ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignment, matrix=matrix, symbols_per_line=50, + ax, + alignment, + matrix=matrix, + symbols_per_line=50, labels=["Drug-type", "Fiber-type"], - show_numbers=True, cmap=cmap, symbol_size=8 + show_numbers=True, + cmap=cmap, + symbol_size=8, ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/blosum_dendrogram.py b/doc/examples/scripts/sequence/misc/blosum_dendrogram.py index 64d67f2f7..400497ef4 100644 --- a/doc/examples/scripts/sequence/misc/blosum_dendrogram.py +++ b/doc/examples/scripts/sequence/misc/blosum_dendrogram.py @@ -10,12 +10,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite.sequence as seq import biotite.sequence.align as align -import biotite.sequence.phylo as phylo import biotite.sequence.graphics as graphics +import biotite.sequence.phylo as phylo # Obtain BLOSUM62 matrix = align.SubstitutionMatrix.std_protein_matrix() @@ -31,11 +31,12 @@ matrix = align.SubstitutionMatrix( seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]), seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]), - matrix.score_matrix()[:-4, :-4] + matrix.score_matrix()[:-4, :-4], ) similarities = matrix.score_matrix() print(matrix) + ######################################################################## # Now a function must be defined, that converts the similarity depicted # by a substitution matrix into a distance required by the UPGMA method. @@ -45,25 +46,26 @@ # # Finally the obtained (phylogenetic) tree is plotted as dendrogram. def get_distance(similarities, i, j): - s_max = (similarities[i,i] + similarities[j,j]) / 2 - return s_max - similarities[i,j] + s_max = (similarities[i, i] + similarities[j, j]) / 2 + return s_max - similarities[i, j] + distances = np.zeros(similarities.shape) for i in range(distances.shape[0]): for j in range(distances.shape[1]): - distances[i,j] = get_distance(similarities, i, j) + distances[i, j] = get_distance(similarities, i, j) tree = phylo.upgma(distances) fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.add_subplot(111) # Use the 3-letter amino acid code aa label -labels = [seq.ProteinSequence.convert_letter_1to3(letter).capitalize() - for letter in matrix.get_alphabet1()] -graphics.plot_dendrogram( - ax, tree, orientation="top", labels=labels -) +labels = [ + seq.ProteinSequence.convert_letter_1to3(letter).capitalize() + for letter in matrix.get_alphabet1() +] +graphics.plot_dendrogram(ax, tree, orientation="top", labels=labels) ax.set_ylabel("Distance") # Add grid for clearer distance perception ax.yaxis.grid(color="lightgray") -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/codon_usage.py b/doc/examples/scripts/sequence/misc/codon_usage.py index dd6963c24..e6d7b888d 100644 --- a/doc/examples/scripts/sequence/misc/codon_usage.py +++ b/doc/examples/scripts/sequence/misc/codon_usage.py @@ -35,14 +35,13 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import tempfile import itertools +import tempfile import numpy as np +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta -import biotite.database.entrez as entrez - +import biotite.sequence.io.genbank as gb # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( @@ -56,8 +55,8 @@ # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { - codon: 0 for codon - in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3) ) + codon: 0 + for codon in itertools.product(*([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) @@ -82,7 +81,7 @@ # Iterate over the sequence in non-overlapping frames of 3 # and count the occurence of each codon for i in range(0, len(cds_seq), 3): - codon_code = tuple(cds_seq.code[i:i+3]) + codon_code = tuple(cds_seq.code[i : i + 3]) codon_counter[codon_code] += 1 # Convert the total frequencies into relative frequencies @@ -165,4 +164,4 @@ # Print the contents of the created FASTA file print(fasta_file) # In a real application it would be written onto the hard drive via -# fasta_file.write("some_file.fasta") \ No newline at end of file +# fasta_file.write("some_file.fasta") diff --git a/doc/examples/scripts/sequence/misc/color_schemes.py b/doc/examples/scripts/sequence/misc/color_schemes.py index de2dd80ad..b84542932 100644 --- a/doc/examples/scripts/sequence/misc/color_schemes.py +++ b/doc/examples/scripts/sequence/misc/color_schemes.py @@ -8,57 +8,65 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import biotite.sequence as seq -import biotite.sequence.graphics as graphics -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.gridspec import GridSpec from matplotlib.patches import Rectangle +import biotite.sequence as seq +import biotite.sequence.graphics as graphics + def plot_colors(ax, alphabet): - x_space=0.1 - y_space=0.3 + x_space = 0.1 + y_space = 0.3 scheme_names = sorted(graphics.list_color_scheme_names(alphabet)) scheme_names.reverse() - schemes = [graphics.get_color_scheme(name, alphabet) - for name in scheme_names] + schemes = [graphics.get_color_scheme(name, alphabet) for name in scheme_names] for i, scheme in enumerate(schemes): for j, color in enumerate(scheme): - box = Rectangle((j - 0.5 + x_space/2, i - 0.5 + y_space/2), - 1 - x_space, 1 - y_space, color=color, - linewidth=0) + box = Rectangle( + (j - 0.5 + x_space / 2, i - 0.5 + y_space / 2), + 1 - x_space, + 1 - y_space, + color=color, + linewidth=0, + ) ax.add_patch(box) ax.set_xticks(np.arange(len(alphabet))) ax.set_yticks(np.arange(len(schemes))) ax.set_xticklabels([symbol for symbol in alphabet]) ax.set_yticklabels(scheme_names) - ax.set_xlim(-0.5, len(alphabet)-0.5) - ax.set_ylim(-0.5, len(schemes)-0.5) + ax.set_xlim(-0.5, len(alphabet) - 0.5) + ax.set_ylim(-0.5, len(schemes) - 0.5) ax.spines["left"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["top"].set_visible(False) - ax.xaxis.set_ticks_position("none") + ax.xaxis.set_ticks_position("none") ax.yaxis.set_ticks_position("none") + nuc_alphabet = seq.NucleotideSequence.alphabet_amb prot_alphabet = seq.ProteinSequence.alphabet pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") figure = plt.figure(figsize=(8.0, 5.0)) gs = GridSpec( - 3, 1, - height_ratios=[len(graphics.list_color_scheme_names(alphabet)) - for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)], + 3, + 1, + height_ratios=[ + len(graphics.list_color_scheme_names(alphabet)) + for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet) + ], ) -ax = figure.add_subplot(gs[0,0]) +ax = figure.add_subplot(gs[0, 0]) ax.set_title("Nucleotide color schemes") plot_colors(ax, nuc_alphabet) -ax = figure.add_subplot(gs[1,0]) +ax = figure.add_subplot(gs[1, 0]) ax.set_title("Protein color schemes") plot_colors(ax, prot_alphabet) -ax = figure.add_subplot(gs[2,0]) +ax = figure.add_subplot(gs[2, 0]) ax.set_title("Protein block color schemes") plot_colors(ax, pb_alphabet) plt.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/color_schemes_protein.py b/doc/examples/scripts/sequence/misc/color_schemes_protein.py index dd7978aa5..a747c2c74 100644 --- a/doc/examples/scripts/sequence/misc/color_schemes_protein.py +++ b/doc/examples/scripts/sequence/misc/color_schemes_protein.py @@ -9,7 +9,7 @@ - **clustalx** - Default color scheme of the *ClustalX* software - Color schemes generated with the software *Gecos* :footcite:`Kunzmann2020`: - + - **flower** - Light color scheme, based on *BLOSUM62* - **blossom** - Light color scheme with high contrast, based on *BLOSUM62*, depicts symbol similarity worse than *flower* @@ -21,12 +21,12 @@ scheme, based on *BLOSUM62* - **ocean** - Blue shifted, light color scheme, based on *BLOSUM62* - + - Color schemes adapted from *JalView* :footcite:`Clamp2004`: - + - **zappo** - Color scheme that depicts physicochemical properties - **taylor** - Color scheme invented by Willie Taylor - - **buried** - Color scheme depicting the *buried index* + - **buried** - Color scheme depicting the *buried index* - **hydrophobicity** - Color scheme depicting hydrophobicity - **prophelix** - Color scheme depicting secondary structure propensities @@ -42,24 +42,23 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez - +import biotite.sequence.io.fasta as fasta # Generate example alignment # (the same as in the bacterial luciferase example) -query = entrez.SimpleQuery("luxA", "Gene Name") \ - & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") +query = entrez.SimpleQuery("luxA", "Gene Name") & entrez.SimpleQuery( + "srcdb_swiss-prot", "Properties" +) uids = entrez.search(query, db_name="protein") -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - uids, None, db_name="protein", ret_type="fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(uids, None, db_name="protein", ret_type="fasta") +) sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] matrix = align.SubstitutionMatrix.std_protein_matrix() alignment, order, _, _ = align.align_multiple(sequences, matrix) @@ -70,10 +69,22 @@ # Get color scheme names alphabet = seq.ProteinSequence.alphabet schemes = [ - "rainbow", "clustalx", - "flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean", - "zappo", "taylor", "buried", "hydrophobicity", - "prophelix", "propstrand", "propturn" + "flower", + "blossom", + "spring", + "wither", + "autumn", + "sunset", + "ocean", + "rainbow", + "clustalx", + "zappo", + "taylor", + "buried", + "hydrophobicity", + "prophelix", + "propstrand", + "propturn", ] count = len(schemes) # Assert that this example displays all available amino acid color schemes @@ -82,20 +93,24 @@ # Visualize each scheme using the example alignment -fig = plt.figure(figsize=(8.0, count*2.0)) +fig = plt.figure(figsize=(8.0, count * 2.0)) gridspec = GridSpec(2, count) for i, name in enumerate(schemes): for j, color_symbols in enumerate([False, True]): - ax = fig.add_subplot(count, 2, 2*i + j + 1) + ax = fig.add_subplot(count, 2, 2 * i + j + 1) if j == 0: ax.set_ylabel(name) alignment_part = alignment[:40] else: alignment_part = alignment[40:] graphics.plot_alignment_type_based( - ax, alignment_part, symbols_per_line=len(alignment_part), - color_scheme=name, color_symbols=color_symbols, symbol_size=8 + ax, + alignment_part, + symbols_per_line=len(alignment_part), + color_scheme=name, + color_symbols=color_symbols, + symbol_size=8, ) fig.tight_layout() fig.subplots_adjust(wspace=0) -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py index 88d0eb4ad..aa3fd533e 100644 --- a/doc/examples/scripts/sequence/misc/local_alignment_statistics.py +++ b/doc/examples/scripts/sequence/misc/local_alignment_statistics.py @@ -22,25 +22,23 @@ # License: BSD 3 clause import matplotlib.pyplot as plt -from matplotlib.lines import Line2D import numpy as np +from matplotlib.lines import Line2D from scipy.stats import linregress import biotite +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align -from biotite.sequence.align.alignment import score -import biotite.sequence.io.fasta as fasta -import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics - +import biotite.sequence.io.fasta as fasta GAP_PENALTY = (-12, -1) # Download and parse protein sequences of avidin and streptavidin -fasta_file = fasta.FastaFile.read(entrez.fetch_single_file( - ["CAC34569", "ACL82594"], None, "protein", "fasta" -)) +fasta_file = fasta.FastaFile.read( + entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein", "fasta") +) for name, sequence in fasta_file.items(): if "CAC34569" in name: query_seq = seq.ProteinSequence(sequence) @@ -54,8 +52,7 @@ # Perform pairwise sequence alignment with affine gap penalty # Terminal gaps are not penalized alignment = align.align_optimal( - query_seq, hit_seq, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + query_seq, hit_seq, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] @@ -64,8 +61,12 @@ fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_alignment_similarity_based( - ax, alignment, matrix=matrix, labels=["Avidin (query)", "Database hit"], - show_numbers=True, show_line_position=True + ax, + alignment, + matrix=matrix, + labels=["Avidin (query)", "Database hit"], + show_numbers=True, + show_line_position=True, ) fig.tight_layout() @@ -103,10 +104,12 @@ # # f(x) = \lambda t(x) e^{-t(x)} + # The probability density function of the extreme value distribution -def pdf(x, l, u): - t = np.exp(-l * (x - u)) - return l * t * np.exp(-t) +def pdf(x, lam, u): + t = np.exp(-lam * (x - u)) + return lam * t * np.exp(-t) + x = np.linspace(-5, 10, 1000) y = pdf(x, 1, 0) @@ -124,7 +127,7 @@ def pdf(x, l, u): # .. math:: # # u = \frac{\ln Kmn}{\lambda}, -# +# # where :math:`m` and :math:`n` are the lengths of the aligned # sequences. # :math:`K` and :math:`\lambda` can be calculated from the substitution @@ -166,32 +169,39 @@ def pdf(x, l, u): SAMPLE_SIZE = 10000 SEQ_LENGTH = 300 -BACKGROUND = np.array(list({ - "A": 35155, - "C": 8669, - "D": 24161, - "E": 28354, - "F": 17367, - "G": 33229, - "H": 9906, - "I": 23161, - "K": 25872, - "L": 40625, - "M": 10101, - "N": 20212, - "P": 23435, - "Q": 19208, - "R": 23105, - "S": 32070, - "T": 26311, - "V": 29012, - "W": 5990, - "Y": 14488, - "B": 0, - "Z": 0, - "X": 0, - "*": 0, -}.values())) / 450431 +BACKGROUND = ( + np.array( + list( + { + "A": 35155, + "C": 8669, + "D": 24161, + "E": 28354, + "F": 17367, + "G": 33229, + "H": 9906, + "I": 23161, + "K": 25872, + "L": 40625, + "M": 10101, + "N": 20212, + "P": 23435, + "Q": 19208, + "R": 23105, + "S": 32070, + "T": 26311, + "V": 29012, + "W": 5990, + "Y": 14488, + "B": 0, + "Z": 0, + "X": 0, + "*": 0, + }.values() + ) + ) + / 450431 +) # Generate the sequence code for random sequences @@ -199,7 +209,7 @@ def pdf(x, l, u): random_sequence_code = np.random.choice( np.arange(len(seq.ProteinSequence.alphabet)), size=(SAMPLE_SIZE, 2, SEQ_LENGTH), - p=BACKGROUND + p=BACKGROUND, ) # Sample alignment scores @@ -207,11 +217,10 @@ def pdf(x, l, u): for i in range(SAMPLE_SIZE): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() - seq1.code = random_sequence_code[i,0] - seq2.code = random_sequence_code[i,1] + seq1.code = random_sequence_code[i, 0] + seq2.code = random_sequence_code[i, 1] sample_alignment = align.align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] sample_scores[i] = sample_alignment.score @@ -231,25 +240,24 @@ def pdf(x, l, u): # respectively. # Use method of moments to estimate distribution parameters -l = np.pi / np.sqrt(6 * np.var(sample_scores)) -u = np.mean(sample_scores) - np.euler_gamma / l +lam = np.pi / np.sqrt(6 * np.var(sample_scores)) +u = np.mean(sample_scores) - np.euler_gamma / lam # Score frequencies for the histogram freqs = np.bincount(sample_scores) / SAMPLE_SIZE # Coordinates for the fit -x = np.linspace(0, len(freqs)-1, 1000) -y = pdf(x, l, u) +x = np.linspace(0, len(freqs) - 1, 1000) +y = pdf(x, lam, u) fig, ax = plt.subplots(figsize=(8.0, 4.0)) ax.scatter( - np.arange(len(freqs)), freqs, color=biotite.colors["dimorange"], - label="Sample", s=8 + np.arange(len(freqs)), freqs, color=biotite.colors["dimorange"], label="Sample", s=8 ) ax.plot(x, y, color="gray", linestyle="--", label="Fit") ax.set_xlabel("Similarity score") ax.set_ylabel("Probability") -ax.set_xlim(0, len(freqs)-1) +ax.set_xlim(0, len(freqs) - 1) ax.legend(loc="upper left") fig.tight_layout() @@ -281,35 +289,33 @@ def pdf(x, l, u): SAMPLE_SIZE_PER_LENGTH = 1000 # The sequence lengths to be sampled -length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE) \ - .astype(int) +length_samples = np.logspace(*np.log10(LENGTH_RANGE), LENGTH_SAMPLE_SIZE).astype(int) u_series = np.zeros(LENGTH_SAMPLE_SIZE) -l_series = np.zeros(LENGTH_SAMPLE_SIZE) +lam_series = np.zeros(LENGTH_SAMPLE_SIZE) for i, length in enumerate(length_samples): # The same procedure from above random_sequence_code = np.random.choice( np.arange(len(seq.ProteinSequence.alphabet)), size=(SAMPLE_SIZE_PER_LENGTH, 2, length), - p=BACKGROUND + p=BACKGROUND, ) scores = np.zeros(SAMPLE_SIZE_PER_LENGTH, dtype=int) for j in range(SAMPLE_SIZE_PER_LENGTH): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() - seq1.code = random_sequence_code[j,0] - seq2.code = random_sequence_code[j,1] + seq1.code = random_sequence_code[j, 0] + seq2.code = random_sequence_code[j, 1] sample_alignment = align.align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0] scores[j] = sample_alignment.score - l_series[i] = np.pi / np.sqrt(6 * np.var(scores)) - u_series[i] = np.mean(scores) - np.euler_gamma / l_series[i] + lam_series[i] = np.pi / np.sqrt(6 * np.var(scores)) + u_series[i] = np.mean(scores) - np.euler_gamma / lam_series[i] ######################################################################## -# Now we use a linear fit of :math:`u` to check if there is a linear +# Now we use a linear fit of :math:`u` to check if there is a linear # relation. # Furthermore, if this is true, the slope and intercept of # the fit should give us a more precise estimation of :math:`\lambda` @@ -319,39 +325,37 @@ def pdf(x, l, u): slope, intercept, r, _, _ = linregress(ln_mn, u_series) # More precise parameter estimation from fit -l = 1/slope -k = np.exp(intercept * l) +lam = 1 / slope +k = np.exp(intercept * lam) # Coordinates for fit x_fit = np.linspace(0, 16, 100) y_fit = slope * x_fit + intercept fig, ax = plt.subplots(figsize=(8.0, 4.0)) -arrowprops = dict( - facecolor='black', shrink=0.1, width=3, headwidth=10, headlength=10 -) +arrowprops = dict(facecolor="black", shrink=0.1, width=3, headwidth=10, headlength=10) ax.scatter(ln_mn, u_series, color=biotite.colors["dimorange"], s=8) ax.plot(x_fit, y_fit, color=biotite.colors["darkorange"], linestyle="--") x_annot = 12 ax.annotate( f"R² = {r**2:.3f}\nK = {k:.3f}", - xy = (x_annot, slope * x_annot + intercept), - xytext = (-100, 50), - textcoords = "offset pixels", - arrowprops = arrowprops, + xy=(x_annot, slope * x_annot + intercept), + xytext=(-100, 50), + textcoords="offset pixels", + arrowprops=arrowprops, ) ax2 = ax.twinx() -ax2.scatter(ln_mn, l_series, color=biotite.colors["lightgreen"], s=8) -ax2.axhline(l, color=biotite.colors["darkgreen"], linestyle=":") +ax2.scatter(ln_mn, lam_series, color=biotite.colors["lightgreen"], s=8) +ax2.axhline(lam, color=biotite.colors["darkgreen"], linestyle=":") x_annot = 2 ax2.annotate( - f"λ = {l:.3f}", - xy = (x_annot, l), - xytext = (0, -50), - textcoords = "offset pixels", - arrowprops = arrowprops, + f"λ = {lam:.3f}", + xy=(x_annot, lam), + xytext=(0, -50), + textcoords="offset pixels", + arrowprops=arrowprops, ) ax.set_xlabel("ln(mn)") @@ -361,17 +365,25 @@ def pdf(x, l, u): ax.set_ylim(0, 50) ax2.set_ylim(0, 0.6) ax.legend( - handles = [ + handles=[ Line2D( - [0], [0], color=biotite.colors["dimorange"], label='u', - marker='o', linestyle="None" + [0], + [0], + color=biotite.colors["dimorange"], + label="u", + marker="o", + linestyle="None", ), Line2D( - [0], [0], color=biotite.colors["lightgreen"], label='λ', - marker='o', linestyle="None" - ) + [0], + [0], + color=biotite.colors["lightgreen"], + label="λ", + marker="o", + linestyle="None", + ), ], - loc = "upper left" + loc="upper left", ) fig.tight_layout() @@ -398,17 +410,17 @@ def pdf(x, l, u): # E-value calculation # ------------------- # -# Finally, we can use the estimated parameters to calculate the E-value +# Finally, we can use the estimated parameters to calculate the E-value # of the alignment of interest. # In this case we use :math:`K` and :math:`\lambda` from the linear fit, # but as already indicated we could alternatively use the parameters # from sampling alignments of sequences at a single length :math:`n`. # While :math:`\lambda` is a direct result of the method of moments as -# shown above, :math:`K` is calculated as +# shown above, :math:`K` is calculated as # # .. math:: # -# K = \frac{e^{\lambda u}}{n^2} +# K = \frac{e^{\lambda u}}{n^2} # # where :math:`n` is the length of both sequences in each sample. # @@ -425,12 +437,12 @@ def pdf(x, l, u): DATABASE_SIZE = 1_000_000 -def e_value(score, length1, length2, k, l): - return k * length1 * length2 * np.exp(-l * score) -e = e_value( - alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, l -) +def e_value(score, length1, length2, k, lam): + return k * length1 * length2 * np.exp(-lam * score) + + +e = e_value(alignment.score, len(query_seq), len(hit_seq) * DATABASE_SIZE, k, lam) print(f"E-value = {e:.2e}") ######################################################################## diff --git a/doc/examples/scripts/sequence/misc/orf_identification.py b/doc/examples/scripts/sequence/misc/orf_identification.py index 6c7d87abd..695b30af8 100644 --- a/doc/examples/scripts/sequence/misc/orf_identification.py +++ b/doc/examples/scripts/sequence/misc/orf_identification.py @@ -16,10 +16,8 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez -import matplotlib.pyplot as plt +import biotite.sequence.io.fasta as fasta # Download Porcine circovirus genome file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta") @@ -29,13 +27,19 @@ proteins, positions = genome.translate() print("Forward strand:") for i in range(len(proteins)): - print("{:4d} - {:4d}: {:}" - .format(positions[i][0], positions[i][1], str(proteins[i]))) + print( + "{:4d} - {:4d}: {:}".format( + positions[i][0], positions[i][1], str(proteins[i]) + ) + ) print("\n") # Perform translation for complementary strand genome_rev = genome.reverse().complement() proteins, positions = genome_rev.translate() print("Reverse strand:") for i in range(len(proteins)): - print("{:5d} - {:5d}: {:}" - .format(positions[i][0], positions[i][1], str(proteins[i]))) \ No newline at end of file + print( + "{:5d} - {:5d}: {:}".format( + positions[i][0], positions[i][1], str(proteins[i]) + ) + ) diff --git a/doc/examples/scripts/sequence/profile/anderson_logo.py b/doc/examples/scripts/sequence/profile/anderson_logo.py index 50b195f56..e516d3b0a 100644 --- a/doc/examples/scripts/sequence/profile/anderson_logo.py +++ b/doc/examples/scripts/sequence/profile/anderson_logo.py @@ -9,33 +9,35 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics # The list of Anderson promoters -seqs = [seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), - seq.NucleotideSequence("tttacagctagctcagtcctaggtattatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctaggtactgtgctagc"), - seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctaggtattgtgctagc"), - seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"), - seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"), - seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"), - seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"), - seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"), - seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"), - seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"), - seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), - seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"), - seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"), - seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"), - seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"), - seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")] +seqs = [ + seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctaggtataatgctagc"), + seq.NucleotideSequence("tttacagctagctcagtcctaggtattatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctaggtactgtgctagc"), + seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctaggtattgtgctagc"), + seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"), + seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"), + seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"), + seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"), + seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"), + seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"), + seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"), + seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), + seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"), + seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"), + seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"), + seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"), + seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc"), +] # Sequences do not need to be aligned # -> Create alignment with trivial trace # [[0 0 0 ...] @@ -43,18 +45,18 @@ # [2 2 2 ...] # ... ] alignment = align.Alignment( - sequences = seqs, - trace = np.tile(np.arange(len(seqs[0])), len(seqs)) \ - .reshape(len(seqs), len(seqs[0])) \ - .transpose(), - score = 0 + sequences=seqs, + trace=np.tile(np.arange(len(seqs[0])), len(seqs)) + .reshape(len(seqs), len(seqs[0])) + .transpose(), + score=0, ) # Create sequence logo from alignment fig = plt.figure(figsize=(8.0, 1.5)) ax = fig.add_subplot(111) profile = seq.SequenceProfile.from_alignment(alignment) -graphics.plot_sequence_logo(ax, profile) +graphics.plot_sequence_logo(ax, profile, scheme="rainbow") # Remove the entire frame ax.axis("off") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/profile/rbs_identification.py b/doc/examples/scripts/sequence/profile/rbs_identification.py index acb30ecc1..fd58280bb 100644 --- a/doc/examples/scripts/sequence/profile/rbs_identification.py +++ b/doc/examples/scripts/sequence/profile/rbs_identification.py @@ -16,17 +16,15 @@ # License: BSD 3 clause import tempfile -import numpy as np import matplotlib.pyplot as plt -from matplotlib.patches import Patch import matplotlib.ticker as ticker +import numpy as np +from matplotlib.patches import Patch import biotite +import biotite.database.entrez as entrez import biotite.sequence as seq -import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez -import biotite.application.muscle as muscle - +import biotite.sequence.io.genbank as gb UTR_LENGTH = 20 @@ -58,18 +56,15 @@ # CDS is on if loc.strand == seq.Location.Strand.FORWARD: utr_start = loc.first - UTR_LENGTH - utr_stop = loc.first + utr_stop = loc.first # Include the start codon (3 bases) in the UTRs for later # visualization - utrs.append( - bl21_genome[utr_start : utr_stop + 3].sequence - ) + utrs.append(bl21_genome[utr_start : utr_stop + 3].sequence) else: utr_start = loc.last + 1 - utr_stop = loc.last + 1 + UTR_LENGTH + utr_stop = loc.last + 1 + UTR_LENGTH utrs.append( - bl21_genome[utr_start - 3 : utr_stop].sequence \ - .reverse().complement() + bl21_genome[utr_start - 3 : utr_stop].sequence.reverse().complement() ) @@ -82,14 +77,15 @@ frequencies[np.arange(len(utr)), utr.code] += 1 profile = seq.SequenceProfile( - symbols = frequencies, - gaps = np.zeros(len(frequencies)), - alphabet = bl21_genome.sequence.alphabet + symbols=frequencies, + gaps=np.zeros(len(frequencies)), + alphabet=bl21_genome.sequence.alphabet, ) ### Visualize the profile + # Spend extra effort for correct sequence postion labels def normalize_seq_pos(x): """ @@ -103,15 +99,17 @@ def normalize_seq_pos(x): x -= 1 return x + @ticker.FuncFormatter def sequence_loc_formatter(x, pos): x = normalize_seq_pos(x) return f"{x:+}" + COLOR_SCHEME = [ - biotite.colors["lightgreen"], # A - biotite.colors["orange"], # C - biotite.colors["dimgreen"], # G + biotite.colors["lightgreen"], # A + biotite.colors["orange"], # C + biotite.colors["dimgreen"], # G biotite.colors["brightorange"], # T ] @@ -127,11 +125,14 @@ def sequence_loc_formatter(x, pos): ax.set_ylabel("Conservation (Bits)") ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) -ax.legend(loc="upper left", handles=[ - Patch(color=biotite.colors["green"], label="Purine"), - Patch(color=biotite.colors["lightorange"], label="Pyrimidine"), -]) +ax.legend( + loc="upper left", + handles=[ + Patch(color=biotite.colors["green"], label="Purine"), + Patch(color=biotite.colors["lightorange"], label="Pyrimidine"), + ], +) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/sequence/sequencing/gene_counts.py b/doc/examples/scripts/sequence/sequencing/gene_counts.py index 6dd4bed65..6fa73b0c1 100644 --- a/doc/examples/scripts/sequence/sequencing/gene_counts.py +++ b/doc/examples/scripts/sequence/sequencing/gene_counts.py @@ -19,21 +19,20 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -from io import StringIO import functools -import multiprocessing import gzip -import numpy as np +import multiprocessing +from io import StringIO import matplotlib.pyplot as plt +import numpy as np import pandas as pd import requests import biotite +import biotite.application.sra as sra import biotite.sequence as seq +import biotite.sequence.align as align import biotite.sequence.io.fasta as fasta import biotite.sequence.io.fastq as fastq -import biotite.sequence.align as align -import biotite.application.sra as sra - # The number of processes for read mapping N_PROCESS = 2 @@ -93,6 +92,7 @@ # extracts the gene symbols, i.e. the 'names' of the genes, and the # corresponding cDNA sequences. + def get_gene_symbol(header): fields = header.split() for field in fields: @@ -103,6 +103,7 @@ def get_gene_symbol(header): # No gene symbol for this cDNA (e.g. non-coding) return None + response = requests.get(CDNA_URL) fasta_content = gzip.decompress(response.content).decode("UTF-8") @@ -123,9 +124,7 @@ def get_gene_symbol(header): # The k-mer code in restricted to int64, so a larger number # of base alphabet codes decreases the *k* that fits into # the integer type - sequences.append( - seq.NucleotideSequence(seq_string, ambiguous=False) - ) + sequences.append(seq.NucleotideSequence(seq_string, ambiguous=False)) except seq.AlphabetError: # For the simplicity of this example just ignore sequences # with unambiguous symbols @@ -172,13 +171,10 @@ def get_gene_symbol(header): base_alph = seq.NucleotideSequence.alphabet_unamb kmer_alph = align.KmerAlphabet(base_alph, K) -min_selector = align.MinimizerSelector( - kmer_alph, WINDOW, align.RandomPermutation() -) +min_selector = align.MinimizerSelector(kmer_alph, WINDOW, align.RandomPermutation()) kmer_table = align.BucketKmerTable.from_kmer_selection( - kmer_alph, - *zip(*[min_selector.select(sequence) for sequence in sequences]) + kmer_alph, *zip(*[min_selector.select(sequence) for sequence in sequences]) ) ######################################################################## @@ -202,6 +198,7 @@ def get_gene_symbol(header): # After all alignments have been collected, simply the highest-scoring # one is chosen as the *correct* one. + def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): try: read = seq.NucleotideSequence(read_string, ambiguous=False) @@ -226,10 +223,13 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): ( gene_i, align.align_banded( - read, gene_sequences[gene_i], substitution_matrix, + read, + gene_sequences[gene_i], + substitution_matrix, band=(diagonal - BAND_WIDTH, diagonal + BAND_WIDTH), - gap_penalty= -10, max_number=1 - )[0] + gap_penalty=-10, + max_number=1, + )[0], ) for gene_i, diagonal in zip(matched_gene_indices, matched_diagonals) ] @@ -243,9 +243,9 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): substitution_matrix = align.SubstitutionMatrix.std_nucleotide_matrix() -for i, (_, (seq_string, q)) in enumerate(fastq.FastqFile.read_iter( - fastq_path, offset="Sanger" -)): +for i, (_, (seq_string, q)) in enumerate( + fastq.FastqFile.read_iter(fastq_path, offset="Sanger") +): # For demonstration only a single clean read is mapped if i == 3: read_string = seq_string @@ -266,10 +266,11 @@ def map_read(read_string, kmer_table, gene_sequences, substitution_matrix): # However, for the large number of reads which can be then processed in # parallel, it is still worth it. + def read_iter(fastq_path): - for i, (_, (read_string, quality)) in enumerate(fastq.FastqFile.read_iter( - fastq_path, offset="Sanger" - )): + for i, (_, (read_string, quality)) in enumerate( + fastq.FastqFile.read_iter(fastq_path, offset="Sanger") + ): # For the purpose of this example only a faction of the reads # are processed to save computation time if i >= EXCERPT: @@ -279,21 +280,24 @@ def read_iter(fastq_path): continue yield read_string + with multiprocessing.Pool(processes=N_PROCESS) as p: # Use multiprocessing to map reads to genes # and remove non-mappable reads (None values) afterwards - mapping_results = list(filter( - lambda mapping: mapping is not None, - p.map( - functools.partial( - map_read, - kmer_table=kmer_table, - gene_sequences=sequences, - substitution_matrix=substitution_matrix + mapping_results = list( + filter( + lambda mapping: mapping is not None, + p.map( + functools.partial( + map_read, + kmer_table=kmer_table, + gene_sequences=sequences, + substitution_matrix=substitution_matrix, + ), + read_iter(fastq_path), ), - read_iter(fastq_path) ) - )) + ) ######################################################################## # Now the genes are counted: @@ -324,7 +328,7 @@ def read_iter(fastq_path): # Put into dataframe for prettier printing counts = pd.DataFrame( {"gene_symbol": ranked_gene_symbols, "count": ranked_counts}, - index = np.arange(1, len(ranked_counts) + 1) + index=np.arange(1, len(ranked_counts) + 1), ) # Show Top N @@ -335,10 +339,7 @@ def read_iter(fastq_path): # Finally the top expressed genes are plotted. figure, ax = plt.subplots(figsize=(8.0, 6.0), constrained_layout=True) -ax.barh( - top_counts["gene_symbol"], top_counts["count"], - color=biotite.colors["orange"] -) +ax.barh(top_counts["gene_symbol"], top_counts["count"], color=biotite.colors["orange"]) ax.invert_yaxis() ax.set_title(f"Top {N_TOP_LIST} expressed genes", weight="semibold") ax.set_xlabel("Counts") @@ -348,4 +349,4 @@ def read_iter(fastq_path): # References # ---------- # -# .. footbibliography:: \ No newline at end of file +# .. footbibliography:: diff --git a/doc/examples/scripts/sequence/sequencing/genome_assembly.py b/doc/examples/scripts/sequence/sequencing/genome_assembly.py index bf5474fd9..29ab83656 100644 --- a/doc/examples/scripts/sequence/sequencing/genome_assembly.py +++ b/doc/examples/scripts/sequence/sequencing/genome_assembly.py @@ -1,4 +1,4 @@ -""" +r""" Comparative genome assembly =========================== @@ -48,21 +48,20 @@ import itertools import tempfile from concurrent.futures import ProcessPoolExecutor -import numpy as np import matplotlib.pyplot as plt -from matplotlib.lines import Line2D +import numpy as np from matplotlib.colors import LinearSegmentedColormap +from matplotlib.lines import Line2D import biotite +import biotite.application.sra as sra +import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.align as align +import biotite.sequence.graphics as graphics import biotite.sequence.io as seqio import biotite.sequence.io.fasta as fasta import biotite.sequence.io.fastq as fastq import biotite.sequence.io.genbank as gb -import biotite.sequence.graphics as graphics -import biotite.database.entrez as entrez -import biotite.application.sra as sra - # Download the sequencing data app = sra.FastqDumpApp("SRR13453793") @@ -73,8 +72,9 @@ # There is only one read per spot file_path = app.get_file_paths()[0] fastq_file = fastq.FastqFile.read(file_path, offset="Sanger") -reads = [seq.NucleotideSequence(seq_str) - for seq_str, score_array in fastq_file.values()] +reads = [ + seq.NucleotideSequence(seq_str) for seq_str, score_array in fastq_file.values() +] score_arrays = [score_array for seq_str, score_array in fastq_file.values()] print(f"Number of reads: {len(reads)}") @@ -93,7 +93,8 @@ length_ax.hist( [len(score_array) for score_array in score_arrays], - bins=np.logspace(1, 5, N_BINS), color="gray" + bins=np.logspace(1, 5, N_BINS), + color="gray", ) length_ax.set_xlabel("Read length") length_ax.set_ylabel("Number of reads") @@ -102,7 +103,8 @@ score_ax.hist( [np.mean(score_array) for score_array in score_arrays], - bins=N_BINS, color="gray", + bins=N_BINS, + color="gray", ) score_ax.set_xlim(0, 30) score_ax.set_xlabel("Phred score") @@ -134,8 +136,10 @@ fig, ax = plt.subplots(figsize=(8.0, 4.0)) ax.fill_between( # Value in megabases -> 1e-6 - np.arange(len(score_histogram)), score_histogram * 1e-6, - linewidth=0, color="gray" + np.arange(len(score_histogram)), + score_histogram * 1e-6, + linewidth=0, + color="gray", ) ax.set_xlim( np.min(np.where(score_histogram > 0)[0]), @@ -166,15 +170,14 @@ # Download and read the reference SARS-CoV-2 genome orig_genome_file = entrez.fetch( - "NC_045512", tempfile.gettempdir(), "gb", - db_name="Nucleotide", ret_type="gb" + "NC_045512", tempfile.gettempdir(), "gb", db_name="Nucleotide", ret_type="gb" ) orig_genome = seqio.load_sequence(orig_genome_file) # Create complementary reads -compl_reads = list(itertools.chain( - *[(read, read.reverse(False).complement()) for read in reads] -)) +compl_reads = list( + itertools.chain(*[(read, read.reverse(False).complement()) for read in reads]) +) ######################################################################## # To map the reads to their corresponding positions in the reference @@ -239,19 +242,27 @@ read_length = len(compl_reads[INDEX]) # Find the correct diagonal for the example read -diagonals = matches[:,2] - matches[:,0] +diagonals = matches[:, 2] - matches[:, 0] diag, counts = np.unique(diagonals, return_counts=True) correct_diagonal = diag[np.argmax(counts)] # Visualize the matches and the correct diagonal fig, ax = plt.subplots(figsize=(8.0, 8.0)) ax.scatter( - matches[:,0], matches[:,2], - s=4, marker="o", color=biotite.colors["dimorange"], label="Match" + matches[:, 0], + matches[:, 2], + s=4, + marker="o", + color=biotite.colors["dimorange"], + label="Match", ) ax.plot( - [0, read_length], [correct_diagonal, read_length+correct_diagonal], - linestyle=":", linewidth=1.0, color="black", label="Correct diagonal" + [0, read_length], + [correct_diagonal, read_length + correct_diagonal], + linestyle=":", + linewidth=1.0, + color="black", + label="Correct diagonal", ) ax.set_xlim(0, read_length) ax.set_xlabel("Read position") @@ -263,7 +274,7 @@ # Find the correct diagonal for all reads correct_diagonals = [None] * len(all_matches) for i, matches in enumerate(all_matches): - diagonals = matches[:,2] - matches[:,0] + diagonals = matches[:, 2] - matches[:, 0] unqiue_diag, counts = np.unique(diagonals, return_counts=True) if len(unqiue_diag) == 0: # If no match is found for this sequence, ignore this sequence @@ -325,23 +336,28 @@ matrix = align.SubstitutionMatrix.std_nucleotide_matrix() + def map_sequence(read, diag): deviation = int(3 * np.sqrt(len(read) * P_INDEL)) if diag is None: return None else: return align.align_banded( - read, orig_genome, matrix, gap_penalty=-10, - band = (diag - deviation, diag + deviation), - max_number = 1 + read, + orig_genome, + matrix, + gap_penalty=-10, + band=(diag - deviation, diag + deviation), + max_number=1, )[0] + # Each process can be quite memory consuming # -> Cap to two processes to make it work on low-RAM commodity hardware with ProcessPoolExecutor(max_workers=2) as executor: - alignments = list(executor.map( - map_sequence, compl_reads, correct_diagonals, chunksize=1000 - )) + alignments = list( + executor.map(map_sequence, compl_reads, correct_diagonals, chunksize=1000) + ) ######################################################################## # Now we have to select for each read, whether the original or @@ -351,18 +367,25 @@ def map_sequence(read, diag): for_alignments = [alignments[i] for i in range(0, len(alignments), 2)] rev_alignments = [alignments[i] for i in range(1, len(alignments), 2)] -scores = np.stack(( - [ali.score if ali is not None else 0 for ali in for_alignments], - [ali.score if ali is not None else 0 for ali in rev_alignments] -),axis=-1) +scores = np.stack( + ( + [ali.score if ali is not None else 0 for ali in for_alignments], + [ali.score if ali is not None else 0 for ali in rev_alignments], + ), + axis=-1, +) correct_sense = np.argmax(scores, axis=-1) -correct_alignments = [for_a if sense == 0 else rev_a for for_a, rev_a, sense - in zip(for_alignments, rev_alignments, correct_sense)] +correct_alignments = [ + for_a if sense == 0 else rev_a + for for_a, rev_a, sense in zip(for_alignments, rev_alignments, correct_sense) +] # If we use a reverse complementary read, # we also need to reverse the Phred score arrays -correct_score_arrays = [score if sense == 0 else score[::-1] for score, sense - in zip(score_arrays, correct_sense)] +correct_score_arrays = [ + score if sense == 0 else score[::-1] + for score, sense in zip(score_arrays, correct_sense) +] ######################################################################## # Now we know for each read where its corresponding position on the @@ -371,12 +394,8 @@ def map_sequence(read, diag): # Eventually, we visualize the mapping. # Find genome positions for the starts and ends of all reads -starts = np.array( - [ali.trace[ 0, 1] for ali in correct_alignments if ali is not None] -) -stops = np.array( - [ali.trace[-1, 1] for ali in correct_alignments if ali is not None] -) +starts = np.array([ali.trace[0, 1] for ali in correct_alignments if ali is not None]) +stops = np.array([ali.trace[-1, 1] for ali in correct_alignments if ali is not None]) # For a nicer plot sort these by their start position order = np.argsort(starts) starts = starts[order] @@ -384,13 +403,17 @@ def map_sequence(read, diag): fig, ax = plt.subplots(figsize=(8.0, 12.0)) ax.barh( - np.arange(len(starts)), left=starts, width=stops-starts, height=1, - color=biotite.colors["dimgreen"], linewidth=0 + np.arange(len(starts)), + left=starts, + width=stops - starts, + height=1, + color=biotite.colors["dimgreen"], + linewidth=0, ) -ax.set_ylim(0, len(starts)+1) -ax.spines['top'].set_visible(False) -ax.spines['right'].set_visible(False) -ax.spines['left'].set_visible(False) +ax.set_ylim(0, len(starts) + 1) +ax.spines["top"].set_visible(False) +ax.spines["right"].set_visible(False) +ax.spines["left"].set_visible(False) ax.tick_params(left=False, labelleft=False) ax.set_xlabel("Sequence position") ax.set_title("Read mappings to reference genome") @@ -479,24 +502,21 @@ def map_sequence(read, diag): if alignment is not None: trace = alignment.trace - no_gap_trace = trace[(trace[:,0] != -1) & (trace[:,1] != -1)] + no_gap_trace = trace[(trace[:, 0] != -1) & (trace[:, 1] != -1)] # Get the sequence code for the aligned read symbols - seq_code = alignment.sequences[0].code[no_gap_trace[:,0]] + seq_code = alignment.sequences[0].code[no_gap_trace[:, 0]] # The sequence code contains the integers 0 - 3; # one for each possible base # Hence, we can use these integers directly to index the second # dimension of the Pred score sum # The index for the first dimension contains simply the genome # positions taken from the alignment trace - phred_sum[no_gap_trace[:,1], seq_code] \ - += score_array[no_gap_trace[:,0]] + phred_sum[no_gap_trace[:, 1], seq_code] += score_array[no_gap_trace[:, 0]] - sequencing_depth[ - trace[0,1] : trace[-1,1] - ] += 1 + sequencing_depth[trace[0, 1] : trace[-1, 1]] += 1 - read_gap_trace = trace[trace[:,0] == -1] - deletion_number[read_gap_trace[:,1]] += 1 + read_gap_trace = trace[trace[:, 0] == -1] + deletion_number[read_gap_trace[:, 1]] += 1 # Call the most probable base for each genome position according to the # formula above @@ -504,23 +524,21 @@ def map_sequence(read, diag): # Visualize the sequencing depth and score sum over the genome -max_phred_sum = phred_sum[ - np.arange(len(phred_sum)), most_probable_symbol_codes -] +max_phred_sum = phred_sum[np.arange(len(phred_sum)), most_probable_symbol_codes] + def moving_average(data_set, window_size): - weights = np.full(window_size, 1/window_size) - return np.convolve(data_set, weights, mode='valid') + weights = np.full(window_size, 1 / window_size) + return np.convolve(data_set, weights, mode="valid") + fig, ax = plt.subplots(figsize=(8.0, 4.0)) -ax.plot( - moving_average(max_phred_sum, 100), - color="lightgray", linewidth=1.0 -) +ax.plot(moving_average(max_phred_sum, 100), color="lightgray", linewidth=1.0) ax2 = ax.twinx() ax2.plot( moving_average(sequencing_depth, 100), - color=biotite.colors["dimorange"], linewidth=1.0 + color=biotite.colors["dimorange"], + linewidth=1.0, ) ax.axhline(0, color="silver", linewidth=0.5) ax.set_xlim(0, len(orig_genome)) @@ -528,10 +546,9 @@ def moving_average(data_set, window_size): ax.set_ylabel("Phred score sum") ax2.set_ylabel("Sequencing depth") ax.legend( - [Line2D([0], [0], color=c) - for c in ("lightgray", biotite.colors["dimorange"])], + [Line2D([0], [0], color=c) for c in ("lightgray", biotite.colors["dimorange"])], ["Phred score sum", "Sequencing depth"], - loc="upper left" + loc="upper left", ) fig.tight_layout() @@ -551,14 +568,13 @@ def moving_average(data_set, window_size): var_genome.code = most_probable_symbol_codes # A deletion is called, if either enough reads include this deletion # or the sequence position is not covered by any read at all -deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) \ - | (sequencing_depth == 0) +deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) | ( + sequencing_depth == 0 +) var_genome = var_genome[~deletion_mask] # Write the assembled genome into a FASTA file out_file = fasta.FastaFile() -fasta.set_sequence( - out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True -) +fasta.set_sequence(out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True) out_file.write(tempfile.NamedTemporaryFile("w")) ######################################################################## @@ -578,10 +594,13 @@ def moving_average(data_set, window_size): BAND_WIDTH = 1000 genome_alignment = align.align_banded( - var_genome, orig_genome, matrix, - band=(-BAND_WIDTH//2, BAND_WIDTH//2), max_number=1 + var_genome, + orig_genome, + matrix, + band=(-BAND_WIDTH // 2, BAND_WIDTH // 2), + max_number=1, )[0] -identity = align.get_sequence_identity(genome_alignment, 'all') +identity = align.get_sequence_identity(genome_alignment, "all") print(f"Sequence identity: {identity * 100:.2f} %") ######################################################################## @@ -599,9 +618,9 @@ def moving_average(data_set, window_size): # Calculate the sequence identity within each bin bin_identities = np.zeros(N_BINS) -edges = np.linspace(0, len(orig_genome), N_BINS+1) +edges = np.linspace(0, len(orig_genome), N_BINS + 1) for i, (bin_start, bin_stop) in enumerate(zip(edges[:-1], edges[1:])): - orig_genome_trace = genome_alignment.trace[:,1] + orig_genome_trace = genome_alignment.trace[:, 1] excerpt = genome_alignment[ (orig_genome_trace >= bin_start) & (orig_genome_trace < bin_stop) ] @@ -612,9 +631,11 @@ def moving_average(data_set, window_size): # Plot the deviation = 1 - sequence identity deviation_ax.bar( - edges[:-1], width=(edges[1:]-edges[:-1]), + edges[:-1], + width=(edges[1:] - edges[:-1]), height=(1 - bin_identities), - color=biotite.colors["dimorange"], align="edge" + color=biotite.colors["dimorange"], + align="edge", ) deviation_ax.set_xlim(0, len(orig_genome)) deviation_ax.set_ylabel("1 - Sequence identity") @@ -623,20 +644,24 @@ def moving_average(data_set, window_size): deviation_ax.set_ylim(1e-3, 1e-1) # Plot genmic coordinates of the genes -for i, feature in enumerate(sorted( - annot_seq.annotation, - key=lambda feature: min([loc.first for loc in feature.locs]) -)): +for i, feature in enumerate( + sorted( + annot_seq.annotation, + key=lambda feature: min([loc.first for loc in feature.locs]), + ) +): for loc in feature.locs: feature_ax.barh( - left=loc.first, width=loc.last-loc.first, y=i, height=1, - color=biotite.colors["dimgreen"] + left=loc.first, + width=loc.last - loc.first, + y=i, + height=1, + color=biotite.colors["dimgreen"], ) feature_ax.text( - loc.last + 100, i, feature.qual["gene"], - fontsize=8, ha="left", va="center" + loc.last + 100, i, feature.qual["gene"], fontsize=8, ha="left", va="center" ) -feature_ax.set_ylim(i+0.5, -0.5) +feature_ax.set_ylim(i + 0.5, -0.5) feature_ax.set_xlim(0, len(orig_genome)) feature_ax.xaxis.set_visible(False) feature_ax.yaxis.set_visible(False) @@ -671,17 +696,17 @@ def moving_average(data_set, window_size): # The locations of some notable spike protein regions FEATURES = { # Signal peptide - "SP": ( 1, 12), + "SP": (1, 12), # N-terminal domain - "NTD": ( 14, 303), + "NTD": (14, 303), # Receptor binding domain - "RBD": ( 319, 541), + "RBD": (319, 541), # Fusion peptide - "FP": ( 788, 806), + "FP": (788, 806), # Transmembrane domain - "TM": (1214, 1234), + "TM": (1214, 1234), # Cytoplasmatic tail - "CT": (1269, 1273), + "CT": (1269, 1273), } # Get RNA sequence coding for spike protein from the reference genome @@ -694,11 +719,11 @@ def moving_average(data_set, window_size): alignment = align.align_optimal( var_genome, orig_spike_seq, matrix, local=True, max_number=1 )[0] -var_spike_seq = var_genome[alignment.trace[alignment.trace[:,0] != -1, 0]] +var_spike_seq = var_genome[alignment.trace[alignment.trace[:, 0] != -1, 0]] # Obtain protein sequences from RNA sequences orig_spike_prot_seq = orig_spike_seq.translate(complete=True).remove_stops() -var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops() +var_spike_prot_seq = var_spike_seq.translate(complete=True).remove_stops() # Align both protein sequences with each other for later comparison blosum_matrix = align.SubstitutionMatrix.std_protein_matrix() @@ -712,47 +737,50 @@ def moving_average(data_set, window_size): # Plot alignment cmap = LinearSegmentedColormap.from_list( - "custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)] + "custom", + colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)], # ^ reddish ^ white ) graphics.plot_alignment_similarity_based( - ax, alignment, matrix=blosum_matrix, symbols_per_line=SYMBOLS_PER_LINE, - labels=["B.1.1.7", "Reference"], show_numbers=True, label_size=9, - number_size=9, symbol_size=7, spacing=SPACING, cmap=cmap + ax, + alignment, + matrix=blosum_matrix, + symbols_per_line=SYMBOLS_PER_LINE, + labels=["B.1.1.7", "Reference"], + show_numbers=True, + label_size=9, + number_size=9, + symbol_size=7, + spacing=SPACING, + cmap=cmap, ) ## Add indicator for features to the alignment for row in range(1 + len(alignment) // SYMBOLS_PER_LINE): col_start = SYMBOLS_PER_LINE * row - col_stop = SYMBOLS_PER_LINE * (row + 1) + col_stop = SYMBOLS_PER_LINE * (row + 1) if col_stop > len(alignment): # This happens in the last line col_stop = len(alignment) seq_start = alignment.trace[col_start, 1] - seq_stop = alignment.trace[col_stop-1, 1] + 1 + seq_stop = alignment.trace[col_stop - 1, 1] + 1 n_sequences = len(alignment.sequences) y_base = (n_sequences + SPACING) * row + n_sequences for feature_name, (first, last) in FEATURES.items(): # Zero based sequence indexing - start = first-1 + start = first - 1 # Exclusive stop stop = last if start < seq_stop and stop > seq_start: # The feature is found in this line x_begin = np.clip(start - seq_start, 0, SYMBOLS_PER_LINE) - x_end = np.clip(stop - seq_start, 0, SYMBOLS_PER_LINE) + x_end = np.clip(stop - seq_start, 0, SYMBOLS_PER_LINE) x_mean = (x_begin + x_end) / 2 y_line = y_base + 0.3 y_text = y_base + 0.6 - ax.plot( - [x_begin, x_end], [y_line, y_line], - color="black", linewidth=2 - ) - ax.text( - x_mean, y_text, feature_name, - fontsize=8, va="top", ha="center" - ) + ax.plot([x_begin, x_end], [y_line, y_line], color="black", linewidth=2) + ax.text(x_mean, y_text, feature_name, fontsize=8, va="top", ha="center") # Increase y-limit to include the feature indicators in the last line ax.set_ylim(y_text, 0) fig.tight_layout() diff --git a/doc/examples/scripts/sequence/sequencing/quality_control.py b/doc/examples/scripts/sequence/sequencing/quality_control.py index b488f1aeb..f7b769071 100644 --- a/doc/examples/scripts/sequence/sequencing/quality_control.py +++ b/doc/examples/scripts/sequence/sequencing/quality_control.py @@ -20,14 +20,13 @@ # License: BSD 3 clause # sphinx_gallery_thumbnail_number = 2 -import numpy as np -from scipy.stats import binom import matplotlib.pyplot as plt import matplotlib.ticker as ticker +import numpy as np +from scipy.stats import binom import biotite -import biotite.sequence as seq import biotite.application.sra as sra - +import biotite.sequence as seq FIG_SIZE = (8.0, 6.0) @@ -38,12 +37,10 @@ # Each run can have multiple reads per spot # by selecting index 0 we take only the first read for every spot sequences_and_scores = app.get_sequences_and_scores()[0] -sequence_codes = np.stack([ - sequence.code for sequence, _ in sequences_and_scores.values() -]) -scores = np.stack([ - scores for _, scores in sequences_and_scores.values() -]) +sequence_codes = np.stack( + [sequence.code for sequence, _ in sequences_and_scores.values()] +) +scores = np.stack([scores for _, scores in sequences_and_scores.values()]) seq_count = scores.shape[0] seq_length = scores.shape[1] positions = np.arange(1, seq_length + 1) @@ -56,20 +53,18 @@ # For the plot we need the first, second (the median) and third # quartile for each position. -first_quartile, median, third_quartile = np.quantile( - scores, (0.25, 0.5, 0.75), axis=0 -) +first_quartile, median, third_quartile = np.quantile(scores, (0.25, 0.5, 0.75), axis=0) fig, ax = plt.subplots(figsize=FIG_SIZE) ax.bar( positions, - bottom=first_quartile, height=third_quartile-first_quartile, width=1.0, - facecolor=biotite.colors["brightorange"], label="Lower/upper quartile" -) -ax.plot( - positions, median, - color=biotite.colors["dimorange"], label="Median" + bottom=first_quartile, + height=third_quartile - first_quartile, + width=1.0, + facecolor=biotite.colors["brightorange"], + label="Lower/upper quartile", ) +ax.plot(positions, median, color=biotite.colors["dimorange"], label="Median") ax.set_xlim(positions[0], positions[-1]) ax.set_xlabel("Position in read") ax.set_ylabel("Phred score") @@ -92,15 +87,13 @@ fig, ax = plt.subplots(figsize=FIG_SIZE) ax.hist( # Definition range of Sanger Phred scores is 0 to 40 - mean_scores, bins=np.linspace(0, 40, BIN_NUMBER), - color=biotite.colors["lightorange"] + mean_scores, + bins=np.linspace(0, 40, BIN_NUMBER), + color=biotite.colors["lightorange"], ) ax.set_xlabel("Mean Phred score of sequence") ax.set_ylabel("Sequence count") -ax.set_xlim( - np.floor(np.min(mean_scores)), - np.ceil( np.max(mean_scores)) -) +ax.set_xlim(np.floor(np.min(mean_scores)), np.ceil(np.max(mean_scores))) fig.tight_layout() ######################################################################## @@ -115,10 +108,9 @@ # as ambiguous bases might occur in some sequencing datasets alphabet = seq.NucleotideSequence.alphabet_amb -counts = np.stack([ - np.bincount(codes, minlength=len(alphabet)) - for codes in sequence_codes.T -], axis=-1) +counts = np.stack( + [np.bincount(codes, minlength=len(alphabet)) for codes in sequence_codes.T], axis=-1 +) frequencies = counts / seq_count * 100 fig, ax = plt.subplots(figsize=FIG_SIZE) @@ -141,38 +133,30 @@ # distribution. gc_count = np.count_nonzero( - (sequence_codes == alphabet.encode("G")) | - (sequence_codes == alphabet.encode("C")), - axis=1 + (sequence_codes == alphabet.encode("G")) | (sequence_codes == alphabet.encode("C")), + axis=1, ) at_count = np.count_nonzero( - (sequence_codes == alphabet.encode("A")) | - (sequence_codes == alphabet.encode("T")), - axis=1 + (sequence_codes == alphabet.encode("A")) | (sequence_codes == alphabet.encode("T")), + axis=1, ) gc_content = gc_count / (gc_count + at_count) # Exclusive range -> 0 to seq_length inclusive -number_of_gc = np.arange(seq_length+1) -exp_gc_content = binom.pmf( - k=number_of_gc, - n=seq_length, - p=np.mean(gc_content) -) +number_of_gc = np.arange(seq_length + 1) +exp_gc_content = binom.pmf(k=number_of_gc, n=seq_length, p=np.mean(gc_content)) fig, ax = plt.subplots(figsize=FIG_SIZE) # Due to finite sequence length, the distribution is discrete # -> use bar() instead of hist() values, counts = np.unique(gc_content, return_counts=True) bin_width = 100 / seq_length -ax.bar( - values * 100, counts, width=bin_width, - color=biotite.colors["brightorange"] -) +ax.bar(values * 100, counts, width=bin_width, color=biotite.colors["brightorange"]) ax.plot( number_of_gc / seq_length * 100, exp_gc_content * seq_count, - color=biotite.colors["dimorange"], linestyle=":" + color=biotite.colors["dimorange"], + linestyle=":", ) ax.set_xlim(0, 100) ax.set_xlabel("Sequence GC content (%)") @@ -201,11 +185,9 @@ duplications[code] = 1 duplication_level_count = np.bincount(list(duplications.values())) duplication_level_freq = ( - duplication_level_count - * np.arange(len(duplication_level_count)) - / seq_count * 100 + duplication_level_count * np.arange(len(duplication_level_count)) / seq_count * 100 ) -max_duplication = len(duplication_level_count)-1 +max_duplication = len(duplication_level_count) - 1 print("Maximum duplication number:", max_duplication) fig, ax = plt.subplots(figsize=FIG_SIZE) @@ -213,7 +195,7 @@ np.arange(0, len(duplication_level_freq)), duplication_level_freq, width=0.6, - color=biotite.colors["dimorange"] + color=biotite.colors["dimorange"], ) ax.set_xlim(0.5, len(duplication_level_freq) + 0.5) ax.xaxis.set_major_locator(ticker.MaxNLocator(10)) @@ -228,4 +210,4 @@ # Usually one would expect, that most sequences occur only once and the # following duplication numbers become decreasingly likely. # However, in this case we have another peak at around 60 duplications. -# And one read is even repeated astonishing 161 times! \ No newline at end of file +# And one read is even repeated astonishing 161 times! diff --git a/doc/examples/scripts/sequence/sequencing/read_quality.py b/doc/examples/scripts/sequence/sequencing/read_quality.py index 2cc12d492..47b89ddc4 100644 --- a/doc/examples/scripts/sequence/sequencing/read_quality.py +++ b/doc/examples/scripts/sequence/sequencing/read_quality.py @@ -10,13 +10,11 @@ # License: BSD 3 clause from io import StringIO -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.sequence as seq import biotite.sequence.io.fastq as fastq - # Sample FASTQ file from https://en.wikipedia.org/wiki/FASTQ_format fastq_content = StringIO(""" @SEQ_ID @@ -30,8 +28,12 @@ sequence, scores = fastq.get_sequence(fastq_file, "SEQ_ID") figure, ax = plt.subplots(figsize=(8.0, 2.0)) ax.bar( - x=np.arange(len(sequence)), height=scores, color=biotite.colors["orange"], - width=1.0, linewidth=1, edgecolor="white" + x=np.arange(len(sequence)), + height=scores, + color=biotite.colors["orange"], + width=1.0, + linewidth=1, + edgecolor="white", ) # -1 to put space between Y-axis and sequence ax.set_xlim(-1, len(sequence)) @@ -44,6 +46,6 @@ # Show sequence as X-axis ticks ax.set_xticks(np.arange(len(sequence))) ax.set_xticklabels(sequence.symbols) -ax.xaxis.set_ticks_position("none") +ax.xaxis.set_ticks_position("none") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/adjacency_matrix.py b/doc/examples/scripts/structure/contacts/adjacency_matrix.py index 2f5b9594a..d51a423bb 100644 --- a/doc/examples/scripts/structure/contacts/adjacency_matrix.py +++ b/doc/examples/scripts/structure/contacts/adjacency_matrix.py @@ -12,13 +12,12 @@ # License: BSD 3 clause from tempfile import gettempdir +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap import biotite +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb -import matplotlib.pyplot as plt -from matplotlib.colors import ListedColormap - file_name = rcsb.fetch("1aki", "bcif", gettempdir()) array = strucio.load_structure(file_name) @@ -41,4 +40,4 @@ ax.set_ylabel("Residue number") ax.set_title("Adjacency matrix of the lysozyme crystal structure") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/contact_sites.py b/doc/examples/scripts/structure/contacts/contact_sites.py index e7673983c..d4723426b 100644 --- a/doc/examples/scripts/structure/contacts/contact_sites.py +++ b/doc/examples/scripts/structure/contacts/contact_sites.py @@ -14,10 +14,9 @@ # License: BSD 3 clause import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # The maximum distance between an atom in the repressor and an atom in # the DNA for them to be considered 'in contact' @@ -30,15 +29,9 @@ # Separate structure into the DNA and the two identical protein chains -dna = structure[ - np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False) -] -protein_l = structure[ - (structure.chain_id == "L") & (structure.hetero == False) -] -protein_r = structure[ - (structure.chain_id == "R") & (structure.hetero == False) -] +dna = structure[np.isin(structure.chain_id, ["A", "B"]) & ~structure.hetero] +protein_l = structure[(structure.chain_id == "L") & ~structure.hetero] +protein_r = structure[(structure.chain_id == "R") & ~structure.hetero] # Quick check if the two protein chains are really identical assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r)) diff --git a/doc/examples/scripts/structure/contacts/contact_sites_pymol.py b/doc/examples/scripts/structure/contacts/contact_sites_pymol.py index 5e51c9a80..a4eb3622e 100644 --- a/doc/examples/scripts/structure/contacts/contact_sites_pymol.py +++ b/doc/examples/scripts/structure/contacts/contact_sites_pymol.py @@ -1,9 +1,8 @@ +import ammolite import numpy as np from matplotlib.colors import to_rgb import biotite import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 550) @@ -15,10 +14,7 @@ # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Add bonds to structure and convert to PyMOL structure = structure[~struc.filter_solvent(structure)] @@ -31,32 +27,38 @@ pymol_obj.color("biotite_lightgreen", structure.chain_id == "R") # Set view -ammolite.cmd.set_view(( - -0.044524662, 0.767611504, 0.639355302, - 0.998693943, 0.018437184, 0.047413416, - 0.024606399, 0.640637815, -0.767439663, - 0.000000000, 0.000000000, -115.614288330, - 56.031833649, 23.317802429, 3.761308193, - 73.517341614, 157.711288452, -20.000000000 -)) +ammolite.cmd.set_view( + ( + -0.044524662, + 0.767611504, + 0.639355302, + 0.998693943, + 0.018437184, + 0.047413416, + 0.024606399, + 0.640637815, + -0.767439663, + 0.000000000, + 0.000000000, + -115.614288330, + 56.031833649, + 23.317802429, + 3.761308193, + 73.517341614, + 157.711288452, + -20.000000000, + ) +) # Highlight contacts residue_mask = np.isin(structure.res_id, common_ids) -pymol_obj.show( - "sticks", - np.isin(structure.chain_id, ["L", "R"]) & residue_mask -) -for chain, color in zip( - ("L", "R"), - ("biotite_dimorange","biotite_darkgreen") -): +pymol_obj.show("sticks", np.isin(structure.chain_id, ["L", "R"]) & residue_mask) +for chain, color in zip(("L", "R"), ("biotite_dimorange", "biotite_darkgreen")): pymol_obj.color( color, - (structure.chain_id == chain) & - (structure.atom_name != "CA") & - residue_mask + (structure.chain_id == chain) & (structure.atom_name != "CA") & residue_mask, ) # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/contacts/disulfide_bonds.py b/doc/examples/scripts/structure/contacts/disulfide_bonds.py index 8d99a675d..e87e33647 100644 --- a/doc/examples/scripts/structure/contacts/disulfide_bonds.py +++ b/doc/examples/scripts/structure/contacts/disulfide_bonds.py @@ -19,28 +19,26 @@ import io from tempfile import gettempdir -import numpy as np -import matplotlib.pyplot as plt import matplotlib.patches as patches +import matplotlib.pyplot as plt +import numpy as np +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb -def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, - dihedral=90, dihedral_tol=10): +def detect_disulfide_bonds( + structure, distance=2.05, distance_tol=0.05, dihedral=90, dihedral_tol=10 +): # Array where detected disulfide bonds are stored disulfide_bonds = [] # A mask that selects only S-gamma atoms of cysteins - sulfide_mask = (structure.res_name == "CYS") & \ - (structure.atom_name == "SG") + sulfide_mask = (structure.res_name == "CYS") & (structure.atom_name == "SG") # sulfides in adjacency to other sulfides are detected in an # efficient manner via a cell list cell_list = struc.CellList( - structure, - cell_size=distance+distance_tol, - selection=sulfide_mask + structure, cell_size=distance + distance_tol, selection=sulfide_mask ) # Iterate over every index corresponding to an S-gamma atom for sulfide_i in np.where(sulfide_mask)[0]: @@ -65,31 +63,34 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, # For dihedral angle measurement the corresponding # C-beta atoms are required, too cb1 = structure[ - (structure.chain_id == sg1.chain_id) & - (structure.res_id == sg1.res_id) & - (structure.atom_name == "CB") + (structure.chain_id == sg1.chain_id) + & (structure.res_id == sg1.res_id) + & (structure.atom_name == "CB") ] cb2 = structure[ - (structure.chain_id == sg2.chain_id) & - (structure.res_id == sg2.res_id) & - (structure.atom_name == "CB") + (structure.chain_id == sg2.chain_id) + & (structure.res_id == sg2.res_id) + & (structure.atom_name == "CB") ] # Measure distance and dihedral angle and check criteria bond_dist = struc.distance(sg1, sg2) bond_dihed = np.abs(np.rad2deg(struc.dihedral(cb1, sg1, sg2, cb2))) - if bond_dist > distance - distance_tol and \ - bond_dist < distance + distance_tol and \ - bond_dihed > dihedral - dihedral_tol and \ - bond_dihed < dihedral + dihedral_tol: - # Atom meet criteria -> we found a disulfide bond - # -> the indices of the bond S-gamma atoms - # are put into a tuple with the lower index first - bond_tuple = sorted((sulfide_i, sulfide_j)) - # Add bond to list of bonds, but each bond only once - if bond_tuple not in disulfide_bonds: - disulfide_bonds.append(bond_tuple) + if ( + bond_dist > distance - distance_tol + and bond_dist < distance + distance_tol + and bond_dihed > dihedral - dihedral_tol + and bond_dihed < dihedral + dihedral_tol + ): + # Atom meet criteria -> we found a disulfide bond + # -> the indices of the bond S-gamma atoms + # are put into a tuple with the lower index first + bond_tuple = sorted((sulfide_i, sulfide_j)) + # Add bond to list of bonds, but each bond only once + if bond_tuple not in disulfide_bonds: + disulfide_bonds.append(bond_tuple) return np.array(disulfide_bonds, dtype=int) + ######################################################################## # As test case a structure of a *cysteine knot* protein is used, # specifically the squash trypsin inhibitor *EETI-II* @@ -104,19 +105,15 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, # For later verification that the implemented function works correctly, # the disulfide bonds, that are removed, are printed out. -pdbx_file = pdbx.BinaryCIFFile.read( - rcsb.fetch("2IT7", "bcif", gettempdir()) -) +pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2IT7", "bcif", gettempdir())) knottin = pdbx.get_structure(pdbx_file, include_bonds=True, model=1) -sulfide_indices = np.where( - (knottin.res_name == "CYS") & (knottin.atom_name == "SG") -)[0] +sulfide_indices = np.where((knottin.res_name == "CYS") & (knottin.atom_name == "SG"))[0] for i, j, _ in knottin.bonds.as_array(): if i in sulfide_indices and j in sulfide_indices: print(knottin[i]) print(knottin[j]) print() - knottin.bonds.remove_bond(i,j) + knottin.bonds.remove_bond(i, j) ######################################################################## # Now the sanitized structure is put into the disulfide detection @@ -143,13 +140,11 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, figure = plt.figure(figsize=(4.0, 1.0)) ax = figure.gca() MARGIN = 0.2 -ax.set_xlim(1-MARGIN, len(sequence)+MARGIN) -ax.set_ylim(0, 1+MARGIN) -ax.set_xticks(np.arange(1, len(sequence)+1)) +ax.set_xlim(1 - MARGIN, len(sequence) + MARGIN) +ax.set_ylim(0, 1 + MARGIN) +ax.set_xticks(np.arange(1, len(sequence) + 1)) ax.set_xticklabels(str(sequence)) -ax.yaxis.set_tick_params( - left=False, right=False, labelleft=False, labelright=False -) +ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False) ax.xaxis.set_tick_params( bottom=True, top=False, labelbottom=True, labeltop=False, width=0 ) @@ -161,10 +156,16 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, ellipse_width = sg2_res_id - sg1_res_id # Height is 2 instead of 1, # because only the upper half of the ellipse is visible - ax.add_patch(patches.Ellipse( - xy=(ellipse_center, 0), width=ellipse_width, height=2, - facecolor="None", edgecolor="gold", linewidth=2 - )) + ax.add_patch( + patches.Ellipse( + xy=(ellipse_center, 0), + width=ellipse_width, + height=2, + facecolor="None", + edgecolor="gold", + linewidth=2, + ) + ) figure.tight_layout() ######################################################################## @@ -180,4 +181,4 @@ def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, pdbx.set_structure(out_file, knottin) out_file.write(io.BytesIO()) -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/domain_hbonds.py b/doc/examples/scripts/structure/contacts/domain_hbonds.py index 03bbb75d7..93550583a 100644 --- a/doc/examples/scripts/structure/contacts/domain_hbonds.py +++ b/doc/examples/scripts/structure/contacts/domain_hbonds.py @@ -15,10 +15,9 @@ from tempfile import gettempdir import matplotlib.pyplot as plt import biotite +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb - file_name = rcsb.fetch("2KB1", "bcif", gettempdir()) stack = strucio.load_structure(file_name) @@ -35,19 +34,22 @@ # Create names of bonds label = "{d_resid}{d_resnm}-{d_a} -- {a_resid}{a_resnm}-{a_a}" -names = [label.format( - d_resid=chain_a.res_id[donor], - d_resnm=chain_a.res_name[donor], - d_a=chain_a.atom_name[donor], - a_resid=chain_a.res_id[acceptor], - a_resnm=chain_a.res_name[acceptor], - a_a=chain_a.atom_name[acceptor] - ) for donor, _, acceptor in triplets] - -plt.subplots(figsize=(11,4.5)) +names = [ + label.format( + d_resid=chain_a.res_id[donor], + d_resnm=chain_a.res_name[donor], + d_a=chain_a.atom_name[donor], + a_resid=chain_a.res_id[acceptor], + a_resnm=chain_a.res_name[acceptor], + a_a=chain_a.atom_name[acceptor], + ) + for donor, _, acceptor in triplets +] + +plt.subplots(figsize=(11, 4.5)) plt.bar(names, freq, color=biotite.colors["orange"]) plt.xlabel("Hydrogen bond") plt.ylabel("Hydrogen bond frequency") plt.xticks(rotation=45) plt.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/contacts/leaflet.py b/doc/examples/scripts/structure/contacts/leaflet.py index 41c184e69..1a8655ecd 100644 --- a/doc/examples/scripts/structure/contacts/leaflet.py +++ b/doc/examples/scripts/structure/contacts/leaflet.py @@ -21,10 +21,10 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -from tempfile import NamedTemporaryFile import warnings -import numpy as np +from tempfile import NamedTemporaryFile import networkx as nx +import numpy as np import biotite.structure as struc import biotite.structure.io as strucio @@ -33,8 +33,7 @@ PDB_FILE_PATH = "../../../download/dppc_n128.pdb" -def find_leaflets(structure, head_atom_mask, - cutoff_distance=15.0, periodic=False): +def find_leaflets(structure, head_atom_mask, cutoff_distance=15.0, periodic=False): """ Identify which lipids molecules belong to the same lipid bilayer leaflet. @@ -64,28 +63,29 @@ def find_leaflets(structure, head_atom_mask, """ cell_list = struc.CellList( - structure, cell_size=cutoff_distance, selection=head_atom_mask, - periodic=periodic + structure, + cell_size=cutoff_distance, + selection=head_atom_mask, + periodic=periodic, ) adjacency_matrix = cell_list.create_adjacency_matrix(cutoff_distance) graph = nx.Graph(adjacency_matrix) - head_leaflets = [sorted(c) for c in nx.connected_components(graph) - # A leaflet cannot consist of a single lipid - # This also removes all entries - # for atoms not in 'head_atom_mask' - if len(c) > 1] + head_leaflets = [ + sorted(c) + for c in nx.connected_components(graph) + # A leaflet cannot consist of a single lipid + # This also removes all entries + # for atoms not in 'head_atom_mask' + if len(c) > 1 + ] # 'leaflets' contains indices to head atoms # Broadcast each head atom index to all atoms in its corresponding # residue - leaflet_masks = np.empty( - (len(head_leaflets), structure.array_length()), - dtype=bool - ) + leaflet_masks = np.empty((len(head_leaflets), structure.array_length()), dtype=bool) for i, head_leaflet in enumerate(head_leaflets): - leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet) \ - .any(axis=0) + leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet).any(axis=0) return leaflet_masks @@ -100,7 +100,7 @@ def find_leaflets(structure, head_atom_mask, # periodicity should not matter leaflets = find_leaflets( structure, - head_atom_mask=(structure.res_name == "DPP") & (structure.atom_name == "P") + head_atom_mask=(structure.res_name == "DPP") & (structure.atom_name == "P"), ) # Bilayer -> Expect two leaflets assert len(leaflets) == 2 diff --git a/doc/examples/scripts/structure/contacts/leaflet_pymol.py b/doc/examples/scripts/structure/contacts/leaflet_pymol.py index 7678e53d5..59b2e98a0 100644 --- a/doc/examples/scripts/structure/contacts/leaflet_pymol.py +++ b/doc/examples/scripts/structure/contacts/leaflet_pymol.py @@ -1,9 +1,8 @@ +import ammolite import numpy as np from matplotlib.colors import to_rgb import biotite import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 700) @@ -14,15 +13,10 @@ # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Remove hydrogen and water and convert to PyMOL -structure = structure[ - (structure.element != "H") & (structure.res_name != "TIP") -] +structure = structure[(structure.element != "H") & (structure.res_name != "TIP")] structure.bonds = struc.connect_via_distances(structure) pymol_obj = ammolite.PyMOLObject.from_structure(structure) @@ -33,16 +27,13 @@ # Configure lipid heads pymol_obj.color( - "biotite_darkgreen", - (structure.chain_id == "A") & (structure.atom_name == "P") + "biotite_darkgreen", (structure.chain_id == "A") & (structure.atom_name == "P") ) pymol_obj.color( - "biotite_dimorange", - (structure.chain_id == "B") & (structure.atom_name == "P") + "biotite_dimorange", (structure.chain_id == "B") & (structure.atom_name == "P") ) pymol_obj.show( - "spheres", - np.isin(structure.chain_id, ("A", "B")) & (structure.atom_name == "P") + "spheres", np.isin(structure.chain_id, ("A", "B")) & (structure.atom_name == "P") ) # Adjust camera @@ -52,4 +43,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/misc/biological_assembly.py b/doc/examples/scripts/structure/misc/biological_assembly.py index ee9fe79ab..83586dcca 100644 --- a/doc/examples/scripts/structure/misc/biological_assembly.py +++ b/doc/examples/scripts/structure/misc/biological_assembly.py @@ -38,11 +38,10 @@ # License: BSD 3 clause from tempfile import NamedTemporaryFile +import biotite.database.rcsb as rcsb import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb - +import biotite.structure.io.pdbx as pdbx pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("3J31", "bcif")) @@ -77,4 +76,4 @@ # Visualization with PyMOL... # sphinx_gallery_ammolite_script = "biological_assembly_pymol.py" -temp.close() \ No newline at end of file +temp.close() diff --git a/doc/examples/scripts/structure/misc/biological_assembly_pymol.py b/doc/examples/scripts/structure/misc/biological_assembly_pymol.py index 377143fbd..3175b4eb9 100644 --- a/doc/examples/scripts/structure/misc/biological_assembly_pymol.py +++ b/doc/examples/scripts/structure/misc/biological_assembly_pymol.py @@ -1,8 +1,6 @@ -import numpy as np -import matplotlib.pyplot as plt -import biotite.structure as struc import ammolite - +import matplotlib.pyplot as plt +import numpy as np PNG_SIZE = (1000, 1000) @@ -21,4 +19,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/misc/diameter.py b/doc/examples/scripts/structure/misc/diameter.py index 2f7154cd8..e428ccbd3 100644 --- a/doc/examples/scripts/structure/misc/diameter.py +++ b/doc/examples/scripts/structure/misc/diameter.py @@ -11,9 +11,10 @@ from tempfile import gettempdir import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb + def get_diameter(pdb_id): file_name = rcsb.fetch(pdb_id, "bcif", gettempdir()) @@ -24,10 +25,11 @@ def get_diameter(pdb_id): # Calculate all pairwise difference vectors diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :] # Calculate absolute of difference vectors -> square distances - sq_dist = np.sum(diff*diff, axis=-1) + sq_dist = np.sum(diff * diff, axis=-1) # Maximum distance is diameter diameter = np.sqrt(np.max(sq_dist)) return diameter + # Example application -print("Diameter of 1QAW:", get_diameter("1QAW"), "Angstrom") \ No newline at end of file +print("Diameter of 1QAW:", get_diameter("1QAW"), "Angstrom") diff --git a/doc/examples/scripts/structure/misc/gap_bars.py b/doc/examples/scripts/structure/misc/gap_bars.py index 1ec9eb343..55024fd4e 100644 --- a/doc/examples/scripts/structure/misc/gap_bars.py +++ b/doc/examples/scripts/structure/misc/gap_bars.py @@ -16,11 +16,12 @@ # License: BSD 3 clause from tempfile import gettempdir -import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb import matplotlib.pyplot as plt -from matplotlib.patches import Rectangle import numpy as np +from matplotlib.patches import Rectangle +import biotite.database.rcsb as rcsb +import biotite.structure.io as strucio + def plot_gaps(pdb_id, chain_id, ax): # Download and parse structure file @@ -32,7 +33,7 @@ def plot_gaps(pdb_id, chain_id, ax): states = np.zeros(atom_array.res_id[-1], dtype=int) for i in range(len(states)): # Get array for only one residue ID - residue = atom_array[atom_array.res_id == i+1] + residue = atom_array[atom_array.res_id == i + 1] if len(residue) == 0: # not existing states[i] = 0 @@ -52,7 +53,7 @@ def plot_gaps(pdb_id, chain_id, ax): curr_start = i curr_state = states[i] else: - if states[i] != states[i-1]: + if states[i] != states[i - 1]: state_intervals.append((curr_start, i, curr_state)) curr_start = i curr_state = states[i] @@ -69,8 +70,11 @@ def plot_gaps(pdb_id, chain_id, ax): color = "gold" elif state == 2: color = "forestgreen" - ax.add_patch(Rectangle((start+1-0.5, 0), stop-start, 1, - edgecolor="None", facecolor=color)) + ax.add_patch( + Rectangle( + (start + 1 - 0.5, 0), stop - start, 1, edgecolor="None", facecolor=color + ) + ) # Some other visual stuff ax.spines["left"].set_visible(False) ax.spines["bottom"].set_visible(False) @@ -88,6 +92,6 @@ def plot_gaps(pdb_id, chain_id, ax): ax = fig.add_subplot(212) ax.set_title("5w1r", loc="left") plot_gaps("5w1r", "A", ax) -ax.set_xlabel("$Residue \ number$") +ax.set_xlabel(r"$Residue \ number$") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/misc/glycan_visualization.py b/doc/examples/scripts/structure/misc/glycan_visualization.py index 43bcccaf0..bd55ca1f7 100644 --- a/doc/examples/scripts/structure/misc/glycan_visualization.py +++ b/doc/examples/scripts/structure/misc/glycan_visualization.py @@ -18,21 +18,21 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from matplotlib.lines import Line2D import networkx as nx +import numpy as np +from matplotlib.lines import Line2D from networkx.drawing.nx_pydot import graphviz_layout +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # Adapted from "Mol*" Software # The dictionary maps residue names of saccharides to their common names SACCHARIDE_NAMES = { - res_name : common_name for common_name, res_names in [ + res_name: common_name + for common_name, res_names in [ ("Glc", ["GLC", "BGC", "Z8T", "TRE", "MLR"]), ("Man", ["MAN", "BMA"]), ("Gal", ["GLA", "GAL", "GZL", "GXL", "GIV"]), @@ -112,62 +112,51 @@ "All": ("o", "purple"), "Tal": ("o", "lightsteelblue"), "Ido": ("o", "chocolate"), - "GlcNAc": ("s", "royalblue"), "ManNAc": ("s", "forestgreen"), "GalNAc": ("s", "gold"), "GulNAc": ("s", "darkorange"), "AllNAc": ("s", "purple"), "IdoNAc": ("s", "chocolate"), - "GlcN": ("1", "royalblue"), "ManN": ("1", "forestgreen"), "GalN": ("1", "gold"), - "GlcA": ("v", "royalblue"), "ManA": ("v", "forestgreen"), "GalA": ("v", "gold"), "GulA": ("v", "darkorange"), "TalA": ("v", "lightsteelblue"), "IdoA": ("v", "chocolate"), - "Qui": ("^", "royalblue"), "Rha": ("^", "forestgreen"), "6dGul": ("^", "darkorange"), "Fuc": ("^", "crimson"), - "QuiNAc": ("P", "royalblue"), "FucNAc": ("P", "crimson"), - "Oli": ("X", "royalblue"), "Tyv": ("X", "forestgreen"), "Abe": ("X", "darkorange"), "Par": ("X", "pink"), "Dig": ("X", "purple"), - "Ara": ("*", "forestgreen"), "Lyx": ("*", "gold"), "Xyl": ("*", "darkorange"), "Rib": ("*", "pink"), - "Kdn": ("D", "forestgreen"), "Neu5Ac": ("D", "mediumvioletred"), "Neu5Gc": ("D", "turquoise"), - "LDManHep": ("H", "forestgreen"), "Kdo": ("H", "gold"), "DDManHep": ("H", "pink"), "MurNAc": ("H", "purple"), "Mur": ("H", "chocolate"), - "Api": ("p", "royalblue"), "Fru": ("p", "forestgreen"), "Tag": ("p", "gold"), "Sor": ("p", "darkorange"), "Psi": ("p", "pink"), - # Default representation - None: ("h", "black") + None: ("h", "black"), } ######################################################################### @@ -222,19 +211,22 @@ bonds = structure.bonds.as_array()[:, :2] # Convert indices pointing to connected atoms to indices pointing to the # starting atom of the respective residue -connected = struc.get_residue_starts_for( - structure, bonds.flatten() -).reshape(bonds.shape) +connected = struc.get_residue_starts_for(structure, bonds.flatten()).reshape( + bonds.shape +) # Omit bonds within the same residue -connected = connected[connected[:,0] != connected[:,1]] +connected = connected[connected[:, 0] != connected[:, 1]] # Add the residue connections to the graph graph.add_edges_from(connected) fig, ax = plt.subplots(figsize=(8.0, 8.0)) nx.draw( - graph, ax=ax, node_size=10, - node_color=["crimson" if is_glycan[atom_i] else "royalblue" - for atom_i in graph.nodes()] + graph, + ax=ax, + node_size=10, + node_color=[ + "crimson" if is_glycan[atom_i] else "royalblue" for atom_i in graph.nodes() + ], ) ######################################################################## @@ -260,7 +252,8 @@ # Get connected subgraphs containing glycans # -> any subgraph with more than one node glycan_graphs = [ - graph.subgraph(nodes).copy() for nodes in nx.connected_components(graph) + graph.subgraph(nodes).copy() + for nodes in nx.connected_components(graph) if len(nodes) > 1 ] @@ -297,14 +290,14 @@ # almost always an atom index that is lower than the saccharides # attached to it glycan_graph = nx.DiGraph( - [(min(atom_i, atom_j), max(atom_i, atom_j)) - for atom_i, atom_j in glycan_graph.edges()] + [ + (min(atom_i, atom_j), max(atom_i, atom_j)) + for atom_i, atom_j in glycan_graph.edges() + ] ) # The 'root' is the amino acid - root = [ - atom_i for atom_i in glycan_graph.nodes() if is_amino_acid[atom_i] - ] + root = [atom_i for atom_i in glycan_graph.nodes() if is_amino_acid[atom_i]] if len(root) == 0: # Saccharide is not attached to an amino acid -> Ignore glycan continue @@ -331,22 +324,20 @@ # Position the root at coordinate origin pos_array -= pos_array[nodes.index(root)] # Set vertical distances between nodes to 1 - pos_array[:,1] /= ( - pos_array[nodes.index(root_neighbor), 1] - - pos_array[nodes.index(root), 1] + pos_array[:, 1] /= ( + pos_array[nodes.index(root_neighbor), 1] - pos_array[nodes.index(root), 1] ) # Set minimum horizontal distances between nodes to 1 - non_zero_dist = np.abs(pos_array[(pos_array[:,0] != 0), 0]) + non_zero_dist = np.abs(pos_array[(pos_array[:, 0] != 0), 0]) if len(non_zero_dist) != 0: - pos_array[:,0] *= HORIZONTAL_NODE_DISTANCE / np.min(non_zero_dist) + pos_array[:, 0] *= HORIZONTAL_NODE_DISTANCE / np.min(non_zero_dist) # Move graph to residue ID position on x-axis - pos_array[:,0] += structure.res_id[root] + pos_array[:, 0] += structure.res_id[root] # Convert array back to dictionary pos = {node: tuple(coord) for node, coord in zip(nodes, pos_array)} nx.draw_networkx_edges( - glycan_graph, pos, ax=ax, - arrows=False, node_size=0, width=LINE_WIDTH + glycan_graph, pos, ax=ax, arrows=False, node_size=0, width=LINE_WIDTH ) # Draw each node individually @@ -359,14 +350,23 @@ common_name = SACCHARIDE_NAMES.get(structure.res_name[atom_i]) shape, color = SACCHARIDE_REPRESENTATION[common_name] ax.scatter( - pos[atom_i][0], pos[atom_i][1], - s=NODE_SIZE, marker=shape, facecolor=color, - edgecolor="black", linewidths=LINE_WIDTH + pos[atom_i][0], + pos[atom_i][1], + s=NODE_SIZE, + marker=shape, + facecolor=color, + edgecolor="black", + linewidths=LINE_WIDTH, ) legend_elements[common_name] = Line2D( - [0], [0], label=common_name, linestyle="None", - marker=shape, markerfacecolor=color, - markeredgecolor="black", markeredgewidth=LINE_WIDTH + [0], + [0], + label=common_name, + linestyle="None", + marker=shape, + markerfacecolor=color, + markeredgecolor="black", + markeredgewidth=LINE_WIDTH, ) @@ -381,9 +381,13 @@ ax.tick_params(axis="y", left=False, labelleft=False) ax.set_xticks(glycosylated_residue_ids) ax.set_xticklabels( - [symbol + str(res_id) for symbol, res_id - in zip(glycosylated_residue_symbols, glycosylated_residue_ids)], - rotation=45 + [ + symbol + str(res_id) + for symbol, res_id in zip( + glycosylated_residue_symbols, glycosylated_residue_ids + ) + ], + rotation=45, ) # Set the end of the axis to the last amino acid @@ -393,4 +397,4 @@ fig.tight_layout() # sphinx_gallery_thumbnail_number = 2 -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/misc/homolog_superimposition.py b/doc/examples/scripts/structure/misc/homolog_superimposition.py index 2e0e03558..4db689581 100644 --- a/doc/examples/scripts/structure/misc/homolog_superimposition.py +++ b/doc/examples/scripts/structure/misc/homolog_superimposition.py @@ -13,20 +13,19 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause - +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb + def _extract_monomer(complex): complex = complex[struc.filter_amino_acids(complex)] # Get the monomer that belongs to the first atom in the structure return complex[struc.get_chain_masks(complex, [0])[0]] + avidin_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1vyo", "bcif")) -avidin = _extract_monomer( - pdbx.get_structure(avidin_file, model=1, include_bonds=True) -) +avidin = _extract_monomer(pdbx.get_structure(avidin_file, model=1, include_bonds=True)) streptavidin_file = pdbx.BinaryCIFFile.read(rcsb.fetch("6j6j", "bcif")) streptavidin = _extract_monomer( pdbx.get_structure(streptavidin_file, model=1, include_bonds=True) @@ -34,4 +33,4 @@ def _extract_monomer(complex): streptavidin, _, _, _ = struc.superimpose_homologs(avidin, streptavidin) # Visualization with PyMOL... -# sphinx_gallery_ammolite_script = "homolog_superimposition_pymol.py" \ No newline at end of file +# sphinx_gallery_ammolite_script = "homolog_superimposition_pymol.py" diff --git a/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py b/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py index f9c204788..1760d527e 100644 --- a/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py +++ b/doc/examples/scripts/structure/misc/homolog_superimposition_pymol.py @@ -1,9 +1,6 @@ -import numpy as np +import ammolite from matplotlib.colors import to_rgb import biotite -import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 750) @@ -13,10 +10,7 @@ # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Convert to PyMOL pymol_avidin = ammolite.PyMOLObject.from_structure(avidin) @@ -33,4 +27,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/misc/pdb_statistics.py b/doc/examples/scripts/structure/misc/pdb_statistics.py index eaf7a3e05..ed8680eb8 100644 --- a/doc/examples/scripts/structure/misc/pdb_statistics.py +++ b/doc/examples/scripts/structure/misc/pdb_statistics.py @@ -10,12 +10,11 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np +from datetime import datetime, time import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.database.rcsb as rcsb -from datetime import datetime, time - +import biotite.database.rcsb as rcsb years = np.arange(1990, datetime.today().year + 1) xray_count = np.zeros(len(years), dtype=int) @@ -28,20 +27,14 @@ # A query that comprises one year date_query = rcsb.FieldQuery( "rcsb_accession_info.initial_release_date", - range_closed = ( - datetime.combine(datetime(year, 1, 1), time.min), - datetime.combine(datetime(year, 12, 31), time.max) - ) - ) - xray_query = rcsb.FieldQuery( - "exptl.method", exact_match="X-RAY DIFFRACTION" - ) - nmr_query = rcsb.FieldQuery( - "exptl.method", exact_match="SOLUTION NMR" - ) - em_query = rcsb.FieldQuery( - "exptl.method", exact_match="ELECTRON MICROSCOPY" + range_closed=( + datetime.combine(datetime(year, 1, 1), time.min), + datetime.combine(datetime(year, 12, 31), time.max), + ), ) + xray_query = rcsb.FieldQuery("exptl.method", exact_match="X-RAY DIFFRACTION") + nmr_query = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR") + em_query = rcsb.FieldQuery("exptl.method", exact_match="ELECTRON MICROSCOPY") # Get the amount of structures, that were released in that year # AND were elucidated with the respective method xray_count[i], nmr_count[i], em_count[i] = [ @@ -53,27 +46,32 @@ fig, ax = plt.subplots(figsize=(8.0, 5.0)) ax.set_title("PDB release statistics") -ax.set_xlim(years[0]-1, years[-1]+1) +ax.set_xlim(years[0] - 1, years[-1] + 1) ax.set_xticks(years) ax.set_xticklabels([str(y) for y in years], rotation=45) ax.set_xlabel("Year") ax.set_ylabel("Released structures per year") +ax.bar(years, xray_count, color=biotite.colors["darkorange"], label="X-Ray") ax.bar( - years, xray_count, - color=biotite.colors["darkorange"], label="X-Ray" -) -ax.bar( - years, nmr_count, bottom=xray_count, - color=biotite.colors["orange"], label="Solution NMR" + years, + nmr_count, + bottom=xray_count, + color=biotite.colors["orange"], + label="Solution NMR", ) ax.bar( - years, em_count, bottom=xray_count + nmr_count, - color=biotite.colors["brightorange"], label="Electron Microscopy" + years, + em_count, + bottom=xray_count + nmr_count, + color=biotite.colors["brightorange"], + label="Electron Microscopy", ) ax.bar( - years, tot_count - xray_count - nmr_count - em_count, + years, + tot_count - xray_count - nmr_count - em_count, bottom=xray_count + nmr_count + em_count, - color="gray", label="Miscellaneous" + color="gray", + label="Miscellaneous", ) ax.legend(loc="upper left") fig.tight_layout() diff --git a/doc/examples/scripts/structure/modeling/docking.py b/doc/examples/scripts/structure/modeling/docking.py index 06492c242..eb9a3fcfa 100644 --- a/doc/examples/scripts/structure/modeling/docking.py +++ b/doc/examples/scripts/structure/modeling/docking.py @@ -28,22 +28,24 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from scipy.stats import spearmanr +import biotite.application.autodock as autodock +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.info as info import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb -import biotite.application.autodock as autodock - # Get the receptor structure # and the original 'correct' conformation of the ligand pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("2RTG", "bcif")) structure = pdbx.get_structure( # Include formal charge for accurate partial charge calculation - pdbx_file, model=1, include_bonds=True, extra_fields=["charge"] + pdbx_file, + model=1, + include_bonds=True, + extra_fields=["charge"], ) # The asymmetric unit describes a streptavidin homodimer # However, we are only interested in a single monomer @@ -79,9 +81,7 @@ docked_ligand = struc.from_template(ligand, docked_coord) # As Vina discards all nonpolar hydrogen atoms, their respective # coordinates are NaN -> remove these atoms -docked_ligand = docked_ligand[ - ..., ~np.isnan(docked_ligand.coord[0]).any(axis=-1) -] +docked_ligand = docked_ligand[..., ~np.isnan(docked_ligand.coord[0]).any(axis=-1)] # For comparison of the docked pose with the experimentally determined @@ -142,9 +142,9 @@ # Vina only keeps polar hydrogens in the modeled structure # For consistency, remove all hydrogen atoms in the reference and # modelled structure -ref_ligand = ref_ligand[ref_ligand.element!= "H"] -docked_ligand = docked_ligand[docked_ligand.element!= "H"] +ref_ligand = ref_ligand[ref_ligand.element != "H"] +docked_ligand = docked_ligand[docked_ligand.element != "H"] # Visualization with PyMOL... # sphinx_gallery_thumbnail_number = 2 -# sphinx_gallery_ammolite_script = "docking_pymol.py" \ No newline at end of file +# sphinx_gallery_ammolite_script = "docking_pymol.py" diff --git a/doc/examples/scripts/structure/modeling/docking_pymol.py b/doc/examples/scripts/structure/modeling/docking_pymol.py index 349f93b39..8f9adc263 100644 --- a/doc/examples/scripts/structure/modeling/docking_pymol.py +++ b/doc/examples/scripts/structure/modeling/docking_pymol.py @@ -1,23 +1,17 @@ -import numpy as np +import ammolite from matplotlib.colors import to_rgb import biotite -import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 400) # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Convert to PyMOL -pymol_receptor = ammolite.PyMOLObject.from_structure(receptor) -pymol_ref_ligand = ammolite.PyMOLObject.from_structure(ref_ligand) +pymol_receptor = ammolite.PyMOLObject.from_structure(receptor) +pymol_ref_ligand = ammolite.PyMOLObject.from_structure(ref_ligand) pymol_docked_ligand = ammolite.PyMOLObject.from_structure(docked_ligand) # Visualize receptor as surface @@ -53,4 +47,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/modeling/md_analysis.py b/doc/examples/scripts/structure/modeling/md_analysis.py index 3e36779b3..dfdbf573b 100644 --- a/doc/examples/scripts/structure/modeling/md_analysis.py +++ b/doc/examples/scripts/structure/modeling/md_analysis.py @@ -22,16 +22,16 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause +import matplotlib.pyplot as plt +import numpy as np import biotite import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.xtc as xtc -import numpy as np -import matplotlib.pyplot as plt # Put here the path of the downloaded files templ_file_path = "../../../download/lysozyme_md.pdb" -traj_file_path = "../../../download/lysozyme_md.xtc" +traj_file_path = "../../../download/lysozyme_md.xtc" # Gromacs does not set the element symbol in its PDB files, # but Biotite guesses the element names from the atom names, @@ -76,7 +76,7 @@ trajectory, _ = struc.superimpose(trajectory[0], trajectory) rmsd = struc.rmsd(trajectory[0], trajectory) -figure = plt.figure(figsize=(6,3)) +figure = plt.figure(figsize=(6, 3)) ax = figure.add_subplot(111) ax.plot(time, rmsd, color=biotite.colors["dimorange"]) ax.set_xlim(time[0], time[-1]) @@ -97,7 +97,7 @@ radius = struc.gyration_radius(trajectory) -figure = plt.figure(figsize=(6,3)) +figure = plt.figure(figsize=(6, 3)) ax = figure.add_subplot(111) ax.plot(time, radius, color=biotite.colors["dimorange"]) ax.set_xlim(time[0], time[-1]) @@ -129,10 +129,10 @@ ca_trajectory = trajectory[:, trajectory.atom_name == "CA"] rmsf = struc.rmsf(struc.average(ca_trajectory), ca_trajectory) -figure = plt.figure(figsize=(6,3)) +figure = plt.figure(figsize=(6, 3)) ax = figure.add_subplot(111) res_count = struc.get_residue_count(trajectory) -ax.plot(np.arange(1, res_count+1), rmsf, color=biotite.colors["dimorange"]) +ax.plot(np.arange(1, res_count + 1), rmsf, color=biotite.colors["dimorange"]) ax.set_xlim(1, res_count) ax.set_ylim(0, 1.5) ax.set_xlabel("Residue") @@ -140,4 +140,4 @@ figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/mmtf_trajectory.py b/doc/examples/scripts/structure/modeling/mmtf_trajectory.py index 4bc706de8..cf4d8612c 100644 --- a/doc/examples/scripts/structure/modeling/mmtf_trajectory.py +++ b/doc/examples/scripts/structure/modeling/mmtf_trajectory.py @@ -25,13 +25,13 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause +import os.path from tempfile import NamedTemporaryFile +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure.io.xtc as xtc import biotite.structure.io.pdbx as pdbx -import numpy as np -import matplotlib.pyplot as plt -import os.path +import biotite.structure.io.xtc as xtc # Put here the path of the downloaded trajectory file xtc_file_path = "../../../download/lysozyme_md.xtc" @@ -53,14 +53,14 @@ ) for i, dim in enumerate(("x", "y", "z")): columns[f"coord_{dim}"] = pdbx.BinaryCIFData( - coord[:,:,i].flatten(), + coord[:, :, i].flatten(), encoding=[ pdbx.FixedPointEncoding(factor=100, src_type=np.float32), pdbx.DeltaEncoding(), # Encode the difference into two bytes pdbx.IntegerPackingEncoding(byte_count=2, is_unsigned=False), pdbx.ByteArrayEncoding(), - ] + ], ) category = pdbx.BinaryCIFCategory(columns) bcif_file = pdbx.BinaryCIFFile( @@ -77,15 +77,17 @@ figure = plt.figure() ax = figure.add_subplot(111) ax.bar( - [1,2], [xtc_size/1e+6, bcif_size/1e+6], width=0.3, + [1, 2], + [xtc_size / 1e6, bcif_size / 1e6], + width=0.3, color=[biotite.colors["dimgreen"], biotite.colors["dimorange"]], - linewidth=0 + linewidth=0, ) -ax.set_xticks([1,2]) +ax.set_xticks([1, 2]) ax.set_xticklabels(["XTC", "BinaryCIF"]) ax.set_xlim(0.5, 2.5) ax.set_ylim(0, 40) ax.yaxis.grid(True) ax.set_ylabel("File size (MB)") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/normal_modes.py b/doc/examples/scripts/structure/modeling/normal_modes.py index 13c7eca3a..ac760c459 100644 --- a/doc/examples/scripts/structure/modeling/normal_modes.py +++ b/doc/examples/scripts/structure/modeling/normal_modes.py @@ -36,11 +36,10 @@ from tempfile import NamedTemporaryFile import numpy as np from numpy import newaxis +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # A CSV file containing the eigenvectors for the CA atoms VECTOR_FILE = "../../../download/glycosylase_anm_vectors.csv" @@ -64,8 +63,7 @@ # Filter first peptide chain protein_chain = structure[ - struc.filter_amino_acids(structure) - & (structure.chain_id == structure.chain_id[0]) + struc.filter_amino_acids(structure) & (structure.chain_id == structure.chain_id[0]) ] # Filter CA atoms ca = protein_chain[protein_chain.atom_name == "CA"] @@ -88,7 +86,7 @@ # Stepwise application of eigenvectors as smooth sine oscillation -time = np.linspace(0, 2*np.pi, FRAMES, endpoint=False) +time = np.linspace(0, 2 * np.pi, FRAMES, endpoint=False) deviation = np.sin(time)[:, newaxis, newaxis] * mode_vectors # Apply oscillation of CA atom to all atoms in the corresponding residue @@ -97,13 +95,14 @@ protein_chain, # The last array element will be the length of the atom array, # i.e. no valid index - add_exclusive_stop=True + add_exclusive_stop=True, ) -for i in range(len(residue_starts) -1): +for i in range(len(residue_starts) - 1): res_start = residue_starts[i] - res_stop = residue_starts[i+1] - oscillation[:, res_start:res_stop, :] \ - = protein_chain.coord[res_start:res_stop, :] + deviation[:, i:i+1, :] + res_stop = residue_starts[i + 1] + oscillation[:, res_start:res_stop, :] = ( + protein_chain.coord[res_start:res_stop, :] + deviation[:, i : i + 1, :] + ) # An atom array stack containing all frames oscillating_structure = struc.from_template(protein_chain, oscillation) @@ -112,4 +111,4 @@ strucio.save_structure(temp.name, oscillating_structure) # sphinx_gallery_static_image = "normal_modes.gif" -temp.close() \ No newline at end of file +temp.close() diff --git a/doc/examples/scripts/structure/modeling/normal_modes_pymol.py b/doc/examples/scripts/structure/modeling/normal_modes_pymol.py index 5165510e9..1c0ad0e2c 100644 --- a/doc/examples/scripts/structure/modeling/normal_modes_pymol.py +++ b/doc/examples/scripts/structure/modeling/normal_modes_pymol.py @@ -1,6 +1,5 @@ +from os.path import isdir, join from pymol import cmd -from os.path import join, isdir - INPUT_STRUCTURE = "normal_modes.pdb" OUTPUT_DIR = "normal_modes" @@ -13,20 +12,34 @@ cmd.dss() # Define colors -cmd.set_color("biotite_lightgreen", [111/255, 222/255, 76/255]) +cmd.set_color("biotite_lightgreen", [111 / 255, 222 / 255, 76 / 255]) # Set overall colors cmd.color("biotite_lightgreen", "chain A") # Set view -cmd.set_view(( - 0.605540633, 0.363677770, -0.707855821, - -0.416691631, 0.902691007, 0.107316799, - 0.678002179, 0.229972601, 0.698157668, - 0.000000000, 0.000000000, -115.912551880, - 32.098876953, 31.005725861, 78.377349854, - 89.280677795, 142.544403076, -20.000000000 -)) +cmd.set_view( + ( + 0.605540633, + 0.363677770, + -0.707855821, + -0.416691631, + 0.902691007, + 0.107316799, + 0.678002179, + 0.229972601, + 0.698157668, + 0.000000000, + 0.000000000, + -115.912551880, + 32.098876953, + 31.005725861, + 78.377349854, + 89.280677795, + 142.544403076, + -20.000000000, + ) +) # Prepare output video frames cmd.mset() diff --git a/doc/examples/scripts/structure/modeling/rotamer_library.py b/doc/examples/scripts/structure/modeling/rotamer_library.py index 2087a08de..fa828eb1d 100644 --- a/doc/examples/scripts/structure/modeling/rotamer_library.py +++ b/doc/examples/scripts/structure/modeling/rotamer_library.py @@ -13,14 +13,11 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np -import networkx as nx import matplotlib.pyplot as plt +import numpy as np import biotite.structure as struc -import biotite.structure.io as strucio -import biotite.structure.info as info import biotite.structure.graphics as graphics - +import biotite.structure.info as info # 'CA' is not in backbone, # as we want to include the rotation between 'CA' and 'CB' @@ -73,14 +70,12 @@ bond_list_without_axis.remove_bond(atom_i, atom_j) # ...and these atoms are found by identifying the atoms that # are still connected to one of the two atoms involved - rotated_atom_indices = struc.find_connected( - bond_list_without_axis, root=atom_i - ) + rotated_atom_indices = struc.find_connected(bond_list_without_axis, root=atom_i) accepted = False while not accepted: # A random angle between 0 and 360 degrees - angle = np.random.rand() * 2*np.pi + angle = np.random.rand() * 2 * np.pi # Rotate coord[rotated_atom_indices] = struc.rotate_about_axis( coord[rotated_atom_indices], axis, angle, support @@ -91,9 +86,7 @@ # than the sum of their VdW radii, if they are not bonded to # each other accepted = True - distances = struc.distance( - coord[:, np.newaxis], coord[np.newaxis, :] - ) + distances = struc.distance(coord[:, np.newaxis], coord[np.newaxis, :]) clashed = distances < vdw_radii_mean for clash_atom1, clash_atom2 in zip(*np.where(clashed)): if clash_atom1 == clash_atom2: @@ -115,23 +108,28 @@ ### Visualize rotamers ### colors = np.zeros((residue.array_length(), 3)) -colors[residue.element == "H"] = (0.8, 0.8, 0.8) # gray -colors[residue.element == "C"] = (0.0, 0.8, 0.0) # green -colors[residue.element == "N"] = (0.0, 0.0, 0.8) # blue -colors[residue.element == "O"] = (0.8, 0.0, 0.0) # red +colors[residue.element == "H"] = (0.8, 0.8, 0.8) # gray +colors[residue.element == "C"] = (0.0, 0.8, 0.0) # green +colors[residue.element == "N"] = (0.0, 0.0, 0.8) # blue +colors[residue.element == "O"] = (0.8, 0.0, 0.0) # red # For consistency, each subplot has the same box size coord = rotamers.coord -size = np.array( - [coord[:, :, 0].max() - coord[:, :, 0].min(), - coord[:, :, 1].max() - coord[:, :, 1].min(), - coord[:, :, 2].max() - coord[:, :, 2].min()] -).max() * 0.5 +size = ( + np.array( + [ + coord[:, :, 0].max() - coord[:, :, 0].min(), + coord[:, :, 1].max() - coord[:, :, 1].min(), + coord[:, :, 2].max() - coord[:, :, 2].min(), + ] + ).max() + * 0.5 +) fig = plt.figure(figsize=(8.0, 8.0)) fig.suptitle("Rotamers of tyrosine", fontsize=20, weight="bold") for i, rotamer in enumerate(rotamers): - ax = fig.add_subplot(3, 3, i+1, projection="3d") + ax = fig.add_subplot(3, 3, i + 1, projection="3d") graphics.plot_atoms(ax, rotamer, colors, line_width=3, size=size, zoom=0.9) fig.tight_layout() @@ -139,4 +137,4 @@ ### Write rotamers to structure file ### -#strucio.save_structure("rotamers.pdb", rotamers) \ No newline at end of file +# strucio.save_structure("rotamers.pdb", rotamers) diff --git a/doc/examples/scripts/structure/modeling/solvation_shells.py b/doc/examples/scripts/structure/modeling/solvation_shells.py index cdd00be28..dcba8894d 100644 --- a/doc/examples/scripts/structure/modeling/solvation_shells.py +++ b/doc/examples/scripts/structure/modeling/solvation_shells.py @@ -25,16 +25,16 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause +import matplotlib.pyplot as plt import numpy as np import scipy.signal as signal -import matplotlib.pyplot as plt import biotite import biotite.structure as struc import biotite.structure.io as strucio # Put here the path of the downloaded files templ_file_path = "../../../download/waterbox_md.pdb" -traj_file_path = "../../../download/waterbox_md.xtc" +traj_file_path = "../../../download/waterbox_md.xtc" # Load the trajectory traj = strucio.load_structure(traj_file_path, template=templ_file_path) @@ -53,27 +53,19 @@ # Calculate the RDF of water molecules # centered on sodium or chloride ions, respectively N_BINS = 200 -bins, rdf_na = struc.rdf( - center=na, atoms=solvent, periodic=True, bins=N_BINS -) -bins, rdf_cl = struc.rdf( - center=cl, atoms=solvent, periodic=True, bins=N_BINS -) +bins, rdf_na = struc.rdf(center=na, atoms=solvent, periodic=True, bins=N_BINS) +bins, rdf_cl = struc.rdf(center=cl, atoms=solvent, periodic=True, bins=N_BINS) # Find peaks # This requires a bit trial and error on the parameters # The 'x' in '[x * N_BINS/10]' is the expected peak width in Å, # that is transformed into a peak width in amount of values -peak_indices_na = signal.find_peaks_cwt( - rdf_na, widths=[0.2 * N_BINS/10] -) -peak_indices_cl = signal.find_peaks_cwt( - rdf_cl, widths=[0.3 * N_BINS/10] -) +peak_indices_na = signal.find_peaks_cwt(rdf_na, widths=[0.2 * N_BINS / 10]) +peak_indices_cl = signal.find_peaks_cwt(rdf_cl, widths=[0.3 * N_BINS / 10]) peak_indices_na, peak_indices_cl = peak_indices_na[:3], peak_indices_cl[:3] # Create plots -fig, ax = plt.subplots(figsize=(8.0,3.0)) +fig, ax = plt.subplots(figsize=(8.0, 3.0)) # Plot average density in box ax.axhline(1, color="lightgray", linestyle="--") # Plot both RDFs @@ -81,19 +73,25 @@ ax.plot(bins, rdf_cl, color=biotite.colors["dimorange"], label="Cl") # The peak positions are shown as vertical lines ax.vlines( - bins[peak_indices_na], ymin=0, ymax=3, - color=biotite.colors["darkgreen"], linestyle=":" + bins[peak_indices_na], + ymin=0, + ymax=3, + color=biotite.colors["darkgreen"], + linestyle=":", ) ax.vlines( - bins[peak_indices_cl], ymin=0, ymax=3, - color=biotite.colors["dimorange"], linestyle=":" + bins[peak_indices_cl], + ymin=0, + ymax=3, + color=biotite.colors["dimorange"], + linestyle=":", ) ax.set_xticks(np.arange(0, 10.5, 0.5)) -ax.set_xlim(0,10) -ax.set_ylim(0,2.7) +ax.set_xlim(0, 10) +ax.set_ylim(0, 2.7) ax.set_xlabel("Radius (Å)") ax.set_ylabel("Relative density") ax.legend() fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/trajectory_sse.py b/doc/examples/scripts/structure/modeling/trajectory_sse.py index 5b33d2156..a0acf219c 100644 --- a/doc/examples/scripts/structure/modeling/trajectory_sse.py +++ b/doc/examples/scripts/structure/modeling/trajectory_sse.py @@ -14,20 +14,18 @@ # Code source: Daniel Bauer, Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from matplotlib.lines import Line2D +import numpy as np from matplotlib import colors -import matplotlib as mpl +from matplotlib.lines import Line2D import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.xtc as xtc from biotite.application.dssp import DsspApp - # Put here the path of the downloaded files templ_file_path = "../../../download/lysozyme_md.pdb" -traj_file_path = "../../../download/lysozyme_md.xtc" +traj_file_path = "../../../download/lysozyme_md.xtc" xtc_file = xtc.XTCFile.read(traj_file_path) @@ -36,25 +34,28 @@ traj = traj[:, struc.filter_amino_acids(traj)] # DSSP does not assign an SSE to the last residue -> -1 -sse = np.empty((traj.shape[0], struc.get_residue_count(traj)-1), dtype='U1') +sse = np.empty((traj.shape[0], struc.get_residue_count(traj) - 1), dtype="U1") for idx, frame in enumerate(traj): app = DsspApp(traj[idx]) app.start() app.join() sse[idx] = app.get_sse() + # Matplotlib needs numbers to assign colors correctly def sse_to_num(sse): num = np.empty(sse.shape, dtype=int) - num[sse == 'C'] = 0 - num[sse == 'E'] = 1 - num[sse == 'B'] = 2 - num[sse == 'S'] = 3 - num[sse == 'T'] = 4 - num[sse == 'H'] = 5 - num[sse == 'G'] = 6 - num[sse == 'I'] = 7 + num[sse == "C"] = 0 + num[sse == "E"] = 1 + num[sse == "B"] = 2 + num[sse == "S"] = 3 + num[sse == "T"] = 4 + num[sse == "H"] = 5 + num[sse == "G"] = 6 + num[sse == "I"] = 7 return num + + sse = sse_to_num(sse) @@ -68,24 +69,26 @@ def sse_to_num(sse): r"turn": "yellow", r"$\alpha$-helix": "blue", r"$3_{10}$-helix": "gray", - r"$\pi$-helix": "purple", + r"$\pi$-helix": "purple", } cmap = colors.ListedColormap(color_assign.values()) plt.figure(figsize=(8.0, 6.0)) -plt.imshow(sse.T, cmap=cmap, origin='lower') +plt.imshow(sse.T, cmap=cmap, origin="lower") plt.xlabel("Time / ps") plt.ylabel("Residue") ticks = np.arange(0, len(traj), 10) plt.xticks(ticks, time[ticks].astype(int)) # Custom legend below the DSSP plot -custom_lines = [ - Line2D([0], [0], color=cmap(i), lw=4) for i in range(len(color_assign)) -] +custom_lines = [Line2D([0], [0], color=cmap(i), lw=4) for i in range(len(color_assign))] plt.legend( - custom_lines, color_assign.keys(), loc="upper center", - bbox_to_anchor=(0.5, -0.15), ncol=len(color_assign), fontsize=8 + custom_lines, + color_assign.keys(), + loc="upper center", + bbox_to_anchor=(0.5, -0.15), + ncol=len(color_assign), + fontsize=8, ) plt.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py index 31a92644a..f62ca18ee 100644 --- a/doc/examples/scripts/structure/modeling/water_exchange_noexec.py +++ b/doc/examples/scripts/structure/modeling/water_exchange_noexec.py @@ -27,10 +27,11 @@ import matplotlib.pyplot as plt import numpy as np +from pylab import polyfit import biotite +import biotite.structure as struct import biotite.structure.io.gro as gro import biotite.structure.io.xtc as xtc -import biotite.structure as struct def water_in_prox(atoms, sele, cutoff): @@ -38,27 +39,28 @@ def water_in_prox(atoms, sele, cutoff): Get the atom indices of water oxygen atoms that are in vicinity of the selected atoms. """ - cell_list = struct.CellList(atoms, cell_size=5, - selection=atoms.atom_name == "OW") + cell_list = struct.CellList(atoms, cell_size=5, selection=atoms.atom_name == "OW") adjacent_atoms = cell_list.get_atoms(atoms[sele].coord, cutoff) adjacent_atoms = np.unique(adjacent_atoms.flatten()) adjacent_atoms = adjacent_atoms[adjacent_atoms > 0] return adjacent_atoms + def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)): """ Calculate the cumulative number of water molecules visiting the pore. """ - protein_sele = np.isin(traj.res_id, key_residues) \ - & ~np.isin(traj.atom_name, ["N", "O", "CA", "C"]) + protein_sele = np.isin(traj.res_id, key_residues) & ~np.isin( + traj.atom_name, ["N", "O", "CA", "C"] + ) water_count = np.zeros(traj.shape[0]) prev_counted_indices = [] for idx, frame in enumerate(traj): indices = water_in_prox(frame, protein_sele, cutoff) count = (~np.isin(indices, prev_counted_indices)).sum() if idx != 0: - count += water_count[idx-1] + count += water_count[idx - 1] water_count[idx] = count prev_counted_indices = indices return water_count @@ -82,36 +84,38 @@ def cum_water_in_pore(traj, cutoff=6, key_residues=(507, 511)): # Linear fitting -from pylab import polyfit open_fit = polyfit(time, counts[0], 1) closed_fit = polyfit(time, counts[1], 1) - fig, ax = plt.subplots(figsize=(8.0, 4.0)) -ax.plot(time, counts[0], - label="open pore", color=biotite.colors["dimgreen"]) -ax.plot(time, open_fit[0]*time+open_fit[1], - linestyle="--", color="black", zorder=-1) -ax.plot(time, counts[1], - label="closed pore", color=biotite.colors["lightorange"]) -ax.plot(time, closed_fit[0]*time+closed_fit[1], - linestyle="--", color="black", zorder=-1) +ax.plot(time, counts[0], label="open pore", color=biotite.colors["dimgreen"]) +ax.plot( + time, open_fit[0] * time + open_fit[1], linestyle="--", color="black", zorder=-1 +) +ax.plot(time, counts[1], label="closed pore", color=biotite.colors["lightorange"]) +ax.plot( + time, closed_fit[0] * time + closed_fit[1], linestyle="--", color="black", zorder=-1 +) ax.set( - xlabel = "Time / ns", - ylabel = "Count", - title = "Cumulative count\nof individual water molecules visiting the pore" + xlabel="Time / ns", + ylabel="Count", + title="Cumulative count\nof individual water molecules visiting the pore", ) ax.legend() -ax.annotate(f"{open_fit[0]:.1f} per ns", - xy=(20, 20*open_fit[0]+open_fit[1]+100), - xytext=(20-5, 20*open_fit[0]+open_fit[1]+1300), - arrowprops=dict(facecolor=biotite.colors["darkgreen"]), - va="center") -ax.annotate(f"{closed_fit[0]:.1f} per ns", - xy=(30, 20*closed_fit[0]+closed_fit[1]+100), - xytext=(30+2, 20*closed_fit[0]+closed_fit[1]+1300), - arrowprops=dict(facecolor=biotite.colors["orange"]), - va="center") +ax.annotate( + f"{open_fit[0]:.1f} per ns", + xy=(20, 20 * open_fit[0] + open_fit[1] + 100), + xytext=(20 - 5, 20 * open_fit[0] + open_fit[1] + 1300), + arrowprops=dict(facecolor=biotite.colors["darkgreen"]), + va="center", +) +ax.annotate( + f"{closed_fit[0]:.1f} per ns", + xy=(30, 20 * closed_fit[0] + closed_fit[1] + 100), + xytext=(30 + 2, 20 * closed_fit[0] + closed_fit[1] + 1300), + arrowprops=dict(facecolor=biotite.colors["orange"]), + va="center", +) fig.savefig("water_exchange.png", bbox_inches="tight") -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/molecule/alkane_isomers.py b/doc/examples/scripts/structure/molecule/alkane_isomers.py index ed479a52f..c9ea8a265 100644 --- a/doc/examples/scripts/structure/molecule/alkane_isomers.py +++ b/doc/examples/scripts/structure/molecule/alkane_isomers.py @@ -24,12 +24,11 @@ opposed to one request per carbon number. """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np import biotite.database.pubchem as pubchem -import biotite.structure.io.mol as mol import biotite.structure as struc - +import biotite.structure.io.mol as mol MAX_CARBON_COUNT = 12 PLOT_MAX_CARBON_COUNT = 6 @@ -37,13 +36,13 @@ carbon_numbers = [] alkane_cids = [] -for n_carbon in range(1, MAX_CARBON_COUNT+1): +for n_carbon in range(1, MAX_CARBON_COUNT + 1): formula = f"C{n_carbon}H{2 * n_carbon + 2}" print(formula) cids = np.array(pubchem.search(pubchem.FormulaQuery(formula))) carbon_numbers.extend([n_carbon] * len(cids)) alkane_cids.extend(cids) -carbon_numbers = np.array(carbon_numbers) +carbon_numbers = np.array(carbon_numbers) alkane_cids = np.array(alkane_cids) ######################################################################## @@ -58,15 +57,13 @@ # appropriate data type and used for filtering. # Finally, also the IUPAC name for each remaining compound is retrieved # to review the results. - + # Filter natural isotopes... n_isotopes = np.array( pubchem.fetch_property(alkane_cids, "IsotopeAtomCount"), dtype=int ) # ...and neutral compounds -charge = np.array( - pubchem.fetch_property(alkane_cids, "Charge"), dtype=int -) +charge = np.array(pubchem.fetch_property(alkane_cids, "Charge"), dtype=int) # Apply filter mask = (n_isotopes == 0) & (charge == 0) carbon_numbers = carbon_numbers[mask] @@ -85,7 +82,7 @@ # Remove compounds containing multiple molecules # (indicated by the ';' as separator between molecule names) -single_molecule_mask = np.array([not ";" in name for name in iupac_names]) +single_molecule_mask = np.array([";" not in name for name in iupac_names]) # Some compounds containing multiple molecules have no name at all single_molecule_mask &= np.array([len(name) != 0 for name in iupac_names]) carbon_numbers = carbon_numbers[single_molecule_mask] @@ -109,10 +106,7 @@ # for alkanes with zero carbon atoms, which does not make sense isomer_numbers = np.bincount(carbon_numbers)[1:] fig, ax = plt.subplots(figsize=(8.0, 4.0)) -ax.plot( - np.arange(1, MAX_CARBON_COUNT+1), isomer_numbers, - marker="o", color="gray" -) +ax.plot(np.arange(1, MAX_CARBON_COUNT + 1), isomer_numbers, marker="o", color="gray") ax.set_xlim(left=0) ax.set_ylim(bottom=0) ax.set_xlabel("Number of carbon atoms") @@ -127,18 +121,18 @@ # xy-coordinates are plotted as skeletal formula. files = pubchem.fetch( - alkane_cids[carbon_numbers <= PLOT_MAX_CARBON_COUNT], - as_structural_formula=True + alkane_cids[carbon_numbers <= PLOT_MAX_CARBON_COUNT], as_structural_formula=True ) fig, axes = plt.subplots( nrows=np.max(isomer_numbers[:PLOT_MAX_CARBON_COUNT]), ncols=PLOT_MAX_CARBON_COUNT, figsize=(8.0, 6.0), - sharex=True, sharey=True + sharex=True, + sharey=True, ) fig.suptitle("Number of carbon atoms", fontsize=16) -for i, n_carbon in enumerate(range(1, PLOT_MAX_CARBON_COUNT+1)): +for i, n_carbon in enumerate(range(1, PLOT_MAX_CARBON_COUNT + 1)): axes[0, i].set_title(n_carbon, fontsize=12) indices_for_n_carbon = np.where(carbon_numbers == n_carbon)[0] for j, file_index in enumerate(indices_for_n_carbon): @@ -149,17 +143,13 @@ # Center atoms in origin atoms.coord -= struc.centroid(atoms) # Structural formula is 0 in z-dimension - coord = atoms.coord[:,:2] + coord = atoms.coord[:, :2] ax = axes[j, i] - ax.plot( - coord[:, 0], coord[:, 1], - color="black", linestyle="None", marker="o" - ) + ax.plot(coord[:, 0], coord[:, 1], color="black", linestyle="None", marker="o") for bond_i, bond_j, _ in atoms.bonds.as_array(): ax.plot( - coord[[bond_i, bond_j], 0], coord[[bond_i, bond_j], 1], - color="black" + coord[[bond_i, bond_j], 0], coord[[bond_i, bond_j], 1], color="black" ) for ax in axes.flatten(): @@ -171,4 +161,4 @@ plt.show() -# sphinx_gallery_thumbnail_number = 2 \ No newline at end of file +# sphinx_gallery_thumbnail_number = 2 diff --git a/doc/examples/scripts/structure/molecule/molecular_visualization.py b/doc/examples/scripts/structure/molecule/molecular_visualization.py index 70d77d837..883785167 100644 --- a/doc/examples/scripts/structure/molecule/molecular_visualization.py +++ b/doc/examples/scripts/structure/molecule/molecular_visualization.py @@ -16,13 +16,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.animation import FuncAnimation import biotite.structure as struc -import biotite.structure.info as info import biotite.structure.graphics as graphics - +import biotite.structure.info as info # Get an atom array for caffeine # Caffeine has the PDB reside name 'CFF' @@ -35,34 +34,37 @@ # Normal vector of ring plane normal = np.cross(n1.coord - n3.coord, n1.coord - n7.coord) # Align ring plane normal to z-axis -caffeine = struc.align_vectors(caffeine, normal, np.array([0,0,1])) +caffeine = struc.align_vectors(caffeine, normal, np.array([0, 0, 1])) # Caffeine should be colored by element colors = np.zeros((caffeine.array_length(), 3)) -colors[caffeine.element == "H"] = (0.8, 0.8, 0.8) # gray -colors[caffeine.element == "C"] = (0.0, 0.8, 0.0) # green -colors[caffeine.element == "N"] = (0.0, 0.0, 0.8) # blue -colors[caffeine.element == "O"] = (0.8, 0.0, 0.0) # red +colors[caffeine.element == "H"] = (0.8, 0.8, 0.8) # gray +colors[caffeine.element == "C"] = (0.0, 0.8, 0.0) # green +colors[caffeine.element == "N"] = (0.0, 0.0, 0.8) # blue +colors[caffeine.element == "O"] = (0.8, 0.0, 0.0) # red fig = plt.figure(figsize=(8.0, 8.0)) ax = fig.add_subplot(111, projection="3d") graphics.plot_atoms( - ax, caffeine, colors, line_width=5, background_color="white", - zoom=1.5 + ax, caffeine, colors, line_width=5, background_color="white", zoom=1.5 ) fig.tight_layout() + # Create an animation that rotates the molecule about the x-axis def update(angle): ax.elev = angle + FPS = 50 DURATION = 4 angles = np.linspace(-180, 180, DURATION * FPS) # Start at 90 degrees -angles = np.concatenate([ - np.linspace( 90, 180, int(DURATION * FPS * 1/4)), - np.linspace(-180, 90, int(DURATION * FPS * 3/4)) -]) -animation = FuncAnimation(fig, update, angles, interval=int(1000/FPS)) -plt.show() \ No newline at end of file +angles = np.concatenate( + [ + np.linspace(90, 180, int(DURATION * FPS * 1 / 4)), + np.linspace(-180, 90, int(DURATION * FPS * 3 / 4)), + ] +) +animation = FuncAnimation(fig, update, angles, interval=int(1000 / FPS)) +plt.show() diff --git a/doc/examples/scripts/structure/molecule/peoe_visualization.py b/doc/examples/scripts/structure/molecule/peoe_visualization.py index d2dbaf66e..c38e51d98 100644 --- a/doc/examples/scripts/structure/molecule/peoe_visualization.py +++ b/doc/examples/scripts/structure/molecule/peoe_visualization.py @@ -13,15 +13,14 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np -from sklearn.decomposition import PCA import matplotlib.pyplot as plt -from matplotlib.colors import Normalize +import numpy as np from matplotlib.cm import ScalarMappable +from matplotlib.colors import Normalize +from sklearn.decomposition import PCA import biotite.structure as struc -import biotite.structure.info as info import biotite.structure.graphics as graphics - +import biotite.structure.info as info # Acetylsalicylic acid MOLECULE_NAME = "AIN" @@ -42,7 +41,6 @@ CMAP_NAME = "bwr_r" - # Get an atom array for the selected molecule molecule = info.residue(MOLECULE_NAME) @@ -71,17 +69,19 @@ colors = color_map(normalized_charges) # Ball size should be proportional to VdW radius of the respective atom -ball_sizes = np.array( - [info.vdw_radius_single(e) for e in molecule.element] -) * BALL_SCALE +ball_sizes = ( + np.array([info.vdw_radius_single(e) for e in molecule.element]) * BALL_SCALE +) # Gradient of ray strength # The ray size is proportional to the absolute charge value ray_full_sizes = ball_sizes + np.abs(charges) * RAY_SCALE -ray_sizes = np.array([ - np.linspace(ray_full_sizes[i], ball_sizes[i], N_RAY_STEPS, endpoint=False) - for i in range(molecule.array_length()) -]).T +ray_sizes = np.array( + [ + np.linspace(ray_full_sizes[i], ball_sizes[i], N_RAY_STEPS, endpoint=False) + for i in range(molecule.array_length()) + ] +).T # The plotting begins here @@ -92,32 +92,38 @@ # As 'axes.scatter()' uses sizes in points**2, # the VdW-radii as also squared graphics.plot_ball_and_stick_model( - ax, molecule, colors, ball_size=ball_sizes**2, line_width=3, - line_color=color_map(0.5), background_color=(.05, .05, .05), zoom=1.5 + ax, + molecule, + colors, + ball_size=ball_sizes**2, + line_width=3, + line_color=color_map(0.5), + background_color=(0.05, 0.05, 0.05), + zoom=1.5, ) # Plot the element labels for atom in molecule: ax.text( - *atom.coord, atom.element, - fontsize=ELEMENT_FONT_SIZE, color="black", - ha="center", va="center", zorder=100 + *atom.coord, + atom.element, + fontsize=ELEMENT_FONT_SIZE, + color="black", + ha="center", + va="center", + zorder=100, ) # Plot the rays for i in range(N_RAY_STEPS): ax.scatter( - *molecule.coord.T, s=ray_sizes[i]**2, c=colors, - linewidth=0, alpha=RAY_ALPHA + *molecule.coord.T, s=ray_sizes[i] ** 2, c=colors, linewidth=0, alpha=RAY_ALPHA ) # Plot the colorbar color_bar = fig.colorbar( - ScalarMappable( - norm=Normalize(vmin=-max_charge, vmax=max_charge), - cmap=color_map - ), - ax=ax + ScalarMappable(norm=Normalize(vmin=-max_charge, vmax=max_charge), cmap=color_map), + ax=ax, ) color_bar.set_label("Partial charge (e)", color="white") color_bar.ax.yaxis.set_tick_params(color="white") @@ -126,4 +132,4 @@ label.set_color("white") fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/nucleotide/base_pairs.py b/doc/examples/scripts/structure/nucleotide/base_pairs.py index cde0b9b21..28681901e 100644 --- a/doc/examples/scripts/structure/nucleotide/base_pairs.py +++ b/doc/examples/scripts/structure/nucleotide/base_pairs.py @@ -10,15 +10,14 @@ # License: BSD 3 clause from tempfile import gettempdir -import biotite -import biotite.structure.io.pdb as pdb -import biotite.database.rcsb as rcsb -import biotite.structure as struc -import biotite.sequence.graphics as graphics import matplotlib.pyplot as plt import matplotlib.ticker as ticker -from matplotlib.patches import Arc import numpy as np +from matplotlib.patches import Arc +import biotite +import biotite.database.rcsb as rcsb +import biotite.structure as struc +import biotite.structure.io.pdb as pdb # Download the PDB file and read the structure pdb_file_path = rcsb.fetch("4p5j", "pdb", gettempdir()) @@ -44,10 +43,10 @@ # Setup the axis ax.set_xlim(0.5, len(residue_ids) + 0.5) -ax.set_ylim(0, len(residue_ids)/2 + 0.5) +ax.set_ylim(0, len(residue_ids) / 2 + 0.5) ax.set_aspect("equal") ax.xaxis.set_major_locator(ticker.MultipleLocator(3)) -ax.tick_params(axis='both', which='major', labelsize=8) +ax.tick_params(axis="both", which="major", labelsize=8) ax.set_yticks([]) # Remove the frame @@ -55,7 +54,7 @@ # Plot the residue names in order for residue_name, residue_id in zip(residue_names, residue_ids): - ax.text(residue_id, 0, residue_name, ha='center', fontsize=8) + ax.text(residue_id, 0, residue_name, ha="center", fontsize=8) # Compute the basepairs and pseudknot order (first result) base_pairs = struc.base_pairs(nucleotides) @@ -63,9 +62,7 @@ # Draw the arcs between base pairs for (base1, base2), order in zip(base_pairs, pseudoknot_order): - arc_center = ( - np.mean((nucleotides.res_id[base1],nucleotides.res_id[base2])), 1.5 - ) + arc_center = (np.mean((nucleotides.res_id[base1], nucleotides.res_id[base2])), 1.5) arc_diameter = abs(nucleotides.res_id[base2] - nucleotides.res_id[base1]) name1 = nucleotides.res_name[base1] name2 = nucleotides.res_name[base2] @@ -80,10 +77,16 @@ else: linestyle = ":" arc = Arc( - arc_center, arc_diameter, arc_diameter, theta1=0, theta2=180, - color=color, linewidth=1.5, linestyle=linestyle + arc_center, + arc_diameter, + arc_diameter, + theta1=0, + theta2=180, + color=color, + linewidth=1.5, + linestyle=linestyle, ) ax.add_patch(arc) # Display the plot -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/nucleotide/leontis_westhof.py b/doc/examples/scripts/structure/nucleotide/leontis_westhof.py index a8a436f97..460dd573e 100644 --- a/doc/examples/scripts/structure/nucleotide/leontis_westhof.py +++ b/doc/examples/scripts/structure/nucleotide/leontis_westhof.py @@ -2,7 +2,7 @@ Leontis-Westhof Nomenclature ============================ -In this example we plot a secondary structure diagram annotated with +In this example we plot a secondary structure diagram annotated with Leontis-Westhof nomenclature :footcite:`Leontis2001` of the sarcin-ricin loop from E. coli (PDB ID: 6ZYB). """ @@ -11,14 +11,13 @@ # License: BSD 3 clause from tempfile import gettempdir +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure.io.pdb as pdb import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.graphics as graphics -import matplotlib.pyplot as plt -import numpy as np - +import biotite.structure.io.pdb as pdb # Download the PDB file and read the structure pdb_file_path = rcsb.fetch("6ZYB", "pdb", gettempdir()) @@ -30,9 +29,9 @@ base_pairs = struc.base_pairs(nucleotides) glycosidic_bonds = struc.base_pairs_glycosidic_bond(nucleotides, base_pairs) edges = struc.base_pairs_edge(nucleotides, base_pairs) -base_pairs = struc.get_residue_positions( - nucleotides, base_pairs.flatten() -).reshape(base_pairs.shape) +base_pairs = struc.get_residue_positions(nucleotides, base_pairs.flatten()).reshape( + base_pairs.shape +) # Get the one-letter-codes of the bases base_labels = [] @@ -41,7 +40,7 @@ # Color canonical Watson-Crick base pairs with a darker orange and # non-canonical base pairs with a lighter orange -colors = np.full(base_pairs.shape[0], biotite.colors['brightorange']) +colors = np.full(base_pairs.shape[0], biotite.colors["brightorange"]) for i, (base1, base2) in enumerate(base_pairs): name1 = base_labels[base1] name2 = base_labels[base2] @@ -68,34 +67,33 @@ # Plot the secondary structure graphics.plot_nucleotide_secondary_structure( - ax, base_labels, base_pairs, struc.get_residue_count(nucleotides), - bond_color=colors + ax, base_labels, base_pairs, struc.get_residue_count(nucleotides), bond_color=colors ) # Display the plot plt.show() ######################################################################## -# The sarcin-ricin loop is part of the 23s rRNA and is considered +# The sarcin-ricin loop is part of the 23s rRNA and is considered # crucial to the ribosome‘s activity. The incorporation of the -# Leontis-Westhof nomenclature into the 2D-plot shows how the individual -# base pairs are oriented and how their glycosidic bonds are oriented +# Leontis-Westhof nomenclature into the 2D-plot shows how the individual +# base pairs are oriented and how their glycosidic bonds are oriented # relative to each other. # -# This visualization enables one to see a pattern that cannot be -# communicated through the 2D structure alone. The upper part of the -# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds. -# All bases interact through their Watson-Crick edge (W). On the other -# hand, the lower part of the sarcin ricin loop looks strikingly -# different. The glycosidic bonds are oriented in cis (c) and trans (t) -# orientation. The bases interact through all three edges: Watson-Crick +# This visualization enables one to see a pattern that cannot be +# communicated through the 2D structure alone. The upper part of the +# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds. +# All bases interact through their Watson-Crick edge (W). On the other +# hand, the lower part of the sarcin ricin loop looks strikingly +# different. The glycosidic bonds are oriented in cis (c) and trans (t) +# orientation. The bases interact through all three edges: Watson-Crick # (W), Hoogsteen (H), and Sugar (S). -# -# Thus, it can be concluded that the upper part of the sarcin ricin loop -# represents a highly organized helix, while the lower part of the loop +# +# Thus, it can be concluded that the upper part of the sarcin ricin loop +# represents a highly organized helix, while the lower part of the loop # is comparatively unorganized. # # References # ---------- -# -# .. footbibliography:: \ No newline at end of file +# +# .. footbibliography:: diff --git a/doc/examples/scripts/structure/nucleotide/transfer_rnas.py b/doc/examples/scripts/structure/nucleotide/transfer_rnas.py index 23823b3d9..5d238b346 100644 --- a/doc/examples/scripts/structure/nucleotide/transfer_rnas.py +++ b/doc/examples/scripts/structure/nucleotide/transfer_rnas.py @@ -2,7 +2,7 @@ Comparison of a tRNA-like-structure with a tRNA =============================================== -In this example we plot a secondary-structure diagram of a tRNA mimic +In this example we plot a secondary-structure diagram of a tRNA mimic (PDB ID: 4P5J) from the *turnip yellow mosaic virus* (TYMV) and compare it to a PHE-tRNA (PDB ID: 1EHZ). """ @@ -11,15 +11,16 @@ # License: BSD 3 clause from tempfile import gettempdir +import matplotlib.pyplot as plt +import numpy as np import biotite -import biotite.structure.io.pdb as pdb import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.graphics as graphics -import matplotlib.pyplot as plt -import numpy as np +import biotite.structure.io.pdb as pdb + -# Create a function to get the structures and compute information for +# Create a function to get the structures and compute information for # the plots. def plot_rna(pdb_id, axes): # Download the PDB file and read the structure @@ -30,15 +31,15 @@ def plot_rna(pdb_id, axes): # Compute the base pairs and their pseudoknot order base_pairs = struc.base_pairs(nucleotides) - base_pairs = struc.get_residue_positions( - nucleotides, base_pairs.flatten() - ).reshape(base_pairs.shape) + base_pairs = struc.get_residue_positions(nucleotides, base_pairs.flatten()).reshape( + base_pairs.shape + ) pseudoknot_order = struc.pseudoknots(base_pairs)[0] # Set the linestyle according to the pseudoknot order - linestyles = np.full(base_pairs.shape[0], '-', dtype=object) - linestyles[pseudoknot_order == 1] = '--' - linestyles[pseudoknot_order == 2] = ':' + linestyles = np.full(base_pairs.shape[0], "-", dtype=object) + linestyles[pseudoknot_order == 1] = "--" + linestyles[pseudoknot_order == 2] = ":" # Indicate canonical nucleotides with an upper case one-letter-code # and non-canonical nucleotides with a lower case one-letter-code @@ -52,7 +53,7 @@ def plot_rna(pdb_id, axes): # Color canonical Watson-Crick base pairs with a darker orange and # non-canonical base pairs with a lighter orange - colors = np.full(base_pairs.shape[0], biotite.colors['brightorange']) + colors = np.full(base_pairs.shape[0], biotite.colors["brightorange"]) for i, (base1, base2) in enumerate(base_pairs): name1 = base_labels[base1] name2 = base_labels[base2] @@ -61,37 +62,45 @@ def plot_rna(pdb_id, axes): # Plot the secondary structure graphics.plot_nucleotide_secondary_structure( - axes, base_labels, base_pairs, struc.get_residue_count(nucleotides), - pseudoknot_order=pseudoknot_order, bond_linestyle=linestyles, + axes, + base_labels, + base_pairs, + struc.get_residue_count(nucleotides), + pseudoknot_order=pseudoknot_order, + bond_linestyle=linestyles, bond_color=colors, # Margin to compensate for reduced axis limits in shared axis - border=0.13 + border=0.13, ) # Use the PDB ID to label each plot axes.set_title(pdb_id, loc="left") + # Create a matplotlib pyplot fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(8.0, 16.0), + 2, + 1, + figsize=(8.0, 16.0), # Share both axes to ensure eqaul scaling of bath secondary structures - sharex=True, sharey=True + sharex=True, + sharey=True, ) # Plot the secondary structures -plot_rna('1EHZ', ax1) -plot_rna('4P5J', ax2) +plot_rna("1EHZ", ax1) +plot_rna("4P5J", ax2) fig.tight_layout() plt.show() ######################################################################## -# The generated plots show that both structures consist of four hairpin -# loops. Two of those loops, which are opposite to each other, interact -# through two pseudoknotted base pairs in the otherwise unpaired loop of -# the respective hairpin structures. The fact that this interaction was -# mimicked indicates functional importance. -# -# A third hairpin loop is folded towards the centre of the tRNA mimic. -# This is not the case for the phenylalanine tRNA and thus signifies a -# major difference between the structures. \ No newline at end of file +# The generated plots show that both structures consist of four hairpin +# loops. Two of those loops, which are opposite to each other, interact +# through two pseudoknotted base pairs in the otherwise unpaired loop of +# the respective hairpin structures. The fact that this interaction was +# mimicked indicates functional importance. +# +# A third hairpin loop is folded towards the centre of the tRNA mimic. +# This is not the case for the phenylalanine tRNA and thus signifies a +# major difference between the structures. diff --git a/doc/examples/scripts/structure/nucleotide/watson_crick.py b/doc/examples/scripts/structure/nucleotide/watson_crick.py index 5ac45ae82..00fbfd33c 100644 --- a/doc/examples/scripts/structure/nucleotide/watson_crick.py +++ b/doc/examples/scripts/structure/nucleotide/watson_crick.py @@ -9,13 +9,12 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx import biotite.structure.graphics as graphics -import biotite.database.rcsb as rcsb - +import biotite.structure.io.pdbx as pdbx # Structure of a DNA double helix pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch("1qxb", "bcif")) @@ -26,13 +25,15 @@ base_pairs = struc.base_pairs(nucleotides) for i, j in base_pairs: if (nucleotides.res_name[i], nucleotides.res_name[j]) == ("DG", "DC"): - guanine, cytosine = [nucleotides[mask] for mask - in struc.get_residue_masks(nucleotides, [i, j])] + guanine, cytosine = [ + nucleotides[mask] for mask in struc.get_residue_masks(nucleotides, [i, j]) + ] break for i, j in base_pairs: if (nucleotides.res_name[i], nucleotides.res_name[j]) == ("DA", "DT"): - adenine, thymine = [nucleotides[mask] for mask - in struc.get_residue_masks(nucleotides, [i, j])] + adenine, thymine = [ + nucleotides[mask] for mask in struc.get_residue_masks(nucleotides, [i, j]) + ] break pairs = [(guanine, cytosine), (adenine, thymine)] @@ -41,19 +42,18 @@ # Arrange bases for i, (purine, pyrimidine) in enumerate(pairs): - n1, n3, c5, c6 = [pyrimidine[pyrimidine.atom_name == name][0] - for name in ("N1", "N3", "C5", "C6")] + n1, n3, c5, c6 = [ + pyrimidine[pyrimidine.atom_name == name][0] for name in ("N1", "N3", "C5", "C6") + ] # Pyrimidine N3-C6 axis is aligned to x-axis purine, pyrimidine = [ - struc.align_vectors( - base, - n3.coord - c6.coord, - np.array([1, 0, 0]) - ) for base in (purine, pyrimidine) + struc.align_vectors(base, n3.coord - c6.coord, np.array([1, 0, 0])) + for base in (purine, pyrimidine) ] # Coords are changed -> update 'Atom' objects - n1, n3, c4, c5 = [pyrimidine[pyrimidine.atom_name == name][0] - for name in ("N1", "N3", "C4", "C5")] + n1, n3, c4, c5 = [ + pyrimidine[pyrimidine.atom_name == name][0] for name in ("N1", "N3", "C4", "C5") + ] # Pyrimidine base plane normal vector is aligned to z-axis # Furthermore, distance between bases is set purine, pyrimidine = [ @@ -61,10 +61,11 @@ base, np.cross(n3.coord - n1.coord, c5.coord - n1.coord), np.array([0, 0, 1]), - origin_position = struc.centroid(purine + pyrimidine), + origin_position=struc.centroid(purine + pyrimidine), # 10 Å separation between pairs - target_position = np.array([0, 10*i, 0]) - ) for base in (purine, pyrimidine) + target_position=np.array([0, 10 * i, 0]), + ) + for base in (purine, pyrimidine) ] pairs[i] = (purine, pyrimidine) @@ -73,14 +74,12 @@ atoms = pairs[0][0] + pairs[0][1] + pairs[1][0] + pairs[1][1] # Color by element colors = np.zeros((atoms.array_length(), 3)) -colors[atoms.element == "H"] = (0.8, 0.8, 0.8) # gray -colors[atoms.element == "C"] = (0.2, 0.2, 0.2) # darkgray -colors[atoms.element == "N"] = (0.0, 0.0, 0.8) # blue -colors[atoms.element == "O"] = (0.8, 0.0, 0.0) # red -colors[atoms.element == "P"] = (0.0, 0.6, 0.0) # green -graphics.plot_atoms( - ax, atoms, colors, line_width=3, background_color="white", zoom=1.5 -) +colors[atoms.element == "H"] = (0.8, 0.8, 0.8) # gray +colors[atoms.element == "C"] = (0.2, 0.2, 0.2) # darkgray +colors[atoms.element == "N"] = (0.0, 0.0, 0.8) # blue +colors[atoms.element == "O"] = (0.8, 0.0, 0.0) # red +colors[atoms.element == "P"] = (0.0, 0.6, 0.0) # green +graphics.plot_atoms(ax, atoms, colors, line_width=3, background_color="white", zoom=1.5) # Plot hydrogen bonds for purine, pyrimidine in pairs: @@ -102,14 +101,13 @@ for pair in pairs: for base in pair: label = base.res_name[0][1] - ring_center = struc.centroid(base[ - np.isin(base.atom_name, ["N1", "C2", "N3", "C4", "C5", "C6"]) - ]) + ring_center = struc.centroid( + base[np.isin(base.atom_name, ["N1", "C2", "N3", "C4", "C5", "C6"])] + ) x, y, z = ring_center ax.text( - x, y, z, label, - fontsize=20, fontweight="bold", va="center", ha="center" + x, y, z, label, fontsize=20, fontweight="bold", va="center", ha="center" ) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/pb_alignment.py b/doc/examples/scripts/structure/protein/pb_alignment.py index 5b6dc3818..9a3396ecf 100644 --- a/doc/examples/scripts/structure/protein/pb_alignment.py +++ b/doc/examples/scripts/structure/protein/pb_alignment.py @@ -27,15 +27,14 @@ # License: BSD 3 clause from tempfile import gettempdir -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.database.rcsb as rcsb - # PB alphabet pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") @@ -79,13 +78,12 @@ [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], -]) +]) # fmt: skip # Fetch animal lysoyzme structures lyso_files = rcsb.fetch( - ["1REX", "1AKI", "1DKJ", "1GD6"], - format="bcif", target_path=gettempdir() + ["1REX", "1AKI", "1DKJ", "1GD6"], format="bcif", target_path=gettempdir() ) organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"] @@ -106,25 +104,21 @@ # centered on the amino acid to calculate the PB for # Hence, the PBs are not defined for the two amino acids # at each terminus - pb_angles = np.full((len(phi)-4, 8), np.nan) - pb_angles[:, 0] = psi[ : -4] - pb_angles[:, 1] = phi[1 : -3] - pb_angles[:, 2] = psi[1 : -3] - pb_angles[:, 3] = phi[2 : -2] - pb_angles[:, 4] = psi[2 : -2] - pb_angles[:, 5] = phi[3 : -1] - pb_angles[:, 6] = psi[3 : -1] - pb_angles[:, 7] = phi[4 : ] + pb_angles = np.full((len(phi) - 4, 8), np.nan) + pb_angles[:, 0] = psi[:-4] + pb_angles[:, 1] = phi[1:-3] + pb_angles[:, 2] = psi[1:-3] + pb_angles[:, 3] = phi[2:-2] + pb_angles[:, 4] = psi[2:-2] + pb_angles[:, 5] = phi[3:-1] + pb_angles[:, 6] = psi[3:-1] + pb_angles[:, 7] = phi[4:] pb_angles = np.rad2deg(pb_angles) # Angle RMSD of all reference angles with all actual angles rmsda = np.sum( - ( - ( - ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180 - ) % 360 - 180 - )**2, - axis=-1 + ((ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2, + axis=-1, ) # Chose PB, where the RMSDA to the reference angle is lowest # Due to the definition of Biotite symbol codes @@ -139,7 +133,7 @@ matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str) matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict) alignment, order, _, _ = align.align_multiple( - pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False + pb_seqs, matrix, gap_penalty=(-500, -100), terminal_penalty=False ) # Visualize the alignment @@ -150,10 +144,15 @@ ax = fig.add_subplot(111) # The color scheme was generated with the 'Gecos' software graphics.plot_alignment_type_based( - ax, alignment, labels=labels, symbols_per_line=45, spacing=2, - show_numbers=True, color_scheme="flower" + ax, + alignment, + labels=labels, + symbols_per_line=45, + spacing=2, + show_numbers=True, + color_scheme="flower", ) # Organism names in italic -ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"}) +ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle": "italic"}) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/peptide_assembly.py b/doc/examples/scripts/structure/protein/peptide_assembly.py index 4c07451ad..de9f24704 100644 --- a/doc/examples/scripts/structure/protein/peptide_assembly.py +++ b/doc/examples/scripts/structure/protein/peptide_assembly.py @@ -21,19 +21,18 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -from tempfile import NamedTemporaryFile import itertools +from tempfile import NamedTemporaryFile import numpy as np from numpy.linalg import norm import biotite.sequence as seq import biotite.structure as struc -import biotite.structure.io as strucio import biotite.structure.info as info +import biotite.structure.io as strucio - -C_N_LENGTH = 1.34 -N_CA_LENGTH = 1.46 -CA_C_LENGTH = 1.54 +C_N_LENGTH = 1.34 +N_CA_LENGTH = 1.46 +CA_C_LENGTH = 1.54 CA_C_N_ANGLE = 114 C_N_CA_ANGLE = 123 @@ -41,96 +40,15 @@ # Reference peptide bond atom coordinates taken from 1l2y: # CA, C, N, O, H -peptide_coord = np.array([ - [-8.608, 3.135, -1.618], - [-7.117, 2.964, -1.897], - [-6.379, 4.031, -2.228], - [-6.634, 1.849, -1.758], - [-6.821, 4.923, -2.394] -]) - - -def create_raw_backbone_coord(number_of_res): - """ - Create coordinates for straight peptide chain in z-plane. - The peptide bonds are in trans configuration. - """ - coord = np.zeros((number_of_res * 3, 3)) - for i, angle, angle_direction, length in zip( - range(len(coord)), - itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]), - itertools.cycle([1, -1]), - itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]) - ): - if i == 0: - coord[i] = [0, 0, 0] - elif i == 1: - coord[i] = [0, length, 0] - else: - # Rotate about z-axis -> backbone lies in z-plane - rot_axis = [0, 0, angle_direction] - # Calculate the coordinates of a new atoms by rotating the previous - # bond by the given angle - new_coord = struc.rotate_about_axis( - coord[i-2], - axis = rot_axis, - angle = np.deg2rad(angle), - support = coord[i-1] - ) - # Scale bond to correct bond length - bond_vector = new_coord - coord[i-1] - coord[i] = coord[i-1] + bond_vector * length / norm(bond_vector) - return coord - - -def append_residue(chain, residue): - """ - Append a residue to an existing chain. - Modify annotation arrays and remove atoms as necessary. - The atom coordinates are not altered. - """ - if chain.array_length() == 0: - # Chain is empty - residue.res_id[:] = 1 - return residue - - last_res_id = chain.res_id[-1] - - # Remove atoms removed by peptide bond - chain = chain[ - (chain.res_id != last_res_id) | - ~np.isin( - chain.atom_name, - ["OXT", "HXT"] - ) - ] - residue = residue[ - ~np.isin( - residue.atom_name, - ["H2", "H3"] - ) +PEPTIDE_COORD = np.array( + [ + [-8.608, 3.135, -1.618], + [-7.117, 2.964, -1.897], + [-6.379, 4.031, -2.228], + [-6.634, 1.849, -1.758], + [-6.821, 4.923, -2.394], ] - - # Increment residue ID for attached residue - residue.res_id[:] = last_res_id + 1 - -C_N_LENGTH = 1.34 -N_CA_LENGTH = 1.46 -CA_C_LENGTH = 1.54 - -CA_C_N_ANGLE = 114 -C_N_CA_ANGLE = 123 -N_CA_C_ANGLE = 110 - -# Reference peptide bond atom coordinates taken from 1l2y: -# CA, C, N, O, H -peptide_coord = np.array([ - [-8.608, 3.135, -1.618], - [-7.117, 2.964, -1.897], - [-6.379, 4.031, -2.228], - [-6.634, 1.849, -1.758], - [-6.821, 4.923, -2.394] -]) +) def create_raw_backbone_coord(number_of_res): @@ -143,7 +61,7 @@ def create_raw_backbone_coord(number_of_res): range(len(coord)), itertools.cycle([CA_C_N_ANGLE, C_N_CA_ANGLE, N_CA_C_ANGLE]), itertools.cycle([1, -1]), - itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]) + itertools.cycle([C_N_LENGTH, N_CA_LENGTH, CA_C_LENGTH]), ): if i == 0: coord[i] = [0, 0, 0] @@ -155,14 +73,14 @@ def create_raw_backbone_coord(number_of_res): # Calculate the coordinates of a new atoms by rotating the # previous bond by the given angle new_coord = struc.rotate_about_axis( - coord[i-2], - axis = rot_axis, - angle = np.deg2rad(angle), - support = coord[i-1] + coord[i - 2], + axis=rot_axis, + angle=np.deg2rad(angle), + support=coord[i - 1], ) # Scale bond to correct bond length - bond_vector = new_coord - coord[i-1] - coord[i] = coord[i-1] + bond_vector * length / norm(bond_vector) + bond_vector = new_coord - coord[i - 1] + coord[i] = coord[i - 1] + bond_vector * length / norm(bond_vector) return coord @@ -181,18 +99,9 @@ def append_residue(chain, residue): # Remove atoms removed by peptide bond chain = chain[ - (chain.res_id != last_res_id) | - ~np.isin( - chain.atom_name, - ["OXT", "HXT"] - ) - ] - residue = residue[ - ~np.isin( - residue.atom_name, - ["H2", "H3"] - ) + (chain.res_id != last_res_id) | ~np.isin(chain.atom_name, ["OXT", "HXT"]) ] + residue = residue[~np.isin(residue.atom_name, ["H2", "H3"])] # Increment residue ID for attached residue residue.res_id[:] = last_res_id + 1 @@ -203,9 +112,7 @@ def append_residue(chain, residue): # Add peptide bond index_prev_c = np.where(chain.atom_name == "C")[0][-2] index_curr_n = np.where(chain.atom_name == "N")[0][-1] - chain.bonds.add_bond( - index_prev_c, index_curr_n, struc.BondType.SINGLE - ) + chain.bonds.add_bond(index_prev_c, index_curr_n, struc.BondType.SINGLE) return chain @@ -213,15 +120,14 @@ def assemble_peptide(sequence): res_names = [seq.ProteinSequence.convert_letter_1to3(r) for r in sequence] backbone_coord = create_raw_backbone_coord(len(sequence)) - chain = struc.AtomArray(0) for i, res_name in enumerate(res_names): residue = info.residue(res_name) # Superimpose residue to corresponding backbone coordinates _, transformation = struc.superimpose( - backbone_coord[3*i : 3*i + 3], - residue.coord[np.isin(residue.atom_name, ["N", "CA", "C"])] + backbone_coord[3 * i : 3 * i + 3], + residue.coord[np.isin(residue.atom_name, ["N", "CA", "C"])], ) residue = transformation.apply(residue) @@ -238,10 +144,9 @@ def assemble_peptide(sequence): for atom_name in ["N", "H"] ] _, transformation = struc.superimpose( - chain.coord[[ca_i, c_i, n_i]], - peptide_coord[:3] + chain.coord[[ca_i, c_i, n_i]], PEPTIDE_COORD[:3] ) - chain.coord[[o_i, h_i]] = transformation.apply(peptide_coord[3:]) + chain.coord[[o_i, h_i]] = transformation.apply(PEPTIDE_COORD[3:]) return chain diff --git a/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py b/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py index 9ef7d7b2f..7afdc6a06 100644 --- a/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py +++ b/doc/examples/scripts/structure/protein/peptide_assembly_pymol.py @@ -1,19 +1,14 @@ -import numpy as np +import ammolite from matplotlib.colors import to_rgb import biotite import biotite.structure as struc -import ammolite - PNG_SIZE = (1000, 400) # Define colors for color_name, color_value in biotite.colors.items(): - ammolite.cmd.set_color( - "biotite_" + color_name, - to_rgb(color_value) - ) + ammolite.cmd.set_color("biotite_" + color_name, to_rgb(color_value)) # Convert to PyMOL chain.bonds = struc.connect_via_distances(chain) @@ -21,14 +16,8 @@ # Visualize as stick model pymol_obj.show_as("sticks") -pymol_obj.color( - "biotite_lightgreen", - (chain.res_id % 2 == 0) & (chain.element == "C") -) -pymol_obj.color( - "biotite_dimgreen", - (chain.res_id % 2 != 0) & (chain.element == "C") -) +pymol_obj.color("biotite_lightgreen", (chain.res_id % 2 == 0) & (chain.element == "C")) +pymol_obj.color("biotite_dimgreen", (chain.res_id % 2 != 0) & (chain.element == "C")) ammolite.cmd.set("depth_cue", 0) # Adjust camera @@ -37,4 +26,4 @@ # Save image ammolite.cmd.ray(*PNG_SIZE) -ammolite.cmd.png(__image_destination__) \ No newline at end of file +ammolite.cmd.png(__image_destination__) diff --git a/doc/examples/scripts/structure/protein/ramachandran.py b/doc/examples/scripts/structure/protein/ramachandran.py index 021349d36..806ac283f 100644 --- a/doc/examples/scripts/structure/protein/ramachandran.py +++ b/doc/examples/scripts/structure/protein/ramachandran.py @@ -12,34 +12,29 @@ # License: BSD 3 clause from tempfile import gettempdir -import biotite.structure as struc -import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb import matplotlib.pyplot as plt import numpy as np -from matplotlib import colors -import scipy.stats as sts +import biotite.database.rcsb as rcsb +import biotite.structure as struc +import biotite.structure.io as strucio # Download and parse file file = rcsb.fetch("3vkh", "cif", gettempdir()) atom_array = strucio.load_structure(file) # Calculate backbone dihedral angles # from one of the two identical chains in the asymmetric unit -phi, psi, omega = struc.dihedral_backbone( - atom_array[atom_array.chain_id == "A"] -) +phi, psi, omega = struc.dihedral_backbone(atom_array[atom_array.chain_id == "A"]) # Conversion from radians into degree -phi *= 180/np.pi -psi *= 180/np.pi +phi *= 180 / np.pi +psi *= 180 / np.pi # Remove invalid values (NaN) at first and last position -phi= phi[1:-1] -psi= psi[1:-1] +phi = phi[1:-1] +psi = psi[1:-1] # Plot density figure = plt.figure() ax = figure.add_subplot(111) -h, xed, yed, image = ax.hist2d(phi, psi, bins=(200, 200), - cmap="RdYlGn_r", cmin=1) +h, xed, yed, image = ax.hist2d(phi, psi, bins=(200, 200), cmap="RdYlGn_r", cmin=1) cbar = figure.colorbar(image, orientation="vertical") cbar.set_label("Count") ax.set_aspect("equal") @@ -49,4 +44,4 @@ ax.set_ylabel(r"$\psi$") ax.set_title("Ramachandran plot of dynein motor domain") figure.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/residue_chirality.py b/doc/examples/scripts/structure/protein/residue_chirality.py index 9d6d94061..92dd15b87 100644 --- a/doc/examples/scripts/structure/protein/residue_chirality.py +++ b/doc/examples/scripts/structure/protein/residue_chirality.py @@ -18,9 +18,9 @@ from tempfile import gettempdir import numpy as np +import biotite.database.rcsb as rcsb import biotite.structure as struc import biotite.structure.io as strucio -import biotite.database.rcsb as rcsb def get_enantiomer(n, ca, c, cb): @@ -29,16 +29,15 @@ def get_enantiomer(n, ca, c, cb): # the enantiomer: # L = 1 # D = -1 - n = np.cross(ca-n, ca-c) + n = np.cross(ca - n, ca - c) sign = np.sign(np.dot(cb - ca, n)) return sign + def analyze_chirality(array): # Filter backbone + CB array = array[struc.filter_amino_acids(array)] - array = array[ - (array.atom_name == "CB") | (struc.filter_peptide_backbone(array)) - ] + array = array[(array.atom_name == "CB") | (struc.filter_peptide_backbone(array))] # Iterate over each residue ids, names = struc.get_residues(array) enantiomers = np.zeros(len(ids), dtype=int) @@ -48,10 +47,10 @@ def analyze_chirality(array): # Glyine -> no chirality enantiomers[i] = 0 else: - enantiomers[i] = get_enantiomer(coord[0], coord[1], - coord[2], coord[3]) + enantiomers[i] = get_enantiomer(coord[0], coord[1], coord[2], coord[3]) return enantiomers + # Fetch and parse structure file file = rcsb.fetch("1l2y", "bcif", gettempdir()) stack = strucio.load_structure(file) @@ -62,5 +61,5 @@ def analyze_chirality(array): # Reflected structures have opposite enantiomers # Test via reflection at x-y-plane, z -> -z array_reflect = array.copy() -array_reflect.coord[:,2] *= -1 -print("1l2y (reflected)", analyze_chirality(array_reflect)) \ No newline at end of file +array_reflect.coord[:, 2] *= -1 +print("1l2y (reflected)", analyze_chirality(array_reflect)) diff --git a/doc/examples/scripts/structure/protein/sheet_arrangement.py b/doc/examples/scripts/structure/protein/sheet_arrangement.py index aea9f4e60..930c6a0e3 100644 --- a/doc/examples/scripts/structure/protein/sheet_arrangement.py +++ b/doc/examples/scripts/structure/protein/sheet_arrangement.py @@ -17,42 +17,45 @@ # Code source: Patrick Kunzmann # License: BSD 3 clause -import numpy as np -import networkx as nx import matplotlib.pyplot as plt +import networkx as nx +import numpy as np from matplotlib.patches import FancyArrow import biotite -import biotite.structure.io.pdbx as pdbx import biotite.database.rcsb as rcsb - +import biotite.structure.io.pdbx as pdbx ##### OPTIONS ##### PDB_ID = "3AKO" SHEETS = ["A"] -FIG_SIZE = (8.0, 4.0) # Figure size in inches -Y_LIMIT = 2.0 # Vertical plot limits -SHEET_DISTANCE = 3.0 # Separation of strands in different sheets -ARROW_TAIL_WITH = 0.4 # Width of the arrow tails -ARROW_HEAD_WITH = 0.7 # Width of the arrow heads -ARROW_HEAD_LENGTH = 0.25 # Length of the arrow heads -ARROW_LINE_WIDTH = 1 # Width of the arrow edges -ARROW_COLORS = [ # Each chain is colored differently +FIG_SIZE = (8.0, 4.0) # Figure size in inches +Y_LIMIT = 2.0 # Vertical plot limits +SHEET_DISTANCE = 3.0 # Separation of strands in different sheets +ARROW_TAIL_WITH = 0.4 # Width of the arrow tails +ARROW_HEAD_WITH = 0.7 # Width of the arrow heads +ARROW_HEAD_LENGTH = 0.25 # Length of the arrow heads +ARROW_LINE_WIDTH = 1 # Width of the arrow edges +ARROW_COLORS = [ # Each chain is colored differently biotite.colors["darkgreen"], biotite.colors["dimorange"], biotite.colors["lightgreen"], biotite.colors["brightorange"], ] -CONNECTION_COLOR = "black" # Color of the connection lines -CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines -CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines -CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines -RES_ID_HEIGHT = -0.2 # The vertical distance of the residue ID labels from the arrow ends -RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels -RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels -ADAPTIVE_ARROW_LENGTHS = True # If true, the arrow length is proportional to the number of its residues -SHOW_SHEET_NAMES = False # If true, the sheets are labeled below the plot -SHEET_NAME_FONT_SIZE = 14 # The font size of the sheet labels +CONNECTION_COLOR = "black" # Color of the connection lines +CONNECTION_LINE_WIDTH = 1.5 # Width of the connection lines +CONNECTION_HEIGHT = 0.1 # Minimum height of the connection lines +CONNECTION_SEPARATION = 0.1 # Minimum vertical distance between the connection lines +RES_ID_HEIGHT = ( + -0.2 +) # The vertical distance of the residue ID labels from the arrow ends +RES_ID_FONT_SIZE = 8 # The font size of the residue ID labels +RES_ID_FONT_WEIGHT = "bold" # The font weight of the residue ID labels +ADAPTIVE_ARROW_LENGTHS = ( + True # If true, the arrow length is proportional to the number of its residues +) +SHOW_SHEET_NAMES = False # If true, the sheets are labeled below the plot +SHEET_NAME_FONT_SIZE = 14 # The font size of the sheet labels ##### SNOITPO ##### ######################################################################## @@ -73,19 +76,20 @@ if SHEETS is None: sele = np.full(sheet_order.row_count, True) else: - sele = np.array([ - sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array() - ]) + sele = np.array([sheet in SHEETS for sheet in sheet_order["sheet_id"].as_array()]) sheet_ids = sheet_order["sheet_id"].as_array()[sele] is_parallel_list = sheet_order["sense"].as_array()[sele] == "parallel" -adjacent_strands = np.array([ - (strand_i, strand_j) for strand_i, strand_j in zip( - sheet_order["range_id_1"].as_array()[sele], - sheet_order["range_id_2"].as_array()[sele] - ) -]) +adjacent_strands = np.array( + [ + (strand_i, strand_j) + for strand_i, strand_j in zip( + sheet_order["range_id_1"].as_array()[sele], + sheet_order["range_id_2"].as_array()[sele], + ) + ] +) print("Adjacent strands (sheet ID, strand ID):") for sheet_id, (strand_i, strand_j) in zip(sheet_ids, adjacent_strands): @@ -105,9 +109,7 @@ sheet_range = bcif_file.block["struct_sheet_range"] # Again, create a boolean mask that covers the selected sheets -sele = np.array([ - sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array() -]) +sele = np.array([sheet in sheet_ids for sheet in sheet_range["sheet_id"].as_array()]) strand_chain_ids = sheet_range["beg_auth_asym_id"].as_array()[sele] strand_res_id_begs = sheet_range["beg_auth_seq_id"].as_array(int)[sele] strand_res_id_ends = sheet_range["end_auth_seq_id"].as_array(int)[sele] @@ -127,19 +129,21 @@ # i.e. entries with the same chain ID and residue ID # Duplicate entries appear e.g. in beta-barrel structure files # Draw one of each duplicate as orphan -> no connections -non_duplicate_mask = (np.diff(strand_res_id_begs[order], prepend=[-1]) != 0) +non_duplicate_mask = np.diff(strand_res_id_begs[order], prepend=[-1]) != 0 connections = [] -non_duplicate_indices = np.arange(len(sorted_strand_ids))[non_duplicate_mask] +non_duplicate_indices = np.arange(len(sorted_strand_ids))[non_duplicate_mask] for i in range(len(non_duplicate_indices) - 1): current_i = non_duplicate_indices[i] - next_i = non_duplicate_indices[i+1] + next_i = non_duplicate_indices[i + 1] if sorted_chain_ids[current_i] != sorted_chain_ids[next_i]: # No connection between separate chains continue - connections.append(( - (sorted_sheet_ids[current_i], sorted_strand_ids[current_i]), - (sorted_sheet_ids[next_i], sorted_strand_ids[next_i] ) - )) + connections.append( + ( + (sorted_sheet_ids[current_i], sorted_strand_ids[current_i]), + (sorted_sheet_ids[next_i], sorted_strand_ids[next_i]), + ) + ) print("Connected strands (sheet ID, strand ID):") for strand_i, strand_j in connections: @@ -148,18 +152,17 @@ # Save the start and end residue IDs for each strand for labeling ranges = { (sheet_id, strand_id): (begin, end) - for sheet_id, strand_id, begin, end - in zip( - sorted_sheet_ids, sorted_strand_ids, - sorted_res_id_begs, sorted_res_id_ends + for sheet_id, strand_id, begin, end in zip( + sorted_sheet_ids, sorted_strand_ids, sorted_res_id_begs, sorted_res_id_ends ) } # Save the chains ID for each strand for coloring chain_ids = { (sheet_id, strand_id): chain_id - for sheet_id, strand_id, chain_id - in zip(sorted_sheet_ids, sorted_strand_ids, sorted_chain_ids) + for sheet_id, strand_id, chain_id in zip( + sorted_sheet_ids, sorted_strand_ids, sorted_chain_ids + ) } unique_chain_ids = np.unique(sorted_chain_ids) @@ -176,14 +179,15 @@ sheet_graphs = {} for sheet_id in np.unique(sheet_ids): # Select only strands from the current sheet - sheet_mask = (sheet_ids == sheet_id) - sheet_graphs[sheet_id] = nx.Graph([ - (strand_i, strand_j, {"is_parallel": is_parallel}) - for (strand_i, strand_j), is_parallel in zip( - adjacent_strands[sheet_mask], - is_parallel_list[sheet_mask] - ) - ]) + sheet_mask = sheet_ids == sheet_id + sheet_graphs[sheet_id] = nx.Graph( + [ + (strand_i, strand_j, {"is_parallel": is_parallel}) + for (strand_i, strand_j), is_parallel in zip( + adjacent_strands[sheet_mask], is_parallel_list[sheet_mask] + ) + ] + ) ######################################################################## # Another missing information is the direction of the plotted arrows, @@ -199,7 +203,7 @@ # The calculated arrow direction is stored as node attribute. for graph in sheet_graphs.values(): - initial_strand = adjacent_strands[0,0] + initial_strand = adjacent_strands[0, 0] graph.nodes[initial_strand]["is_upwards"] = True for strand in graph.nodes: if strand == initial_strand: @@ -212,21 +216,15 @@ # yet determined continue is_parallel = graph.edges[(strand, adj_strand)]["is_parallel"] - this_strand_is_upwards.append( - is_upwards ^ ~is_parallel - ) + this_strand_is_upwards.append(is_upwards ^ ~is_parallel) if len(this_strand_is_upwards) == 0: - raise ValueError( - "Cannot determine arrow direction from adjacent strands" - ) + raise ValueError("Cannot determine arrow direction from adjacent strands") elif all(this_strand_is_upwards): graph.nodes[strand]["is_upwards"] = True elif not any(this_strand_is_upwards): graph.nodes[strand]["is_upwards"] = False else: - raise ValueError( - "Conflicting arrow directions from adjacent strands" - ) + raise ValueError("Conflicting arrow directions from adjacent strands") ######################################################################## # No we have got all positioning information we need to start plotting. @@ -234,7 +232,7 @@ fig, ax = plt.subplots(figsize=FIG_SIZE) ### Plot arrows -MAX_ARROW_LENGTH = 2 # from y=-1 to y=1 +MAX_ARROW_LENGTH = 2 # from y=-1 to y=1 arrow_length_per_seq_length = MAX_ARROW_LENGTH / np.max( [end - beg + 1 for beg, end in ranges.values()] ) @@ -280,14 +278,17 @@ dy = -arrow_length ax.add_patch( FancyArrow( - x=pos, y=y, dx=0, dy=dy, + x=pos, + y=y, + dx=0, + dy=dy, length_includes_head=True, - width = ARROW_TAIL_WITH, - head_width = ARROW_HEAD_WITH, - head_length = ARROW_HEAD_LENGTH, - facecolor = ARROW_COLORS[color_index % len(ARROW_COLORS)], - edgecolor = CONNECTION_COLOR, - linewidth = ARROW_LINE_WIDTH, + width=ARROW_TAIL_WITH, + head_width=ARROW_HEAD_WITH, + head_length=ARROW_HEAD_LENGTH, + facecolor=ARROW_COLORS[color_index % len(ARROW_COLORS)], + edgecolor=CONNECTION_COLOR, + linewidth=ARROW_LINE_WIDTH, ) ) # Start and end coordinates of the respective arrow @@ -299,10 +300,12 @@ # Plot the short connections at low height # to decrease line intersections # -> sort connections by length of connection -order = np.argsort([ - np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0]) - for strand_i, strand_j in connections -]) +order = np.argsort( + [ + np.abs(coord_dict[strand_i][0][0] - coord_dict[strand_j][0][0]) + for strand_i, strand_j in connections + ] +) connections = [connections[i] for i in order] for i, (strand_i, strand_j) in enumerate(connections): horizontal_line_height = 1 + CONNECTION_HEIGHT + i * CONNECTION_SEPARATION @@ -311,17 +314,12 @@ if np.sign(coord_i_end[1]) == np.sign(coord_j_beg[1]): # Start and end are on the same side of the arrows - x = ( - coord_i_end[0], - coord_i_end[0], - coord_j_beg[0], - coord_j_beg[0] - ) + x = (coord_i_end[0], coord_i_end[0], coord_j_beg[0], coord_j_beg[0]) y = ( coord_i_end[1], np.sign(coord_i_end[1]) * horizontal_line_height, np.sign(coord_j_beg[1]) * horizontal_line_height, - coord_j_beg[1] + coord_j_beg[1], ) else: # Start and end are on different sides @@ -332,7 +330,7 @@ coord_i_end[0] + offset, coord_i_end[0] + offset, coord_j_beg[0], - coord_j_beg[0] + coord_j_beg[0], ) y = ( coord_i_end[1], @@ -340,14 +338,15 @@ np.sign(coord_i_end[1]) * horizontal_line_height, np.sign(coord_j_beg[1]) * horizontal_line_height, np.sign(coord_j_beg[1]) * horizontal_line_height, - coord_j_beg[1] + coord_j_beg[1], ) ax.plot( - x, y, - color = CONNECTION_COLOR, - linewidth = CONNECTION_LINE_WIDTH, + x, + y, + color=CONNECTION_COLOR, + linewidth=CONNECTION_LINE_WIDTH, # Avoid intersection of the line's end with the arrow - solid_capstyle = "butt" + solid_capstyle="butt", ) ### Plot residue ID labels @@ -358,16 +357,16 @@ coord[0], np.sign(coord[1]) * (np.abs(coord[1]) + RES_ID_HEIGHT), str(res_id), - ha="center", va="center", - fontsize=RES_ID_FONT_SIZE, weight=RES_ID_FONT_WEIGHT + ha="center", + va="center", + fontsize=RES_ID_FONT_SIZE, + weight=RES_ID_FONT_WEIGHT, ) ### Plot sheet names as x-axis ticks if SHOW_SHEET_NAMES: tick_pos = [ - np.mean([ - coord_dict[key][0][0] for key in coord_dict if key[0] == sheet_id - ]) + np.mean([coord_dict[key][0][0] for key in coord_dict if key[0] == sheet_id]) for sheet_id in sheet_ids ] ax.set_xticks(tick_pos) @@ -375,8 +374,11 @@ ax.set_frame_on(False) ax.yaxis.set_visible(False) ax.xaxis.set_tick_params( - bottom=False, top=False, labelbottom=True, labeltop=False, - labelsize=SHEET_NAME_FONT_SIZE + bottom=False, + top=False, + labelbottom=True, + labeltop=False, + labelsize=SHEET_NAME_FONT_SIZE, ) else: ax.axis("off") @@ -385,4 +387,4 @@ ax.set_xlim(-1, current_position - SHEET_DISTANCE + 1) ax.set_ylim(-Y_LIMIT, Y_LIMIT) fig.tight_layout() -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/examples/scripts/structure/protein/transketolase_sse.py b/doc/examples/scripts/structure/protein/transketolase_sse.py index 78f3ba546..7697d62d8 100644 --- a/doc/examples/scripts/structure/protein/transketolase_sse.py +++ b/doc/examples/scripts/structure/protein/transketolase_sse.py @@ -14,25 +14,24 @@ # License: BSD 3 clause from tempfile import gettempdir -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.patches import Rectangle import biotite -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx +import biotite.application.dssp as dssp +import biotite.database.entrez as entrez +import biotite.database.rcsb as rcsb import biotite.sequence as seq import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb -import biotite.database.rcsb as rcsb -import biotite.database.entrez as entrez -import biotite.application.dssp as dssp - +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx # Create 'FeaturePlotter' subclasses # for drawing the scondary structure features -class HelixPlotter(graphics.FeaturePlotter): +class HelixPlotter(graphics.FeaturePlotter): def __init__(self): pass @@ -48,12 +47,12 @@ def matches(self, feature): def draw(self, axes, feature, bbox, loc, style_param): # Approx. 1 turn per 3.6 residues to resemble natural helix n_turns = np.ceil((loc.last - loc.first + 1) / 3.6) - x_val = np.linspace(0, n_turns * 2*np.pi, 100) + x_val = np.linspace(0, n_turns * 2 * np.pi, 100) # Curve ranges from 0.3 to 0.7 - y_val = (-0.4*np.sin(x_val) + 1) / 2 + y_val = (-0.4 * np.sin(x_val) + 1) / 2 # Transform values for correct location in feature map - x_val *= bbox.width / (n_turns * 2*np.pi) + x_val *= bbox.width / (n_turns * 2 * np.pi) x_val += bbox.x0 y_val *= bbox.height y_val += bbox.y0 @@ -63,18 +62,14 @@ def draw(self, axes, feature, bbox, loc, style_param): bbox.p0, bbox.width, bbox.height, color="white", linewidth=0 ) axes.add_patch(background) - axes.plot( - x_val, y_val, linewidth=2, color=biotite.colors["dimgreen"] - ) + axes.plot(x_val, y_val, linewidth=2, color=biotite.colors["dimgreen"]) class SheetPlotter(graphics.FeaturePlotter): - def __init__(self, head_width=0.8, tail_width=0.5): self._head_width = head_width self._tail_width = tail_width - def matches(self, feature): if feature.key == "SecStr": if "sec_str_type" in feature.qual: @@ -84,39 +79,52 @@ def matches(self, feature): def draw(self, axes, feature, bbox, loc, style_param): x = bbox.x0 - y = bbox.y0 + bbox.height/2 + y = bbox.y0 + bbox.height / 2 dx = bbox.width dy = 0 - if loc.defect & seq.Location.Defect.MISS_RIGHT: + if loc.defect & seq.Location.Defect.MISS_RIGHT: # If the feature extends into the prevoius or next line # do not draw an arrow head draw_head = False else: draw_head = True - axes.add_patch(biotite.AdaptiveFancyArrow( - x, y, dx, dy, - self._tail_width*bbox.height, self._head_width*bbox.height, - # Create head with 90 degrees tip - # -> head width/length ratio = 1/2 - head_ratio=0.5, draw_head=draw_head, - color=biotite.colors["orange"], linewidth=0 - )) + axes.add_patch( + biotite.AdaptiveFancyArrow( + x, + y, + dx, + dy, + self._tail_width * bbox.height, + self._head_width * bbox.height, + # Create head with 90 degrees tip + # -> head width/length ratio = 1/2 + head_ratio=0.5, + draw_head=draw_head, + color=biotite.colors["orange"], + linewidth=0, + ) + ) # Test our drawing functions with example annotation -annotation = seq.Annotation([ - seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type" : "helix"}), - seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type" : "sheet"}), -]) +annotation = seq.Annotation( + [ + seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type": "helix"}), + seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type": "sheet"}), + ] +) fig = plt.figure(figsize=(8.0, 0.8)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, multi_line=False, loc_range=(1,100), + ax, + annotation, + multi_line=False, + loc_range=(1, 100), # Register our drawing functions - feature_plotters=[HelixPlotter(), SheetPlotter()] + feature_plotters=[HelixPlotter(), SheetPlotter()], ) fig.tight_layout() @@ -138,11 +146,14 @@ def draw(self, axes, feature, bbox, loc, style_param): fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, symbols_per_line=150, - show_numbers=True, show_line_position=True, + ax, + annotation, + symbols_per_line=150, + show_numbers=True, + show_line_position=True, # 'loc_range' takes exclusive stop -> length+1 is required - loc_range=(1,length+1), - feature_plotters=[HelixPlotter(), SheetPlotter()] + loc_range=(1, length + 1), + feature_plotters=[HelixPlotter(), SheetPlotter()], ) fig.tight_layout() @@ -152,14 +163,17 @@ def draw(self, axes, feature, bbox, loc, style_param): # Converter for the DSSP secondary structure elements # to the classical ones -dssp_to_abc = {"I" : "c", - "S" : "c", - "H" : "a", - "E" : "b", - "G" : "c", - "B" : "b", - "T" : "c", - "C" : "c"} +dssp_to_abc = { + "I": "c", + "S": "c", + "H": "a", + "E": "b", + "G": "c", + "B": "b", + "T": "c", + "C": "c", +} + def visualize_secondary_structure(sse, first_id): """ @@ -176,7 +190,7 @@ def _add_sec_str(annotation, first, last, str_type): # coil return feature = seq.Feature( - "SecStr", [seq.Location(first, last)], {"sec_str_type" : str_type} + "SecStr", [seq.Location(first, last)], {"sec_str_type": str_type} ) annotation.add_feature(feature) @@ -190,25 +204,29 @@ def _add_sec_str(annotation, first, last, str_type): curr_start = i curr_sse = sse[i] else: - if sse[i] != sse[i-1]: + if sse[i] != sse[i - 1]: _add_sec_str( - annotation, curr_start+first_id, i-1+first_id, curr_sse + annotation, curr_start + first_id, i - 1 + first_id, curr_sse ) curr_start = i curr_sse = sse[i] # Add last secondary structure element to annotation - _add_sec_str(annotation, curr_start+first_id, i-1+first_id, curr_sse) + _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( - ax, annotation, symbols_per_line=150, - loc_range=(first_id, first_id+len(sse)), - show_numbers=True, show_line_position=True, - feature_plotters=[HelixPlotter(), SheetPlotter()] + ax, + annotation, + symbols_per_line=150, + loc_range=(first_id, first_id + len(sse)), + show_numbers=True, + show_line_position=True, + feature_plotters=[HelixPlotter(), SheetPlotter()], ) fig.tight_layout() + # Fetch and load structure file_name = rcsb.fetch("1QGD", "bcif", gettempdir()) pdbx_file = pdbx.BinaryCIFFile.read(file_name) @@ -227,7 +245,8 @@ def _add_sec_str(annotation, first, last, str_type): # Last but not least we calculate the secondary structure using # *Biotite*'s built-in method, based on the P-SEA algorithm. -sse = struc.annotate_sse(array, chain_id="A") +array = array[array.chain_id == "A"] +sse = struc.annotate_sse(array) visualize_secondary_structure(sse, tk_mono.res_id[0]) -plt.show() \ No newline at end of file +plt.show() diff --git a/doc/index.rst b/doc/index.rst index 535c69c09..9a09fbc89 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -32,6 +32,10 @@ display: block; } } + /* Limit the size of the sponsor logos */ + .sponsors { + width: 25%; + } @@ -203,7 +207,7 @@ Biotite documentation .. raw:: html - + Interested in contributing to the project? @@ -231,6 +235,20 @@ Biotite documentation Kunzmann2023 +.. raw:: html + +

Sponsors

+ +.. grid:: + :gutter: 5 + :padding: 5 + :class-container: sponsors + + .. grid-item-card:: + :img-background: /static/assets/sponsors/vantai_logo.jpg + :link: https://www.vant.ai/ + + .. toctree:: :maxdepth: 1 :hidden: diff --git a/doc/install.rst b/doc/install.rst index 1b5d7ae1e..2dcb83028 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -20,7 +20,6 @@ dependencies if not already present. Some functionalities require extra packages: - - ``mdtraj`` - Required for trajectory file I/O operations. - ``matplotlib`` - Required for plotting purposes. diff --git a/doc/key.py b/doc/key.py index b517f37fb..b7a2b4334 100644 --- a/doc/key.py +++ b/doc/key.py @@ -8,4 +8,4 @@ def set_ncbi_api_key_from_env(*args, **kwargs): ncbi_api_key = os.environ.get("NCBI_API_KEY") if ncbi_api_key is not None and ncbi_api_key != "": - entrez.set_api_key(ncbi_api_key) \ No newline at end of file + entrez.set_api_key(ncbi_api_key) diff --git a/doc/scraper.py b/doc/scraper.py index ac60188e6..c9fd629ce 100644 --- a/doc/scraper.py +++ b/doc/scraper.py @@ -1,12 +1,11 @@ -import shutil import copy -import sys import os -from os.path import splitext, join, dirname, isfile -from sphinx_gallery.scrapers import figure_rst -from sphinx_gallery.py_source_parser import extract_file_config +import shutil +import sys +from os.path import dirname, isfile, join, splitext from sphinx.errors import ExtensionError - +from sphinx_gallery.py_source_parser import extract_file_config +from sphinx_gallery.scrapers import figure_rst STATIC_IMAGE_COMMAND = "static_image" PYMOL_IMAGE_COMMAND = "ammolite_script" @@ -19,7 +18,7 @@ def static_image_scraper(block, block_vars, gallery_conf): # Search for `sphinx_gallery_static_image` commands block_conf = extract_file_config(code) if STATIC_IMAGE_COMMAND not in block_conf: - return figure_rst([], gallery_conf['src_dir']) + return figure_rst([], gallery_conf["src_dir"]) image_sources = [ join(script_dir, image_name.strip()) @@ -29,7 +28,7 @@ def static_image_scraper(block, block_vars, gallery_conf): # Copy the images into the 'gallery' directory under a canonical # sphinx-gallery name image_destinations = [] - image_path_iterator = block_vars['image_path_iterator'] + image_path_iterator = block_vars["image_path_iterator"] for image in image_sources: suffix = splitext(image)[1] image_destination = image_path_iterator.next() @@ -40,7 +39,7 @@ def static_image_scraper(block, block_vars, gallery_conf): shutil.copy(image, image_destination) # Generate rST for detected image files - return figure_rst(image_destinations, gallery_conf['src_dir']) + return figure_rst(image_destinations, gallery_conf["src_dir"]) def pymol_scraper(block, block_vars, gallery_conf): @@ -48,7 +47,7 @@ def pymol_scraper(block, block_vars, gallery_conf): block_conf = extract_file_config(code) # Search for a `sphinx_gallery_ammolite_script` command if PYMOL_IMAGE_COMMAND not in block_conf: - return figure_rst([], gallery_conf['src_dir']) + return figure_rst([], gallery_conf["src_dir"]) script_dir = dirname(block_vars["src_file"]) pymol_script_path = join(script_dir, block_conf[PYMOL_IMAGE_COMMAND]) @@ -56,7 +55,7 @@ def pymol_scraper(block, block_vars, gallery_conf): # the example script # -> the image will be included in version control # -> Rendering with PyMOL is not necessary for building the docs - pymol_image_path = splitext(block_vars["src_file"])[0] + ".png" + pymol_image_path = splitext(block_vars["src_file"])[0] + ".png" if not isfile(pymol_script_path): raise ExtensionError( f"'{block_vars['src_file']}' has no corresponding " @@ -64,8 +63,8 @@ def pymol_scraper(block, block_vars, gallery_conf): ) try: - import pymol - import ammolite + import ammolite # noqa: F401 + import pymol # noqa: F401 except ImportError: # If Ammolite is not installed, fall back to the image file, # if already existing @@ -82,7 +81,7 @@ def pymol_scraper(block, block_vars, gallery_conf): # to STDOUT or STDERR # -> Save original STDOUT/STDERR and point them # temporarily to DEVNULL - dev_null = open(os.devnull, 'w') + dev_null = open(os.devnull, "w") orig_stdout = sys.stdout orig_stderr = sys.stderr sys.stdout = dev_null @@ -100,13 +99,12 @@ def pymol_scraper(block, block_vars, gallery_conf): dev_null.close() if not isfile(pymol_image_path): raise ExtensionError( - "PyMOL script did not create an image " - "(at expected location)" + "PyMOL script did not create an image " "(at expected location)" ) # Copy the images into the 'gallery' directory under a canonical # sphinx-gallery name - image_path_iterator = block_vars['image_path_iterator'] + image_path_iterator = block_vars["image_path_iterator"] image_destination = image_path_iterator.next() shutil.copy(pymol_image_path, image_destination) - return figure_rst([image_destination], gallery_conf['src_dir']) + return figure_rst([image_destination], gallery_conf["src_dir"]) diff --git a/doc/static/assets/sponsors/LICENSE.rst b/doc/static/assets/sponsors/LICENSE.rst new file mode 100644 index 000000000..93642627e --- /dev/null +++ b/doc/static/assets/sponsors/LICENSE.rst @@ -0,0 +1,3 @@ +The files contained in this directory, depicting sponsor logos, may only be used for the +purpose of displaying the 'Sponsors' section in the documentation. +Any usage beyond this scope is prohibited. diff --git a/doc/static/assets/sponsors/vantai_logo.jpg b/doc/static/assets/sponsors/vantai_logo.jpg new file mode 100644 index 000000000..15166fe99 Binary files /dev/null and b/doc/static/assets/sponsors/vantai_logo.jpg differ diff --git a/doc/switcher.py b/doc/switcher.py index b34cad867..c89518fea 100644 --- a/doc/switcher.py +++ b/doc/switcher.py @@ -3,18 +3,17 @@ # information. __author__ = "Patrick Kunzmann" -__all__ = ["create_api_doc", "skip_non_methods"] +__all__ = ["create_switcher_json"] -from dataclasses import dataclass -from pathlib import Path import json import re +from dataclasses import dataclass import requests import biotite -RELEASE_REQUEST = f"https://api.github.com/repos/biotite-dev/biotite/releases" +RELEASE_REQUEST = "https://api.github.com/repos/biotite-dev/biotite/releases" BIOTITE_URL = "https://www.biotite-python.org" -SEMVER_TAG_REGEX = r"^v(\d+)\.(\d+)\.(\d+)" +SEMVER_TAG_REGEX = r"^v?(\d+)\.(\d+)\.(\d+)" @dataclass(frozen=True) @@ -35,18 +34,17 @@ def __str__(self): return f"{self.major}.{self.minor}.{self.patch}" def __ge__(self, other): - return ( - (self.major, self.minor, self.patch) - >= (other.major, other.minor, other.patch) + return (self.major, self.minor, self.patch) >= ( + other.major, + other.minor, + other.patch, ) def _get_previous_versions(min_tag, n_versions): response = requests.get(RELEASE_REQUEST, params={"per_page": n_versions}) release_data = json.loads(response.text) - versions = [ - Version.from_tag(release["tag_name"]) for release in release_data - ] + versions = [Version.from_tag(release["tag_name"]) for release in release_data] return [version for version in versions if version >= Version.from_tag(min_tag)] @@ -72,17 +70,21 @@ def create_switcher_json(file_path, min_tag, n_versions): if version.patch != 0: # Documentation is not uploaded for patch versions continue - version_config.append({ - "name": f"{version.major}.{version.minor}", - "version": str(version), - "url": f"{BIOTITE_URL}/{version}/", - }) + version_config.append( + { + "name": f"{version.major}.{version.minor}", + "version": str(version), + "url": f"{BIOTITE_URL}/{version}/", + } + ) current_version = _get_current_version() - version_config.append({ - "name": f"{current_version.major}.{current_version.minor}", - "version": str(current_version), - "url": f"{BIOTITE_URL}/{current_version}/", - "preferred": True - }) + version_config.append( + { + "name": f"{current_version.major}.{current_version.minor}", + "version": str(current_version), + "url": f"{BIOTITE_URL}/{current_version}/", + "preferred": True, + } + ) with open(file_path, "w") as file: json.dump(version_config, file, indent=4) diff --git a/doc/tutorial/structure/index.rst b/doc/tutorial/structure/index.rst index 46a2307f4..32b550994 100644 --- a/doc/tutorial/structure/index.rst +++ b/doc/tutorial/structure/index.rst @@ -54,4 +54,3 @@ contains functions for structure analysis and manipulation. measurement segments nucleotide - trajectories diff --git a/doc/tutorial/structure/measurement.rst b/doc/tutorial/structure/measurement.rst index 3b531a81c..a5bd94d6f 100644 --- a/doc/tutorial/structure/measurement.rst +++ b/doc/tutorial/structure/measurement.rst @@ -152,8 +152,9 @@ An ``'a'`` means alpha-helix, ``'b'`` beta-sheet, and ``'c'`` means coil. .. jupyter-execute:: array = pdbx.get_structure(pdbx_file, model=1) + array = array[array.chain_id == 'A'] # Estimate secondary structure - sse = struc.annotate_sse(array, chain_id="A") + sse = struc.annotate_sse(array) # Pretty print print("".join(sse)) diff --git a/doc/tutorial/structure/trajectories.rst b/doc/tutorial/structure/trajectories.rst index d716b406f..ffb4125db 100644 --- a/doc/tutorial/structure/trajectories.rst +++ b/doc/tutorial/structure/trajectories.rst @@ -10,11 +10,6 @@ If you like, you can even use the seamless interaction between *Biotite* and the `OpenMM `_ MD simulation toolkit. -.. note:: - - Reading/writing trajectory files currently requires the - `MDtraj `_ package. - Reading trajectory files ------------------------ diff --git a/doc/viewcode.py b/doc/viewcode.py index d828f960f..ec0b28974 100644 --- a/doc/viewcode.py +++ b/doc/viewcode.py @@ -10,10 +10,10 @@ __author__ = "Patrick Kunzmann" __all__ = ["linkcode_resolve"] +import inspect from importlib import import_module -from os.path import dirname, join, isdir, splitext from os import listdir -import inspect +from os.path import dirname, isdir, join, splitext import biotite @@ -66,10 +66,13 @@ def _index_attributes(package_name, src_path): # Import all modules in directory and index attributes source_files = [ - file_name for file_name in directory_content - if file_name != "__init__.py" and ( + file_name + for file_name in directory_content + if file_name != "__init__.py" + and ( # Standard Python modules - file_name.endswith(".py") or + file_name.endswith(".py") + or # Extension modules file_name.endswith(".pyx") ) @@ -83,9 +86,7 @@ def _index_attributes(package_name, src_path): module = import_module(module_name) if not hasattr(module, "__all__"): - raise AttributeError( - f"Module {module_name} has not attribute '__all__'" - ) + raise AttributeError(f"Module {module_name} has not attribute '__all__'") # Only index attributes from modules that are available # via respective Biotite (sub-)package # If a the attribute is available, the module was imported in @@ -98,8 +99,7 @@ def _index_attributes(package_name, src_path): is_cython = source_file.endswith(".pyx") for attribute in module.__all__: - attribute_index[(package_name, attribute)] \ - = (module_name, is_cython) + attribute_index[(package_name, attribute)] = (module_name, is_cython) if is_cython: with open(join(src_path, source_file), "r") as cython_file: lines = cython_file.read().splitlines() @@ -144,16 +144,14 @@ def _index_cython_code(code_lines): continue if line.startswith(("def")): - attr_type = "def" # Get name of the function: # Remove 'def' from line... cropped_line = stripped_line[3:].strip() # ...and determine the end of the name by finding the # subsequent '(' - cropped_line = cropped_line[:cropped_line.index("(")].strip() + cropped_line = cropped_line[: cropped_line.index("(")].strip() attr_name = cropped_line elif line.startswith(("class", "cdef class")): - attr_type = "class" cropped_line = stripped_line # Get name of the class: # Remove potential 'cdef' from line... @@ -163,8 +161,11 @@ def _index_cython_code(code_lines): cropped_line = cropped_line[5:].strip() # ...and determine the end of the name by finding the # subsequent '(' or ':' - index = cropped_line.index("(") if "(" in cropped_line \ - else cropped_line.index(":") + index = ( + cropped_line.index("(") + if "(" in cropped_line + else cropped_line.index(":") + ) cropped_line = cropped_line[:index].strip() attr_name = cropped_line else: @@ -172,8 +173,8 @@ def _index_cython_code(code_lines): continue attr_line_start = i - attr_line_stop = i+1 - for j in range(i+1, len(code_lines)): + attr_line_stop = i + 1 + for j in range(i + 1, len(code_lines)): attr_line = code_lines[j] if len(attr_line.strip()) == 0 or attr_line.strip()[0] == "#": continue @@ -189,7 +190,7 @@ def _index_cython_code(code_lines): # 'One' based indexing attr_line_start + 1, # 'One' based indexing and inclusive stop - attr_line_stop + attr_line_stop, ) return line_index @@ -203,7 +204,7 @@ def _is_package(path): _attribute_index, _cython_line_index = _index_attributes( "biotite", # Directory to src/biotite - join(dirname(dirname(__file__)), "src", "biotite") + join(dirname(dirname(__file__)), "src", "biotite"), ) @@ -226,17 +227,11 @@ def linkcode_resolve(domain, info): if is_cython: if (package_name, attr_name) in _cython_line_index: first, last = _cython_line_index[(package_name, attr_name)] - return ( - base_url + - f"{module_name.replace('.', '/')}.pyx#L{first}-L{last}" - ) + return base_url + f"{module_name.replace('.', '/')}.pyx#L{first}-L{last}" else: # In case the attribute is not found # by the Cython code analyzer - return ( - base_url + - f"{module_name.replace('.', '/')}.pyx" - ) + return base_url + f"{module_name.replace('.', '/')}.pyx" else: module = import_module(module_name) @@ -255,7 +250,4 @@ def linkcode_resolve(domain, info): source_lines, first = inspect.getsourcelines(obj) last = first + len(source_lines) - 1 - return ( - base_url + - f"{module_name.replace('.', '/')}.py#L{first}-L{last}" - ) \ No newline at end of file + return base_url + f"{module_name.replace('.', '/')}.py#L{first}-L{last}" diff --git a/environment.yml b/environment.yml index ba7cc77ec..b83db7616 100644 --- a/environment.yml +++ b/environment.yml @@ -10,20 +10,23 @@ channels: - bioconda dependencies: - - python =3.10 + - python =3.11 # Package building - cython >=3.0 - pip >=10.0 - - setuptools >=30.0 + - hatchling + - hatch-vcs == 0.4 - wheel >=0.30 # Biotite dependencies + - biotraj >=1.0,<2.0 - msgpack-python >=0.5.6 - networkx >=2.0 - - numpy >=1.15, <2.0 + - numpy >=2.0 - requests >=2.12 # Testing - - mdtraj >=1.9.3, <1.10 - pytest >=7.0 + # Code style + - ruff =0.5.2 # Interfaced software in biotite.application (can also be installed separately) - autodock-vina - clustalo diff --git a/pyproject.toml b/pyproject.toml index f1fa2e07b..fded7a18f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.rst" authors = [{name = "The Biotite contributors"}] license = {"file" = "LICENSE.rst"} classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: BSD License", @@ -18,15 +18,16 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Topic :: Scientific/Engineering :: Bio-Informatics", ] -# Based on https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg -# When updating our minimum supported python version follow minimums set in this setup.cfg -# as of 2022-01 for 3.7 "numpy >= 1.14.5", for 3.8 "numpy >= 1.17.3", for 3.9 "numpy >= 1.19.3" -# this should be manually updated as the minimum python version increases + dependencies = [ + # Wheels compiled with NumPy 2.0 are backward compatible with NumPy 1.x + # https://numpy.org/devdocs/dev/depending_on_numpy.html#numpy-2-0-specific-advice + "numpy >= 1.25", + "biotraj >= 1.0, < 2.0", "requests >= 2.12", - "numpy >= 1.14.5, < 2.0", "msgpack >= 0.5.6", "networkx >= 2.0", + "requests >= 2.12", ] dynamic = ["version"] @@ -40,6 +41,49 @@ homepage = "https://www.biotite-python.org" repository = "https://github.com/biotite-dev/biotite" documentation = "https://www.biotite-python.org" +[tool.ruff.lint] +# pyflakes, pycodestyle isort and varibale naming +select = ["F", "E", "W", "I", "TID", "N"] +ignore = [ + # In docstrings long lines are often intentional + # Most other ocassions are caught by the ruff formatter + "E501", + # Due to constants and class placeholders defined in functions + "N806", +] + +[tool.ruff.lint.per-file-ignores] +# Due to `* import` of BCIF encoding +"setup_ccd.py" = ["F405", "F403"] +# Due to imports after the PATH has been adjusted +"doc/conf.py" = ["E402"] +# Due to `from .module import *` imports in `__init__.py` modules +"__init__.py" = ["F403", "TID252"] +# Due to pymol scripts that are evaluated in other example scripts +"doc/examples/**/*_pymol.py" = ["F821"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.isort] +# No separator lines between import sections +no-lines-before = [ + "future", + "standard-library", + "third-party", + "first-party", + "local-folder", +] +order-by-type = true +known-first-party = ["biotite"] + +[tool.pytest.ini_options] +filterwarnings = [ + # Appears in loading NetCDF trajectory files + "ignore:The 'netCDF4' Python package is not installed.", + "ignore:Input structure has no associated 'BondList'", +] + [tool.hatch.build.targets.sdist] exclude = [ "tests", @@ -62,13 +106,16 @@ dependencies = ["hatch-cython"] [tool.hatch.build.targets.wheel.hooks.cython.options] include_numpy = true compile_py = false +define_macros = [ + ["NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION"], +] [build-system] requires = [ "hatchling", "hatch-vcs == 0.4", "hatch-cython == 0.5", - "oldest-supported-numpy", + "numpy >= 2.0", "cython >= 3.0", ] build-backend = "hatchling.build" diff --git a/setup_ccd.py b/setup_ccd.py index 07218964d..a3351c205 100644 --- a/setup_ccd.py +++ b/setup_ccd.py @@ -1,13 +1,14 @@ import gzip import logging -from io import StringIO from dataclasses import dataclass +from io import StringIO +from pathlib import Path import numpy as np import requests from biotite.structure.io.pdbx import * -class ComponentException(Exception): +class ComponentError(Exception): pass @@ -28,6 +29,7 @@ class ColumnInfo: The name of an alternative column to use, if the original column contains masked values and no `fill_value` is given. """ + dtype: ... encoding: ... fill_value: ... = None @@ -37,67 +39,75 @@ class ColumnInfo: MAIN_COLUMNS = { "id": ColumnInfo( "U5", - [StringArrayEncoding( - data_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + data_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=2, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "name": ColumnInfo( str, - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT32)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "type": ColumnInfo( str, - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "formula_weight": ColumnInfo( "f8", [ FixedPointEncoding(factor=1000, src_type=TypeCode.FLOAT64), - ByteArrayEncoding() + ByteArrayEncoding(), ], - fill_value=0 + fill_value=0, ), "one_letter_code": ColumnInfo( "U1", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )], - fill_value="" + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], + fill_value="", ), } @@ -105,148 +115,160 @@ class ColumnInfo: ATOM_COLUMNS = { "comp_id": ColumnInfo( "U5", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=2, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "atom_id": ColumnInfo( "U6", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "type_symbol": ColumnInfo( "U2", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] - ), - "charge": ColumnInfo( - "i1", - [ByteArrayEncoding(type=TypeCode.INT8)], - fill_value=0 + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT8)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), + "charge": ColumnInfo("i1", [ByteArrayEncoding(type=TypeCode.INT8)], fill_value=0), "pdbx_model_Cartn_x_ideal": ColumnInfo( "f4", [ FixedPointEncoding(factor=100), IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding() + ByteArrayEncoding(), ], - alternative="model_Cartn_x" + alternative="model_Cartn_x", ), "pdbx_model_Cartn_y_ideal": ColumnInfo( "f4", [ FixedPointEncoding(factor=100), IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding() + ByteArrayEncoding(), ], - alternative="model_Cartn_y" + alternative="model_Cartn_y", ), "pdbx_model_Cartn_z_ideal": ColumnInfo( "f4", [ FixedPointEncoding(factor=100), IntegerPackingEncoding(byte_count=2, is_unsigned=False), - ByteArrayEncoding() + ByteArrayEncoding(), ], - alternative="model_Cartn_z" + alternative="model_Cartn_z", ), } BOND_COLUMNS = { "comp_id": ColumnInfo( "U5", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=2, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=2, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "atom_id_1": ColumnInfo( "U6", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "atom_id_2": ColumnInfo( "U6", - [StringArrayEncoding( - # The unique strings in the column are sorted - # -> Indices do not follow distinct pattern - data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], - offset_encoding=[ - DeltaEncoding(src_type=TypeCode.INT32), - RunLengthEncoding(), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ] - )] + [ + StringArrayEncoding( + # The unique strings in the column are sorted + # -> Indices do not follow distinct pattern + data_encoding=[ByteArrayEncoding(type=TypeCode.INT16)], + offset_encoding=[ + DeltaEncoding(src_type=TypeCode.INT32), + RunLengthEncoding(), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + ) + ], ), "value_order": ColumnInfo( "U4", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)], + ) + ], ), "pdbx_aromatic_flag": ColumnInfo( "U1", - [StringArrayEncoding( - data_encoding=[ - RunLengthEncoding(src_type=TypeCode.INT32), - IntegerPackingEncoding(byte_count=1, is_unsigned=True), - ByteArrayEncoding() - ], - offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)] - )] + [ + StringArrayEncoding( + data_encoding=[ + RunLengthEncoding(src_type=TypeCode.INT32), + IntegerPackingEncoding(byte_count=1, is_unsigned=True), + ByteArrayEncoding(), + ], + offset_encoding=[ByteArrayEncoding(type=TypeCode.UINT8)], + ) + ], ), } @@ -282,18 +304,14 @@ def check_presence(pdbx_file, category_name, column_names): is_present = column_names[0] in category for name in column_names: if (name in category) != is_present: - raise ComponentException( - "Only some column names are missing" - ) + raise ComponentError("Only some column names are missing") if not is_present: return is_unmasked = category[column_names[0]].mask is None for name in column_names: if (category[name].mask is None) != is_unmasked: - raise ComponentException( - "Only some column names are masked" - ) + raise ComponentError("Only some column names are masked") def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): @@ -320,46 +338,40 @@ def concatenate_blocks_into_category(pdbx_file, category_name, column_infos): for comp_id, block in pdbx_file.items(): try: if category_name not in block: - raise ComponentException( - f"Block has no category '{category_name}'" - ) + raise ComponentError(f"Block has no category '{category_name}'") chunk = {} category = block[category_name] for col_name, info in column_infos.items(): col = category.get(col_name) - if ( - col is None - or (col.mask is not None and info.fill_value is None) - ): + if col is None or (col.mask is not None and info.fill_value is None): # Some/all values are missing and there is no default # -> Try alternative if info.alternative is not None: col = category[info.alternative] if col.mask is not None: - raise ComponentException( + raise ComponentError( f"Missing values in alternative " f"'{info.alternative}'" ) else: - raise ComponentException( - f"Missing values in column '{col_name}'" - ) + raise ComponentError(f"Missing values in column '{col_name}'") data_array = col.as_array(info.dtype, info.fill_value) chunk[col_name] = data_array - except ComponentException as e: + except ComponentError as e: logging.warning(f"Skipping '{comp_id}': {e}") # Append all columns in the chunk after the try-except block # to avoid appending incomplete chunks else: for col_name, data_array in chunk.items(): column_chunks[col_name].append(data_array) - return BinaryCIFCategory({ - col_name: BinaryCIFData( - array=np.concatenate(col_data), - encoding=column_infos[col_name].encoding - ) - for col_name, col_data in column_chunks.items() - }) + return BinaryCIFCategory( + { + col_name: BinaryCIFData( + array=np.concatenate(col_data), encoding=column_infos[col_name].encoding + ) + for col_name, col_data in column_chunks.items() + } + ) def extract_component_groups(type_dict, include, exclude, file_name): @@ -393,8 +405,8 @@ def extract_component_groups(type_dict, include, exclude, file_name): del type_dict[comp_id] # Write extracted components into output file logging.info( - f"Using the following types for '{file_name.name}':\n" + - ", ".join(types_for_group) + f"Using the following types for '{file_name.name}':\n" + + ", ".join(types_for_group) ) with open(file_name, "w") as file: for comp_id in comp_ids_for_group: @@ -412,12 +424,12 @@ def setup_ccd(target_diriectory): logging.info("Checking for consistent coordinates...") check_presence( - ccd_file, "chem_comp_atom", - ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"] + ccd_file, "chem_comp_atom", ["model_Cartn_x", "model_Cartn_y", "model_Cartn_z"] ) check_presence( - ccd_file, "chem_comp_atom", - ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"] + ccd_file, + "chem_comp_atom", + ["model_Cartn_x_ideal", "model_Cartn_y_ideal", "model_Cartn_z_ideal"], ) logging.info("Extracting component groups...") @@ -426,26 +438,25 @@ def setup_ccd(target_diriectory): for comp_id, block in ccd_file.items() } extract_component_groups( - type_dict, ["peptide", "amino"], ["peptide-like"], - target_diriectory / "amino_acids.txt" + type_dict, + ["peptide", "amino"], + ["peptide-like"], + target_diriectory / "amino_acids.txt", ) extract_component_groups( - type_dict, ["rna", "dna"], [], - target_diriectory / "nucleotides.txt" + type_dict, ["rna", "dna"], [], target_diriectory / "nucleotides.txt" ) extract_component_groups( - type_dict, ["saccharide"], [], - target_diriectory / "carbohydrates.txt" + type_dict, ["saccharide"], [], target_diriectory / "carbohydrates.txt" ) remaining_types = set(type_dict.values()) logging.info( - "The following types are not used in any group:\n" + - ", ".join(remaining_types) + "The following types are not used in any group:\n" + ", ".join(remaining_types) ) compressed_block = BinaryCIFBlock() for category_name, column_infos in [ - ("chem_comp", MAIN_COLUMNS), + ("chem_comp", MAIN_COLUMNS), ("chem_comp_atom", ATOM_COLUMNS), ("chem_comp_bond", BOND_COLUMNS), ]: @@ -459,5 +470,5 @@ def setup_ccd(target_diriectory): compressed_file["components"] = compressed_block compressed_file.write(target_diriectory / "components.bcif") -from pathlib import Path -setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd") \ No newline at end of file + +setup_ccd(Path(__file__).parent / "src" / "biotite" / "structure" / "info" / "ccd") diff --git a/src/biotite/__init__.py b/src/biotite/__init__.py index b2b83467d..653caf8f5 100644 --- a/src/biotite/__init__.py +++ b/src/biotite/__init__.py @@ -12,8 +12,7 @@ __name__ = "biotite" __author__ = "Patrick Kunzmann" -from .file import * -from .temp import * from .copyable import * +from .file import * +from .version import __version__, __version_tuple__ # noqa: F401 from .visualize import * -from .version import __version__, __version_tuple__ diff --git a/src/biotite/application/__init__.py b/src/biotite/application/__init__.py index 72ca3f96c..de09a3dbf 100644 --- a/src/biotite/application/__init__.py +++ b/src/biotite/application/__init__.py @@ -65,5 +65,5 @@ from .application import * from .localapp import * +from .msaapp import * from .webapp import * -from .msaapp import * \ No newline at end of file diff --git a/src/biotite/application/application.py b/src/biotite/application/application.py index 858658175..fb5d2c037 100644 --- a/src/biotite/application/application.py +++ b/src/biotite/application/application.py @@ -4,19 +4,26 @@ __name__ = "biotite.application" __author__ = "Patrick Kunzmann" -__all__ = ["Application", "AppStateError", "TimeoutError", "VersionError", - "AppState", "requires_state"] +__all__ = [ + "Application", + "AppStateError", + "TimeoutError", + "VersionError", + "AppState", + "requires_state", +] import abc import time -from functools import wraps from enum import Flag, auto +from functools import wraps class AppState(Flag): """ This enum type represents the app states of an application. """ + CREATED = auto() RUNNING = auto() FINISHED = auto() @@ -45,6 +52,7 @@ def requires_state(app_state): ... def function(self): ... pass """ + def decorator(func): @wraps(func) def wrapper(*args, **kwargs): @@ -52,16 +60,16 @@ def wrapper(*args, **kwargs): try: instance = args[0] except IndexError: - raise TypeError( - "This method must be called from a class instance" - ) + raise TypeError("This method must be called from a class instance") if not instance._state & app_state: raise AppStateError( f"The application is in {instance.get_app_state()} state, " f"but {app_state} state is required" ) return func(*args, **kwargs) + return wrapper + return decorator @@ -146,11 +154,10 @@ def join(self, timeout=None): """ time.sleep(self.wait_interval()) while self.get_app_state() != AppState.FINISHED: - if timeout is not None and time.time()-self._start_time > timeout: + if timeout is not None and time.time() - self._start_time > timeout: self.cancel() raise TimeoutError( - f"The application expired its timeout " - f"({timeout:.1f} s)" + f"The application expired its timeout " f"({timeout:.1f} s)" ) else: time.sleep(self.wait_interval()) @@ -249,6 +256,7 @@ class AppStateError(Exception): """ Indicate that the application lifecycle was violated. """ + pass @@ -256,6 +264,7 @@ class TimeoutError(Exception): """ Indicate that the application's timeout expired. """ + pass @@ -263,4 +272,5 @@ class VersionError(Exception): """ Indicate that the application's version is invalid. """ - pass \ No newline at end of file + + pass diff --git a/src/biotite/application/autodock/__init__.py b/src/biotite/application/autodock/__init__.py index 9d8aabe1e..756b6648c 100644 --- a/src/biotite/application/autodock/__init__.py +++ b/src/biotite/application/autodock/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.autodock" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/autodock/app.py b/src/biotite/application/autodock/app.py index c93cd3cc8..babd860ae 100644 --- a/src/biotite/application/autodock/app.py +++ b/src/biotite/application/autodock/app.py @@ -9,12 +9,12 @@ import copy from tempfile import NamedTemporaryFile import numpy as np -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state -from ...structure.io.pdbqt import PDBQTFile -from ...structure.residues import get_residue_starts_for, get_residue_masks -from ...structure.bonds import find_connected -from ...structure.error import BadStructureError +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.structure.bonds import find_connected +from biotite.structure.error import BadStructureError +from biotite.structure.io.pdbqt import PDBQTFile +from biotite.structure.residues import get_residue_masks, get_residue_starts_for class VinaApp(LocalApp): @@ -62,8 +62,8 @@ class VinaApp(LocalApp): ... flexible=(receptor.res_id == 2) | (receptor.res_id == 5) ... ) """ - def __init__(self, ligand, receptor, center, size, flexible=None, - bin_path="vina"): + + def __init__(self, ligand, receptor, center, size, flexible=None, bin_path="vina"): super().__init__(bin_path) if ligand.bonds is None: @@ -83,23 +83,17 @@ def __init__(self, ligand, receptor, center, size, flexible=None, if self._is_flexible: flexible_indices = np.where(flexible)[0] - self._flex_res_starts = np.unique(get_residue_starts_for( - receptor, flexible_indices - )) - - self._ligand_file = NamedTemporaryFile( - "w", suffix=".pdbqt", delete=False - ) - self._receptor_file = NamedTemporaryFile( - "w", suffix=".pdbqt", delete=False - ) - self._receptor_flex_file = NamedTemporaryFile( + self._flex_res_starts = np.unique( + get_residue_starts_for(receptor, flexible_indices) + ) + + self._ligand_file = NamedTemporaryFile("w", suffix=".pdbqt", delete=False) + self._receptor_file = NamedTemporaryFile("w", suffix=".pdbqt", delete=False) + self._receptor_flex_file = NamedTemporaryFile( "w", suffix=".pdbqt", delete=False ) - self._out_file = NamedTemporaryFile( - "r", suffix=".pdbqt", delete=False - ) - + self._out_file = NamedTemporaryFile("r", suffix=".pdbqt", delete=False) + @requires_state(AppState.CREATED) def set_seed(self, seed): """ @@ -114,7 +108,7 @@ def set_seed(self, seed): The seed for the random number generator. """ self._seed = seed - + @requires_state(AppState.CREATED) def set_exhaustiveness(self, exhaustiveness): """ @@ -131,7 +125,7 @@ def set_exhaustiveness(self, exhaustiveness): Must be greater than 0. """ self._exhaustiveness = exhaustiveness - + @requires_state(AppState.CREATED) def set_max_number_of_models(self, number): """ @@ -147,7 +141,7 @@ def set_max_number_of_models(self, number): The maximum number of generated modes/models. """ self._number = number - + @requires_state(AppState.CREATED) def set_energy_range(self, energy_range): """ @@ -168,34 +162,31 @@ def run(self): # Use different atom ID ranges for atoms in ligand and receptor # for unambiguous assignment, if the receptor contains flexible # residues - self._ligand.set_annotation("atom_id", np.arange( - 1, - self._ligand.array_length() + 1 - )) - self._receptor.set_annotation("atom_id", np.arange( - self._ligand.array_length() + 1, - self._ligand.array_length() + self._receptor.array_length() + 1 - )) + self._ligand.set_annotation( + "atom_id", np.arange(1, self._ligand.array_length() + 1) + ) + self._receptor.set_annotation( + "atom_id", + np.arange( + self._ligand.array_length() + 1, + self._ligand.array_length() + self._receptor.array_length() + 1, + ), + ) ligand_file = PDBQTFile() - # Contains 'true' entries for all atoms that have not been + # Contains 'true' entries for all atoms that have not been # removed from ligand self._ligand_mask = ligand_file.set_structure( - self._ligand, - rotatable_bonds="all" + self._ligand, rotatable_bonds="all" ) ligand_file.write(self._ligand_file) self._ligand_file.flush() - + if self._is_flexible: - self._rigid_mask = np.ones( - self._receptor.array_length(), dtype=bool - ) - # Contains 'true' entries for all atoms that have not been + self._rigid_mask = np.ones(self._receptor.array_length(), dtype=bool) + # Contains 'true' entries for all atoms that have not been # removed from receptor in flexible side chains - self._receptor_mask = np.zeros( - self._receptor.array_length(), dtype=bool - ) + self._receptor_mask = np.zeros(self._receptor.array_length(), dtype=bool) for i, start in enumerate(self._flex_res_starts): flex_mask, rigid_mask, root = self._get_flexible_residue(start) self._rigid_mask &= rigid_mask @@ -207,7 +198,7 @@ def run(self): self._receptor[flex_mask], rotatable_bonds="all", root=root_in_flex_residue, - include_torsdof=False + include_torsdof=False, ) # Enclose each flexible residue # with BEGIN_RES and END_RES @@ -220,7 +211,7 @@ def run(self): receptor_file.set_structure( self._receptor[self._rigid_mask], rotatable_bonds=None, - include_torsdof=False + include_torsdof=False, ) receptor_file.write(self._receptor_file) self._receptor_file.flush() @@ -228,23 +219,30 @@ def run(self): else: receptor_file = PDBQTFile() receptor_file.set_structure( - self._receptor, - rotatable_bonds=None, - include_torsdof=False + self._receptor, rotatable_bonds=None, include_torsdof=False ) receptor_file.write(self._receptor_file) self._receptor_file.flush() arguments = [ - "--ligand", self._ligand_file.name, - "--receptor", self._receptor_file.name, - "--out", self._out_file.name, - "--center_x", f"{self._center[0]:.3f}", - "--center_y", f"{self._center[1]:.3f}", - "--center_z", f"{self._center[2]:.3f}", - "--size_x", f"{self._size[0]:.3f}", - "--size_y", f"{self._size[1]:.3f}", - "--size_z", f"{self._size[2]:.3f}", + "--ligand", + self._ligand_file.name, + "--receptor", + self._receptor_file.name, + "--out", + self._out_file.name, + "--center_x", + f"{self._center[0]:.3f}", + "--center_y", + f"{self._center[1]:.3f}", + "--center_z", + f"{self._center[2]:.3f}", + "--size_x", + f"{self._size[0]:.3f}", + "--size_y", + f"{self._size[1]:.3f}", + "--size_z", + f"{self._size[2]:.3f}", ] if self._seed is not None: arguments.extend(["--seed", str(self._seed)]) @@ -259,32 +257,32 @@ def run(self): self.set_arguments(arguments) super().run() - + def evaluate(self): super().evaluate() out_file = PDBQTFile.read(self._out_file) - + models = out_file.get_structure() n_ligand_atoms = np.count_nonzero(self._ligand_mask) self._ligand_models = models[..., :n_ligand_atoms] self._flex_models = models[..., n_ligand_atoms:] self._n_models = models.stack_depth() - + remarks = out_file.get_remarks() self._energies = np.array( # VINA RESULT: -5.8 0.000 0.000 # ^ [float(remark[12:].split()[0]) for remark in remarks] ) - + def clean_up(self): super().clean_up() cleanup_tempfile(self._ligand_file) cleanup_tempfile(self._receptor_file) cleanup_tempfile(self._receptor_flex_file) cleanup_tempfile(self._out_file) - + @requires_state(AppState.JOINED) def get_energies(self): """ @@ -302,7 +300,7 @@ def get_energies(self): @requires_state(AppState.JOINED) def get_ligand_models(self): """ - Get the ligand structure with the conformations for each + Get the ligand structure with the conformations for each generated binding mode. Returns @@ -312,7 +310,7 @@ def get_ligand_models(self): Each model corresponds to one binding mode. The models are sorted from best to worst predicted binding affinity. - + Notes ----- The returned structure may contain less atoms than the input @@ -338,12 +336,11 @@ def get_ligand_coord(self): atoms are set to *NaN*. """ coord = np.full( - (self._n_models, self._ligand.array_length(), 3), - np.nan, dtype=np.float32 + (self._n_models, self._ligand.array_length(), 3), np.nan, dtype=np.float32 ) coord[:, self._ligand_mask] = self._ligand_models.coord return coord - + @requires_state(AppState.JOINED) def get_flexible_residue_models(self): """ @@ -360,7 +357,7 @@ def get_flexible_residue_models(self): Each model corresponds to one binding mode. The models are sorted from best to worst predicted binding affinity. - + Notes ----- The returned structure may contain less atoms than the input @@ -385,7 +382,7 @@ def get_receptor_coord(self): affinity. Missing coordinates due to the removed nonpolar hydrogen atoms from flexible side chains are set to *NaN*. - + Notes ----- The output is only meaningful, if flexible side chains were @@ -394,8 +391,7 @@ def get_receptor_coord(self): of the input receptor coordinates. """ coord = np.repeat( - self._receptor.coord[np.newaxis, ...], - repeats=self._n_models, axis=0 + self._receptor.coord[np.newaxis, ...], repeats=self._n_models, axis=0 ) if self._is_flexible: # Replace original coordinates with modeled coordinates @@ -424,16 +420,16 @@ def _get_flexible_residue(self, residue_start): root_connect_indices, _ = self._receptor.bonds.get_bonds(root_index) connected_index = None try: - connected_index = root_connect_indices[np.isin( - self._receptor.atom_name[root_connect_indices], ("CB",) - )][0] + connected_index = root_connect_indices[ + np.isin(self._receptor.atom_name[root_connect_indices], ("CB",)) + ][0] except IndexError: # Residue has no appropriate connection (e.g. in glycine) # -> There is no atom in the flexible side chain flex_mask = np.zeros(self._receptor.array_length(), dtype=bool) rigid_mask = np.ones(self._receptor.array_length(), dtype=bool) return flex_mask, rigid_mask, root_index - + # Remove the root bond from the bond list # to find the atoms involved in the flexible part bonds = self._receptor.bonds.copy() @@ -442,7 +438,7 @@ def _get_flexible_residue(self, residue_start): if root_index in flexible_indices: raise BadStructureError( "There are multiple connections between the flexible and " - "rigid part, maybe a cyclic residue like proline was selected" + "rigid part, maybe a cyclic residue like proline was selected" ) flex_mask = np.zeros(self._receptor.array_length(), dtype=bool) @@ -452,7 +448,6 @@ def _get_flexible_residue(self, residue_start): flex_mask[root_index] = True return flex_mask, rigid_mask, root_index - @staticmethod def dock(ligand, receptor, center, size, flexible=None, bin_path="vina"): diff --git a/src/biotite/application/blast/__init__.py b/src/biotite/application/blast/__init__.py index 77caf3e64..65857b2b4 100644 --- a/src/biotite/application/blast/__init__.py +++ b/src/biotite/application/blast/__init__.py @@ -10,5 +10,5 @@ __name__ = "biotite.application.blast" __author__ = "Patrick Kunzmann" +from .alignment import * from .webapp import * -from .alignment import * \ No newline at end of file diff --git a/src/biotite/application/blast/alignment.py b/src/biotite/application/blast/alignment.py index dc5b31784..85890df66 100644 --- a/src/biotite/application/blast/alignment.py +++ b/src/biotite/application/blast/alignment.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["BlastAlignment"] -from ...sequence.align.alignment import Alignment +from biotite.sequence.align.alignment import Alignment class BlastAlignment(Alignment): @@ -14,10 +14,10 @@ class BlastAlignment(Alignment): A specialized :class:`Alignment` class for alignments using the BLAST application. It stores additional data, like the E-value, the HSP position and a description of the hit sequence. - + Like its superclass, all attributes of a :class:`BlastAlignment` are public. The attributes are the same as the constructor parameters. - + Parameters ---------- sequences : list @@ -44,16 +44,25 @@ class BlastAlignment(Alignment): hit_definition : str The name of the hit sequence. """ - - def __init__(self, sequences, trace, score, e_value, - query_interval, hit_interval, hit_id, hit_definition): + + def __init__( + self, + sequences, + trace, + score, + e_value, + query_interval, + hit_interval, + hit_id, + hit_definition, + ): super().__init__(sequences, trace, score) self.e_value = e_value self.query_interval = query_interval self.hit_interval = hit_interval self.hit_id = hit_id self.hit_definition = hit_definition - + def __eq__(self, item): if not isinstance(item, BlastAlignment): return False @@ -68,7 +77,7 @@ def __eq__(self, item): if self.hit_definition != item.hit_definition: return False return super().__eq__(item) - + def __getitem__(self, index): super_alignment = super().__getitem__(index) return BlastAlignment( @@ -79,5 +88,5 @@ def __getitem__(self, index): self.query_interval, self.hit_interval, self.hit_id, - self.hit_definition - ) \ No newline at end of file + self.hit_definition, + ) diff --git a/src/biotite/application/blast/webapp.py b/src/biotite/application/blast/webapp.py index cf358ac23..f8d6b09d1 100644 --- a/src/biotite/application/blast/webapp.py +++ b/src/biotite/application/blast/webapp.py @@ -6,26 +6,26 @@ __author__ = "Patrick Kunzmann" __all__ = ["BlastWebApp"] -from .alignment import BlastAlignment -from ..application import Application, requires_state, AppState -from ..webapp import WebApp, RuleViolationError -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.io.fasta.convert import get_sequence -from ...sequence.align.alignment import Alignment import time -import requests from xml.etree import ElementTree - +import requests +from biotite.application.application import AppState, requires_state +from biotite.application.blast.alignment import BlastAlignment +from biotite.application.webapp import WebApp +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.io.fasta.convert import get_sequence +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence +from biotite.sequence.sequence import Sequence _ncbi_url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" + class BlastWebApp(WebApp): """ Perform a local alignment against a large sequence database using using the web-based BLAST application (by default NCBI BLAST). - + Parameters ---------- program : str @@ -35,7 +35,7 @@ class BlastWebApp(WebApp): The query sequence. If a string is provided, it is interpreted as path to a FASTA file, if the string contains a valid FASTA file extension, otherwise it is interpreted as a single letter - string representation of a sequence. + string representation of a sequence. database : str, optional The NCBI sequence database to blast against. By default it contains all sequences (`database`='nr'`). @@ -52,68 +52,71 @@ class BlastWebApp(WebApp): HTTP request. This allows the NCBI to contact you in case your application sends too many requests. """ - + _last_contact = 0 _last_request = 0 _contact_delay = 3 _request_delay = 60 - - def __init__(self, program, query, database="nr", - app_url=_ncbi_url, obey_rules=True, - mail="padix.key@gmail.com"): + + def __init__( + self, + program, + query, + database="nr", + app_url=_ncbi_url, + obey_rules=True, + mail="padix.key@gmail.com", + ): super().__init__(app_url, obey_rules) - + # 'megablast' is somehow not working # When entering the corresponding HTTPS request into a browser # you are redirected onto the blast mainpage - if program not in ["blastn", "blastp", - "blastx", "tblastn", "tblastx"]: + if program not in ["blastn", "blastp", "blastx", "tblastn", "tblastx"]: raise ValueError(f"'{program}' is not a valid BLAST program") self._program = program - - requires_protein = (program in ["blastp", "tblastn"]) - if isinstance(query, str) and query.endswith((".fa",".fst",".fasta")): + + requires_protein = program in ["blastp", "tblastn"] + if isinstance(query, str) and query.endswith((".fa", ".fst", ".fasta")): # If string has a file extension, it is interpreted as # FASTA file from which the sequence is taken file = FastaFile.read(query) # Get first entry in file and take the sequence - # (rather than header) + # (rather than header) self._query = str(get_sequence(file)) elif isinstance(query, Sequence): self._query = str(query) else: self._query = query - + # Check for unsuitable symbols in query string if requires_protein: ref_alphabet = ProteinSequence.alphabet else: ref_alphabet = NucleotideSequence.alphabet_amb for symbol in self._query: - if not symbol.upper() in ref_alphabet: - raise ValueError( - f"Query sequence contains unsuitable symbol {symbol}" - ) - + if symbol.upper() not in ref_alphabet: + raise ValueError(f"Query sequence contains unsuitable symbol {symbol}") + self._database = database - + self._gap_openining = None self._gap_extension = None self._word_size = None - + self._expect_value = None self._max_results = None self._entrez_query = None - + self._reward = None self._penalty = None - + self._matrix = None self._threshold = None - - self._mail=mail + + self._mail = mail self._rid = None - + @requires_state(AppState.CREATED) def set_entrez_query(self, query): """ @@ -126,7 +129,7 @@ def set_entrez_query(self, query): An NCBI Entrez query. """ self._entrez_query = str(query) - + @requires_state(AppState.CREATED) def set_max_results(self, number): """ @@ -138,30 +141,30 @@ def set_max_results(self, number): The maximum number of results. """ self._max_results = number - + @requires_state(AppState.CREATED) def set_max_expect_value(self, value): """ Set the threshold expectation value (E-value). No alignments with an E-value above this threshold will be considered. - + The E-Value is the expectation value for the number of random sequences of a similar sized database getting an equal or higher score by change when aligned with the query sequence. - + Parameters ---------- value : float The threshold E-value. """ self._expect_value = value - + @requires_state(AppState.CREATED) def set_gap_penalty(self, opening, extension): """ Set the affine gap penalty for the alignment. - + Parameters ---------- opening : float @@ -171,75 +174,75 @@ def set_gap_penalty(self, opening, extension): """ self._gap_openining = opening self._gap_extension = extension - + @requires_state(AppState.CREATED) def set_word_size(self, size): """ Set the word size for alignment seeds. - + Parameters ---------- size : int Word size. """ self._word_size = size - + @requires_state(AppState.CREATED) def set_match_reward(self, reward): """ Set the score of a symbol match in the alignment. - + Used only in 'blastn' and 'megablast'. - + Parameters ---------- reward : int Match reward. Must be positive. """ self._reward = reward - + @requires_state(AppState.CREATED) def set_mismatch_penalty(self, penalty): """ Set the penalty of a symbol mismatch in the alignment. - + Used only in 'blastn' and 'megablast'. - + Parameters ---------- penalty : int Mismatch penalty. Must be negative. """ self._penalty = penalty - + @requires_state(AppState.CREATED) def set_substitution_matrix(self, matrix_name): """ Set the penalty of a symbol mismatch in the alignment. - + Used only in 'blastp', "blastx', 'tblastn' and 'tblastx'. - + Parameters ---------- matrix_name : str Name of the substitution matrix. Default is 'BLOSUM62'. """ self._matrix = matrix_name.upper() - + @requires_state(AppState.CREATED) def set_threshold(self, threshold): """ Set the threshold neighboring score for initial words. - + Used only in 'blastp', "blastx', 'tblastn' and 'tblastx'. - + Parameters ---------- threshold : int Threshold value. Must be positve. """ self._threshold = threshold - + def run(self): param_dict = {} param_dict["tool"] = "Biotite" @@ -255,23 +258,24 @@ def run(self): if self._expect_value is not None: param_dict["EXPECT"] = self._expect_value if self._gap_openining is not None and self._gap_extension is not None: - param_dict["GAPCOSTS"] = "{:d} {:d}".format(self._gap_openining, - self._gap_extension) + param_dict["GAPCOSTS"] = "{:d} {:d}".format( + self._gap_openining, self._gap_extension + ) if self._word_size is not None: param_dict["WORD_SIZE"] = self._word_size - + if self._program in ["blastn", "megablast"]: if self._reward is not None: param_dict["NUCL_REWARD"] = self._reward if self._penalty is not None: param_dict["NUCL_PENALTY"] = self._penalty - + if self._program in ["blastp", "blastx", "tblastn", "tblastx"]: if self._matrix is not None: param_dict["MATRIX"] = self._matrix if self._threshold is not None: param_dict["THRESHOLD"] = self._threshold - + request = requests.get(self.app_url(), params=param_dict) if "Submitted URI too large" in request.text: raise ValueError("The URI is too large, try a shorter sequence") @@ -279,11 +283,9 @@ def run(self): self._request() info_dict = BlastWebApp._get_info(request.text) self._rid = info_dict["RID"] - + def is_finished(self): - data_dict = {"FORMAT_OBJECT" : "SearchInfo", - "RID" : self._rid, - "CMD" : "Get"} + data_dict = {"FORMAT_OBJECT": "SearchInfo", "RID": self._rid, "CMD": "Get"} request = requests.get(self.app_url(), params=data_dict) self._contact() info_dict = BlastWebApp._get_info(request.text) @@ -294,17 +296,17 @@ def is_finished(self): "(Server responsed status 'UNKNOWN')" ) return info_dict["Status"] == "READY" - + def wait_interval(self): # NCBI requires a 3 second delay between server contacts return BlastWebApp._contact_delay - + def clean_up(self): param_dict = {} param_dict["CMD"] = "Delete" param_dict["RID"] = self._rid - request = requests.get(self.app_url(), params=param_dict) - + requests.get(self.app_url(), params=param_dict) + def evaluate(self): param_dict = {} param_dict["tool"] = "BiotiteClient" @@ -316,7 +318,7 @@ def evaluate(self): param_dict["NCBI_GI"] = "T" request = requests.get(self.app_url(), params=param_dict) self._contact() - + self._alignments = [] self._xml_response = request.text root = ElementTree.fromstring(self._xml_response) @@ -333,15 +335,14 @@ def evaluate(self): query_end = int(hsp.find("Hsp_query-to").text) hit_begin = int(hsp.find("Hsp_hit-from").text) hit_end = int(hsp.find("Hsp_hit-to").text) - + seq1_str = hsp.find("Hsp_qseq").text seq2_str = hsp.find("Hsp_hseq").text if self._program in ["blastn", "megablast"]: # NucleotideSequence/ProteinSequence do ignore gaps # Gaps are represented by the trace seq1, seq2 = [ - NucleotideSequence(s.replace("-", "")) - for s in (seq1_str, seq2_str) + NucleotideSequence(s.replace("-", "")) for s in (seq1_str, seq2_str) ] else: seq1, seq2 = [ @@ -349,18 +350,24 @@ def evaluate(self): for s in (seq1_str, seq2_str) ] trace = Alignment.trace_from_strings([seq1_str, seq2_str]) - - alignment = BlastAlignment( [seq1 ,seq2], trace, score, e_value, - (query_begin, query_end), - (hit_begin, hit_end), - hit_id, hit_definition ) + + alignment = BlastAlignment( + [seq1, seq2], + trace, + score, + e_value, + (query_begin, query_end), + (hit_begin, hit_end), + hit_id, + hit_definition, + ) self._alignments.append(alignment) @requires_state(AppState.JOINED) def get_xml_response(self): """ Get the raw XML response. - + Returns ------- response : str @@ -372,14 +379,14 @@ def get_xml_response(self): def get_alignments(self): """ Get the resulting local sequence alignments. - + Returns ------- alignment : list of BlastAlignment The local sequence alignments. """ return self._alignments - + @staticmethod def _get_info(text): """ @@ -399,7 +406,7 @@ def _get_info(text): pair = line.split("=") info_dict[pair[0].strip()] = pair[1].strip() return info_dict - + def _contact(self): """ Resets the time since the last server contact. Used for @@ -409,7 +416,7 @@ def _contact(self): if (contact - BlastWebApp._last_contact) < BlastWebApp._contact_delay: self.violate_rule("The server was contacted too often") BlastWebApp._last_contact = contact - + def _request(self): """ Resets the time since the last new alignment request. Used for diff --git a/src/biotite/application/clustalo/__init__.py b/src/biotite/application/clustalo/__init__.py index 1f3afebac..ba0f44704 100644 --- a/src/biotite/application/clustalo/__init__.py +++ b/src/biotite/application/clustalo/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.clustalo" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/clustalo/app.py b/src/biotite/application/clustalo/app.py index 778c613d8..228300984 100644 --- a/src/biotite/application/clustalo/app.py +++ b/src/biotite/application/clustalo/app.py @@ -8,20 +8,16 @@ from tempfile import NamedTemporaryFile import numpy as np -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.align.alignment import Alignment -from ...sequence.phylo.tree import Tree -from ..localapp import cleanup_tempfile -from ..msaapp import MSAApp -from ..application import AppState, requires_state +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import cleanup_tempfile +from biotite.application.msaapp import MSAApp +from biotite.sequence.phylo.tree import Tree class ClustalOmegaApp(MSAApp): """ Perform a multiple sequence alignment using Clustal-Omega. - + Parameters ---------- sequences : list of ProteinSequence or NucleotideSequence @@ -30,7 +26,7 @@ class ClustalOmegaApp(MSAApp): Path of the Custal-Omega binary. matrix : None This parameter is used for compatibility reasons and is ignored. - + Examples -------- @@ -48,34 +44,30 @@ class ClustalOmegaApp(MSAApp): -BISMITE --IQLITE """ - + def __init__(self, sequences, bin_path="clustalo", matrix=None): super().__init__(sequences, bin_path, None) self._seq_count = len(sequences) self._mbed = True self._dist_matrix = None self._tree = None - self._in_dist_matrix_file = NamedTemporaryFile( - "w", suffix=".mat", delete=False - ) + self._in_dist_matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False) self._out_dist_matrix_file = NamedTemporaryFile( "r", suffix=".mat", delete=False ) - self._in_tree_file = NamedTemporaryFile( - "w", suffix=".tree", delete=False - ) - self._out_tree_file = NamedTemporaryFile( - "r", suffix=".tree", delete=False - ) - + self._in_tree_file = NamedTemporaryFile("w", suffix=".tree", delete=False) + self._out_tree_file = NamedTemporaryFile("r", suffix=".tree", delete=False) + def run(self): args = [ - "--in", self.get_input_file_path(), - "--out", self.get_output_file_path(), + "--in", + self.get_input_file_path(), + "--out", + self.get_output_file_path(), # The temporary files are already created # -> tell Clustal to overwrite these empty files "--force", - # Tree order for get_alignment_order() to work properly + # Tree order for get_alignment_order() to work properly "--output-order=tree-order", ] if self.get_seqtype() == "protein": @@ -87,28 +79,24 @@ def run(self): # as input and output# # -> Only request tree output when not tree is input args += [ - "--guidetree-out", self._out_tree_file.name, + "--guidetree-out", + self._out_tree_file.name, ] if not self._mbed: - args += [ - "--full", - "--distmat-out", self._out_dist_matrix_file.name - ] + args += ["--full", "--distmat-out", self._out_dist_matrix_file.name] if self._dist_matrix is not None: # Add the sequence names (0, 1, 2, 3 ...) as first column dist_matrix_with_index = np.concatenate( - ( - np.arange(self._seq_count)[:, np.newaxis], - self._dist_matrix - ), axis=1 + (np.arange(self._seq_count)[:, np.newaxis], self._dist_matrix), axis=1 ) np.savetxt( - self._in_dist_matrix_file.name, dist_matrix_with_index, + self._in_dist_matrix_file.name, + dist_matrix_with_index, # The first line contains the amount of sequences - comments = "", - header = str(self._seq_count), + comments="", + header=str(self._seq_count), # The sequence indices are integers, the rest are floats - fmt = ["%d"] + ["%.5f"] * self._seq_count + fmt=["%d"] + ["%.5f"] * self._seq_count, ) args += ["--distmat-in", self._in_dist_matrix_file.name] if self._tree is not None: @@ -117,15 +105,15 @@ def run(self): args += ["--guidetree-in", self._in_tree_file.name] self.set_arguments(args) super().run() - + def evaluate(self): super().evaluate() if not self._mbed: self._dist_matrix = np.loadtxt( self._out_dist_matrix_file.name, # The first row only contains the number of sequences - skiprows = 1, - dtype = float + skiprows=1, + dtype=float, ) # The first column contains only the name of the # sequences, in this case 0, 1, 2, 3 ... @@ -133,17 +121,15 @@ def evaluate(self): self._dist_matrix = self._dist_matrix[:, 1:] # Only read output tree if no tree was input if self._tree is None: - self._tree = Tree.from_newick( - self._out_tree_file.read().replace("\n", "") - ) - + self._tree = Tree.from_newick(self._out_tree_file.read().replace("\n", "")) + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_dist_matrix_file) cleanup_tempfile(self._out_dist_matrix_file) cleanup_tempfile(self._in_tree_file) cleanup_tempfile(self._out_tree_file) - + @requires_state(AppState.CREATED) def full_matrix_calculation(self): """ @@ -154,13 +140,13 @@ def full_matrix_calculation(self): default *mBed* heuristic. """ self._mbed = False - + @requires_state(AppState.CREATED) def set_distance_matrix(self, matrix): """ Set the pairwise sequence distances, the program should use to - calculate the guide tree. - + calculate the guide tree. + Parameters ---------- matrix : ndarray, shape=(n,n), dtype=float @@ -172,13 +158,13 @@ def set_distance_matrix(self, matrix): f"{self._seq_count} sequences" ) self._dist_matrix = matrix.astype(float, copy=False) - + @requires_state(AppState.JOINED) def get_distance_matrix(self): """ Get the pairwise sequence distances the program used to - calculate the guide tree. - + calculate the guide tree. + Returns ------- matrix : ndarray, shape=(n,n), dtype=float @@ -186,17 +172,16 @@ def get_distance_matrix(self): """ if self._mbed: raise ValueError( - "Getting the distance matrix requires " - "'full_matrix_calculation()'" + "Getting the distance matrix requires " "'full_matrix_calculation()'" ) return self._dist_matrix - + @requires_state(AppState.CREATED) def set_guide_tree(self, tree): """ Set the guide tree, the program should use for the progressive alignment. - + Parameters ---------- tree : Tree @@ -208,31 +193,31 @@ def set_guide_tree(self, tree): "{self._seq_count} sequences, must be equal" ) self._tree = tree - + @requires_state(AppState.JOINED) def get_guide_tree(self): """ Get the guide tree created for the progressive alignment. - + Returns ------- tree : Tree The guide tree. """ return self._tree - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return False - + @staticmethod def supports_custom_protein_matrix(): return False diff --git a/src/biotite/application/dssp/__init__.py b/src/biotite/application/dssp/__init__.py index 93f8f17e2..b1d43758c 100644 --- a/src/biotite/application/dssp/__init__.py +++ b/src/biotite/application/dssp/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.dssp" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/dssp/app.py b/src/biotite/application/dssp/app.py index eb0974460..57e4ac0f3 100644 --- a/src/biotite/application/dssp/app.py +++ b/src/biotite/application/dssp/app.py @@ -7,11 +7,11 @@ __all__ = ["DsspApp"] from tempfile import NamedTemporaryFile -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state -from ...structure.io.pdbx.cif import CIFFile -from ...structure.io.pdbx.convert import set_structure import numpy as np +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.structure.io.pdbx.cif import CIFFile +from biotite.structure.io.pdbx.convert import set_structure class DsspApp(LocalApp): @@ -73,7 +73,7 @@ def __init__(self, atom_array, bin_path="mkdssp"): "occupancy", np.ones(self._array.array_length(), dtype=float) ) - self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False) + self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False) self._out_file = NamedTemporaryFile("r", suffix=".dssp", delete=False) def run(self): @@ -81,9 +81,7 @@ def run(self): set_structure(in_file, self._array) in_file.write(self._in_file) self._in_file.flush() - self.set_arguments( - ["-i", self._in_file.name, "-o", self._out_file.name] - ) + self.set_arguments(["-i", self._in_file.name, "-o", self._out_file.name]) super().run() def evaluate(self): @@ -93,13 +91,12 @@ def evaluate(self): sse_start = None for i, line in enumerate(lines): if line.startswith(" # RESIDUE AA STRUCTURE"): - sse_start = i+1 + sse_start = i + 1 if sse_start is None: raise ValueError("DSSP file does not contain SSE records") # Remove "!" for missing residues lines = [ - line for line in lines[sse_start:] - if len(line) != 0 and line[13] != "!" + line for line in lines[sse_start:] if len(line) != 0 and line[13] != "!" ] self._sse = np.zeros(len(lines), dtype="U1") # Parse file for SSE letters diff --git a/src/biotite/application/localapp.py b/src/biotite/application/localapp.py index acfd1bd8b..ca351e940 100644 --- a/src/biotite/application/localapp.py +++ b/src/biotite/application/localapp.py @@ -9,23 +9,29 @@ import abc import copy from os import chdir, getcwd, remove -from .application import Application, AppState, AppStateError, requires_state -from subprocess import Popen, PIPE, SubprocessError, TimeoutExpired +from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired +from biotite.application.application import ( + Application, + AppState, + AppStateError, + requires_state, +) + class LocalApp(Application, metaclass=abc.ABCMeta): """ The base class for all locally installed applications, that are used via the command line. - + Internally this creates a :class:`Popen` instance, which handles the execution. - + Parameters ---------- bin_path : str Path of the application represented by this class. """ - + def __init__(self, bin_path): super().__init__() self._bin_path = bin_path @@ -35,28 +41,28 @@ def __init__(self, bin_path): self._process = None self._command = None self._stdin_file = None - + @requires_state(AppState.CREATED) def set_arguments(self, arguments): """ Set command line arguments for the application run. - + PROTECTED: Do not call from outside. - + Parameters ---------- arguments : list of str A list of strings representing the command line options. """ self._arguments = copy.copy(arguments) - + @requires_state(AppState.CREATED) def set_stdin(self, file): """ Set a file as standard input for the application run. - + PROTECTED: Do not call from outside. - + Parameters ---------- file : file object @@ -65,7 +71,7 @@ def set_stdin(self, file): such as `StringIO` are invalid. """ self._stdin_file = file - + @requires_state(AppState.CREATED) def add_additional_options(self, options): """ @@ -81,12 +87,12 @@ def add_additional_options(self, options): It is recommended to use this method only, when the respective :class:`LocalApp` subclass does not provide a method to set the desired option. - + Parameters ---------- options : list of str A list of strings representing the command line options. - + Notes ----- In order to see which options the command line execution used, @@ -114,27 +120,24 @@ def add_additional_options(self, options): clustalo --full --in ...fa --out ...fa --force --output-order=tree-order --seqtype Protein --guidetree-out ...tree """ self._options += options - + @requires_state( - AppState.RUNNING | \ - AppState.CANCELLED | \ - AppState.FINISHED | \ - AppState.JOINED + AppState.RUNNING | AppState.CANCELLED | AppState.FINISHED | AppState.JOINED ) def get_command(self): """ Get the executed command. Cannot be called until the application has been started. - + Returns ------- command : str The executed command. - + Examples -------- - + >>> seq1 = ProteinSequence("BIQTITE") >>> seq2 = ProteinSequence("TITANITE") >>> seq3 = ProteinSequence("BISMITE") @@ -146,72 +149,71 @@ def get_command(self): """ return " ".join(self._command) - @requires_state(AppState.CREATED) def set_exec_dir(self, exec_dir): """ Set the directory where the application should be executed. If not set, it will be executed in the working directory at the - time the application was created. - + time the application was created. + PROTECTED: Do not call from outside. - + Parameters ---------- exec_dir : str The execution directory. """ self._exec_dir = exec_dir - + @requires_state(AppState.RUNNING | AppState.FINISHED) def get_process(self): """ Get the `Popen` instance. - + PROTECTED: Do not call from outside. - + Returns ------- process : Popen The `Popen` instance """ return self._process - + @requires_state(AppState.FINISHED | AppState.JOINED) def get_exit_code(self): """ Get the exit code of the process. - + PROTECTED: Do not call from outside. - + Returns ------- code : int The exit code. """ return self._process.returncode - + @requires_state(AppState.FINISHED | AppState.JOINED) def get_stdout(self): """ Get the STDOUT pipe content of the process. - + PROTECTED: Do not call from outside. - + Returns ------- stdout : str The standard output. """ return self._stdout - + @requires_state(AppState.FINISHED | AppState.JOINED) def get_stderr(self): """ Get the STDERR pipe content of the process. - + PROTECTED: Do not call from outside. - + Returns ------- stdout : str @@ -221,38 +223,37 @@ def get_stderr(self): def run(self): cwd = getcwd() - chdir(self._exec_dir) + chdir(self._exec_dir) self._command = [self._bin_path] + self._options + self._arguments self._process = Popen( - self._command, stdin=self._stdin_file, stdout=PIPE, stderr=PIPE, - encoding="UTF-8" + self._command, + stdin=self._stdin_file, + stdout=PIPE, + stderr=PIPE, + encoding="UTF-8", ) chdir(cwd) - + def is_finished(self): code = self._process.poll() - if code == None: + if code is None: return False else: self._stdout, self._stderr = self._process.communicate() return True - + @requires_state(AppState.RUNNING | AppState.FINISHED) def join(self, timeout=None): # Override method as repetitive calls of 'is_finished()' # are not necessary as 'communicate()' already waits for the # finished application try: - self._stdout, self._stderr = self._process.communicate( - timeout=timeout - ) + self._stdout, self._stderr = self._process.communicate(timeout=timeout) except TimeoutExpired: self.cancel() - raise TimeoutError( - f"The application expired its timeout ({timeout:.1f} s)" - ) + raise TimeoutError(f"The application expired its timeout ({timeout:.1f} s)") self._state = AppState.FINISHED - + try: self.evaluate() except AppStateError: @@ -263,12 +264,11 @@ def join(self, timeout=None): else: self._state = AppState.JOINED self.clean_up() - - + def wait_interval(self): # Not used in this implementation of 'join()' raise NotImplementedError() - + def evaluate(self): super().evaluate() # Check if applicaion terminated correctly @@ -276,10 +276,9 @@ def evaluate(self): if exit_code != 0: err_msg = self.get_stderr().replace("\n", " ") raise SubprocessError( - f"'{self._bin_path}' returned with exit code {exit_code}: " - f"{err_msg}" + f"'{self._bin_path}' returned with exit code {exit_code}: " f"{err_msg}" ) - + def clean_up(self): if self.get_app_state() == AppState.CANCELLED: self._process.kill() @@ -290,7 +289,7 @@ def cleanup_tempfile(temp_file): Close a :class:`NamedTemporaryFile` and delete it manually, if `delete` is set to ``False``. This function is a small helper function intended for usage in - `LocalApp` subclasses. + `LocalApp` subclasses. The manual deletion is necessary, as Windows does not allow to open a :class:`NamedTemporaryFile` as second time @@ -302,5 +301,8 @@ def cleanup_tempfile(temp_file): The temporary file to be closed and deleted. """ temp_file.close() - if not temp_file.delete: - remove(temp_file.name) \ No newline at end of file + try: + remove(temp_file.name) + except FileNotFoundError: + # File was already deleted, e.g. due to `TemporaryFile(delete=True)` + pass diff --git a/src/biotite/application/mafft/__init__.py b/src/biotite/application/mafft/__init__.py index 52f86e0ac..19def8bad 100644 --- a/src/biotite/application/mafft/__init__.py +++ b/src/biotite/application/mafft/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.mafft" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/mafft/app.py b/src/biotite/application/mafft/app.py index 2d4a22530..84f3f6b9b 100644 --- a/src/biotite/application/mafft/app.py +++ b/src/biotite/application/mafft/app.py @@ -6,25 +6,19 @@ __author__ = "Patrick Kunzmann" __all__ = ["MafftApp"] -import re import os -from ..msaapp import MSAApp -from ..application import AppState, requires_state -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.io.fasta.file import FastaFile -from ...sequence.align.alignment import Alignment -from ...sequence.phylo.tree import Tree - +import re +from biotite.application.application import AppState, requires_state +from biotite.application.msaapp import MSAApp +from biotite.sequence.phylo.tree import Tree _prefix_pattern = re.compile(r"\d*_") - class MafftApp(MSAApp): """ Perform a multiple sequence alignment using MAFFT. - + Parameters ---------- sequences : list of Sequence @@ -33,7 +27,7 @@ class MafftApp(MSAApp): Path of the MUSCLE binary. matrix : SubstitutionMatrix, optional A custom substitution matrix. - + Examples -------- @@ -51,19 +45,19 @@ class MafftApp(MSAApp): -BISMITE --IQLITE """ - + def __init__(self, sequences, bin_path="mafft", matrix=None): super().__init__(sequences, bin_path, matrix) self._tree = None self._out_tree_file_name = self.get_input_file_path() + ".tree" - + def run(self): args = [ "--quiet", "--auto", "--treeout", # Get the reordered alignment in order for - # get_alignment_order() to work properly + # get_alignment_order() to work properly "--reorder", ] if self.get_seqtype() == "protein": @@ -75,7 +69,7 @@ def run(self): args += [self.get_input_file_path()] self.set_arguments(args) super().run() - + def evaluate(self): with open(self.get_output_file_path(), "w") as f: # MAFFT outputs alignment to stdout @@ -89,7 +83,7 @@ def evaluate(self): # -> remove the '_' prefix newick = re.sub(_prefix_pattern, "", raw_newick) self._tree = Tree.from_newick(newick) - + def clean_up(self): os.remove(self._out_tree_file_name) @@ -97,26 +91,26 @@ def clean_up(self): def get_guide_tree(self): """ Get the guide tree created for the progressive alignment. - + Returns ------- tree : Tree The guide tree. """ return self._tree - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return True - + @staticmethod def supports_custom_protein_matrix(): return True diff --git a/src/biotite/application/msaapp.py b/src/biotite/application/msaapp.py index bf490872e..31eb0064c 100644 --- a/src/biotite/application/msaapp.py +++ b/src/biotite/application/msaapp.py @@ -7,22 +7,22 @@ __all__ = ["MSAApp"] import abc -from tempfile import NamedTemporaryFile from collections import OrderedDict +from tempfile import NamedTemporaryFile import numpy as np -from .localapp import LocalApp, cleanup_tempfile -from .application import AppState, requires_state -from ..sequence.seqtypes import NucleotideSequence, ProteinSequence -from ..sequence.io.fasta.file import FastaFile -from ..sequence.align.alignment import Alignment -from .util import map_sequence, map_matrix +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.application.util import map_matrix, map_sequence +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence class MSAApp(LocalApp, metaclass=abc.ABCMeta): """ This is an abstract base class for multiple sequence alignment software. - + It handles conversion of :class:`Sequence` objects to FASTA input and FASTA output to an :class:`Alignment` object. Inheriting subclasses only need to incorporate the file path @@ -41,10 +41,10 @@ class MSAApp(LocalApp, metaclass=abc.ABCMeta): sequences are mapped back into the original sequence types. The mapping does not work, when the alphabet of the exotic sequences is larger than the amino acid alphabet. - + Internally this creates a :class:`Popen` instance, which handles the execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -54,10 +54,10 @@ class MSAApp(LocalApp, metaclass=abc.ABCMeta): matrix : SubstitutionMatrix, optional A custom substitution matrix. """ - + def __init__(self, sequences, bin_path, matrix=None): super().__init__(bin_path) - + if len(sequences) < 2: raise ValueError("At least two sequences are required") # Check if all sequences share the same alphabet @@ -68,40 +68,39 @@ def __init__(self, sequences, bin_path, matrix=None): # Check matrix symmetry if matrix is not None and not matrix.is_symmetric(): raise ValueError( - "A symmetric matrix is required for " - "multiple sequence alignments" + "A symmetric matrix is required for " "multiple sequence alignments" ) - # Check whether the program supports the alignment for the given # sequence type - if ProteinSequence.alphabet.extends(alphabet) \ - and self.supports_protein(): - self._is_mapped = False - self._seqtype = "protein" - if matrix is not None: - if not self.supports_custom_protein_matrix(): - raise TypeError( - "The software does not support custom " - "substitution matrices for protein sequences" - ) - self._matrix = matrix - else: - self._matrix = None - - elif NucleotideSequence.alphabet_amb.extends(alphabet) \ - and self.supports_nucleotide(): - self._is_mapped = False - self._seqtype = "nucleotide" - if matrix is not None: - if not self.supports_custom_nucleotide_matrix(): - raise TypeError( - "The software does not support custom " - "substitution matrices for nucleotide sequences" - ) - self._matrix = matrix - else: - self._matrix = None + if ProteinSequence.alphabet.extends(alphabet) and self.supports_protein(): + self._is_mapped = False + self._seqtype = "protein" + if matrix is not None: + if not self.supports_custom_protein_matrix(): + raise TypeError( + "The software does not support custom " + "substitution matrices for protein sequences" + ) + self._matrix = matrix + else: + self._matrix = None + + elif ( + NucleotideSequence.alphabet_amb.extends(alphabet) + and self.supports_nucleotide() + ): + self._is_mapped = False + self._seqtype = "nucleotide" + if matrix is not None: + if not self.supports_custom_nucleotide_matrix(): + raise TypeError( + "The software does not support custom " + "substitution matrices for nucleotide sequences" + ) + self._matrix = matrix + else: + self._matrix = None else: # For all other sequence types, try to map the sequence into @@ -126,26 +125,16 @@ def __init__(self, sequences, bin_path, matrix=None): self._sequences = sequences # Sequence masquerades as protein self._seqtype = "protein" - self._mapped_sequences = [ - map_sequence(sequence) for sequence in sequences - ] + self._mapped_sequences = [map_sequence(sequence) for sequence in sequences] self._matrix = map_matrix(matrix) - self._sequences = sequences - self._in_file = NamedTemporaryFile( - "w", suffix=".fa", delete=False - ) - self._out_file = NamedTemporaryFile( - "r", suffix=".fa", delete=False - ) - self._matrix_file = NamedTemporaryFile( - "w", suffix=".mat", delete=False - ) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) + self._out_file = NamedTemporaryFile("r", suffix=".fa", delete=False) + self._matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False) def run(self): - sequences = self._sequences if not self._is_mapped \ - else self._mapped_sequences + sequences = self._sequences if not self._is_mapped else self._mapped_sequences sequences_file = FastaFile() for i, seq in enumerate(sequences): sequences_file[str(i)] = str(seq) @@ -155,7 +144,7 @@ def run(self): self._matrix_file.write(str(self._matrix)) self._matrix_file.flush() super().run() - + def evaluate(self): super().evaluate() alignment_file = FastaFile.read(self._out_file) @@ -169,26 +158,26 @@ def evaluate(self): # Also obtain original order self._order = np.zeros(len(seq_dict), dtype=int) for i, seq_index in enumerate(seq_dict): - self._order[i] = int(seq_index) - + self._order[i] = int(seq_index) + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) cleanup_tempfile(self._out_file) cleanup_tempfile(self._matrix_file) - + @requires_state(AppState.JOINED) def get_alignment(self): """ Get the resulting multiple sequence alignment. - + Returns ------- alignment : Alignment The global multiple sequence alignment. """ return self._alignment - + @requires_state(AppState.JOINED) def get_alignment_order(self): """ @@ -202,12 +191,12 @@ def get_alignment_order(self): order. This method returns the order of the sequences intended by the MSA software. - + Returns ------- order : ndarray, dtype=int The sequence order intended by the MSA software. - + Examples -------- Align sequences and restore the original order: @@ -220,39 +209,39 @@ def get_alignment_order(self): alignment = alignment[:, order] """ return self._order - + def get_input_file_path(self): """ Get input file path (FASTA format). - + PROTECTED: Do not call from outside. - + Returns ------- path : str Path of input file. """ return self._in_file.name - + def get_output_file_path(self): """ Get output file path (FASTA format). - + PROTECTED: Do not call from outside. - + Returns ------- path : str Path of output file. """ return self._out_file.name - + def get_matrix_file_path(self): """ Get file path for custom substitution matrix. - + PROTECTED: Do not call from outside. - + Returns ------- path : str or None @@ -260,7 +249,7 @@ def get_matrix_file_path(self): None if no matrix was given. """ return self._matrix_file.name if self._matrix is not None else None - + def get_seqtype(self): """ Get the type of aligned sequences. @@ -268,16 +257,16 @@ def get_seqtype(self): When a custom sequence type (neither nucleotide nor protein) is mapped onto a protein sequence, the return value is also ``'protein'``. - + PROTECTED: Do not call from outside. - + Returns ------- seqtype : {'nucleotide', 'protein'} Type of sequences to be aligned. """ return self._seqtype - + @staticmethod @abc.abstractmethod def supports_nucleotide(): @@ -289,11 +278,11 @@ def supports_nucleotide(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @staticmethod @abc.abstractmethod def supports_protein(): @@ -305,11 +294,11 @@ def supports_protein(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @staticmethod @abc.abstractmethod def supports_custom_nucleotide_matrix(): @@ -321,11 +310,11 @@ def supports_custom_nucleotide_matrix(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @staticmethod @abc.abstractmethod def supports_custom_protein_matrix(): @@ -337,19 +326,19 @@ def supports_custom_protein_matrix(): ------- support : bool True, if the class has support, false otherwise. - + PROTECTED: Override when inheriting. """ pass - + @classmethod def align(cls, sequences, bin_path=None, matrix=None): """ Perform a multiple sequence alignment. - + This is a convenience function, that wraps the :class:`MSAApp` execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -359,7 +348,7 @@ def align(cls, sequences, bin_path=None, matrix=None): path will be used. matrix : SubstitutionMatrix, optional A custom substitution matrix. - + Returns ------- alignment : Alignment diff --git a/src/biotite/application/muscle/__init__.py b/src/biotite/application/muscle/__init__.py index 644e7a118..c75f0f8be 100644 --- a/src/biotite/application/muscle/__init__.py +++ b/src/biotite/application/muscle/__init__.py @@ -10,4 +10,4 @@ __author__ = "Patrick Kunzmann" from .app3 import * -from .app5 import * \ No newline at end of file +from .app5 import * diff --git a/src/biotite/application/muscle/app3.py b/src/biotite/application/muscle/app3.py index 8df72ce65..60118966a 100644 --- a/src/biotite/application/muscle/app3.py +++ b/src/biotite/application/muscle/app3.py @@ -6,25 +6,22 @@ __author__ = "Patrick Kunzmann" __all__ = ["MuscleApp"] -import re import numbers -import warnings +import re import subprocess +import warnings +from collections.abc import Sequence from tempfile import NamedTemporaryFile -from ..localapp import cleanup_tempfile -from ..msaapp import MSAApp -from ..application import AppState, VersionError, requires_state -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.align.matrix import SubstitutionMatrix -from ...sequence.align.alignment import Alignment -from ...sequence.phylo.tree import Tree +from biotite.application.application import AppState, VersionError, requires_state +from biotite.application.localapp import cleanup_tempfile +from biotite.application.msaapp import MSAApp +from biotite.sequence.phylo.tree import Tree class MuscleApp(MSAApp): """ Perform a multiple sequence alignment using MUSCLE version 3. - + Parameters ---------- sequences : list of Sequence @@ -33,11 +30,11 @@ class MuscleApp(MSAApp): Path of the MUSCLE binary. matrix : SubstitutionMatrix, optional A custom substitution matrix. - + See also -------- Muscle5App - + Examples -------- @@ -55,34 +52,32 @@ class MuscleApp(MSAApp): BISM-ITE -IQL-ITE """ - + def __init__(self, sequences, bin_path="muscle", matrix=None): major_version = get_version(bin_path)[0] if major_version != 3: - raise VersionError( - f"Muscle 3 is required, got version {major_version}" - ) - + raise VersionError(f"Muscle 3 is required, got version {major_version}") + super().__init__(sequences, bin_path, matrix) self._gap_open = None self._gap_ext = None self._terminal_penalty = None self._tree1 = None self._tree2 = None - self._out_tree1_file = NamedTemporaryFile( - "r", suffix=".tree", delete=False - ) - self._out_tree2_file = NamedTemporaryFile( - "r", suffix=".tree", delete=False - ) - + self._out_tree1_file = NamedTemporaryFile("r", suffix=".tree", delete=False) + self._out_tree2_file = NamedTemporaryFile("r", suffix=".tree", delete=False) + def run(self): args = [ "-quiet", - "-in", self.get_input_file_path(), - "-out", self.get_output_file_path(), - "-tree1", self._out_tree1_file.name, - "-tree2", self._out_tree2_file.name, + "-in", + self.get_input_file_path(), + "-out", + self.get_output_file_path(), + "-tree1", + self._out_tree1_file.name, + "-tree2", + self._out_tree2_file.name, ] if self.get_seqtype() == "protein": args += ["-seqtype", "protein"] @@ -91,7 +86,7 @@ def run(self): if self.get_matrix_file_path() is not None: args += ["-matrix", self.get_matrix_file_path()] if self._gap_open is not None and self._gap_ext is not None: - args += ["-gapopen", f"{self._gap_open:.1f}"] + args += ["-gapopen", f"{self._gap_open:.1f}"] args += ["-gapextend", f"{self._gap_ext:.1f}"] # When the gap penalty is set, # use the penalty also for hydrophobic regions @@ -100,7 +95,7 @@ def run(self): args += ["-center", "0.0"] self.set_arguments(args) super().run() - + def evaluate(self): super().evaluate() @@ -108,23 +103,19 @@ def evaluate(self): if len(newick) > 0: self._tree1 = Tree.from_newick(newick) else: - warnings.warn( - "MUSCLE did not write a tree file from the first iteration" - ) - + warnings.warn("MUSCLE did not write a tree file from the first iteration") + newick = self._out_tree2_file.read().replace("\n", "") if len(newick) > 0: self._tree2 = Tree.from_newick(newick) else: - warnings.warn( - "MUSCLE did not write a tree file from the second iteration" - ) - + warnings.warn("MUSCLE did not write a tree file from the second iteration") + def clean_up(self): super().clean_up() cleanup_tempfile(self._out_tree1_file) cleanup_tempfile(self._out_tree2_file) - + @requires_state(AppState.CREATED) def set_gap_penalty(self, gap_penalty): """ @@ -145,20 +136,20 @@ def set_gap_penalty(self, gap_penalty): if gap_penalty > 0: raise ValueError("Gap penalty must be negative") self._gap_open = gap_penalty - self._gap_ext= gap_penalty - elif type(gap_penalty) == tuple: + self._gap_ext = gap_penalty + elif isinstance(gap_penalty, Sequence): if gap_penalty[0] > 0 or gap_penalty[1] > 0: - raise ValueError("Gap penalty must be negative") + raise ValueError("Gap penalty must be negative") self._gap_open = gap_penalty[0] self._gap_ext = gap_penalty[1] else: raise TypeError("Gap penalty must be either float or tuple") - + @requires_state(AppState.JOINED) def get_guide_tree(self, iteration="identity"): """ Get the guide tree created for the progressive alignment. - + Parameters ---------- iteration : {'kmer', 'identity'} @@ -168,7 +159,7 @@ def get_guide_tree(self, iteration="identity"): If 'identity' the second iteration tree is returned. This tree uses distances based on the pairwise sequence identity after the first progressive alignment iteration. - + Returns ------- tree : Tree @@ -180,32 +171,31 @@ def get_guide_tree(self, iteration="identity"): return self._tree2 else: raise ValueError("Iteration must be 'kmer' or 'identity'") - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return False - + @staticmethod def supports_custom_protein_matrix(): return True - + @classmethod - def align(cls, sequences, bin_path=None, matrix=None, - gap_penalty=None): + def align(cls, sequences, bin_path=None, matrix=None, gap_penalty=None): """ Perform a multiple sequence alignment. - + This is a convenience function, that wraps the :class:`MuscleApp` execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -222,7 +212,7 @@ def align(cls, sequences, bin_path=None, matrix=None, The first value in the tuple is the gap opening penalty, the second value is the gap extension penalty. The values need to be negative. - + Returns ------- alignment : Alignment @@ -240,15 +230,11 @@ def align(cls, sequences, bin_path=None, matrix=None, def get_version(bin_path="muscle"): - output = subprocess.run( - [bin_path, "-version"], capture_output=True, text=True - ) + output = subprocess.run([bin_path, "-version"], capture_output=True, text=True) # Find matches for version string containing major and minor version - match = re.search("\d+\.\d+", output.stdout) + match = re.search(r"\d+\.\d+", output.stdout) if match is None: - raise subprocess.SubprocessError( - "Could not determine Muscle version" - ) + raise subprocess.SubprocessError("Could not determine Muscle version") version_string = match.group(0) splitted = version_string.split(".") - return int(splitted[0]), int(splitted[1]) \ No newline at end of file + return int(splitted[0]), int(splitted[1]) diff --git a/src/biotite/application/muscle/app5.py b/src/biotite/application/muscle/app5.py index 326c92227..cc1ef5e2a 100644 --- a/src/biotite/application/muscle/app5.py +++ b/src/biotite/application/muscle/app5.py @@ -6,31 +6,22 @@ __author__ = "Patrick Kunzmann" __all__ = ["Muscle5App"] -import numbers -import warnings -from tempfile import NamedTemporaryFile -from ..localapp import cleanup_tempfile -from ..msaapp import MSAApp -from ..application import AppState, VersionError, requires_state -from ...sequence.sequence import Sequence -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.align.matrix import SubstitutionMatrix -from ...sequence.align.alignment import Alignment -from ...sequence.phylo.tree import Tree -from .app3 import get_version +from biotite.application.application import AppState, VersionError, requires_state +from biotite.application.msaapp import MSAApp +from biotite.application.muscle.app3 import get_version class Muscle5App(MSAApp): """ Perform a multiple sequence alignment using MUSCLE version 5. - + Parameters ---------- sequences : list of Sequence The sequences to be aligned. bin_path : str, optional Path of the MUSCLE binary. - + See also -------- MuscleApp @@ -38,7 +29,7 @@ class Muscle5App(MSAApp): Notes ----- Alignment ensemble generation is not supported, yet. - + Examples -------- @@ -56,14 +47,14 @@ class Muscle5App(MSAApp): BI-SMITE -I-QLITE """ - + def __init__(self, sequences, bin_path="muscle"): major_version = get_version(bin_path)[0] if major_version < 5: raise VersionError( f"At least Muscle 5 is required, got version {major_version}" ) - + super().__init__(sequences, bin_path) self._mode = "align" self._consiters = None @@ -86,7 +77,7 @@ def set_iterations(self, consistency=None, refinement=None): self._consiters = consistency if refinement is not None: self._refineiters = refinement - + @requires_state(AppState.CREATED) def set_thread_number(self, number): """ @@ -110,48 +101,49 @@ def run(self): args = [ f"-{self._mode}", self.get_input_file_path(), - "-output", self.get_output_file_path(), + "-output", + self.get_output_file_path(), ] if self.get_seqtype() == "protein": args += ["-amino"] else: args += ["-nt"] if self._n_threads is not None: - args += ["-threads", str(self._n_threads)] + args += ["-threads", str(self._n_threads)] if self._consiters is not None: - args += ["-consiters", str(self._consiters)] + args += ["-consiters", str(self._consiters)] if self._refineiters is not None: - args += ["-refineiters", str(self._refineiters)] + args += ["-refineiters", str(self._refineiters)] self.set_arguments(args) super().run() - + def clean_up(self): super().clean_up() - + @staticmethod def supports_nucleotide(): return True - + @staticmethod def supports_protein(): return True - + @staticmethod def supports_custom_nucleotide_matrix(): return False - + @staticmethod def supports_custom_protein_matrix(): return False - + @classmethod def align(cls, sequences, bin_path="muscle"): """ Perform a multiple sequence alignment. - + This is a convenience function, that wraps the :class:`Muscle5App` execution. - + Parameters ---------- sequences : iterable object of Sequence @@ -159,7 +151,7 @@ def align(cls, sequences, bin_path="muscle"): bin_path : str, optional Path of the MSA software binary. By default, the default path will be used. - + Returns ------- alignment : Alignment diff --git a/src/biotite/application/sra/__init__.py b/src/biotite/application/sra/__init__.py index d68a49d3e..f69fccde6 100644 --- a/src/biotite/application/sra/__init__.py +++ b/src/biotite/application/sra/__init__.py @@ -15,4 +15,4 @@ __name__ = "biotite.application.sra" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/sra/app.py b/src/biotite/application/sra/app.py index 6f5a20955..7fc39ab4c 100644 --- a/src/biotite/application/sra/app.py +++ b/src/biotite/application/sra/app.py @@ -7,17 +7,21 @@ __all__ = ["FastaDumpApp", "FastqDumpApp"] import abc -from os.path import join -from subprocess import Popen, SubprocessError, PIPE, TimeoutExpired import glob +from os.path import join +from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired from tempfile import TemporaryDirectory -from ..application import Application, AppState, AppStateError, \ - requires_state -from ...sequence.seqtypes import NucleotideSequence -from ...sequence.io.fastq.file import FastqFile -from ...sequence.io.fasta.file import FastaFile -from ...sequence.io.fastq.convert import get_sequences as get_sequences_and_scores -from ...sequence.io.fasta.convert import get_sequences +from biotite.application.application import ( + Application, + AppState, + AppStateError, + requires_state, +) +from biotite.sequence.io.fasta.convert import get_sequences +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.io.fastq.convert import get_sequences as get_sequences_and_scores +from biotite.sequence.io.fastq.file import FastqFile +from biotite.sequence.seqtypes import NucleotideSequence # Do not use LocalApp, as two programs are executed @@ -48,8 +52,13 @@ class _DumpApp(Application, metaclass=abc.ABCMeta): the score format. """ - def __init__(self, uid, output_path_prefix=None, - prefetch_path="prefetch", fasterq_dump_path="fasterq-dump"): + def __init__( + self, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + ): super().__init__() self._prefetch_path = prefetch_path self._fasterq_dump_path = fasterq_dump_path @@ -62,21 +71,16 @@ def __init__(self, uid, output_path_prefix=None, self._prefetch_process = None self._fasterq_dump_process = None - @requires_state(AppState.RUNNING | AppState.FINISHED) def join(self, timeout=None): # Override method as repetitive calls of 'is_finished()' # are not necessary as 'communicate()' already waits for the # finished application try: - _, self._stderr = self._process.communicate( - timeout=timeout - ) + _, self._stderr = self._process.communicate(timeout=timeout) except TimeoutExpired: self.cancel() - raise TimeoutError( - f"The application expired its timeout ({timeout:.1f} s)" - ) + raise TimeoutError(f"The application expired its timeout ({timeout:.1f} s)") self._state = AppState.FINISHED try: @@ -90,7 +94,6 @@ def join(self, timeout=None): self._state = AppState.JOINED self.clean_up() - def run(self): # Prefetch into a temp directory with file name equaling UID # This ensures that the ID in the header is not the temp prefix @@ -105,16 +108,14 @@ def run(self): command, stdout=PIPE, stderr=PIPE, shell=True, encoding="UTF-8" ) - def is_finished(self): code = self._process.poll() - if code == None: + if code is None: return False else: - _, self._stderr = self._process.communicate() + _, self._stderr = self._process.communicate() return True - def evaluate(self): super().evaluate() # Check if applicaion terminated correctly @@ -128,26 +129,24 @@ def evaluate(self): self._file_names = ( # For entries with one read per spot - glob.glob(self._prefix + ".fastq") + + glob.glob(self._prefix + ".fastq") + + # For entries with multiple reads per spot glob.glob(self._prefix + "_*.fastq") ) # Only load FASTQ files into memory when needed self._fastq_files = None - def wait_interval(self): # Not used in this implementation of 'join()' raise NotImplementedError() - def clean_up(self): if self.get_app_state() == AppState.CANCELLED: self._process.kill() # Directory with temp files does not need to be deleted, # as temp dir is automatically deleted upon object destruction - @requires_state(AppState.CREATED) def get_prefetch_options(self): """ @@ -176,7 +175,6 @@ def get_fastq_dump_options(self): """ return "" - @requires_state(AppState.JOINED) def get_file_paths(self): """ @@ -189,7 +187,6 @@ def get_file_paths(self): """ return self._file_names - @requires_state(AppState.JOINED) @abc.abstractmethod def get_sequences(self): @@ -236,15 +233,18 @@ class FastqDumpApp(_DumpApp): the score format. """ - def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump", offset="Sanger"): - super().__init__( - uid, output_path_prefix, prefetch_path, fasterq_dump_path - ) + def __init__( + self, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + offset="Sanger", + ): + super().__init__(uid, output_path_prefix, prefetch_path, fasterq_dump_path) self._offset = offset self._fastq_files = None - @requires_state(AppState.JOINED) def get_fastq(self): """ @@ -265,20 +265,16 @@ def get_fastq(self): ] return self._fastq_files - @requires_state(AppState.JOINED) def get_sequences(self): return [ { - header: NucleotideSequence( - seq_str.replace("U","T").replace("X","N") - ) + header: NucleotideSequence(seq_str.replace("U", "T").replace("X", "N")) for header, (seq_str, _) in fastq_file.items() } for fastq_file in self.get_fastq() ] - @requires_state(AppState.JOINED) def get_sequences_and_scores(self): """ @@ -294,15 +290,17 @@ def get_sequences_and_scores(self): Each item in the list is a dictionary mapping identifiers to its corresponding sequence and score values. """ - return [ - get_sequences_and_scores(fastq_file) - for fastq_file in self.get_fastq() - ] - + return [get_sequences_and_scores(fastq_file) for fastq_file in self.get_fastq()] @classmethod - def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump", offset="Sanger"): + def fetch( + cls, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + offset="Sanger", + ): """ Get the sequences belonging to the UID from the *NCBI sequence read archive* (SRA). @@ -338,9 +336,7 @@ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", Each item in the list is a dictionary mapping identifiers to its corresponding sequence. """ - app = cls( - uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset - ) + app = cls(uid, output_path_prefix, prefetch_path, fasterq_dump_path, offset) app.start() app.join() return app.get_sequences() @@ -368,14 +364,16 @@ class FastaDumpApp(_DumpApp): respectively. """ - def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump"): - super().__init__( - uid, output_path_prefix, prefetch_path, fasterq_dump_path - ) + def __init__( + self, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + ): + super().__init__(uid, output_path_prefix, prefetch_path, fasterq_dump_path) self._fasta_files = None - @requires_state(AppState.CREATED) def get_prefetch_options(self): return @@ -383,12 +381,10 @@ def get_prefetch_options(self): # when https://github.com/ncbi/sra-tools/issues/883 is resolved # return "--eliminate-quals" - @requires_state(AppState.CREATED) def get_fastq_dump_options(self): return "--fasta" - @requires_state(AppState.JOINED) def get_fasta(self): """ @@ -404,20 +400,22 @@ def get_fasta(self): """ if self._fasta_files is None: self._fasta_files = [ - FastaFile.read(file_name) - for file_name in self.get_file_paths() + FastaFile.read(file_name) for file_name in self.get_file_paths() ] return self._fasta_files - @requires_state(AppState.JOINED) def get_sequences(self): return [get_sequences(fasta_file) for fasta_file in self.get_fasta()] - @classmethod - def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", - fasterq_dump_path="fasterq-dump"): + def fetch( + cls, + uid, + output_path_prefix=None, + prefetch_path="prefetch", + fasterq_dump_path="fasterq-dump", + ): """ Get the sequences belonging to the UID from the *NCBI sequence read archive* (SRA). @@ -448,9 +446,7 @@ def fetch(cls, uid, output_path_prefix=None, prefetch_path="prefetch", Each item in the list is a dictionary mapping identifiers to its corresponding sequence. """ - app = cls( - uid, output_path_prefix, prefetch_path, fasterq_dump_path - ) + app = cls(uid, output_path_prefix, prefetch_path, fasterq_dump_path) app.start() app.join() - return app.get_sequences() \ No newline at end of file + return app.get_sequences() diff --git a/src/biotite/application/tantan/__init__.py b/src/biotite/application/tantan/__init__.py index 6efc86610..7a829420a 100644 --- a/src/biotite/application/tantan/__init__.py +++ b/src/biotite/application/tantan/__init__.py @@ -9,4 +9,4 @@ __name__ = "biotite.application.tantan" __author__ = "Patrick Kunzmann" -from .app import * \ No newline at end of file +from .app import * diff --git a/src/biotite/application/tantan/app.py b/src/biotite/application/tantan/app.py index 077a5cbdd..6d7020569 100644 --- a/src/biotite/application/tantan/app.py +++ b/src/biotite/application/tantan/app.py @@ -6,17 +6,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["TantanApp"] -from collections.abc import Sequence as SequenceABC import io +from collections.abc import Sequence as SequenceABC from tempfile import NamedTemporaryFile import numpy as np -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state -from ...sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...sequence.alphabet import common_alphabet -from ...sequence.io.fasta.file import FastaFile -from ..util import map_sequence, map_matrix - +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.sequence.alphabet import common_alphabet +from biotite.sequence.io.fasta.file import FastaFile +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence MASKING_LETTER = "!" @@ -43,7 +41,7 @@ class TantanApp(LocalApp): References ---------- - + .. footbibliography:: Examples @@ -59,10 +57,10 @@ class TantanApp(LocalApp): True True True True True True True True False False False False False] >>> print(sequence, "\n" + "".join(["^" if e else " " for e in repeat_mask])) - GGCATCGATATATATATATAGTCAA - ^^^^^^^^^^^ + GGCATCGATATATATATATAGTCAA + ^^^^^^^^^^^ """ - + def __init__(self, sequence, matrix=None, bin_path="tantan"): super().__init__(bin_path) @@ -93,59 +91,43 @@ def __init__(self, sequence, matrix=None, bin_path="tantan"): ) self._is_protein = True else: - raise TypeError( - "A NucleotideSequence or ProteinSequence is required" - ) - + raise TypeError("A NucleotideSequence or ProteinSequence is required") + if matrix is None: self._matrix_file = None else: - common_alph = common_alphabet( - (seq.alphabet for seq in self._sequences) - ) + common_alph = common_alphabet((seq.alphabet for seq in self._sequences)) if common_alph is None: - raise ValueError( - "There is no common alphabet within the sequences" - ) + raise ValueError("There is no common alphabet within the sequences") if not matrix.get_alphabet1().extends(common_alph): raise ValueError( "The alphabet of the sequence(s) do not fit the matrix" ) if not matrix.is_symmetric(): raise ValueError("A symmetric matrix is required") - self._matrix_file = NamedTemporaryFile( - "w", suffix=".mat", delete=False - ) + self._matrix_file = NamedTemporaryFile("w", suffix=".mat", delete=False) self._matrix = matrix - - self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) def run(self): FastaFile.write_iter( self._in_file, - ( - (f"sequence_{i:d}", str(seq)) - for i, seq in enumerate(self._sequences) - ) + ((f"sequence_{i:d}", str(seq)) for i, seq in enumerate(self._sequences)), ) self._in_file.flush() if self._matrix is not None: self._matrix_file.write(str(self._matrix)) self._matrix_file.flush() - + args = [] if self._matrix is not None: args += ["-m", self._matrix_file.name] if self._is_protein: - args += ["-p"] - args += [ - "-x", MASKING_LETTER, - self._in_file.name - ] + args += ["-p"] + args += ["-x", MASKING_LETTER, self._in_file.name] self.set_arguments(args) super().run() - def evaluate(self): super().evaluate() @@ -154,18 +136,14 @@ def evaluate(self): self._masks = [] encoded_masking_letter = MASKING_LETTER.encode("ASCII")[0] for _, masked_seq_string in FastaFile.read_iter(out_file): - array = np.frombuffer( - masked_seq_string.encode("ASCII"), dtype=np.ubyte - ) + array = np.frombuffer(masked_seq_string.encode("ASCII"), dtype=np.ubyte) self._masks.append(array == encoded_masking_letter) - def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) if self._matrix_file is not None: cleanup_tempfile(self._matrix_file) - @requires_state(AppState.JOINED) def get_mask(self): @@ -186,7 +164,6 @@ def get_mask(self): else: return self._masks[0] - @staticmethod def mask_repeats(sequence, matrix=None, bin_path="tantan"): """ @@ -219,4 +196,4 @@ def mask_repeats(sequence, matrix=None, bin_path="tantan"): app = TantanApp(sequence, matrix, bin_path) app.start() app.join() - return app.get_mask() \ No newline at end of file + return app.get_mask() diff --git a/src/biotite/application/util.py b/src/biotite/application/util.py index ce544c417..4da2a342f 100644 --- a/src/biotite/application/util.py +++ b/src/biotite/application/util.py @@ -8,15 +8,15 @@ import numpy as np -from ..sequence.seqtypes import ProteinSequence -from ..sequence.align.matrix import SubstitutionMatrix +from biotite.sequence.align.matrix import SubstitutionMatrix +from biotite.sequence.seqtypes import ProteinSequence def map_sequence(sequence): """ Map a sequence with an arbitrary alphabet into a :class:`ProteinSequence`, in order to support arbitrary sequence - types in software that can handle protein sequences. + types in software that can handle protein sequences. """ if len(sequence.alphabet) > len(ProteinSequence.alphabet): # Cannot map into a protein sequence if the alphabet @@ -39,12 +39,11 @@ def map_matrix(matrix): Map a :class:`SubstitutionMatrix` with an arbitrary alphabet into a class:`SubstitutionMatrix` for protein sequences, in order to support arbitrary sequence types in software that can handle protein - sequences. + sequences. """ if matrix is None: raise TypeError( - "A substitution matrix must be provided for custom " - "sequence types" + "A substitution matrix must be provided for custom " "sequence types" ) # Create a protein substitution matrix with the values taken # from the original matrix @@ -54,6 +53,5 @@ def map_matrix(matrix): new_score_matrix = np.zeros((new_length, new_length)) new_score_matrix[:old_length, :old_length] = matrix.score_matrix() return SubstitutionMatrix( - ProteinSequence.alphabet, ProteinSequence.alphabet, - new_score_matrix - ) \ No newline at end of file + ProteinSequence.alphabet, ProteinSequence.alphabet, new_score_matrix + ) diff --git a/src/biotite/application/viennarna/rnaalifold.py b/src/biotite/application/viennarna/rnaalifold.py index aadc61b97..4604780aa 100644 --- a/src/biotite/application/viennarna/rnaalifold.py +++ b/src/biotite/application/viennarna/rnaalifold.py @@ -9,12 +9,12 @@ import copy from tempfile import NamedTemporaryFile import numpy as np -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile -from ...sequence.io.fasta import FastaFile, set_alignment -from ...structure.dotbracket import base_pairs_from_dot_bracket -from ...structure.bonds import BondList -from .util import build_constraint_string +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.application.viennarna.util import build_constraint_string +from biotite.sequence.io.fasta import FastaFile, set_alignment +from biotite.structure.bonds import BondList +from biotite.structure.dotbracket import base_pairs_from_dot_bracket class RNAalifoldApp(LocalApp): @@ -45,9 +45,7 @@ def __init__(self, alignment, temperature=37, bin_path="RNAalifold"): self._temperature = str(temperature) self._constraints = None self._enforce = None - self._in_file = NamedTemporaryFile( - "w", suffix=".fa", delete=False - ) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) self._constraints_file = NamedTemporaryFile( "w+", suffix=".constraints", delete=False ) @@ -57,15 +55,17 @@ def run(self): # -> Extremely high value for characters per line fasta_file = FastaFile(chars_per_line=np.iinfo(np.int32).max) set_alignment( - fasta_file, self._alignment, - seq_names=[str(i) for i in range(len(self._alignment.sequences))] + fasta_file, + self._alignment, + seq_names=[str(i) for i in range(len(self._alignment.sequences))], ) fasta_file.write(self._in_file) self._in_file.flush() options = [ "--noPS", - "-T", self._temperature, + "-T", + self._temperature, ] if self._enforce is True: options.append("--enforceConstraint") @@ -78,7 +78,7 @@ def run(self): self.set_arguments(options + [self._in_file.name]) super().run() - + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) @@ -97,7 +97,7 @@ def evaluate(self): self._free_energy = float(energy_contributions[0]) self._covariance_energy = float(energy_contributions[1]) self._dotbracket = dotbracket - + @requires_state(AppState.CREATED) def set_temperature(self, temperature): """ @@ -110,10 +110,17 @@ def set_temperature(self, temperature): The temperature. """ self._temperature = str(temperature) - + @requires_state(AppState.CREATED) - def set_constraints(self, pairs=None, paired=None, unpaired=None, - downstream=None, upstream=None, enforce=False): + def set_constraints( + self, + pairs=None, + paired=None, + unpaired=None, + downstream=None, + upstream=None, + enforce=False, + ): """ Add constraints of known paired or unpaired bases to the folding algorithm. @@ -138,15 +145,14 @@ def set_constraints(self, pairs=None, paired=None, unpaired=None, the respective base pairs must form. By default (false), a constraint does only forbid formation of a pair that would conflict with this constraint. - + Warnings -------- If a constraint is given for a gap position in the consensus sequence, the software may find no base pairs at all. """ self._constraints = build_constraint_string( - len(self._alignment), - pairs, paired, unpaired, downstream, upstream + len(self._alignment), pairs, paired, unpaired, downstream, upstream ) self._enforce = enforce @@ -160,19 +166,19 @@ def get_free_energy(self): ------- free_energy : float The free energy. - + Notes ----- The total energy of the secondary structure regarding the minimization objective is the sum of the free energy and the covariance term. - + See also -------- get_covariance_energy """ return self._free_energy - + @requires_state(AppState.JOINED) def get_covariance_energy(self): """ @@ -183,19 +189,19 @@ def get_covariance_energy(self): ------- covariance_energy : float The energy of the covariance term. - + Notes ----- The total energy of the secondary structure regarding the minimization objective is the sum of the free energy and the covariance term. - + See also -------- get_free_energy """ return self._covariance_energy - + @requires_state(AppState.JOINED) def get_consensus_sequence_string(self): """ @@ -265,7 +271,7 @@ def get_base_pairs(self, sequence_index=None): pair_list = pair_list[trace != -1] # Convert back to array of base pairs, # remove unused BondType column - base_pairs = pair_list.as_array()[:,:2] + base_pairs = pair_list.as_array()[:, :2] return base_pairs @staticmethod @@ -300,5 +306,5 @@ def compute_secondary_structure(alignment, bin_path="RNAalifold"): return ( app.get_dot_bracket(), app.get_free_energy(), - app.get_covariance_energy() + app.get_covariance_energy(), ) diff --git a/src/biotite/application/viennarna/rnafold.py b/src/biotite/application/viennarna/rnafold.py index 52fca90c2..37fb0e3d7 100644 --- a/src/biotite/application/viennarna/rnafold.py +++ b/src/biotite/application/viennarna/rnafold.py @@ -6,14 +6,13 @@ __author__ = "Tom David Müller, Patrick Kunzmann" __all__ = ["RNAfoldApp"] -import warnings from tempfile import NamedTemporaryFile import numpy as np -from ..application import AppState, requires_state -from ..localapp import LocalApp, cleanup_tempfile -from ...sequence.io.fasta import FastaFile, set_sequence -from ...structure.dotbracket import base_pairs_from_dot_bracket -from .util import build_constraint_string +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.application.viennarna.util import build_constraint_string +from biotite.sequence.io.fasta import FastaFile, set_sequence +from biotite.structure.dotbracket import base_pairs_from_dot_bracket class RNAfoldApp(LocalApp): @@ -51,9 +50,7 @@ def __init__(self, sequence, temperature=37, bin_path="RNAfold"): self._temperature = str(temperature) self._constraints = None self._enforce = None - self._in_file = NamedTemporaryFile( - "w", suffix=".fa", delete=False - ) + self._in_file = NamedTemporaryFile("w", suffix=".fa", delete=False) super().__init__(bin_path) def run(self): @@ -65,10 +62,11 @@ def run(self): fasta_file.lines.append(self._constraints) fasta_file.write(self._in_file) self._in_file.flush() - + options = [ "--noPS", - "-T", self._temperature, + "-T", + self._temperature, ] if self._enforce is True: options.append("--enforceConstraint") @@ -87,11 +85,11 @@ def evaluate(self): self._free_energy = free_energy self._dotbracket = dotbracket - + def clean_up(self): super().clean_up() cleanup_tempfile(self._in_file) - + @requires_state(AppState.CREATED) def set_temperature(self, temperature): """ @@ -104,10 +102,17 @@ def set_temperature(self, temperature): The temperature. """ self._temperature = str(temperature) - + @requires_state(AppState.CREATED) - def set_constraints(self, pairs=None, paired=None, unpaired=None, - downstream=None, upstream=None, enforce=False): + def set_constraints( + self, + pairs=None, + paired=None, + unpaired=None, + downstream=None, + upstream=None, + enforce=False, + ): """ Add constraints of known paired or unpaired bases to the folding algorithm. @@ -134,11 +139,10 @@ def set_constraints(self, pairs=None, paired=None, unpaired=None, of a pair that would conflict with this constraint. """ self._constraints = build_constraint_string( - len(self._sequence), - pairs, paired, unpaired, downstream, upstream + len(self._sequence), pairs, paired, unpaired, downstream, upstream ) self._enforce = enforce - + @requires_state(AppState.JOINED) def get_free_energy(self): """ @@ -162,25 +166,6 @@ def get_free_energy(self): """ return self._free_energy - @requires_state(AppState.JOINED) - def get_mfe(self): - """ - Get the free energy (kcal/mol) of the suggested - secondary structure. - - DEPRECATED: Use :meth:`get_free_energy()` instead. - - Returns - ------- - mfe : float - The minimum free energy. - """ - warnings.warn( - "'get_mfe()' is deprecated, use 'get_free_energy()' instead", - DeprecationWarning - ) - return self.get_free_energy() - @requires_state(AppState.JOINED) def get_dot_bracket(self): """ @@ -243,7 +228,7 @@ def get_base_pairs(self): @staticmethod def compute_secondary_structure(sequence, bin_path="RNAfold"): """ - Compute the minimum free energy secondary structure of a + Compute the minimum free energy secondary structure of a ribonucleic acid sequence using *ViennaRNA's* *RNAfold* software. This is a convenience function, that wraps the diff --git a/src/biotite/application/viennarna/rnaplot.py b/src/biotite/application/viennarna/rnaplot.py index 7eedea7ee..1f36f9142 100644 --- a/src/biotite/application/viennarna/rnaplot.py +++ b/src/biotite/application/viennarna/rnaplot.py @@ -6,13 +6,14 @@ __author__ = "Tom David Müller" __all__ = ["RNAplotApp"] -import numpy as np -from tempfile import NamedTemporaryFile -from os import remove from enum import IntEnum -from ..localapp import LocalApp, cleanup_tempfile -from ..application import AppState, requires_state -from ...structure.dotbracket import dot_bracket as dot_bracket_ +from os import remove +from tempfile import NamedTemporaryFile +import numpy as np +from biotite.application.application import AppState, requires_state +from biotite.application.localapp import LocalApp, cleanup_tempfile +from biotite.structure.dotbracket import dot_bracket as dot_bracket_ + class RNAplotApp(LocalApp): """ @@ -60,21 +61,28 @@ class Layout(IntEnum): This enum type represents the layout type of the plot according to the official *RNAplot* orientation. """ - RADIAL = 0, - NAVIEW = 1, - CIRCULAR = 2, - RNATURTLE = 3, + + RADIAL = (0,) + NAVIEW = (1,) + CIRCULAR = (2,) + RNATURTLE = (3,) RNAPUZZLER = 4 - def __init__(self, dot_bracket=None, base_pairs=None, length=None, - layout_type=Layout.NAVIEW, bin_path="RNAplot"): + def __init__( + self, + dot_bracket=None, + base_pairs=None, + length=None, + layout_type=Layout.NAVIEW, + bin_path="RNAplot", + ): super().__init__(bin_path) if dot_bracket is not None: self._dot_bracket = dot_bracket elif (base_pairs is not None) and (length is not None): self._dot_bracket = dot_bracket_( - base_pairs, length, max_pseudoknot_order = 0 + base_pairs, length, max_pseudoknot_order=0 )[0] else: raise ValueError( @@ -84,10 +92,10 @@ def __init__(self, dot_bracket=None, base_pairs=None, length=None, # Get the value of the enum type self._layout_type = str(int(layout_type)) - self._in_file = NamedTemporaryFile("w", suffix=".fold", delete=False) + self._in_file = NamedTemporaryFile("w", suffix=".fold", delete=False) def run(self): - self._in_file.write("N"*len(self._dot_bracket) + "\n") + self._in_file.write("N" * len(self._dot_bracket) + "\n") self._in_file.write(self._dot_bracket) self._in_file.flush() self.set_arguments( @@ -146,8 +154,11 @@ def get_coordinates(self): @staticmethod def compute_coordinates( - dot_bracket=None, base_pairs=None, length=None, - layout_type=Layout.NAVIEW, bin_path="RNAplot" + dot_bracket=None, + base_pairs=None, + length=None, + layout_type=Layout.NAVIEW, + bin_path="RNAplot", ): """ Get coordinates for a 2D representation of any unknotted RNA @@ -179,9 +190,13 @@ def compute_coordinates( The 2D coordinates. Each row represents the *x* and *y* coordinates for a total sequence length of *n*. """ - app = RNAplotApp(dot_bracket=dot_bracket, base_pairs=base_pairs, - length=length, layout_type=layout_type, - bin_path=bin_path) + app = RNAplotApp( + dot_bracket=dot_bracket, + base_pairs=base_pairs, + length=length, + layout_type=layout_type, + bin_path=bin_path, + ) app.start() app.join() - return app.get_coordinates() \ No newline at end of file + return app.get_coordinates() diff --git a/src/biotite/application/viennarna/util.py b/src/biotite/application/viennarna/util.py index df6149a2b..90bcd6c4e 100644 --- a/src/biotite/application/viennarna/util.py +++ b/src/biotite/application/viennarna/util.py @@ -7,12 +7,17 @@ __all__ = ["build_constraint_string"] import numpy as np -from ...structure.pseudoknots import pseudoknots +from biotite.structure.pseudoknots import pseudoknots -def build_constraint_string(sequence_length, - pairs=None, paired=None, unpaired=None, - downstream=None, upstream=None): +def build_constraint_string( + sequence_length, + pairs=None, + paired=None, + unpaired=None, + downstream=None, + upstream=None, +): """ Build a ViennaRNA constraint string. @@ -30,7 +35,7 @@ def build_constraint_string(sequence_length, Positions of bases that are paired with any downstream base. upstream : ndarray, shape=(n,), dtype=int or dtype=bool, optional Positions of bases that are paired with any upstream base. - + Returns ------- constraints : str @@ -45,21 +50,21 @@ def build_constraint_string(sequence_length, raise ValueError("Given pairs include pseudoknots") # Ensure the lower base comes first for each pair pairs = np.sort(pairs, axis=-1) - _set_constraints(constraints, pairs[:,0], "(") - _set_constraints(constraints, pairs[:,1], ")") + _set_constraints(constraints, pairs[:, 0], "(") + _set_constraints(constraints, pairs[:, 1], ")") _set_constraints(constraints, paired, "|") _set_constraints(constraints, unpaired, "x") _set_constraints(constraints, downstream, "<") _set_constraints(constraints, upstream, ">") - + return "".join(constraints) - + def _set_constraints(constraints, index, character): if index is None: return - + # Search for conflicts with other constraints potential_conflict_indices = np.where(constraints[index] != ".")[0] if len(potential_conflict_indices) > 0: @@ -68,5 +73,5 @@ def _set_constraints(constraints, index, character): f"Constraint '{character}' at position {conflict_i} " f"conflicts with existing constraint '{constraints[conflict_i]}'" ) - - constraints[index] = character \ No newline at end of file + + constraints[index] = character diff --git a/src/biotite/application/webapp.py b/src/biotite/application/webapp.py index afeaaddaf..6e76eb1cd 100644 --- a/src/biotite/application/webapp.py +++ b/src/biotite/application/webapp.py @@ -7,22 +7,22 @@ __all__ = ["WebApp", "RuleViolationError"] import abc -from .application import Application +from biotite.application.application import Application class WebApp(Application, metaclass=abc.ABCMeta): """ The base class for all web based applications. - + It allows for getting and setting the URL of the app and raises an :class:`RuleViolationError` when a subclass calls :func:`violate_rule()` (e.g. when the server was contacted too often.) - + Be careful, when calling func:`get_app_state()`. This may involve a server contact and therefore frequent calls may raise a :class:`RuleViolationError`. - + Parameters ---------- app_url : str @@ -31,19 +31,19 @@ class WebApp(Application, metaclass=abc.ABCMeta): If true, the application raises an :class:`RuleViolationError`, if the server rules are violated. (Default: True) """ - + def __init__(self, app_url, obey_rules=True): super().__init__() self._obey_rules = obey_rules self._app_url = app_url - + def violate_rule(self, msg=None): """ Indicate that a server rule was violated, i.e. this raises a :class:`RuleViolationError` unless `obey_rules` is false. - + PROTECTED: Do not call from outside. - + Parameters ---------- msg : str, optional @@ -51,16 +51,14 @@ def violate_rule(self, msg=None): """ if self._obey_rules: if msg is None: - raise RuleViolationError( - "The user guidelines would be violated" - ) + raise RuleViolationError("The user guidelines would be violated") else: raise RuleViolationError(msg) - + def app_url(self): """ Get the URL of the web app. - + Returns ------- url : str @@ -74,4 +72,5 @@ class RuleViolationError(Exception): Indicates that the user guidelines of the web application would be violated, if the program continued. """ - pass \ No newline at end of file + + pass diff --git a/src/biotite/copyable.py b/src/biotite/copyable.py index d9c389b63..30d8a85d5 100644 --- a/src/biotite/copyable.py +++ b/src/biotite/copyable.py @@ -12,22 +12,22 @@ class Copyable(metaclass=abc.ABCMeta): """ Base class for all objects, that should be copyable. - + The public method `copy()` first creates a fresh instance of the class of the instance, that is copied via the `__copy_create__()` method. All variables, that could not be set via the constructor, are then copied via `__copy_fill__()`, starting with the method in the uppermost base class and ending with the class of the instance to be copied. - + This approach solves the problem of encapsulated variables in superclasses. """ - + def copy(self): """ Create a deep copy of this object. - + Returns ------- copy @@ -36,36 +36,36 @@ def copy(self): clone = self.__copy_create__() self.__copy_fill__(clone) return clone - + def __copy_create__(self): """ Instantiate a new object of this class. - + Only the constructor should be called in this method. All further attributes, that need to be copied are handled in `__copy_fill__()` - + Do not call the `super()` method here. - + This method must be overridden, if the constructor takes parameters. - + Returns ------- copy A freshly instantiated copy of *self*. """ return type(self)() - + def __copy_fill__(self, clone): """ Copy all necessary attributes to the new object. - + Always call the `super()` method as first statement. - + Parameters ---------- clone The freshly instantiated copy of *self*. """ - pass \ No newline at end of file + pass diff --git a/src/biotite/database/__init__.py b/src/biotite/database/__init__.py index 36c544065..d4b733cb8 100644 --- a/src/biotite/database/__init__.py +++ b/src/biotite/database/__init__.py @@ -20,4 +20,4 @@ __name__ = "biotite.database" __author__ = "Patrick Kunzmann" -from .error import * \ No newline at end of file +from .error import * diff --git a/src/biotite/database/entrez/__init__.py b/src/biotite/database/entrez/__init__.py index 2b5488ce4..a27d11338 100644 --- a/src/biotite/database/entrez/__init__.py +++ b/src/biotite/database/entrez/__init__.py @@ -11,5 +11,5 @@ from .dbnames import * from .download import * +from .key import * from .query import * -from .key import * \ No newline at end of file diff --git a/src/biotite/database/entrez/check.py b/src/biotite/database/entrez/check.py index 52bcd3fdc..a9e2db5e9 100644 --- a/src/biotite/database/entrez/check.py +++ b/src/biotite/database/entrez/check.py @@ -7,8 +7,7 @@ __all__ = ["check_for_errors"] import json -from ..error import RequestError - +from biotite.database.error import RequestError # Taken from https://github.com/kblin/ncbi-entrez-error-messages _error_messages = [ @@ -58,4 +57,4 @@ def check_for_errors(message): for error_msg in _error_messages: # Often whitespace is also replaced by '+' in error message if error_msg.replace(" ", "") in message_end: - raise RequestError(error_msg) \ No newline at end of file + raise RequestError(error_msg) diff --git a/src/biotite/database/entrez/dbnames.py b/src/biotite/database/entrez/dbnames.py index dfa0a8e0a..e17796648 100644 --- a/src/biotite/database/entrez/dbnames.py +++ b/src/biotite/database/entrez/dbnames.py @@ -7,6 +7,7 @@ __all__ = ["get_database_name"] +# fmt: off _db_names = { "BioProject" : "bioproject", "BioSample" : "biosample", @@ -45,26 +46,27 @@ "UniGene" : "unigene", "UniSTS" : "unists" } +# fmt: on def get_database_name(database): """ Map a common NCBI Entrez database name to an E-utility database name. - + Parameters ---------- database : str Entrez database name. - + Returns ------- name : str E-utility database name. - + Examples -------- - + >>> print(get_database_name("Nucleotide")) nuccore """ @@ -86,4 +88,4 @@ def sanitize_database_name(db_name): # Is already E-utility database name return db_name else: - raise ValueError("Database '{db_name}' is not existing") \ No newline at end of file + raise ValueError("Database '{db_name}' is not existing") diff --git a/src/biotite/database/entrez/download.py b/src/biotite/database/entrez/download.py index d30ac41ea..2c2438d8e 100644 --- a/src/biotite/database/entrez/download.py +++ b/src/biotite/database/entrez/download.py @@ -6,22 +6,28 @@ __author__ = "Patrick Kunzmann" __all__ = ["fetch", "fetch_single_file"] -from os.path import isdir, isfile, join, getsize -import os -import glob import io +import os +from os.path import getsize, isdir, isfile, join import requests -from .check import check_for_errors -from .dbnames import sanitize_database_name -from .key import get_api_key -from ..error import RequestError - +from biotite.database.entrez.check import check_for_errors +from biotite.database.entrez.dbnames import sanitize_database_name +from biotite.database.entrez.key import get_api_key +from biotite.database.error import RequestError _fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" -def fetch(uids, target_path, suffix, db_name, ret_type, - ret_mode="text", overwrite=False, verbose=False): +def fetch( + uids, + target_path, + suffix, + db_name, + ret_type, + ret_mode="text", + overwrite=False, + verbose=False, +): """ Download files from the NCBI Entrez database in various formats. @@ -111,31 +117,28 @@ def fetch(uids, target_path, suffix, db_name, ret_type, file = join(target_path, id + "." + suffix) else: file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: - param_dict = { - "db" : sanitize_database_name(db_name), - "id" : id, - "rettype" : ret_type, - "retmode" : ret_mode, - "tool" : "Biotite", - "mail" : "padix.key@gmail.com" - } - api_key = get_api_key() - if api_key is not None: - param_dict["api_key"] = api_key - r = requests.get(_fetch_url, params=param_dict) - content = r.text - check_for_errors(content) - if content.startswith(" Error"): - raise RequestError(content[8:]) - if file is None: - file = io.StringIO(content) - else: - with open(file, "w+") as f: - f.write(content) + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: + param_dict = { + "db": sanitize_database_name(db_name), + "id": id, + "rettype": ret_type, + "retmode": ret_mode, + "tool": "Biotite", + "mail": "padix.key@gmail.com", + } + api_key = get_api_key() + if api_key is not None: + param_dict["api_key"] = api_key + r = requests.get(_fetch_url, params=param_dict) + content = r.text + check_for_errors(content) + if content.startswith(" Error"): + raise RequestError(content[8:]) + if file is None: + file = io.StringIO(content) + else: + with open(file, "w+") as f: + f.write(content) files.append(file) if verbose: print("\nDone") @@ -146,8 +149,9 @@ def fetch(uids, target_path, suffix, db_name, ret_type, return files -def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text", - overwrite=False): +def fetch_single_file( + uids, file_name, db_name, ret_type, ret_mode="text", overwrite=False +): """ Almost the same as :func:`fetch()`, but the data for the given UIDs will be stored in a single file. @@ -188,24 +192,26 @@ def fetch_single_file(uids, file_name, db_name, ret_type, ret_mode="text", -------- fetch """ - if file_name is not None \ - and os.path.isfile(file_name) \ - and getsize(file_name) > 0 \ - and not overwrite: - # Do no redownload the already existing file - return file_name + if ( + file_name is not None + and os.path.isfile(file_name) + and getsize(file_name) > 0 + and not overwrite + ): + # Do no redownload the already existing file + return file_name uid_list_str = "" for id in uids: uid_list_str += id + "," # Remove terminal comma uid_list_str = uid_list_str[:-1] param_dict = { - "db" : sanitize_database_name(db_name), - "id" : uid_list_str, - "rettype" : ret_type, - "retmode" : ret_mode, - "tool" : "Biotite", - "mail" : "padix.key@gmail.com" + "db": sanitize_database_name(db_name), + "id": uid_list_str, + "rettype": ret_type, + "retmode": ret_mode, + "tool": "Biotite", + "mail": "padix.key@gmail.com", } api_key = get_api_key() if api_key is not None: diff --git a/src/biotite/database/entrez/key.py b/src/biotite/database/entrez/key.py index 2427fd13a..83e56869c 100644 --- a/src/biotite/database/entrez/key.py +++ b/src/biotite/database/entrez/key.py @@ -41,4 +41,4 @@ def set_api_key(key): The API key. """ global _API_KEY - _API_KEY = key \ No newline at end of file + _API_KEY = key diff --git a/src/biotite/database/entrez/query.py b/src/biotite/database/entrez/query.py index 1626735f6..f9b4867ea 100644 --- a/src/biotite/database/entrez/query.py +++ b/src/biotite/database/entrez/query.py @@ -6,22 +6,23 @@ __author__ = "Patrick Kunzmann" __all__ = ["Query", "SimpleQuery", "CompositeQuery", "search"] -import requests import abc from xml.etree import ElementTree -from .check import check_for_errors -from .dbnames import sanitize_database_name -from ..error import RequestError -from .key import get_api_key - +import requests +from biotite.database.entrez.check import check_for_errors +from biotite.database.entrez.dbnames import sanitize_database_name +from biotite.database.entrez.key import get_api_key +from biotite.database.error import RequestError _search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + class Query(metaclass=abc.ABCMeta): """ Base class for a wrapper around a search term for the NCBI Entrez search service. """ + def __init__(self): pass @@ -85,7 +86,6 @@ def __str__(self): return "({:}) {:} ({:})".format(str(self._q1), self._op, self._q2) - class SimpleQuery(Query): """ A simple query for the NCBI Entrez search service without @@ -121,17 +121,59 @@ class SimpleQuery(Query): # Field identifiers are taken from # https://www.ncbi.nlm.nih.gov/books/NBK49540/ _fields = [ - "Accession", "All Fields", "Author", "EC/RN Number", "Feature Key", - "Filter", "Gene Name", "Genome Project", "Issue", "Journal", "Keyword", - "Modification Date", "Molecular Weight", "Organism", "Page Number", - "Primary Accession", "Properties", "Protein Name", "Publication Date", - "SeqID String", "Sequence Length", "Substance Name", "Text Word", - "Title", "Volume", + "Accession", + "All Fields", + "Author", + "EC/RN Number", + "Feature Key", + "Filter", + "Gene Name", + "Genome Project", + "Issue", + "Journal", + "Keyword", + "Modification Date", + "Molecular Weight", + "Organism", + "Page Number", + "Primary Accession", + "Properties", + "Protein Name", + "Publication Date", + "SeqID String", + "Sequence Length", + "Substance Name", + "Text Word", + "Title", + "Volume", # Abbreviations - "ACCN", "ALL", "AU", "AUTH", "ECNO", "FKEY", "FILT", "SB", "GENE", - "ISS", "JOUR", "KYWD", "MDAT", "MOLWT", "ORGN", "PAGE", "PACC", - "PORGN", "PROP", "PROT", "PDAT", "SQID", "SLEN", "SUBS", "WORD", "TI", - "TITL" "VOL" + "ACCN", + "ALL", + "AU", + "AUTH", + "ECNO", + "FKEY", + "FILT", + "SB", + "GENE", + "ISS", + "JOUR", + "KYWD", + "MDAT", + "MOLWT", + "ORGN", + "PAGE", + "PACC", + "PORGN", + "PROP", + "PROT", + "PDAT", + "SQID", + "SLEN", + "SUBS", + "WORD", + "TI", + "TITL" "VOL", ] def __init__(self, term, field=None): @@ -139,12 +181,9 @@ def __init__(self, term, field=None): if field is not None: if field not in SimpleQuery._fields: raise ValueError(f"Unknown field identifier '{field}'") - for invalid_string in \ - ['"', "AND", "OR", "NOT", "[", "]", "(", ")", "\t", "\n"]: - if invalid_string in term: - raise ValueError( - f"Query contains illegal term {invalid_string}" - ) + for invalid_string in ['"', "AND", "OR", "NOT", "[", "]", "(", ")", "\t", "\n"]: + if invalid_string in term: + raise ValueError(f"Query contains illegal term {invalid_string}") if " " in term: # Encapsulate in quotes if spaces are in search term term = f'"{term}"' diff --git a/src/biotite/database/error.py b/src/biotite/database/error.py index 577e6ce73..271aa37e0 100644 --- a/src/biotite/database/error.py +++ b/src/biotite/database/error.py @@ -12,4 +12,5 @@ class RequestError(Exception): Indicates that the database returned a response with an error message or other malformed content. """ - pass \ No newline at end of file + + pass diff --git a/src/biotite/database/pubchem/__init__.py b/src/biotite/database/pubchem/__init__.py index 73c3a296d..30c4813bb 100644 --- a/src/biotite/database/pubchem/__init__.py +++ b/src/biotite/database/pubchem/__init__.py @@ -18,4 +18,4 @@ from .download import * from .query import * -from .throttle import * \ No newline at end of file +from .throttle import * diff --git a/src/biotite/database/pubchem/download.py b/src/biotite/database/pubchem/download.py index e7f1c22ed..85fa09e9e 100644 --- a/src/biotite/database/pubchem/download.py +++ b/src/biotite/database/pubchem/download.py @@ -6,24 +6,29 @@ __author__ = "Patrick Kunzmann" __all__ = ["fetch", "fetch_property"] +import io import numbers -import requests -from os.path import isdir, isfile, join, getsize import os -import io -import numpy as np -from .throttle import ThrottleStatus -from .error import parse_error_details -from ..error import RequestError - +from os.path import getsize, isdir, isfile, join +import requests +from biotite.database.error import RequestError +from biotite.database.pubchem.error import parse_error_details +from biotite.database.pubchem.throttle import ThrottleStatus _base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" _binary_formats = ["png", "asnb"] -def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, - overwrite=False, verbose=False, - throttle_threshold=0.5, return_throttle_status=False): +def fetch( + cids, + format="sdf", + target_path=None, + as_structural_formula=False, + overwrite=False, + verbose=False, + throttle_threshold=0.5, + return_throttle_status=False, +): """ Download structure files from *PubChem* in various formats. @@ -109,8 +114,7 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, raise TypeError("CIDs must be given as integers, not as string") # Verbose output if verbose: - print(f"Fetching file {i+1:d} / {len(cids):d} ({cid})...", - end="\r") + print(f"Fetching file {i+1:d} / {len(cids):d} ({cid})...", end="\r") # Fetch file from database if target_path is not None: @@ -119,36 +123,33 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: - record_type = "2d" if as_structural_formula else "3d" - r = requests.get( - _base_url + f"compound/cid/{cid}/{format.upper()}", - params={"record_type": record_type} - ) - if not r.ok: - raise RequestError(parse_error_details(r.text)) + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: + record_type = "2d" if as_structural_formula else "3d" + r = requests.get( + _base_url + f"compound/cid/{cid}/{format.upper()}", + params={"record_type": record_type}, + ) + if not r.ok: + raise RequestError(parse_error_details(r.text)) - if format.lower() in _binary_formats: - content = r.content - else: - content = r.text + if format.lower() in _binary_formats: + content = r.content + else: + content = r.text - if file is None: - if format in _binary_formats: - file = io.BytesIO(content) - else: - file = io.StringIO(content) + if file is None: + if format in _binary_formats: + file = io.BytesIO(content) else: - mode = "wb+" if format in _binary_formats else "w+" - with open(file, mode) as f: - f.write(content) + file = io.StringIO(content) + else: + mode = "wb+" if format in _binary_formats else "w+" + with open(file, mode) as f: + f.write(content) - throttle_status = ThrottleStatus.from_response(r) - if throttle_threshold is not None: - throttle_status.wait_if_busy(throttle_threshold) + throttle_status = ThrottleStatus.from_response(r) + if throttle_threshold is not None: + throttle_status.wait_if_busy(throttle_threshold) files.append(file) if verbose: @@ -164,8 +165,7 @@ def fetch(cids, format="sdf", target_path=None, as_structural_formula=False, return return_value -def fetch_property(cids, name, - throttle_threshold=0.5, return_throttle_status=False): +def fetch_property(cids, name, throttle_threshold=0.5, return_throttle_status=False): """ Download the given property for the given CID(s). @@ -230,15 +230,13 @@ def fetch_property(cids, name, # Property names may only contain letters and numbers if not name.isalnum(): - raise ValueError( - f"Property '{name}' contains invalid characters" - ) + raise ValueError(f"Property '{name}' contains invalid characters") # Use TXT format instead of CSV to avoid issues with ',' characters # within table elements r = requests.post( _base_url + f"compound/cid/property/{name}/TXT", - data={"cid": ','.join([str(cid) for cid in cids])} + data={"cid": ",".join([str(cid) for cid in cids])}, ) if not r.ok: raise RequestError(parse_error_details(r.text)) diff --git a/src/biotite/database/pubchem/error.py b/src/biotite/database/pubchem/error.py index cbbdc0dcd..963fac865 100644 --- a/src/biotite/database/pubchem/error.py +++ b/src/biotite/database/pubchem/error.py @@ -15,6 +15,6 @@ def parse_error_details(response_text): for message_line_indicator in ["Detail: ", "Message: "]: for line in response_text.splitlines(): if line.startswith(message_line_indicator): - return line[len(message_line_indicator):] + return line[len(message_line_indicator) :] # No 'Detail: ...' or 'Message: ' line found - return "Unknown error" \ No newline at end of file + return "Unknown error" diff --git a/src/biotite/database/pubchem/query.py b/src/biotite/database/pubchem/query.py index bb6eec92d..31a030e4a 100644 --- a/src/biotite/database/pubchem/query.py +++ b/src/biotite/database/pubchem/query.py @@ -4,20 +4,28 @@ __name__ = "biotite.database.pubchem" __author__ = "Patrick Kunzmann" -__all__ = ["Query", "NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery", - "FormulaQuery", "SuperstructureQuery", "SubstructureQuery", - "SimilarityQuery", "IdentityQuery", - "search"] +__all__ = [ + "Query", + "NameQuery", + "SmilesQuery", + "InchiQuery", + "InchiKeyQuery", + "FormulaQuery", + "SuperstructureQuery", + "SubstructureQuery", + "SimilarityQuery", + "IdentityQuery", + "search", +] -import copy import abc import collections +import copy import requests -from .error import parse_error_details -from .throttle import ThrottleStatus -from ..error import RequestError -from ...structure.io.mol.mol import MOLFile - +from biotite.database.error import RequestError +from biotite.database.pubchem.error import parse_error_details +from biotite.database.pubchem.throttle import ThrottleStatus +from biotite.structure.io.mol.mol import MOLFile _base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" @@ -258,9 +266,10 @@ def get_params(self): # Only set maximum number, if provided by the user # The PubChem default value for this might change over time if self._number is not None: - params["MaxRecords"] = self._number + params["MaxRecords"] = self._number return params + def _format_element(element, count): if count == 1: return element.capitalize() @@ -318,8 +327,8 @@ def __init__(self, **kwargs): ) if not query_key_found: raise TypeError( - "Expected exactly one of 'smiles', 'smarts', 'inchi', 'sdf' " - "or 'cid'") + "Expected exactly one of 'smiles', 'smarts', 'inchi', 'sdf' " "or 'cid'" + ) if "number" in kwargs: self._number = kwargs["number"] del kwargs["number"] @@ -346,14 +355,10 @@ def from_atoms(cls, atoms, *args, **kwargs): mol_file.set_structure(atoms) # Every MOL string with "$$$$" is a valid SDF string # Important: USE MS-style new lines - return cls( - *args, - sdf = "\r\n".join(mol_file.lines) + "\r\n$$$$\r\n", - **kwargs - ) + return cls(*args, sdf="\r\n".join(mol_file.lines) + "\r\n$$$$\r\n", **kwargs) def get_input_url_path(self): - input_string = f"compound/{self.search_type()}/{self._query_key}" + input_string = f"compound/{self.search_type()}/{self._query_key}" if self._query_key == "cid": # Put CID in URL and not in POST payload, # as PubChem is confused otherwise @@ -370,7 +375,7 @@ def get_params(self): # Only set maximum number, if provided by the user # The PubChem default value for this might change over time if self._number is not None: - params["MaxRecords"] = self._number + params["MaxRecords"] = self._number for key, val in self.search_options().items(): # Convert 'snake case' Python parameters # to 'camel case' request parameters @@ -472,13 +477,13 @@ class SuperOrSubstructureQuery(StructureQuery, metaclass=abc.ABCMeta): """ _option_defaults = { - "match_charges" : False, - "match_tautomers" : False, - "rings_not_embedded" : False, - "single_double_bonds_match" : True, - "chains_match_rings" : True, - "strip_hydrogen" : False, - "stereo" : "ignore", + "match_charges": False, + "match_tautomers": False, + "rings_not_embedded": False, + "single_double_bonds_match": True, + "chains_match_rings": True, + "strip_hydrogen": False, + "stereo": "ignore", } def __init__(self, **kwargs): @@ -706,7 +711,7 @@ def search_type(self): return f"fastsimilarity_{dim}" def search_options(self): - return {"threshold" : int(round(self._threshold * 100))} + return {"threshold": int(round(self._threshold * 100))} class IdentityQuery(StructureQuery): @@ -766,8 +771,6 @@ def get_params(self): return params - - def search(query, throttle_threshold=0.5, return_throttle_status=False): """ Get all CIDs that meet the given query requirements, @@ -812,7 +815,7 @@ def search(query, throttle_threshold=0.5, return_throttle_status=False): r = requests.post( _base_url + query.get_input_url_path() + "/cids/TXT", data=query.get_params(), - files=files + files=files, ) if not r.ok: raise RequestError(parse_error_details(r.text)) diff --git a/src/biotite/database/pubchem/throttle.py b/src/biotite/database/pubchem/throttle.py index 27cb09084..171c1a484 100644 --- a/src/biotite/database/pubchem/throttle.py +++ b/src/biotite/database/pubchem/throttle.py @@ -7,8 +7,8 @@ __all__ = ["ThrottleStatus"] -from dataclasses import dataclass import time +from dataclasses import dataclass @dataclass(frozen=True) @@ -67,8 +67,7 @@ def from_response(response): """ throttle_control = response.headers["X-Throttling-Control"] throttle_status = [ - substring.split(")")[0] for substring - in throttle_control.split("(")[1:] + substring.split(")")[0] for substring in throttle_control.split("(")[1:] ] # Remove '%' sign and convert to int count_status, time_status, service_status = [ @@ -96,4 +95,4 @@ def wait_if_busy(self, threshold=0.5, wait_time=1.0): threshold is exceeded. """ if self.count > threshold or self.time > threshold: - time.sleep(wait_time) \ No newline at end of file + time.sleep(wait_time) diff --git a/src/biotite/database/rcsb/__init__.py b/src/biotite/database/rcsb/__init__.py index c36dfb2b8..0e5faf41c 100644 --- a/src/biotite/database/rcsb/__init__.py +++ b/src/biotite/database/rcsb/__init__.py @@ -10,4 +10,4 @@ __author__ = "Patrick Kunzmann" from .download import * -from .query import * \ No newline at end of file +from .query import * diff --git a/src/biotite/database/rcsb/download.py b/src/biotite/database/rcsb/download.py index 4f9a9f5fe..230792dae 100644 --- a/src/biotite/database/rcsb/download.py +++ b/src/biotite/database/rcsb/download.py @@ -6,20 +6,17 @@ __author__ = "Patrick Kunzmann" __all__ = ["fetch"] -import requests -from os.path import isdir, isfile, join, getsize -import os -import glob import io -from ..error import RequestError - +import os +from os.path import getsize, isfile, join +import requests +from biotite.database.error import RequestError _standard_url = "https://files.rcsb.org/download/" -_mmtf_url = "https://mmtf.rcsb.org/v1.0/full/" _bcif_url = "https://models.rcsb.org/" _fasta_url = "https://www.rcsb.org/fasta/entry/" -_binary_formats = ["mmtf", "bcif"] +_binary_formats = ["bcif"] def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False): @@ -34,7 +31,7 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False): pdb_ids : str or iterable object of str A single PDB ID or a list of PDB IDs of the structure(s) to be downloaded. - format : {'pdb', 'pdbx', 'cif', 'mmcif', 'bcif', 'mmtf', 'fasta'} + format : {'pdb', 'pdbx', 'cif', 'mmcif', 'bcif', 'fasta'} The format of the files to be downloaded. ``'pdbx'``, ``'cif'`` and ``'mmcif'`` are synonyms for the same format. @@ -94,8 +91,7 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False): for i, id in enumerate(pdb_ids): # Verbose output if verbose: - print(f"Fetching file {i+1:d} / {len(pdb_ids):d} ({id})...", - end="\r") + print(f"Fetching file {i+1:d} / {len(pdb_ids):d} ({id})...", end="\r") # Fetch file from database if target_path is not None: @@ -104,42 +100,35 @@ def fetch(pdb_ids, format, target_path=None, overwrite=False, verbose=False): # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: - if format == "pdb": - r = requests.get(_standard_url + id + ".pdb") - content = r.text - _assert_valid_file(content, id) - elif format in ["cif", "mmcif", "pdbx"]: - r = requests.get(_standard_url + id + ".cif") - content = r.text - _assert_valid_file(content, id) - elif format in ["bcif"]: - r = requests.get(_bcif_url + id + ".bcif") - content = r.content - _assert_valid_file(r.text, id) - elif format == "mmtf": - r = requests.get(_mmtf_url + id) - content = r.content - _assert_valid_file(r.text, id) - elif format == "fasta": - r = requests.get(_fasta_url + id) - content = r.text - _assert_valid_file(content, id) - else: - raise ValueError(f"Format '{format}' is not supported") - - if file is None: - if format in _binary_formats: - file = io.BytesIO(content) - else: - file = io.StringIO(content) + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: + if format == "pdb": + r = requests.get(_standard_url + id + ".pdb") + content = r.text + _assert_valid_file(content, id) + elif format in ["cif", "mmcif", "pdbx"]: + r = requests.get(_standard_url + id + ".cif") + content = r.text + _assert_valid_file(content, id) + elif format in ["bcif"]: + r = requests.get(_bcif_url + id + ".bcif") + content = r.content + _assert_valid_file(r.text, id) + elif format == "fasta": + r = requests.get(_fasta_url + id) + content = r.text + _assert_valid_file(content, id) + else: + raise ValueError(f"Format '{format}' is not supported") + + if file is None: + if format in _binary_formats: + file = io.BytesIO(content) else: - mode = "wb+" if format in _binary_formats else "w+" - with open(file, mode) as f: - f.write(content) + file = io.StringIO(content) + else: + mode = "wb+" if format in _binary_formats else "w+" + with open(file, mode) as f: + f.write(content) files.append(file) if verbose: @@ -158,10 +147,13 @@ def _assert_valid_file(response_text, pdb_id): """ # Structure file and FASTA file retrieval # have different error messages - if len(response_text) == 0 or any(err_msg in response_text for err_msg in [ - "404 Not Found", - "RCSB Protein Data Bank Error Page", - "No fasta files were found.", - "No valid PDB IDs were submitted.", - ]): + if len(response_text) == 0 or any( + err_msg in response_text + for err_msg in [ + "404 Not Found", + "RCSB Protein Data Bank Error Page", + "No fasta files were found.", + "No valid PDB IDs were submitted.", + ] + ): raise RequestError("PDB ID {:} is invalid".format(pdb_id)) diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index 7f131f3ee..e0e486875 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -4,28 +4,38 @@ __name__ = "biotite.database.rcsb" __author__ = "Patrick Kunzmann, Maximilian Dombrowsky" -__all__ = ["Query", "SingleQuery", "CompositeQuery", - "BasicQuery", "FieldQuery", - "SequenceQuery", "StructureQuery", "MotifQuery", - "Sorting", - "Grouping", "DepositGrouping", "IdentityGrouping", "UniprotGrouping", - "search", "count"] +__all__ = [ + "Query", + "SingleQuery", + "CompositeQuery", + "BasicQuery", + "FieldQuery", + "SequenceQuery", + "StructureQuery", + "MotifQuery", + "Sorting", + "Grouping", + "DepositGrouping", + "IdentityGrouping", + "UniprotGrouping", + "search", + "count", +] import abc -import json import copy +import json from datetime import datetime import numpy as np import requests -from ...sequence.seqtypes import NucleotideSequence -from ..error import RequestError - +from biotite.database.error import RequestError +from biotite.sequence.seqtypes import NucleotideSequence _search_url = "https://search.rcsb.org/rcsbsearch/v2/query" _scope_to_target = { "protein": "pdb_protein_sequence", - "rna": "pdb_rna_sequence", - "dna": "pdb_dna_sequence" + "rna": "pdb_rna_sequence", + "dna": "pdb_dna_sequence", } @@ -35,6 +45,7 @@ class Query(metaclass=abc.ABCMeta): This is the abstract base class for all queries. """ + @abc.abstractmethod def get_content(self): """ @@ -58,7 +69,6 @@ def __or__(self, query): return CompositeQuery([self, query], "or") - class SingleQuery(Query, metaclass=abc.ABCMeta): """ A terminal query node for the RCSB search API. @@ -69,6 +79,7 @@ class SingleQuery(Query, metaclass=abc.ABCMeta): This is the abstract base class for all queries that are terminal nodes. """ + @abc.abstractmethod def get_content(self): return {"parameters": {}} @@ -91,12 +102,11 @@ class CompositeQuery(Query): operator : {'or', 'and'} The type of combination. """ + def __init__(self, queries, operator): self._queries = queries if operator not in ("or", "and"): - raise ValueError( - f"Operator must be 'or' or 'and', not '{operator}'" - ) + raise ValueError(f"Operator must be 'or' or 'and', not '{operator}'") self._operator = operator def get_content(self): @@ -113,12 +123,11 @@ def get_content(self): content = { "type": "group", "logical_operator": self._operator, - "nodes": [query.get_content() for query in self._queries] + "nodes": [query.get_content() for query in self._queries], } return content - class BasicQuery(SingleQuery): """ A text query for searching for a given term across all available @@ -139,8 +148,9 @@ class BasicQuery(SingleQuery): >>> query = BasicQuery("tc5b") >>> print(sorted(search(query))) - ['1L2Y', '8ANG', '8ANH', '8ANI', '8ANM'] + ['1L2Y', '8ANG', '8ANH', '8ANI', '8ANM', '8QWW'] """ + def __init__(self, term): super().__init__() self._term = term @@ -212,7 +222,10 @@ class FieldQuery(SingleQuery): >>> print(sorted(search(query))) ['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H'] """ - def __init__(self, field, molecular_definition=False, case_sensitive=False, **kwargs): + + def __init__( + self, field, molecular_definition=False, case_sensitive=False, **kwargs + ): super().__init__() self._negation = False self._field = field @@ -231,20 +244,25 @@ def __init__(self, field, molecular_definition=False, case_sensitive=False, **kw if self._operator not in [ "exact_match", - "contains_words", "contains_phrase", - "greater", "less", "greater_or_equal", "less_or_equal", "equals", - "range", "range_closed", + "contains_words", + "contains_phrase", + "greater", + "less", + "greater_or_equal", + "less_or_equal", + "equals", + "range", + "range_closed", "is_in", - "exists" + "exists", ]: raise TypeError( - f"Constructor got an unexpected keyword argument " - f"'{self._operator}'" + f"Constructor got an unexpected keyword argument " f"'{self._operator}'" ) # Convert dates into ISO 8601 if isinstance(self._value, datetime): - self._value = _to_isoformat(self._value) + self._value = _to_isoformat(self._value) elif isinstance(self._value, (tuple, list, np.ndarray)): self._value = [ _to_isoformat(val) if isinstance(val, datetime) else val @@ -257,14 +275,14 @@ def __init__(self, field, molecular_definition=False, case_sensitive=False, **kw "from": self._value[0], "include_lower": False, "to": self._value[1], - "include_upper": False + "include_upper": False, } elif self._operator == "range_closed": self._value = { "from": self._value[0], "include_lower": True, "to": self._value[1], - "include_upper": True + "include_upper": True, } # Rename operators to names used in API @@ -332,8 +350,8 @@ class SequenceQuery(SingleQuery): >>> print(sorted(search(query))) ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2LL5', '2MJ9', '3UC7', '3UC8'] """ - def __init__(self, sequence, scope, - min_identity=0.0, max_expect_value=10000000.0): + + def __init__(self, sequence, scope, min_identity=0.0, max_expect_value=10000000.0): super().__init__() self._target = _scope_to_target.get(scope.lower()) if self._target is None: @@ -381,6 +399,7 @@ class MotifQuery(SingleQuery): ... "protein" ... ) """ + def __init__(self, pattern, pattern_type, scope): super().__init__() self._pattern = pattern @@ -424,27 +443,20 @@ class StructureQuery(SingleQuery): >>> print(sorted(search(query))) ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2M7D', '7MQS'] """ + def __init__(self, pdb_id, chain=None, assembly=None, strict=True): super().__init__() - if (chain is None and assembly is None) \ - or (chain is not None and assembly is not None): - raise TypeError( - "Either the chain ID or assembly ID must be set" - ) + if (chain is None and assembly is None) or ( + chain is not None and assembly is not None + ): + raise TypeError("Either the chain ID or assembly ID must be set") elif chain is None: - self._value = { - "entry_id": pdb_id, - "asssembly_id": assembly - } + self._value = {"entry_id": pdb_id, "asssembly_id": assembly} else: - self._value = { - "entry_id": pdb_id, - "asym_id": chain - } + self._value = {"entry_id": pdb_id, "asym_id": chain} - self._operator = "strict_shape_match" if strict \ - else "relaxed_shape_match" + self._operator = "strict_shape_match" if strict else "relaxed_shape_match" def get_content(self): content = super().get_content() @@ -455,10 +467,7 @@ def get_content(self): return content - - class Sorting: - def __init__(self, field, descending=True): self._field = field self._descending = descending @@ -487,12 +496,7 @@ def get_content(self): ``'ranking_criteria_type'`` attributes. """ direction = "desc" if self._descending else "asc" - return { - "sort_by" : self._field, - "direction" : direction - } - - + return {"sort_by": self._field, "direction": direction} class Grouping(metaclass=abc.ABCMeta): @@ -539,7 +543,7 @@ def get_content(self): The content dictionary for the ``'group_by'`` attributes. """ if self._sorting is not None: - return {"ranking_criteria_type" : self._sorting.get_content()} + return {"ranking_criteria_type": self._sorting.get_content()} else: return {} @@ -627,6 +631,7 @@ class IdentityGrouping(Grouping): To choose the order a :class:`Sorting` object needs to be provided. """ + def __init__(self, similarity_cutoff, sort_by=None): super().__init__(sort_by) if similarity_cutoff not in (100, 95, 90, 70, 50, 30): @@ -677,11 +682,7 @@ def is_compatible_return_type(self, return_type): return return_type == "polymer_entity" - - - -def count(query, return_type="entry", group_by=None, - content_types=("experimental",)): +def count(query, return_type="entry", group_by=None, content_types=("experimental",)): """ Count PDB entries that meet the given query requirements, via the RCSB search API. @@ -737,9 +738,7 @@ def count(query, return_type="entry", group_by=None, >>> print(sorted(ids)) ['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H'] """ - query_dict = _initialize_query_dict( - query, return_type, group_by, content_types - ) + query_dict = _initialize_query_dict(query, return_type, group_by, content_types) query_dict["request_options"]["return_counts"] = True @@ -761,8 +760,15 @@ def count(query, return_type="entry", group_by=None, raise RequestError(f"Error {r.status_code}") -def search(query, return_type="entry", range=None, sort_by=None, group_by=None, - return_groups=False, content_types=("experimental",)): +def search( + query, + return_type="entry", + range=None, + sort_by=None, + group_by=None, + return_groups=False, + content_types=("experimental",), +): """ Get all PDB IDs that meet the given query requirements, via the RCSB search API. @@ -862,19 +868,15 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, ... query, return_type="polymer_entity", return_groups=True, ... group_by=UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"), ... )) - {'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']} + {'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['1EJG_1', '3NIR_1']} """ - query_dict = _initialize_query_dict( - query, return_type, group_by, content_types - ) + query_dict = _initialize_query_dict(query, return_type, group_by, content_types) if group_by is not None: if return_groups: - query_dict["request_options"]["group_by_return_type"] \ - = "groups" + query_dict["request_options"]["group_by_return_type"] = "groups" else: - query_dict["request_options"]["group_by_return_type"] \ - = "representatives" + query_dict["request_options"]["group_by_return_type"] = "representatives" if sort_by is not None: if isinstance(sort_by, Sorting): @@ -890,7 +892,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, else: query_dict["request_options"]["paginate"] = { "start": int(range[0]), - "rows": int(range[1]) - int(range[0]) + "rows": int(range[1]) - int(range[0]), } r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) @@ -900,7 +902,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, return [result["identifier"] for result in r.json()["result_set"]] else: return { - group["identifier"] : [ + group["identifier"]: [ result["identifier"] for result in group["result_set"] ] for group in r.json()["group_set"] @@ -922,8 +924,11 @@ def _initialize_query_dict(query, return_type, group_by, content_types): `count()` and `search()` have in common. """ if return_type not in [ - "entry", "polymer_instance", "assembly", - "polymer_entity", "non_polymer_entity", + "entry", + "polymer_instance", + "assembly", + "polymer_entity", + "non_polymer_entity", ]: raise ValueError(f"'{return_type}' is an invalid return type") @@ -947,7 +952,7 @@ def _initialize_query_dict(query, return_type, group_by, content_types): query_dict = { "query": query.get_content(), "return_type": return_type, - "request_options": request_options + "request_options": request_options, } return query_dict @@ -956,4 +961,4 @@ def _to_isoformat(object): """ Convert a datetime into the specifc ISO 8601 format required by the RCSB. """ - return object.strftime("%Y-%m-%dT%H:%M:%SZ") \ No newline at end of file + return object.strftime("%Y-%m-%dT%H:%M:%SZ") diff --git a/src/biotite/database/uniprot/check.py b/src/biotite/database/uniprot/check.py index 4b00845d2..a1782e1ba 100644 --- a/src/biotite/database/uniprot/check.py +++ b/src/biotite/database/uniprot/check.py @@ -6,7 +6,7 @@ __author__ = "Maximilian Greil" __all__ = ["assert_valid_response"] -from ..error import RequestError +from biotite.database.error import RequestError # Taken from https://www.uniprot.org/help/api_retrieve_entries @@ -27,6 +27,9 @@ def assert_valid_response(response_status_code): raise RequestError("Gone. The resource you requested was removed.") elif response_status_code == 500: raise RequestError( - "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team.") + "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team." + ) elif response_status_code == 503: - raise RequestError("Service not available. The server is being updated, try again later.") + raise RequestError( + "Service not available. The server is being updated, try again later." + ) diff --git a/src/biotite/database/uniprot/download.py b/src/biotite/database/uniprot/download.py index 7faf37954..bacb40e96 100644 --- a/src/biotite/database/uniprot/download.py +++ b/src/biotite/database/uniprot/download.py @@ -6,11 +6,11 @@ __author__ = "Maximilian Greil" __all__ = ["fetch"] -from os.path import isdir, isfile, join, getsize -import os import io +import os +from os.path import getsize, isdir, isfile, join import requests -from .check import assert_valid_response +from biotite.database.uniprot.check import assert_valid_response _fetch_url = "https://rest.uniprot.org/" @@ -36,8 +36,7 @@ def _get_database_name(id): return "uniprotkb" -def fetch(ids, format, target_path=None, - overwrite=False, verbose=False): +def fetch(ids, format, target_path=None, overwrite=False, verbose=False): """ Download files from the UniProt in various formats. @@ -101,18 +100,14 @@ def fetch(ids, format, target_path=None, db_name = _get_database_name(id) # Verbose output if verbose: - print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", - end="\r") + print(f"Fetching file {i + 1:d} / {len(ids):d} ({id})...", end="\r") # Fetch file from database if target_path is not None: file = join(target_path, id + "." + format) else: # 'file = None' -> store content in a file-like object file = None - if file is None \ - or not isfile(file) \ - or getsize(file) == 0 \ - or overwrite: + if file is None or not isfile(file) or getsize(file) == 0 or overwrite: if format in ["fasta", "gff", "txt", "xml", "rdf", "tab"]: r = requests.get(_fetch_url + db_name + "/" + id + "." + format) content = r.text diff --git a/src/biotite/database/uniprot/query.py b/src/biotite/database/uniprot/query.py index 95e6f391d..687c61f5f 100644 --- a/src/biotite/database/uniprot/query.py +++ b/src/biotite/database/uniprot/query.py @@ -6,10 +6,9 @@ __author__ = "Maximilian Greil" __all__ = ["Query", "SimpleQuery", "CompositeQuery", "search"] -import requests import abc -from .check import assert_valid_response - +import requests +from biotite.database.uniprot.check import assert_valid_response _base_url = "https://rest.uniprot.org/uniprotkb/search/" @@ -122,22 +121,114 @@ class SimpleQuery(Query): # Field identifiers are taken from # https://www.uniprot.org/help/query-fields _fields = [ - "accession", "active", "ft_init_met", "ft_signal", "ft_transit", "ft_propep", "ft_chain", "ft_peptide", - "ft_topo_dom", "ft_transmem", "ft_intramem", "ft_domain", "ft_repeat", "ft_zn_fing", "ft_dna_bind", - "ft_region", "ft_coiled", "ft_motif", "ft_compbias", "ft_act_site", "ft_binding", "ft_site", "ft_non_std", - "ft_mod_res", "ft_lipid", "ft_carbohyd", "ft_disulfid", "ft_crosslnk", "ft_var_seq", "ft_variant", - "ft_mutagen", "ft_unsure", "ft_conflict", "ft_non_cons", "ft_non_ter", "ft_helix", "ft_turn", "ft_strand", - "lit_author", "protein_name", "chebi", "citation", "uniref_cluster_90", "xrefcount_pdb", "date_created", - "database", "xref", "ec", "cc_function", "cc_catalytic_activity", "cc_cofactor", "cc_activity_regulation", - "cc_biophysicochemical_properties", "cc_subunit", "cc_pathway", "cc_scl_term", "cc_tissue_specificity", - "cc_developmental_stage", "cc_induction", "cc_domain", "cc_ptm cc_rna_editing", "cc_mass_spectrometry", - "cc_polymorphism", "cc_disease", "cc_disruption_phenotype", "cc_allergen", "cc_toxic_dose", "cc_biotechnology", - "cc_pharmaceutical", "cc_miscellaneous", "cc_similarity", "cc_caution", "cc_sequence_caution", - "existence", "family", "fragment", "gene", "gene_exact", "go", "virus_host_name", "virus_host_id", - "accession_id", "inchikey", "protein_name", "interactor", "keyword", "length", "lineage", "mass", - "cc_mass_spectrometry", "date_modified", "protein_name", "organelle", "organism_name", "organism_id", - "plasmid", "proteome", "proteomecomponent", "sec_acc", "reviewed", "scope", "sequence", - "date_sequence_modified", "strain", "taxonomy_name", "taxonomy_id", "tissue", "cc_webresource" + "accession", + "active", + "ft_init_met", + "ft_signal", + "ft_transit", + "ft_propep", + "ft_chain", + "ft_peptide", + "ft_topo_dom", + "ft_transmem", + "ft_intramem", + "ft_domain", + "ft_repeat", + "ft_zn_fing", + "ft_dna_bind", + "ft_region", + "ft_coiled", + "ft_motif", + "ft_compbias", + "ft_act_site", + "ft_binding", + "ft_site", + "ft_non_std", + "ft_mod_res", + "ft_lipid", + "ft_carbohyd", + "ft_disulfid", + "ft_crosslnk", + "ft_var_seq", + "ft_variant", + "ft_mutagen", + "ft_unsure", + "ft_conflict", + "ft_non_cons", + "ft_non_ter", + "ft_helix", + "ft_turn", + "ft_strand", + "lit_author", + "protein_name", + "chebi", + "citation", + "uniref_cluster_90", + "xrefcount_pdb", + "date_created", + "database", + "xref", + "ec", + "cc_function", + "cc_catalytic_activity", + "cc_cofactor", + "cc_activity_regulation", + "cc_biophysicochemical_properties", + "cc_subunit", + "cc_pathway", + "cc_scl_term", + "cc_tissue_specificity", + "cc_developmental_stage", + "cc_induction", + "cc_domain", + "cc_ptm cc_rna_editing", + "cc_mass_spectrometry", + "cc_polymorphism", + "cc_disease", + "cc_disruption_phenotype", + "cc_allergen", + "cc_toxic_dose", + "cc_biotechnology", + "cc_pharmaceutical", + "cc_miscellaneous", + "cc_similarity", + "cc_caution", + "cc_sequence_caution", + "existence", + "family", + "fragment", + "gene", + "gene_exact", + "go", + "virus_host_name", + "virus_host_id", + "accession_id", + "inchikey", + "protein_name", + "interactor", + "keyword", + "length", + "lineage", + "mass", + "cc_mass_spectrometry", + "date_modified", + "protein_name", + "organelle", + "organism_name", + "organism_id", + "plasmid", + "proteome", + "proteomecomponent", + "sec_acc", + "reviewed", + "scope", + "sequence", + "date_sequence_modified", + "strain", + "taxonomy_name", + "taxonomy_id", + "tissue", + "cc_webresource", ] def __init__(self, field, term): @@ -146,14 +237,11 @@ def __init__(self, field, term): raise ValueError(f"Unknown field identifier '{field}'") if not _check_brackets(term): raise ValueError( - f"Query term contains illegal number of round brackets ( ) and/or square brackets [ ]" + "Query term contains illegal number of round brackets ( ) and/or square brackets [ ]" ) - for invalid_string in \ - ['"', "AND", "OR", "NOT", "\t", "\n"]: + for invalid_string in ['"', "AND", "OR", "NOT", "\t", "\n"]: if invalid_string in term: - raise ValueError( - f"Query contains illegal term {invalid_string}" - ) + raise ValueError(f"Query contains illegal term {invalid_string}") if " " in term: term = f'"{term}"' self._field = field @@ -198,12 +286,8 @@ def search(query, number=500): ['P12345'] """ - params = { - 'query': str(query), - 'format': 'list', - 'size': str(number) - } + params = {"query": str(query), "format": "list", "size": str(number)} r = requests.get(_base_url, params=params) content = r.text assert_valid_response(r.status_code) - return content.split('\n')[:-1] + return content.split("\n")[:-1] diff --git a/src/biotite/file.py b/src/biotite/file.py index 8094668aa..ec7047db6 100644 --- a/src/biotite/file.py +++ b/src/biotite/file.py @@ -4,16 +4,19 @@ __name__ = "biotite" __author__ = "Patrick Kunzmann" -__all__ = ["File", "TextFile", "InvalidFileError", - "SerializationError", "DeserializationError"] +__all__ = [ + "File", + "TextFile", + "InvalidFileError", + "SerializationError", + "DeserializationError", +] import abc +import copy import io -import warnings from os import PathLike - -from .copyable import Copyable -import copy +from biotite.copyable import Copyable class File(Copyable, metaclass=abc.ABCMeta): @@ -27,13 +30,6 @@ class File(Copyable, metaclass=abc.ABCMeta): :func:`write()` method is used. """ - def __init__(self): - # Support for deprecated instance method 'read()': - # When creating an instance, the 'read()' class method is - # replaced by the instance method, so that subsequent - # 'read()' calls are delegated to the instance method - self.read = self._deprecated_read - @classmethod @abc.abstractmethod def read(cls, file): @@ -54,23 +50,6 @@ def read(cls, file): """ pass - def _deprecated_read(self, file, *args, **kwargs): - """ - Support for deprecated instance method :func:`read()`. - - Internally this calls the :func:`read()` class method and - replaces the data in `self` with the data from the newly created - :class:`File` object - """ - warnings.warn( - "Instance method 'read()' is deprecated, " - "use class method instead", - DeprecationWarning - ) - cls = type(self) - new_file = cls.read(file, *args, **kwargs) - self.__dict__.update(new_file.__dict__) - @abc.abstractmethod def write(self, file): """ @@ -209,12 +188,14 @@ class InvalidFileError(Exception): either because the file does not contain the required data or because the file is malformed. """ + pass class SerializationError(Exception): pass + class DeserializationError(Exception): pass @@ -229,7 +210,7 @@ def wrap_string(text, width): """ lines = [] for i in range(0, len(text), width): - lines.append(text[i : i+width]) + lines.append(text[i : i + width]) return lines diff --git a/src/biotite/sequence/__init__.py b/src/biotite/sequence/__init__.py index afda0ab34..005a7c88c 100644 --- a/src/biotite/sequence/__init__.py +++ b/src/biotite/sequence/__init__.py @@ -76,9 +76,9 @@ __author__ = "Patrick Kunzmann" from .alphabet import * +from .annotation import * +from .codon import * +from .profile import * from .search import * from .seqtypes import * from .sequence import * -from .codon import * -from .annotation import * -from .profile import * diff --git a/src/biotite/sequence/align/__init__.py b/src/biotite/sequence/align/__init__.py index d548b11a3..7e90c32ad 100644 --- a/src/biotite/sequence/align/__init__.py +++ b/src/biotite/sequence/align/__init__.py @@ -191,8 +191,8 @@ from .buckets import * from .cigar import * from .kmeralphabet import * -from .kmertable import * from .kmersimilarity import * +from .kmertable import * from .localgapped import * from .localungapped import * from .matrix import * @@ -200,4 +200,4 @@ from .pairwise import * from .permutation import * from .selector import * -from .statistics import * \ No newline at end of file +from .statistics import * diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py index 2f824c7f0..d33e3d051 100644 --- a/src/biotite/sequence/align/alignment.py +++ b/src/biotite/sequence/align/alignment.py @@ -5,16 +5,22 @@ __name__ = "biotite.sequence.align" __author__ = "Patrick Kunzmann" -import numpy as np import numbers -import copy import textwrap -from ..alphabet import LetterAlphabet - +from collections.abc import Sequence +import numpy as np +from biotite.sequence.alphabet import LetterAlphabet -__all__ = ["Alignment", "get_codes", "get_symbols", - "get_sequence_identity", "get_pairwise_sequence_identity", - "score", "find_terminal_gaps", "remove_terminal_gaps"] +__all__ = [ + "Alignment", + "get_codes", + "get_symbols", + "get_sequence_identity", + "get_pairwise_sequence_identity", + "score", + "find_terminal_gaps", + "remove_terminal_gaps", +] class Alignment(object): @@ -22,7 +28,7 @@ class Alignment(object): An :class:`Alignment` object stores information about which symbols of *n* sequences are aligned to each other and it stores the corresponding alignment score. - + Instead of saving a list of aligned symbols, this class saves the original *n* sequences, that were aligned, and a so called *trace*, which indicate the aligned symbols of these sequences. @@ -31,16 +37,16 @@ class Alignment(object): Each element of the trace is the index in the corresponding sequence. A gap is represented by the value -1. - + Furthermore this class provides multiple utility functions for conversion into strings in order to make the alignment human readable. - + Unless an :class:`Alignment` object is the result of an multiple sequence alignment, the object will contain only two sequences. - + All attributes of this class are publicly accessible. - + Parameters ---------- sequences : list @@ -49,7 +55,7 @@ class Alignment(object): The alignment trace. score : int, optional Alignment score. - + Attributes ---------- sequences : list @@ -58,10 +64,10 @@ class Alignment(object): The alignment trace. score : int Alignment score. - + Examples -------- - + >>> seq1 = NucleotideSequence("CGTCAT") >>> seq2 = NucleotideSequence("TCATGC") >>> matrix = SubstitutionMatrix.std_nucleotide_matrix() @@ -95,8 +101,10 @@ def __init__(self, sequences, trace, score=None): def __repr__(self): """Represent Alignment a string for debugging.""" - return f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], " \ - f"np.{np.array_repr(self.trace)}, score={self.score})" + return ( + f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], " + f"np.{np.array_repr(self.trace)}, score={self.score})" + ) def _gapped_str(self, seq_index): seq_str = "" @@ -107,11 +115,11 @@ def _gapped_str(self, seq_index): else: seq_str += "-" return seq_str - + def get_gapped_sequences(self): """ Get a the string representation of the gapped sequences. - + Returns ------- sequences : list of str @@ -119,7 +127,7 @@ def get_gapped_sequences(self): as in `Alignment.sequences`. """ return [self._gapped_str(i) for i in range(len(self.sequences))] - + def __str__(self): # Check if any of the sequences # has an non-single letter alphabet @@ -143,32 +151,33 @@ def __str__(self): return ali_str[:-2] else: return super().__str__() - + def __getitem__(self, index): if isinstance(index, tuple): if len(index) > 2: raise IndexError("Only 1D or 2D indices are allowed") - if isinstance(index[0], numbers.Integral) or \ - isinstance(index[0], numbers.Integral): - raise IndexError( - "Integers are invalid indices for alignments, " - "a single sequence or alignment column cannot be " - "selected" - ) + if isinstance(index[0], numbers.Integral) or isinstance( + index[0], numbers.Integral + ): + raise IndexError( + "Integers are invalid indices for alignments, " + "a single sequence or alignment column cannot be " + "selected" + ) return Alignment( Alignment._index_sequences(self.sequences, index[1]), self.trace[index], - self.score + self.score, ) else: return Alignment(self.sequences, self.trace[index], self.score) - + def __iter__(self): raise TypeError("'Alignment' object is not iterable") - + def __len__(self): return len(self.trace) - + def __eq__(self, item): if not isinstance(item, Alignment): return False @@ -179,45 +188,41 @@ def __eq__(self, item): if self.score != item.score: return False return True - + @staticmethod def _index_sequences(sequences, index): - if isinstance(index, (list, tuple)) or \ - (isinstance(index, np.ndarray) and index.dtype != bool): - return [sequences[i] for i in index] + if isinstance(index, (list, tuple)) or ( + isinstance(index, np.ndarray) and index.dtype != bool + ): + return [sequences[i] for i in index] elif isinstance(index, np.ndarray) and index.dtype == bool: return [seq for seq, mask in zip(sequences, index) if mask] if isinstance(index, slice): return sequences[index] else: - raise IndexError( - f"Invalid alignment index type '{type(index).__name__}'" - ) - + raise IndexError(f"Invalid alignment index type '{type(index).__name__}'") + @staticmethod def trace_from_strings(seq_str_list): """ Create a trace from strings that represent aligned sequences. - + Parameters ---------- seq_str_list : list of str The strings, where each each one represents a sequence (with gaps) in an alignment. A ``-`` is interpreted as gap. - + Returns ------- trace : ndarray, dtype=int, shape=(n,2) The created trace. """ if len(seq_str_list) < 2: - raise ValueError( - "An alignment must contain at least two sequences" - ) + raise ValueError("An alignment must contain at least two sequences") seq_i = np.zeros(len(seq_str_list)) - trace = np.full(( len(seq_str_list[0]), len(seq_str_list) ), - -1, dtype=int) + trace = np.full((len(seq_str_list[0]), len(seq_str_list)), -1, dtype=int) # Get length of string (same length for all strings) # rather than length of list for pos_i in range(len(seq_str_list[0])): @@ -238,22 +243,22 @@ def get_codes(alignment): Instead of the indices of the aligned symbols (trace), the return value contains the corresponding symbol codes for each index. Gaps are still represented by *-1*. - + Parameters ---------- alignment : Alignment The alignment to get the sequence codes for. - + Returns ------- codes : ndarray, dtype=int, shape=(n,m) The sequence codes for the alignment. The shape is *(n,m)* for *n* sequences and *m* alignment cloumn. The array uses *-1* values for gaps. - + Examples -------- - + >>> seq1 = NucleotideSequence("CGTCAT") >>> seq2 = NucleotideSequence("TCATGC") >>> matrix = SubstitutionMatrix.std_nucleotide_matrix() @@ -267,14 +272,17 @@ def get_codes(alignment): """ trace = alignment.trace sequences = alignment.sequences - + # The number of sequences is the first dimension - codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=int) + codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=np.int64) for i in range(len(sequences)): + # Mark -1 explicitly as int64 to avoid that the unsigned dtype + # of the sequence code is used + # (https://numpy.org/neps/nep-0050-scalar-promotion.html) codes[i] = np.where( - trace[:,i] != -1, sequences[i].code[trace[:,i]], -1 + trace[:, i] != -1, sequences[i].code[trace[:, i]], np.int64(-1) ) - + return np.stack(codes) @@ -283,24 +291,24 @@ def get_symbols(alignment): Similar to :func:`get_codes()`, but contains the decoded symbols instead of codes. Gaps are still represented by *None* values. - + Parameters ---------- alignment : Alignment The alignment to get the symbols for. - + Returns ------- symbols : list of list The nested list of symbols. - + See Also -------- get_codes Examples -------- - + >>> seq1 = NucleotideSequence("CGTCAT") >>> seq2 = NucleotideSequence("TCATGC") >>> matrix = SubstitutionMatrix.std_nucleotide_matrix() @@ -317,8 +325,8 @@ def get_symbols(alignment): alphabet = alignment.sequences[i].get_alphabet() codes_wo_gaps = codes[i, codes[i] != -1] symbols_wo_gaps = alphabet.decode_multiple(codes_wo_gaps) - if not isinstance(symbols_wo_gaps, list): - symbols_wo_gaps = list(symbols_wo_gaps) + if isinstance(symbols_wo_gaps, np.ndarray): + symbols_wo_gaps = symbols_wo_gaps.tolist() symbols_for_seq = np.full(len(codes[i]), None, dtype=object) symbols_for_seq[codes[i] != -1] = symbols_wo_gaps symbols[i] = symbols_for_seq.tolist() @@ -331,7 +339,7 @@ def get_sequence_identity(alignment, mode="not_terminal"): The identity is equal to the matches divided by a measure for the length of the alignment that depends on the `mode` parameter. - + Parameters ---------- alignment : Alignment @@ -348,12 +356,12 @@ def get_sequence_identity(alignment, mode="not_terminal"): length of the shortest sequence. Default is *not_terminal*. - + Returns ------- identity : float The sequence identity, ranging between 0 and 1. - + See also -------- get_pairwise_sequence_identity @@ -363,12 +371,12 @@ def get_sequence_identity(alignment, mode="not_terminal"): # Count matches matches = 0 for i in range(codes.shape[1]): - column = codes[:,i] + column = codes[:, i] # One unique value -> all symbols match unique_symbols = np.unique(column) if len(unique_symbols) == 1 and unique_symbols[0] != -1: matches += 1 - + # Calculate length if mode == "all": length = len(alignment) @@ -394,7 +402,7 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"): The identity is equal to the matches divided by a measure for the length of the alignment that depends on the `mode` parameter. - + Parameters ---------- alignment : Alignment, length=n @@ -411,12 +419,12 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"): length of the shortest one of the two sequences. Default is *not_terminal*. - + Returns ------- identity : ndarray, dtype=float, shape=(n,n) The pairwise sequence identity, ranging between 0 and 1. - + See also -------- get_sequence_identity @@ -427,9 +435,11 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"): # Count matches # Calculate at which positions the sequences are identical # and are not gaps - equality_matrix = (codes[:, np.newaxis, :] == codes[np.newaxis, :, :]) \ - & (codes[:, np.newaxis, :] != -1) \ - & (codes[np.newaxis, :, :] != -1) \ + equality_matrix = ( + (codes[:, np.newaxis, :] == codes[np.newaxis, :, :]) + & (codes[:, np.newaxis, :] != -1) + & (codes[np.newaxis, :, :] != -1) + ) # Sum these positions up matches = np.count_nonzero(equality_matrix, axis=-1) @@ -441,24 +451,23 @@ def get_pairwise_sequence_identity(alignment, mode="not_terminal"): for i in range(n_seq): for j in range(n_seq): # Find latest start and earliest stop of all sequences - start, stop = find_terminal_gaps(alignment[:, [i,j]]) + start, stop = find_terminal_gaps(alignment[:, [i, j]]) if stop <= start: raise ValueError( "Cannot calculate non-terminal identity, " "as the two sequences have no overlap" ) - length[i,j] = stop - start + length[i, j] = stop - start elif mode == "shortest": length = np.zeros((n_seq, n_seq)) for i in range(n_seq): for j in range(n_seq): - length[i,j] = min([ - len(alignment.sequences[i]), - len(alignment.sequences[j]) - ]) + length[i, j] = min( + [len(alignment.sequences[i]), len(alignment.sequences[j])] + ) else: raise ValueError(f"'{mode}' is an invalid calculation mode") - + return matches / length @@ -468,7 +477,7 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True): If the alignment contains more than two sequences, all pairwise scores are counted. - + Parameters ---------- alignment : Alignment @@ -485,7 +494,7 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True): terminal_penalty : bool, optional If true, gap penalties are applied to terminal gaps. (Default: True) - + Returns ------- score : int @@ -503,18 +512,18 @@ def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True): # Do not count self-similarity # and do not count similarity twice (not S(i,j) and S(j,i)) for i in range(codes.shape[0]): - for j in range(i+1, codes.shape[0]): + for j in range(i + 1, codes.shape[0]): code_i = column[i] code_j = column[j] # Ignore gaps if code_i != -1 and code_j != -1: score += matrix[code_i, code_j] - + # Sum gap penalties - if type(gap_penalty) == int: + if isinstance(gap_penalty, numbers.Real): gap_open = gap_penalty gap_ext = gap_penalty - elif type(gap_penalty) == tuple: + elif isinstance(gap_penalty, Sequence): gap_open = gap_penalty[0] gap_ext = gap_penalty[1] else: @@ -590,15 +599,15 @@ def find_terminal_gaps(alignment): """ trace = alignment.trace # Find for each sequence the positions of non-gap symbols - no_gap_pos = [np.where(trace[:,i] != -1)[0] for i in range(trace.shape[1])] + no_gap_pos = [np.where(trace[:, i] != -1)[0] for i in range(trace.shape[1])] # Find for each sequence the positions of the sequence start and end # in the alignment - firsts = [no_gap_pos[i][0 ] for i in range(trace.shape[1])] - lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])] + firsts = [no_gap_pos[i][0] for i in range(trace.shape[1])] + lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])] # The terminal gaps are before all sequences start and after any # sequence ends # Use exclusive stop -> -1 - return np.max(firsts), np.min(lasts) + 1 + return np.max(firsts).item(), np.min(lasts).item() + 1 def remove_terminal_gaps(alignment): @@ -655,4 +664,4 @@ def remove_terminal_gaps(alignment): "Cannot remove terminal gaps, since at least two sequences have " "no overlap and the resulting alignment would be empty" ) - return alignment[start : stop] \ No newline at end of file + return alignment[start:stop] diff --git a/src/biotite/sequence/align/buckets.py b/src/biotite/sequence/align/buckets.py index 79a1afadd..5b99ef890 100644 --- a/src/biotite/sequence/align/buckets.py +++ b/src/biotite/sequence/align/buckets.py @@ -6,11 +6,12 @@ __author__ = "Patrick Kunzmann" __all__ = ["bucket_number"] -from os.path import realpath, dirname, join +from os.path import dirname, join, realpath import numpy as np - _primes = None + + def bucket_number(n_kmers, load_factor=0.8): """ Find an appropriate number of buckets for a :class:`BucketKmerTable` @@ -54,16 +55,17 @@ def bucket_number(n_kmers, load_factor=0.8): """ global _primes if _primes is None: - with open( - join(dirname(realpath(__file__)), "primes.txt") - ) as file: - _primes = np.array([ - int(line) for line in file.read().splitlines() - if len(line) != 0 and line[0] != "#" - ]) + with open(join(dirname(realpath(__file__)), "primes.txt")) as file: + _primes = np.array( + [ + int(line) + for line in file.read().splitlines() + if len(line) != 0 and line[0] != "#" + ] + ) number = int(n_kmers / load_factor) index = np.searchsorted(_primes, number, side="left") if index == len(_primes): raise ValueError("Number of buckets too large") - return _primes[index] \ No newline at end of file + return _primes[index] diff --git a/src/biotite/sequence/align/cigar.py b/src/biotite/sequence/align/cigar.py index abe76cae6..a7735984d 100644 --- a/src/biotite/sequence/align/cigar.py +++ b/src/biotite/sequence/align/cigar.py @@ -8,13 +8,14 @@ import enum import numpy as np -from .alignment import Alignment, get_codes +from biotite.sequence.align.alignment import Alignment, get_codes class CigarOp(enum.IntEnum): """ An enum for the different CIGAR operations. """ + MATCH = 0 INSERTION = 1 DELETION = 2 @@ -46,23 +47,23 @@ def from_cigar_symbol(symbol): def to_cigar_symbol(self): return _op_to_str[self] + _str_to_op = { - "M" : CigarOp.MATCH, - "I" : CigarOp.INSERTION, - "D" : CigarOp.DELETION, - "N" : CigarOp.INTRON, - "S" : CigarOp.SOFT_CLIP, - "H" : CigarOp.HARD_CLIP, - "P" : CigarOp.PADDING, - "=" : CigarOp.EQUAL, - "X" : CigarOp.DIFFERENT, - "B" : CigarOp.BACK - } + "M": CigarOp.MATCH, + "I": CigarOp.INSERTION, + "D": CigarOp.DELETION, + "N": CigarOp.INTRON, + "S": CigarOp.SOFT_CLIP, + "H": CigarOp.HARD_CLIP, + "P": CigarOp.PADDING, + "=": CigarOp.EQUAL, + "X": CigarOp.DIFFERENT, + "B": CigarOp.BACK, +} _op_to_str = {v: k for k, v in _str_to_op.items()} -def read_alignment_from_cigar(cigar, position, - reference_sequence, segment_sequence): +def read_alignment_from_cigar(cigar, position, reference_sequence, segment_sequence): """ Create an :class:`Alignment` from a CIGAR string. @@ -147,20 +148,16 @@ def read_alignment_from_cigar(cigar, position, else: operations = np.asarray(cigar, dtype=int) if operations.ndim != 2: - raise ValueError( - "Expected array with shape (n,2)" - ) + raise ValueError("Expected array with shape (n,2)") if operations.shape[1] != 2: - raise ValueError( - "Expected (operation, length) pairs" - ) + raise ValueError("Expected (operation, length) pairs") if len(operations) == 0: return Alignment( [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int) ) - trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int) + trace = np.zeros((np.sum(operations[:, 1]), 2), dtype=int) clip_mask = np.ones(trace.shape[0], dtype=bool) i = 0 @@ -187,19 +184,23 @@ def read_alignment_from_cigar(cigar, position, elif op == CigarOp.HARD_CLIP: clip_mask[i : i + length] = False else: - raise ValueError( - f"CIGAR operation {op} is not implemented" - ) + raise ValueError(f"CIGAR operation {op} is not implemented") i += length # Remove clipped positions trace = trace[clip_mask] return Alignment([reference_sequence, segment_sequence], trace) -def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, - introns=(), distinguish_matches=False, - hard_clip=False, include_terminal_gaps=False, - as_string=True): +def write_alignment_to_cigar( + alignment, + reference_index=0, + segment_index=1, + introns=(), + distinguish_matches=False, + hard_clip=False, + include_terminal_gaps=False, + as_string=True, +): """ Convert an :class:`Alignment` into a CIGAR string. @@ -293,10 +294,10 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False) >>> for op, length in op_tuples: - ... print(CigarOp(op), length) - CigarOp.MATCH 9 - CigarOp.DELETION 2 - CigarOp.MATCH 12 + ... print(CigarOp(op).name, length) + MATCH 9 + DELETION 2 + MATCH 12 """ if not include_terminal_gaps: alignment = _remove_terminal_segment_gaps(alignment, segment_index) @@ -305,8 +306,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, seg_trace = alignment.trace[:, segment_index] operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int) - insertion_mask = (ref_trace == -1) - deletion_mask = (seg_trace == -1) + insertion_mask = ref_trace == -1 + deletion_mask = seg_trace == -1 if np.any(insertion_mask & deletion_mask): raise ValueError( "Alignment contains insertion and deletion at the same position" @@ -318,35 +319,27 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1, intron_mask = np.zeros(operations.shape[0], dtype=bool) for start, stop in introns: if start >= stop: - raise ValueError( - "Intron start must be smaller than intron stop" - ) + raise ValueError("Intron start must be smaller than intron stop") if start < 0: - raise ValueError( - "Intron start must not be negative" - ) + raise ValueError("Intron start must not be negative") intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True if np.any(intron_mask & ~deletion_mask): - raise ValueError( - "Introns must be within gaps in the reference sequence" - ) + raise ValueError("Introns must be within gaps in the reference sequence") operations[intron_mask] = CigarOp.INTRON if distinguish_matches: symbol_codes = get_codes(alignment) ref_codes = symbol_codes[reference_index, :] seg_codes = symbol_codes[segment_index, :] - equal_mask = (ref_codes == seg_codes) - match_mask = (operations == CigarOp.MATCH) + equal_mask = ref_codes == seg_codes + match_mask = operations == CigarOp.MATCH operations[equal_mask & match_mask] = CigarOp.EQUAL operations[~equal_mask & match_mask] = CigarOp.DIFFERENT op_tuples = _aggregate_consecutive(operations) clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP - start_clip_length, end_clip_length = _find_clipped_bases( - alignment, segment_index - ) + start_clip_length, end_clip_length = _find_clipped_bases(alignment, segment_index) if start_clip_length != 0: start_clip = [(clip_op, start_clip_length)] else: @@ -386,9 +379,7 @@ def _find_clipped_bases(alignment, segment_index): # all previous bases are clipped... start_clip_length = seg_trace[0] # ...and the same applies for the last base - end_clip_length = ( - len(alignment.sequences[segment_index]) - seg_trace[-1] - 1 - ) + end_clip_length = len(alignment.sequences[segment_index]) - seg_trace[-1] - 1 return start_clip_length, end_clip_length @@ -431,4 +422,4 @@ def _op_tuples_from_cigar(cigar): op = CigarOp.from_cigar_symbol(char) op_tuples.append((op, count)) count = "" - return np.array(op_tuples, dtype=int) \ No newline at end of file + return np.array(op_tuples, dtype=int) diff --git a/src/biotite/sequence/align/kmeralphabet.pyx b/src/biotite/sequence/align/kmeralphabet.pyx index ef92c6075..02a1bbcda 100644 --- a/src/biotite/sequence/align/kmeralphabet.pyx +++ b/src/biotite/sequence/align/kmeralphabet.pyx @@ -33,7 +33,7 @@ class KmerAlphabet(Alphabet): This type of alphabet uses *k-mers* as symbols, i.e. all combinations of *k* symbols from its *base alphabet*. - + It's primary use is its :meth:`create_kmers()` method, that iterates over all overlapping *k-mers* in a :class:`Sequence` and encodes each one into its corresponding *k-mer* symbol code @@ -68,7 +68,7 @@ class KmerAlphabet(Alphabet): integers, that indicate the *informative* positions. For a continuous *k-mer* the `spacing` would be ``[0, 1, 2,...]``. - + Attributes ---------- base_alphabet : Alphabet @@ -79,7 +79,7 @@ class KmerAlphabet(Alphabet): spacing : None or ndarray, dtype=int The *k-mer* model in array form, if spaced *k-mers* are used, ``None`` otherwise. - + Notes ----- The symbol code for a *k-mer* :math:`s` calculates as @@ -94,7 +94,7 @@ class KmerAlphabet(Alphabet): References ---------- - + .. footbibliography:: Examples @@ -103,11 +103,11 @@ class KmerAlphabet(Alphabet): >>> base_alphabet = NucleotideSequence.unambiguous_alphabet() >>> print(base_alphabet.get_symbols()) - ['A', 'C', 'G', 'T'] + ('A', 'C', 'G', 'T') >>> kmer_alphabet = KmerAlphabet(base_alphabet, 2) >>> print(kmer_alphabet.get_symbols()) - ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT'] - + ('AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT') + Encode and decode *k-mers*: >>> print(kmer_alphabet.encode("TC")) @@ -127,7 +127,7 @@ class KmerAlphabet(Alphabet): [3 1] Encode all overlapping continuous k-mers of a sequence: - + >>> sequence = NucleotideSequence("ATTGCT") >>> kmer_codes = kmer_alphabet.create_kmers(sequence.code) >>> print(kmer_codes) @@ -146,7 +146,7 @@ class KmerAlphabet(Alphabet): >>> print([s[0] + s[1] + "_" + s[2] for s in strings]) ['BI_T', 'IQ_I', 'QT_T', 'TI_E'] """ - + def __init__(self, base_alphabet, k, spacing=None): if not isinstance(base_alphabet, Alphabet): raise TypeError( @@ -157,7 +157,7 @@ class KmerAlphabet(Alphabet): raise ValueError("k must be at least 2") self._base_alph = base_alphabet self._k = k - + base_alph_len = len(self._base_alph) self._radix_multiplier = np.array( [base_alph_len**n for n in reversed(range(0, self._k))], @@ -166,10 +166,10 @@ class KmerAlphabet(Alphabet): if spacing is None: self._spacing = None - + elif isinstance(spacing, str): self._spacing = _to_array_form(spacing) - + else: self._spacing = np.array(spacing, dtype=np.int64) self._spacing.sort() @@ -181,13 +181,13 @@ class KmerAlphabet(Alphabet): raise ValueError( "Spacing model contains duplicate values" ) - + if spacing is not None and len(self._spacing) != self._k: raise ValueError( f"Expected {self._k} informative positions, " f"but got {len(self._spacing)} positions in spacing" ) - + @property def base_alphabet(self): @@ -196,11 +196,11 @@ class KmerAlphabet(Alphabet): @property def k(self): return self._k - + @property def spacing(self): return None if self._spacing is None else self._spacing.copy() - + def get_symbols(self): """ @@ -210,10 +210,10 @@ class KmerAlphabet(Alphabet): Returns ------- - symbols : list - A list of all *k-mer* symbols, i.e. all possible + symbols : tuple + A tuple of all *k-mer* symbols, i.e. all possible combinations of *k* symbols from its *base alphabet*. - + Notes ----- In contrast the base :class:`Alphabet` and @@ -224,10 +224,10 @@ class KmerAlphabet(Alphabet): to be created first. """ if isinstance(self._base_alph, LetterAlphabet): - return ["".join(self.decode(code)) for code in range(len(self))] + return tuple(["".join(self.decode(code)) for code in range(len(self))]) else: - return [list(self.decode(code)) for code in range(len(self))] - + return tuple([list(self.decode(code)) for code in range(len(self))]) + def extends(self, alphabet): # A KmerAlphabet cannot really extend another KmerAlphabet: @@ -237,15 +237,15 @@ class KmerAlphabet(Alphabet): # A KmerAlphabet can only 'extend' another KmerAlphabet, # if the two alphabets are equal return alphabet == self - + def encode(self, symbol): return self.fuse(self._base_alph.encode_multiple(symbol)) - + def decode(self, code): return self._base_alph.decode_multiple(self.split(code)) - + def fuse(self, codes): """ @@ -261,7 +261,7 @@ class KmerAlphabet(Alphabet): ---------- codes : ndarray, dtype=int, shape=(k,) or shape=(n,k) The symbol codes from the base alphabet to be fused. - + Returns ------- kmer_codes : int or ndarray, dtype=np.int64, shape=(n,) @@ -292,13 +292,13 @@ class KmerAlphabet(Alphabet): ) if np.any(codes > len(self._base_alph)): raise AlphabetError("Given k-mer(s) contains invalid symbol code") - + orig_shape = codes.shape codes = np.atleast_2d(codes) kmer_code = np.sum(self._radix_multiplier * codes, axis=-1) # The last dimension is removed since it collpased in np.sum return kmer_code.reshape(orig_shape[:-1]) - + def split(self, kmer_code): """ split(kmer_code) @@ -313,7 +313,7 @@ class KmerAlphabet(Alphabet): ---------- kmer_code : int or ndarray, dtype=int, shape=(n,) The *k-mer* code(s). - + Returns ------- codes : ndarray, dtype=np.uint64, shape=(k,) or shape=(n,k) @@ -341,13 +341,13 @@ class KmerAlphabet(Alphabet): raise AlphabetError( f"Given k-mer symbol code is invalid for this alphabet" ) - + orig_shape = np.shape(kmer_code) split_codes = self._split( np.atleast_1d(kmer_code).astype(np.int64, copy=False) ) return split_codes.reshape(orig_shape + (self._k,)) - + @cython.boundscheck(False) @cython.wraparound(False) @cython.cdivision(True) @@ -360,7 +360,7 @@ class KmerAlphabet(Alphabet): cdef uint64[:,:] split_codes = np.empty( (codes.shape[0], self._k), dtype=np.uint64 ) - + cdef int k = self._k for i in range(codes.shape[0]): code = codes[i] @@ -369,9 +369,9 @@ class KmerAlphabet(Alphabet): symbol_code = code // val split_codes[i,n] = symbol_code code -= symbol_code * val - + return np.asarray(split_codes) - + def kmer_array_length(self, int64 length): """ @@ -385,7 +385,7 @@ class KmerAlphabet(Alphabet): ---------- length : int The length of the hypothetical sequence - + Returns ------- kmer_length : int @@ -400,7 +400,7 @@ class KmerAlphabet(Alphabet): spacing = self._spacing max_offset = self._spacing[len(spacing)-1] + 1 return length - max_offset + 1 - + def create_kmers(self, seq_code): """ @@ -418,7 +418,7 @@ class KmerAlphabet(Alphabet): ------- kmer_codes : ndarray, dtype=int64 The symbol codes for the *k-mers*. - + Examples -------- @@ -435,7 +435,7 @@ class KmerAlphabet(Alphabet): return self._create_continuous_kmers(seq_code) else: return self._create_spaced_kmers(seq_code) - + @cython.boundscheck(False) @cython.wraparound(False) def _create_continuous_kmers(self, CodeType[:] seq_code not None): @@ -460,7 +460,7 @@ class KmerAlphabet(Alphabet): cdef int64[:] kmers = np.empty( self.kmer_array_length(len(seq_code)), dtype=np.int64 ) - + cdef CodeType code cdef int64 kmer, prev_kmer # Compute first k-mer using naive approach @@ -471,7 +471,7 @@ class KmerAlphabet(Alphabet): raise AlphabetError(f"Symbol code {code} is out of range") kmer += radix_multiplier[i] * code kmers[0] = kmer - + # Compute all following k-mers from the previous one prev_kmer = kmer for i in range(1, kmers.shape[0]): @@ -481,7 +481,7 @@ class KmerAlphabet(Alphabet): kmer = ( ( # Remove first symbol - (prev_kmer - seq_code[i - 1] * end_radix_multiplier) + (prev_kmer - seq_code[i - 1] * end_radix_multiplier) # Shift k-mer to left * alphabet_length ) @@ -490,9 +490,9 @@ class KmerAlphabet(Alphabet): ) kmers[i] = kmer prev_kmer = kmer - + return np.asarray(kmers) - + @cython.boundscheck(False) @cython.wraparound(False) def _create_spaced_kmers(self, CodeType[:] seq_code not None): @@ -515,7 +515,7 @@ class KmerAlphabet(Alphabet): cdef int64[:] kmers = np.empty( self.kmer_array_length(len(seq_code)), dtype=np.int64 ) - + cdef CodeType code cdef int64 kmer cdef int64 offset @@ -528,18 +528,18 @@ class KmerAlphabet(Alphabet): raise AlphabetError(f"Symbol code {code} is out of range") kmer += radix_multiplier[j] * code kmers[i] = kmer - + return np.asarray(kmers) - + def __str__(self): return str(self.get_symbols()) - + def __repr__(self): return f"KmerAlphabet({repr(self._base_alph)}, " \ f"{self._k}, {repr(self._spacing)})" - + def __eq__(self, item): if item is self: @@ -550,15 +550,19 @@ class KmerAlphabet(Alphabet): return False if self._k != item._k: return False - + if self._spacing is None: if item._spacing is not None: return False elif np.any(self._spacing != item._spacing): return False - + return True - + + + def __hash__(self): + return hash((self._base_alph, self._k, tuple(self._spacing.tolist()))) + def __len__(self): return int(len(self._base_alph) ** self._k) diff --git a/src/biotite/sequence/align/kmertable.pyx b/src/biotite/sequence/align/kmertable.pyx index 98cc62dee..90d7b0569 100644 --- a/src/biotite/sequence/align/kmertable.pyx +++ b/src/biotite/sequence/align/kmertable.pyx @@ -1352,7 +1352,8 @@ cdef class KmerTable: def __iter__(self): - return iter(self.get_kmers()) + for kmer in self.get_kmers(): + yield kmer.item() def __reversed__(self): @@ -3394,7 +3395,7 @@ def _to_string(table): else: symbols = str(tuple(symbols)) line = symbols + ": " + ", ".join( - [str(tuple(pos)) for pos in table[kmer]] + [str((ref_id.item(), pos.item())) for ref_id, pos in table[kmer]] ) lines.append(line) return "\n".join(lines) diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 7f7d4f9eb..2a7d23437 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -5,11 +5,9 @@ __name__ = "biotite.sequence.align" __author__ = "Patrick Kunzmann" -from ..sequence import Sequence -from ..seqtypes import NucleotideSequence, ProteinSequence -from ..alphabet import Alphabet -import numpy as np import os +import numpy as np +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence __all__ = ["SubstitutionMatrix"] @@ -21,54 +19,54 @@ class SubstitutionMatrix(object): A :class:`SubstitutionMatrix` maps each possible pairing of a symbol of a first alphabet with a symbol of a second alphabet to a score (integer). - + The class uses a 2-D (m x n) :class:`ndarray` (dtype=:attr:`numpy.int32`), where each element stores the score for a symbol pairing, indexed by the symbol codes of the respective symbols in an *m*-length alphabet 1 and an *n*-length alphabet 2. - + There are 3 ways to creates instances: - + At first a 2-D :class:`ndarray` containing the scores can be directly provided. - + Secondly a dictionary can be provided, where the keys are pairing tuples and values are the corresponding scores. The pairing tuples consist of a symbol of alphabet 1 as first element and a symbol of alphabet 2 as second element. Parings have to be provided for each possible combination. - + At last a valid matrix name can be given, which is loaded from the internal matrix database. The following matrices are avaliable: - + - Nucleotide substitution matrices from NCBI database - **NUC** - Also usable with ambiguous alphabet - + - Protein substitution matrices from NCBI database - + - **PAM** - **BLOSUM** - **MATCH** - Only differentiates between match and mismatch - **IDENTITY** - Strongly penalizes mismatches - **GONNET** - Not usable with default protein alphabet - **DAYHOFF** - + - Corrected protein substitution matrices :footcite:`Hess2016`, **** is the BLOCKS version, the matrix is based on - + - **BLOSUM_** - **RBLOSUM_** - **CorBLOSUM_** - + A list of all available matrix names is returned by :meth:`list_db()`. - + Since this class can handle two different alphabets, it is possible to align two different types of sequences. - + Objects of this class are immutable. - + Parameters ---------- alphabet1 : Alphabet, length=m @@ -79,23 +77,23 @@ class SubstitutionMatrix(object): Either a symbol code indexed :class:`ndarray` containing the scores, or a dictionary mapping the symbol pairing to scores, or a string referencing a matrix in the internal database. - + Raises ------ KeyError If the matrix dictionary misses a symbol given in the alphabet. - + References ---------- - + .. footbibliography:: - + Examples -------- - + Creating a matrix for two different (nonsense) alphabets via a matrix dictionary: - + >>> alph1 = Alphabet(["foo","bar"]) >>> alph2 = Alphabet([1,2,3]) >>> matrix_dict = {("foo",1):5, ("foo",2):10, ("foo",3):15, @@ -119,17 +117,16 @@ class SubstitutionMatrix(object): C 0 1 0 0 G 0 0 1 0 T 0 0 0 1 - + Creating a matrix via database name: - + >>> alph = ProteinSequence.alphabet >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50") """ - + # Directory of matrix files - _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "matrix_data") - + _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data") + def __init__(self, alphabet1, alphabet2, score_matrix): self._alph1 = alphabet1 self._alph2 = alphabet2 @@ -147,16 +144,19 @@ def __init__(self, alphabet1, alphabet2, score_matrix): matrix_dict = SubstitutionMatrix.dict_from_db(score_matrix) self._fill_with_matrix_dict(matrix_dict) else: - raise TypeError("Matrix must be either a dictionary, " - "an 2-D ndarray or a string") + raise TypeError( + "Matrix must be either a dictionary, " "an 2-D ndarray or a string" + ) # This class is immutable and has a getter function for the # score matrix -> make the score matrix read-only self._matrix.setflags(write=False) def __repr__(self): """Represent SubstitutionMatrix as a string for debugging.""" - return f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " \ - f"np.{np.array_repr(self._matrix)})" + return ( + f"SubstitutionMatrix({self._alph1.__repr__()}, {self._alph2.__repr__()}, " + f"np.{np.array_repr(self._matrix)})" + ) def __eq__(self, item): if not isinstance(item, SubstitutionMatrix): @@ -173,40 +173,39 @@ def __ne__(self, item): return not self == item def _fill_with_matrix_dict(self, matrix_dict): - self._matrix = np.zeros(( len(self._alph1), len(self._alph2) ), - dtype=np.int32) + self._matrix = np.zeros((len(self._alph1), len(self._alph2)), dtype=np.int32) for i in range(len(self._alph1)): for j in range(len(self._alph2)): sym1 = self._alph1.decode(i) sym2 = self._alph2.decode(j) - self._matrix[i,j] = int(matrix_dict[sym1, sym2]) - + self._matrix[i, j] = int(matrix_dict[sym1, sym2]) + def get_alphabet1(self): """ - Get the first alphabet. - + Get the first alphabet. + Returns ------- alphabet : Alphabet The first alphabet. """ return self._alph1 - + def get_alphabet2(self): """ - Get the second alphabet. - + Get the second alphabet. + Returns ------- alphabet : Alphabet The second alphabet. """ return self._alph2 - + def score_matrix(self): """ Get the 2-D :class:`ndarray` containing the score values. - + Returns ------- matrix : ndarray, shape=(m,n), dtype=np.int32 @@ -214,12 +213,12 @@ def score_matrix(self): The array is read-only. """ return self._matrix - + def transpose(self): """ Get a copy of this instance, where the alphabets are interchanged. - + Returns ------- transposed : SubstitutionMatrix @@ -229,7 +228,7 @@ def transpose(self): new_alph2 = self._alph1 new_matrix = np.transpose(self._matrix) return SubstitutionMatrix(new_alph1, new_alph2, new_matrix) - + def is_symmetric(self): """ Check whether the substitution matrix is symmetric, @@ -242,35 +241,36 @@ def is_symmetric(self): True, if both alphabets are identical and the score matrix is symmetric, false otherwise. """ - return self._alph1 == self._alph2 \ - and np.array_equal(self._matrix, np.transpose(self._matrix)) - + return self._alph1 == self._alph2 and np.array_equal( + self._matrix, np.transpose(self._matrix) + ) + def get_score_by_code(self, code1, code2): """ Get the substitution score of two symbols, represented by their code. - + Parameters ---------- code1, code2 : int Symbol codes of the two symbols to be aligned. - + Returns ------- score : int The substitution / alignment score. """ return self._matrix[code1, code2] - + def get_score(self, symbol1, symbol2): """ Get the substitution score of two symbols. - + Parameters ---------- symbol1, symbol2 : object Symbols to be aligned. - + Returns ------- score : int @@ -279,19 +279,19 @@ def get_score(self, symbol1, symbol2): code1 = self._alph1.encode(symbol1) code2 = self._alph2.encode(symbol2) return self._matrix[code1, code2] - + def shape(self): """ Get the shape (i.e. the length of both alphabets) of the subsitution matrix. - + Returns ------- shape : tuple Matrix shape. """ return (len(self._alph1), len(self._alph2)) - + def __str__(self): # Create matrix in NCBI format string = " " @@ -306,18 +306,18 @@ def __str__(self): # Remove terminal line break string = string[:-1] return string - + @staticmethod def dict_from_str(string): """ Create a matrix dictionary from a string in NCBI matrix format. - + Symbols of the first alphabet are taken from the left column, symbols of the second alphabet are taken from the top row. - + The keys of the dictionary consist of tuples containing the aligned symbols and the values are the corresponding scores. - + Returns ------- matrix_dict : dict @@ -329,22 +329,22 @@ def dict_from_str(string): symbols2 = [e for e in lines[0].split()] scores = np.array([line.split()[1:] for line in lines[1:]]).astype(int) scores = np.transpose(scores) - + matrix_dict = {} for i in range(len(symbols1)): for j in range(len(symbols2)): - matrix_dict[(symbols1[i], symbols2[j])] = scores[i,j] + matrix_dict[(symbols1[i], symbols2[j])] = scores[i, j] return matrix_dict - + @staticmethod def dict_from_db(matrix_name): """ Create a matrix dictionary from a valid matrix name in the internal matrix database. - + The keys of the dictionary consist of tuples containing the aligned symbols and the values are the corresponding scores. - + Returns ------- matrix_dict : dict @@ -353,12 +353,12 @@ def dict_from_db(matrix_name): filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat" with open(filename, "r") as f: return SubstitutionMatrix.dict_from_str(f.read()) - + @staticmethod def list_db(): """ List all matrix names in the internal database. - + Returns ------- db_list : list @@ -367,27 +367,26 @@ def list_db(): files = os.listdir(SubstitutionMatrix._db_dir) # Remove '.mat' from files return [file[:-4] for file in sorted(files)] - - + @staticmethod def std_protein_matrix(): """ Get the default :class:`SubstitutionMatrix` for protein sequence alignments, which is BLOSUM62. - + Returns ------- matrix : SubstitutionMatrix Default matrix. """ return _matrix_blosum62 - + @staticmethod def std_nucleotide_matrix(): """ Get the default :class:`SubstitutionMatrix` for DNA sequence alignments. - + Returns ------- matrix : SubstitutionMatrix @@ -395,11 +394,11 @@ def std_nucleotide_matrix(): """ return _matrix_nuc -# Preformatted BLOSUM62 and NUC substitution matrix from NCBI -_matrix_blosum62 = SubstitutionMatrix(ProteinSequence.alphabet, - ProteinSequence.alphabet, - "BLOSUM62") -_matrix_nuc = SubstitutionMatrix(NucleotideSequence.alphabet_amb, - NucleotideSequence.alphabet_amb, - "NUC") +# Preformatted BLOSUM62 and NUC substitution matrix from NCBI +_matrix_blosum62 = SubstitutionMatrix( + ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62" +) +_matrix_nuc = SubstitutionMatrix( + NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" +) diff --git a/src/biotite/sequence/align/multiple.pyx b/src/biotite/sequence/align/multiple.pyx index 2c7b8e50d..ba9b35db4 100644 --- a/src/biotite/sequence/align/multiple.pyx +++ b/src/biotite/sequence/align/multiple.pyx @@ -236,7 +236,7 @@ def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True, # Create new matrix with neutral gap symbol gap_symbol = GapSymbol.instance() new_alphabet = Alphabet( - matrix.get_alphabet1().get_symbols() + [gap_symbol] + matrix.get_alphabet1().get_symbols() + (gap_symbol,) ) new_score_matrix = np.zeros( (len(new_alphabet), len(new_alphabet)), dtype=np.int32 diff --git a/src/biotite/sequence/align/permutation.pyx b/src/biotite/sequence/align/permutation.pyx index 64a84289f..ebe85a835 100644 --- a/src/biotite/sequence/align/permutation.pyx +++ b/src/biotite/sequence/align/permutation.pyx @@ -85,7 +85,7 @@ class RandomPermutation(Permutation): This class uses a simple full-period *linear congruential generator* (LCG) to provide pseudo-randomized values: - .. math:: \text{order} = (a c_\text{k-mer} + 1) \mod 2^64. + .. math:: \text{order} = (a \, c_\text{k-mer} + 1) \mod 2^{64}. The factor :math:`a` is taken from :footcite:`Steele2021` to ensure full periodicity and good random behavior. @@ -186,6 +186,9 @@ class FrequencyPermutation(Permutation): The minimum and maximum value, the permutated value (i.e. the return value of :meth:`permute()`) can take. + kmer_alphabet : KmerAlphabet + The *k-mer* alphabet that defines the range of possible *k-mers* + that should be permuted. Notes ----- @@ -226,11 +229,11 @@ class FrequencyPermutation(Permutation): >>> permutation = FrequencyPermutation.from_table(kmer_table) >>> order = permutation.permute(kmer_codes) >>> print(order) - [ 0 24 20 19 16 15 14 13 12 22 21 10 11 8 7 18 6 5 4 3 23 2 1 9 + [ 0 22 18 19 1 2 3 4 5 23 20 6 7 8 9 21 10 11 12 13 24 14 15 16 17] >>> kmer_codes = kmer_codes[np.argsort(order)] >>> print(["..."] + ["".join(kmer_alph.decode(c)) for c in kmer_codes[-10:]]) - ['...', 'ba', 'ar', 'rr', 'da', 'ad', 'ac', 'ca', 'br', 'ra', 'ab'] + ['...', 'rc', 'rd', 'rr', 'ac', 'ad', 'ca', 'da', 'ab', 'br', 'ra'] """ def __init__(self, kmer_alphabet, counts): @@ -240,7 +243,9 @@ class FrequencyPermutation(Permutation): f"but {len(counts)} counts were given" ) # 'order' maps a permutation to a k-mer - order = np.argsort(counts) + # Stability is important to get the same k-mer subset selection + # on different architectures + order = np.argsort(counts, kind="stable") # '_permutation_table' should perform the reverse mapping self._permutation_table = _invert_mapping(order) self._kmer_alph = kmer_alphabet @@ -259,8 +264,11 @@ class FrequencyPermutation(Permutation): return self._kmer_alph + @staticmethod def from_table(kmer_table): """ + from_table(kmer_table) + Create a :class:`FrequencyPermutation` from the *k-mer* counts of a :class:`KmerTable`. diff --git a/src/biotite/sequence/align/selector.pyx b/src/biotite/sequence/align/selector.pyx index 1bf68127f..8bfff8721 100644 --- a/src/biotite/sequence/align/selector.pyx +++ b/src/biotite/sequence/align/selector.pyx @@ -10,10 +10,8 @@ __all__ = ["MinimizerSelector", "SyncmerSelector", "CachedSyncmerSelector", cimport cython cimport numpy as np -from numbers import Integral import numpy as np from .kmeralphabet import KmerAlphabet -from ..alphabet import AlphabetError ctypedef np.int64_t int64 @@ -21,7 +19,7 @@ ctypedef np.uint32_t uint32 # Obtained from 'np.iinfo(np.int64).max' -DEF MAX_INT_64 = 9223372036854775807 +cdef int64 MAX_INT_64 = 9223372036854775807 class MinimizerSelector: @@ -54,7 +52,7 @@ class MinimizerSelector: This standard order is often the lexicographical order, which is known to yield suboptimal *density* in many cases :footcite:`Roberts2004`. - + Attributes ---------- kmer_alphabet : KmerAlphabet @@ -73,7 +71,7 @@ class MinimizerSelector: References ---------- - + .. footbibliography:: Examples @@ -122,12 +120,12 @@ class MinimizerSelector: self._window = window self._kmer_alph = kmer_alphabet self._permutation = permutation - + @property def kmer_alphabet(self): return self._kmer_alph - + @property def window(self): return self._window @@ -135,7 +133,7 @@ class MinimizerSelector: @property def permutation(self): return self._permutation - + def select(self, sequence, bint alphabet_check=True): """ @@ -154,7 +152,7 @@ class MinimizerSelector: of the sequence and the alphabet of the :class:`MinimizerSelector` is not checked to gain additional performance. - + Returns ------- minimizer_indices : ndarray, dtype=np.uint32 @@ -162,7 +160,7 @@ class MinimizerSelector: minimizers : ndarray, dtype=np.int64 The *k-mers* that are the selected minimizers, returned as *k-mer* code. - + Notes ----- Duplicate minimizers are omitted, i.e. if two windows have the @@ -176,7 +174,7 @@ class MinimizerSelector: ) kmers = self._kmer_alph.create_kmers(sequence.code) return self.select_from_kmers(kmers) - + def select_from_kmers(self, kmers): """ @@ -191,7 +189,7 @@ class MinimizerSelector: minimizers in. The *k-mer* codes correspond to the *k-mers* encoded by the given `kmer_alphabet`. - + Returns ------- minimizer_indices : ndarray, dtype=np.uint32 @@ -199,7 +197,7 @@ class MinimizerSelector: appears. minimizers : ndarray, dtype=np.int64 The corresponding *k-mers* codes of the minimizers. - + Notes ----- Duplicate minimizers are omitted, i.e. if two windows have the @@ -267,7 +265,7 @@ class SyncmerSelector: *k-mer*. By default, the minimum position needs to be at the start of the *k-mer*, which is termed *open syncmer*. - + Attributes ---------- alphabet : Alphabet @@ -276,7 +274,7 @@ class SyncmerSelector: The :class:`KmerAlphabet` for *k* and *s*, respectively. permutation : Permutation The permutation. - + See also -------- CachedSyncmerSelector @@ -291,7 +289,7 @@ class SyncmerSelector: References ---------- - + .. footbibliography:: Examples @@ -337,7 +335,7 @@ class SyncmerSelector: self._alphabet = alphabet self._kmer_alph = KmerAlphabet(alphabet, k) self._smer_alph = KmerAlphabet(alphabet, s) - + self._permutation = permutation self._offset = np.asarray(offset, dtype=np.int64) @@ -353,7 +351,7 @@ class SyncmerSelector: ) if len(np.unique(self._offset)) != len(self._offset): raise ValueError("Offset must contain unique values") - + @property def alphabet(self): @@ -362,7 +360,7 @@ class SyncmerSelector: @property def kmer_alphabet(self): return self._kmer_alph - + @property def smer_alphabet(self): return self._smer_alph @@ -370,7 +368,7 @@ class SyncmerSelector: @property def permutation(self): return self._permutation - + def select(self, sequence, bint alphabet_check=True): """ @@ -389,7 +387,7 @@ class SyncmerSelector: of the sequence and the alphabet of the :class:`SyncmerSelector` is not checked to gain additional performance. - + Returns ------- syncmer_indices : ndarray, dtype=np.uint32 @@ -428,7 +426,7 @@ class SyncmerSelector: relative_min_pos = min_pos - np.arange(len(kmers)) syncmer_pos = self._filter_syncmer_pos(relative_min_pos) return syncmer_pos, kmers[syncmer_pos] - + def select_from_kmers(self, kmers): """ @@ -442,7 +440,7 @@ class SyncmerSelector: ---------- kmers : ndarray, dtype=np.int64 The *k-mer* codes to select the syncmers from. - + Returns ------- syncmer_indices : ndarray, dtype=np.uint32 @@ -459,9 +457,9 @@ class SyncmerSelector: :class:`Sequence` objects. """ cdef int64 i - + symbol_codes_for_each_kmer = self._kmer_alph.split(kmers) - + cdef int64[:] min_pos = np.zeros( len(symbol_codes_for_each_kmer), dtype=np.int64 ) @@ -477,10 +475,10 @@ class SyncmerSelector: f"sort keys for {len(smers)} s-mers" ) min_pos[i] = np.argmin(ordering) - + syncmer_pos = self._filter_syncmer_pos(min_pos) return syncmer_pos, kmers[syncmer_pos] - + def _filter_syncmer_pos(self, min_pos): """ @@ -538,7 +536,7 @@ class CachedSyncmerSelector(SyncmerSelector): *k-mer*. By default, the minimum position needs to be at the start of the *k-mer*, which is termed *open syncmer*. - + Attributes ---------- alphabet : Alphabet @@ -547,7 +545,7 @@ class CachedSyncmerSelector(SyncmerSelector): The :class:`KmerAlphabet` for *k* and *s*, respectively. permutation : Permutation The permutation. - + See also -------- SyncmerSelector @@ -562,7 +560,7 @@ class CachedSyncmerSelector(SyncmerSelector): References ---------- - + .. footbibliography:: Examples @@ -584,7 +582,7 @@ class CachedSyncmerSelector(SyncmerSelector): >>> print(["".join(kmer_alph.decode(kmer)) for kmer in syncmers]) ['GGCAA', 'AAGTG', 'AGTGA', 'GTGAC'] """ - + def __init__(self, alphabet, k, s, permutation=None, offset=(0,)): super().__init__(alphabet, k, s, permutation, offset) # Check for all possible *k-mers*, whether they are syncmers @@ -593,7 +591,7 @@ class CachedSyncmerSelector(SyncmerSelector): # Convert the index array into a boolean mask self._syncmer_mask = np.zeros(len(self.kmer_alphabet), dtype=bool) self._syncmer_mask[syncmer_indices] = True - + def select(self, sequence, bint alphabet_check=True): """ @@ -612,7 +610,7 @@ class CachedSyncmerSelector(SyncmerSelector): of the sequence and the alphabet of the :class:`CachedSyncmerSelector` is not checked to gain additional performance. - + Returns ------- syncmer_indices : ndarray, dtype=np.uint32 @@ -628,7 +626,7 @@ class CachedSyncmerSelector(SyncmerSelector): ) kmers = self.kmer_alphabet.create_kmers(sequence.code) return self.select_from_kmers(kmers) - + def select_from_kmers(self, kmers): """ @@ -642,7 +640,7 @@ class CachedSyncmerSelector(SyncmerSelector): ---------- kmers : ndarray, dtype=np.int64 The *k-mer* codes to select the syncmers from. - + Returns ------- syncmer_indices : ndarray, dtype=np.uint32 @@ -660,7 +658,7 @@ class MincodeSelector: Selects the :math:`1/\text{compression}` *smallest* *k-mers* from :class:`KmerAlphabet`. :footcite:`Edgar2021` - + '*Small*' refers to the lexicographical order, or alternatively a custom order if `permutation` is given. The *Mincode* approach tries to reduce the number of *k-mers* from a @@ -682,7 +680,7 @@ class MincodeSelector: By default, the standard order of the :class:`KmerAlphabet` is used. This standard order is often the lexicographical order. - + Attributes ---------- kmer_alphabet : KmerAlphabet @@ -695,10 +693,10 @@ class MincodeSelector: All *k-mers*, that are smaller than this value are selected. permutation : Permutation The permutation. - + References ---------- - + .. footbibliography:: Examples @@ -735,12 +733,12 @@ class MincodeSelector: permutation_offset = permutation.min permutation_range = permutation.max - permutation.min + 1 self._threshold = permutation_offset + permutation_range / compression - + @property def kmer_alphabet(self): return self._kmer_alph - + @property def compression(self): return self._compression @@ -752,7 +750,7 @@ class MincodeSelector: @property def permutation(self): return self._permutation - + def select(self, sequence, bint alphabet_check=True): """ @@ -771,7 +769,7 @@ class MincodeSelector: of the sequence and the alphabet of the :class:`MincodeSelector` is not checked to gain additional performance. - + Returns ------- mincode_indices : ndarray, dtype=np.uint32 @@ -786,7 +784,7 @@ class MincodeSelector: ) kmers = self._kmer_alph.create_kmers(sequence.code) return self.select_from_kmers(kmers) - + def select_from_kmers(self, kmers): """ @@ -800,7 +798,7 @@ class MincodeSelector: ---------- kmers : ndarray, dtype=np.int64 The *k-mer* codes to select the *Mincode k-mers* from. - + Returns ------- mincode_indices : ndarray, dtype=np.uint32 @@ -820,7 +818,7 @@ class MincodeSelector: mincode_pos = ordering < self._threshold return mincode_pos, kmers[mincode_pos] - + @cython.boundscheck(False) @cython.wraparound(False) @@ -835,7 +833,7 @@ def _minimize(int64[:] kmers, int64[:] ordering, uint32 window, instead of 'x - (window-1)/2' to 'x + (window-1)/2'. """ cdef uint32 seq_i - + cdef uint32 n_windows = kmers.shape[0] - (window - 1) # Pessimistic array allocation size # -> Expect that every window has a new minimizer @@ -865,14 +863,14 @@ def _minimize(int64[:] kmers, int64[:] ordering, uint32 window, reverse_argcummin = reverse_argcummins[seq_i] forward_cummin = ordering[forward_argcummin] reverse_cummin = ordering[reverse_argcummin] - + # At ties the leftmost position is taken, # which stems from the reverse pass if forward_cummin < reverse_cummin: combined_argcummin = forward_argcummin else: combined_argcummin = reverse_argcummin - + # If the same minimizer position was observed before, the # duplicate is simply ignored, if 'include_duplicates' is false if include_duplicates or combined_argcummin != prev_argcummin: @@ -899,7 +897,7 @@ cdef _chunk_wise_forward_argcummin(int64[:] values, uint32 chunk_size): cdef uint32 current_min_i = 0 cdef int64 current_min, current_val cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32) - + # Any actual value will be smaller than this placeholder current_min = MAX_INT_64 for seq_i in range(values.shape[0]): @@ -911,7 +909,7 @@ cdef _chunk_wise_forward_argcummin(int64[:] values, uint32 chunk_size): current_min_i = seq_i current_min = current_val min_pos[seq_i] = current_min_i - + return min_pos @cython.boundscheck(False) @@ -930,7 +928,7 @@ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size): - There are issues in selecting the leftmost argument - An offset is necessary to ensure alignment of chunks with forward pass - + Hence, a separate 'reverse' variant of the function was implemented. """ cdef uint32 seq_i @@ -938,7 +936,7 @@ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size): cdef uint32 current_min_i = 0 cdef int64 current_min, current_val cdef uint32[:] min_pos = np.empty(values.shape[0], dtype=np.uint32) - + current_min = MAX_INT_64 for seq_i in reversed(range(values.shape[0])): # The chunk beginning is a small difference to forward @@ -952,5 +950,5 @@ cdef _chunk_wise_reverse_argcummin(int64[:] values, uint32 chunk_size): current_min_i = seq_i current_min = current_val min_pos[seq_i] = current_min_i - + return min_pos diff --git a/src/biotite/sequence/align/statistics.py b/src/biotite/sequence/align/statistics.py index 19a8c9aba..72a783ac5 100644 --- a/src/biotite/sequence/align/statistics.py +++ b/src/biotite/sequence/align/statistics.py @@ -7,8 +7,8 @@ __all__ = ["EValueEstimator"] import numpy as np -from ..seqtypes import GeneralSequence -from .pairwise import align_optimal +from biotite.sequence.align.pairwise import align_optimal +from biotite.sequence.seqtypes import GeneralSequence class EValueEstimator: @@ -29,7 +29,7 @@ class EValueEstimator: of random sequence alignments in :meth:`from_samples()` :footcite:`Altschul1986`, which may be time consuming. If these parameters are known, the constructor can be used instead. - + Based on the sampled parameters, the decadic logarithm of the E-value can be quickly calculated via :meth:`log_evalue()`. @@ -39,7 +39,7 @@ class EValueEstimator: The :math:`\lambda` parameter. k : float The :math:`K` parameter. - + Notes ----- The calculated E-value is a rough estimation that gets more @@ -102,8 +102,9 @@ def __init__(self, lam, k): self._k = k @staticmethod - def from_samples(alphabet, matrix, gap_penalty, frequencies, - sample_length=1000, sample_size=1000): + def from_samples( + alphabet, matrix, gap_penalty, frequencies, sample_length=1000, sample_size=1000 + ): r""" Create an :class:`EValueEstimator` with :math:`\lambda` and :math:`K` estimated via sampling alignments of random sequences @@ -137,13 +138,13 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, The number of sampled sequences. The accuracy of the estimated parameters and E-values, but also the runtime increases with the sample size. - + Returns ------- estimator : EValueEstimator A :class:`EValueEstimator` with sampled :math:`\lambda` and :math:`K` parameters. - + Notes ----- The sampling process generates random sequences based on @@ -167,15 +168,15 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, raise ValueError("A symmetric substitution matrix is required") if not matrix.get_alphabet1().extends(alphabet): raise ValueError( - "The substitution matrix is not compatible " - "with the given alphabet" + "The substitution matrix is not compatible " "with the given alphabet" ) - score_matrix = matrix.score_matrix()[:len(alphabet), :len(alphabet)] - if np.sum( - score_matrix \ - * frequencies[np.newaxis, :] \ - * frequencies[:, np.newaxis] - ) >= 0: + score_matrix = matrix.score_matrix()[: len(alphabet), : len(alphabet)] + if ( + np.sum( + score_matrix * frequencies[np.newaxis, :] * frequencies[:, np.newaxis] + ) + >= 0 + ): raise ValueError( "Invalid substitution matrix, the expected similarity " "score between two random symbols is not negative" @@ -183,9 +184,7 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, # Generate the sequence code for the random sequences random_sequence_code = np.random.choice( - len(alphabet), - size=(sample_size, 2, sample_length), - p=frequencies + len(alphabet), size=(sample_size, 2, sample_length), p=frequencies ) # Sample the alignments of random sequences @@ -193,28 +192,27 @@ def from_samples(alphabet, matrix, gap_penalty, frequencies, for i in range(sample_size): seq1 = GeneralSequence(alphabet) seq2 = GeneralSequence(alphabet) - seq1.code = random_sequence_code[i,0] - seq2.code = random_sequence_code[i,1] + seq1.code = random_sequence_code[i, 0] + seq2.code = random_sequence_code[i, 1] sample_scores[i] = align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=gap_penalty, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=gap_penalty, max_number=1 )[0].score - + # Use method of moments to estimate parameters lam = np.pi / np.sqrt(6 * np.var(sample_scores)) u = np.mean(sample_scores) - np.euler_gamma / lam k = np.exp(lam * u) / sample_length**2 - + return EValueEstimator(lam, k) @property def lam(self): return self._lam - + @property def k(self): return self._k - + def log_evalue(self, score, seq1_length, seq2_length): r""" Calculate the decadic logarithm of the E-value for a given @@ -223,11 +221,11 @@ def log_evalue(self, score, seq1_length, seq2_length): The E-value and the logarithm of the E-value is calculated as .. math:: - + E = Kmn e^{-\lambda s} \log_{10} E = (\log_{10} Kmn) - \frac{\lambda s}{\ln 10}, - + where :math:`s` is the similarity score and :math:`m` and :math:`n` are the lengths of the aligned sequences. @@ -245,12 +243,12 @@ def log_evalue(self, score, seq1_length, seq2_length): this is usually either the combined length of all sequences in the database or the length of the hit sequence multiplied by the number of sequences in the database. - + Returns ------- log_e : float The decadic logarithm of the E-value. - + Notes ----- This method returns the logarithm of the E-value instead of @@ -261,5 +259,6 @@ def log_evalue(self, score, seq1_length, seq2_length): seq1_length = np.asarray(seq1_length) seq2_length = np.asarray(seq2_length) - return np.log10(self._k * seq1_length * seq2_length) \ - - self._lam * score / np.log(10) \ No newline at end of file + return np.log10( + self._k * seq1_length * seq2_length + ) - self._lam * score / np.log(10) diff --git a/src/biotite/sequence/alphabet.py b/src/biotite/sequence/alphabet.py index 2f0d409e6..c08022cec 100644 --- a/src/biotite/sequence/alphabet.py +++ b/src/biotite/sequence/alphabet.py @@ -4,14 +4,18 @@ __name__ = "biotite.sequence" __author__ = "Patrick Kunzmann" -__all__ = ["Alphabet", "LetterAlphabet", "AlphabetMapper", "AlphabetError", - "common_alphabet"] +__all__ = [ + "Alphabet", + "LetterAlphabet", + "AlphabetMapper", + "AlphabetError", + "common_alphabet", +] -import copy -from numbers import Integral import string +from numbers import Integral import numpy as np -from .codec import encode_chars, decode_to_chars, map_sequence_code +from biotite.sequence.codec import decode_to_chars, encode_chars, map_sequence_code class Alphabet(object): @@ -100,14 +104,14 @@ class Alphabet(object): def __init__(self, symbols): if len(symbols) == 0: raise ValueError("Symbol list is empty") - self._symbols = copy.deepcopy(list(symbols)) + self._symbols = tuple(symbols) self._symbol_dict = {} for i, symbol in enumerate(symbols): self._symbol_dict[symbol] = i def __repr__(self): """Represent Alphabet as a string for debugging.""" - return f'Alphabet({self._symbols})' + return f"Alphabet({self._symbols})" def get_symbols(self): """ @@ -115,10 +119,10 @@ def get_symbols(self): Returns ------- - symbols : list - Copy of the internal list of symbols. + symbols : tuple + The symbols. """ - return copy.deepcopy(self._symbols) + return self._symbols def extends(self, alphabet): """ @@ -139,8 +143,7 @@ def extends(self, alphabet): elif len(alphabet) > len(self): return False else: - return alphabet.get_symbols() \ - == self.get_symbols()[:len(alphabet)] + return alphabet.get_symbols() == self.get_symbols()[: len(alphabet)] def encode(self, symbol): """ @@ -164,9 +167,7 @@ def encode(self, symbol): try: return self._symbol_dict[symbol] except KeyError: - raise AlphabetError( - f"Symbol {repr(symbol)} is not in the alphabet" - ) + raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet") def decode(self, code): """ @@ -238,12 +239,11 @@ def is_letter_alphabet(self): have length 1 and are printable. """ for symbol in self: - if not isinstance(symbol, (str, bytes)) \ - or len(symbol) > 1: - return False + if not isinstance(symbol, (str, bytes)) or len(symbol) > 1: + return False if isinstance(symbol, str): symbol = symbol.encode("ASCII") - if symbol not in LetterAlphabet.PRINATBLES: + if symbol not in LetterAlphabet.PRINTABLES: return False return True @@ -260,7 +260,11 @@ def __contains__(self, symbol): return symbol in self.get_symbols() def __hash__(self): - return hash(tuple(self._symbols)) + symbols = self.get_symbols() + if isinstance(symbols, tuple): + return hash(symbols) + else: + return hash(tuple(symbols)) def __eq__(self, item): if item is self: @@ -292,8 +296,9 @@ class LetterAlphabet(Alphabet): in this list. """ - PRINATBLES = (string.digits + string.ascii_letters + string.punctuation) \ - .encode("ASCII") + PRINTABLES = (string.digits + string.ascii_letters + string.punctuation).encode( + "ASCII" + ) def __init__(self, symbols): if len(symbols) == 0: @@ -304,7 +309,7 @@ def __init__(self, symbols): raise ValueError(f"Symbol '{symbol}' is not a single letter") if isinstance(symbol, str): symbol = symbol.encode("ASCII") - if symbol not in LetterAlphabet.PRINATBLES: + if symbol not in LetterAlphabet.PRINTABLES: raise ValueError( f"Symbol {repr(symbol)} is not printable or whitespace" ) @@ -312,47 +317,33 @@ def __init__(self, symbols): # Direct 'astype' conversion is not allowed by numpy # -> frombuffer() self._symbols = np.frombuffer( - np.array(self._symbols, dtype="|S1"), - dtype=np.ubyte + np.array(self._symbols, dtype="|S1"), dtype=np.ubyte ) def __repr__(self): """Represent LetterAlphabet as a string for debugging.""" - return f'LetterAlphabet({self.get_symbols()})' + return f"LetterAlphabet({self.get_symbols()})" def extends(self, alphabet): if alphabet is self: return True - elif type(alphabet) == LetterAlphabet: + elif isinstance(alphabet, LetterAlphabet): if len(alphabet._symbols) > len(self._symbols): return False - return np.all( - alphabet._symbols == self._symbols[:len(alphabet._symbols)] - ) + return np.all(alphabet._symbols == self._symbols[: len(alphabet._symbols)]) else: return super().extends(alphabet) def get_symbols(self): - """ - Get the symbols in the alphabet. - - Returns - ------- - symbols : list - Copy of the internal list of symbols. - """ - return [symbol.decode("ASCII") for symbol - in self._symbols_as_bytes()] + return tuple([symbol.decode("ASCII") for symbol in self._symbols_as_bytes()]) def encode(self, symbol): if not isinstance(symbol, (str, bytes)) or len(symbol) > 1: raise AlphabetError(f"Symbol '{symbol}' is not a single letter") indices = np.where(self._symbols == ord(symbol))[0] if len(indices) == 0: - raise AlphabetError( - f"Symbol {repr(symbol)} is not in the alphabet" - ) - return indices[0] + raise AlphabetError(f"Symbol {repr(symbol)} is not in the alphabet") + return indices[0].item() def decode(self, code, as_bytes=False): if code < 0 or code >= len(self._symbols): @@ -382,13 +373,10 @@ def encode_multiple(self, symbols, dtype=None): elif isinstance(symbols, bytes): symbols = np.frombuffer(symbols, dtype=np.ubyte) elif isinstance(symbols, np.ndarray): - symbols = np.frombuffer( - symbols.astype(dtype="|S1"), dtype=np.ubyte - ) + symbols = np.frombuffer(symbols.astype(dtype="|S1"), dtype=np.ubyte) else: symbols = np.frombuffer( - np.array(list(symbols), dtype="|S1"), - dtype=np.ubyte + np.array(list(symbols), dtype="|S1"), dtype=np.ubyte ) return encode_chars(alphabet=self._symbols, symbols=symbols) @@ -435,7 +423,6 @@ def _symbols_as_bytes(self): return np.frombuffer(self._symbols, dtype="|S1") - class AlphabetMapper(object): """ This class is used for symbol code conversion from a source @@ -486,8 +473,7 @@ def __init__(self, source_alphabet, target_alphabet): else: self._necessary_mapping = True self._mapper = np.zeros( - len(source_alphabet), - dtype=AlphabetMapper._dtype(len(target_alphabet)) + len(source_alphabet), dtype=AlphabetMapper._dtype(len(target_alphabet)) ) for old_code in range(len(source_alphabet)): symbol = source_alphabet.decode(old_code) @@ -500,26 +486,25 @@ def __getitem__(self, code): return self._mapper[code] else: return code - if not isinstance(code, np.ndarray) \ - or code.dtype not in (np.uint8, np.uint16, np.uint32, np.uint64): - code = np.array(code, dtype=np.uint64) + if not isinstance(code, np.ndarray) or code.dtype not in ( + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ): + code = np.array(code, dtype=np.uint64) if self._necessary_mapping: mapped_code = np.empty(len(code), dtype=self._mapper.dtype) - map_sequence_code( - self._mapper, - code, - mapped_code - ) + map_sequence_code(self._mapper, code, mapped_code) return mapped_code else: return code - @staticmethod def _dtype(alphabet_size): - _size_uint8 = np.iinfo(np.uint8 ).max +1 - _size_uint16 = np.iinfo(np.uint16).max +1 - _size_uint32 = np.iinfo(np.uint32).max +1 + _size_uint8 = np.iinfo(np.uint8).max + 1 + _size_uint16 = np.iinfo(np.uint16).max + 1 + _size_uint32 = np.iinfo(np.uint32).max + 1 if alphabet_size <= _size_uint8: return np.uint8 elif alphabet_size <= _size_uint16: @@ -535,6 +520,7 @@ class AlphabetError(Exception): This exception is raised, when a code or a symbol is not in an :class:`Alphabet`. """ + pass @@ -563,4 +549,4 @@ def common_alphabet(alphabets): common_alphabet = alphabet else: return None - return common_alphabet \ No newline at end of file + return common_alphabet diff --git a/src/biotite/sequence/annotation.py b/src/biotite/sequence/annotation.py index cb2a9267e..dac70b993 100644 --- a/src/biotite/sequence/annotation.py +++ b/src/biotite/sequence/annotation.py @@ -6,17 +6,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["Location", "Feature", "Annotation", "AnnotatedSequence"] -import numbers import copy +import numbers import sys -from enum import Flag, Enum, auto +from enum import Enum, Flag, auto import numpy as np -from .sequence import Sequence -from ..copyable import Copyable -from .seqtypes import NucleotideSequence +from biotite.copyable import Copyable -class Location(): +class Location: """ A :class:`Location` defines at which base(s)/residue(s) a feature is located. @@ -63,24 +61,25 @@ class Defect(Flag): - **BETWEEN** - The position is between to consecutive bases/residues. """ - NONE = 0 - MISS_LEFT = auto() - MISS_RIGHT = auto() - BEYOND_LEFT = auto() + + NONE = 0 + MISS_LEFT = auto() + MISS_RIGHT = auto() + BEYOND_LEFT = auto() BEYOND_RIGHT = auto() - UNK_LOC = auto() - BETWEEN = auto() + UNK_LOC = auto() + BETWEEN = auto() class Strand(Enum): """ This enum type describes the strand of the feature location. This is not relevant for protein sequence features. """ + FORWARD = auto() REVERSE = auto() - def __init__(self, first, last, strand=Strand.FORWARD, - defect=Defect.NONE): + def __init__(self, first, last, strand=Strand.FORWARD, defect=Defect.NONE): if first > last: raise ValueError( "The first position cannot be higher than the last position" @@ -92,8 +91,10 @@ def __init__(self, first, last, strand=Strand.FORWARD, def __repr__(self): """Represent Location as a string for debugging.""" - return f'Location({self._first}, {self._last}, strand={"Location." + str(self._strand)}, ' \ - f'defect={"Location." + str(self._defect)})' + return ( + f'Location({self._first}, {self._last}, strand={"Location." + str(self._strand)}, ' + f'defect={"Location." + str(self._defect)})' + ) @property def first(self): @@ -122,10 +123,12 @@ def __str__(self): def __eq__(self, item): if not isinstance(item, Location): return False - return ( self.first == item.first - and self.last == item.last - and self.strand == item.strand - and self.defect == item.defect) + return ( + self.first == item.first + and self.last == item.last + and self.strand == item.strand + and self.defect == item.defect + ) def __hash__(self): return hash((self._first, self._last, self._strand, self._defect)) @@ -208,9 +211,11 @@ def get_location_range(self): def __eq__(self, item): if not isinstance(item, Feature): return False - return ( self._key == item._key - and self._locs == item._locs - and self._qual == item._qual) + return ( + self._key == item._key + and self._locs == item._locs + and self._qual == item._qual + ) def __lt__(self, item): if not isinstance(item, Feature): @@ -223,7 +228,7 @@ def __lt__(self, item): return True elif first > it_first: return False - else: # First is equal + else: # First is equal return last > it_last def __gt__(self, item): @@ -237,7 +242,7 @@ def __gt__(self, item): return True elif first < it_first: return False - else: # First is equal + else: # First is equal return last < it_last @property @@ -253,7 +258,7 @@ def qual(self): return copy.copy(self._qual) def __hash__(self): - return hash(( self._key, self._locs, frozenset(self._qual.items()) )) + return hash((self._key, self._locs, frozenset(self._qual.items()))) class Annotation(Copyable): @@ -337,7 +342,7 @@ class Annotation(Copyable): ... gene = f.qual["gene"] ... loc_str = "".join([f"{loc} {loc.defect}" for loc in f.locs]) ... print(gene, loc_str) - test5 40-149 > Defect.MISS_RIGHT|MISS_LEFT + test5 40-149 > Defect.MISS_LEFT|MISS_RIGHT test2 40-50 > Defect.MISS_LEFT test3 100-130 > Defect.NONE """ @@ -350,7 +355,9 @@ def __init__(self, features=None): def __repr__(self): """Represent Annotation as a string for debugging.""" - return f'Annotation([{", ".join([feat.__repr__() for feat in self._features])}])' + return ( + f'Annotation([{", ".join([feat.__repr__() for feat in self._features])}])' + ) def __copy_create__(self): return Annotation(self._features) @@ -403,7 +410,7 @@ def get_location_range(self): if loc.last > last: last = loc.last # Exclusive stop -> +1 - return first, last+1 + return first, last + 1 def del_feature(self, feature): """ @@ -475,9 +482,7 @@ def __getitem__(self, index): if loc.last > i_last: defect |= Location.Defect.MISS_RIGHT last = i_last - locs_in_scope.append(Location( - first, last, loc.strand, defect - )) + locs_in_scope.append(Location(first, last, loc.strand, defect)) if len(locs_in_scope) > 0: # The feature is present in the new annotation # if any of the original locations is in the new @@ -488,15 +493,12 @@ def __getitem__(self, index): sub_annot.add_feature(new_feature) return sub_annot else: - raise TypeError( - f"'{type(index).__name__}' instances are invalid indices" - ) + raise TypeError(f"'{type(index).__name__}' instances are invalid indices") def __delitem__(self, item): if not isinstance(item, Feature): raise TypeError( - f"Only 'Feature' objects are supported, " - f"not {type(item).__name__}" + f"Only 'Feature' objects are supported, " f"not {type(item).__name__}" ) self.del_feature(item) @@ -626,8 +628,10 @@ def __init__(self, annotation, sequence, sequence_start=1): def __repr__(self): """Represent AnnotatedSequence as a string for debugging.""" - return f'AnnotatedSequence({self._annotation.__repr__()}, {self._sequence.__repr__()}, ' \ - f'sequence_start={self._seqstart})' + return ( + f"AnnotatedSequence({self._annotation.__repr__()}, {self._sequence.__repr__()}, " + f"sequence_start={self._seqstart})" + ) @property def sequence_start(self): @@ -643,7 +647,8 @@ def annotation(self): def __copy_create__(self): return AnnotatedSequence( - self._annotation.copy(), self._sequence.copy, self._seqstart) + self._annotation.copy(), self._sequence.copy, self._seqstart + ) def reverse_complement(self, sequence_start=1): """ @@ -676,10 +681,12 @@ def reverse_complement(self, sequence_start=1): # (seq_len-1) -> last sequence index # (loc.last-self._seqstart) -> location to index # ... + rev_seqstart -> index to location - rev_loc_first \ - = (seq_len-1) - (loc.last-self._seqstart) + rev_seqstart - rev_loc_last \ - = (seq_len-1) - (loc.first-self._seqstart) + rev_seqstart + rev_loc_first = ( + (seq_len - 1) - (loc.last - self._seqstart) + rev_seqstart + ) + rev_loc_last = ( + (seq_len - 1) - (loc.first - self._seqstart) + rev_seqstart + ) if loc.strand == Location.Strand.FORWARD: rev_loc_strand = Location.Strand.REVERSE @@ -700,17 +707,14 @@ def reverse_complement(self, sequence_start=1): if loc.defect & Location.Defect.BETWEEN: rev_loc_defect |= Location.Defect.BETWEEN - rev_locs.append(Location( - rev_loc_first, rev_loc_last, - rev_loc_strand, rev_loc_defect - )) - rev_features.append(Feature( - feature.key, rev_locs, feature.qual - )) + rev_locs.append( + Location( + rev_loc_first, rev_loc_last, rev_loc_strand, rev_loc_defect + ) + ) + rev_features.append(Feature(feature.key, rev_locs, feature.qual)) - return AnnotatedSequence( - Annotation(rev_features), rev_sequence, rev_seqstart - ) + return AnnotatedSequence(Annotation(rev_features), rev_sequence, rev_seqstart) def __getitem__(self, index): if isinstance(index, Feature): @@ -730,24 +734,20 @@ def __getitem__(self, index): pass elif strand is None: strand = loc.strand - else: # loc.strand != strand + else: # loc.strand != strand raise ValueError( "All locations of the feature must have the same " "strand direction" ) if strand == Location.Strand.FORWARD: - sorted_locs = sorted( - locs, key=lambda loc: loc.first - ) + sorted_locs = sorted(locs, key=lambda loc: loc.first) else: - sorted_locs = sorted( - locs, key=lambda loc: loc.last, reverse=True - ) + sorted_locs = sorted(locs, key=lambda loc: loc.last, reverse=True) # Merge the sequences corresponding to the ordered locations for loc in sorted_locs: slice_start = loc.first - self._seqstart # +1 due to exclusive stop - slice_stop = loc.last - self._seqstart +1 + slice_stop = loc.last - self._seqstart + 1 add_seq = self._sequence[slice_start:slice_stop] if loc.strand == Location.Strand.REVERSE: add_seq = add_seq.reverse().complement() @@ -775,17 +775,17 @@ def __getitem__(self, index): rel_seq_start = self._seqstart else: rel_seq_start = index.start - return AnnotatedSequence(self._annotation[index], - self._sequence[seq_start:seq_stop], - rel_seq_start) + return AnnotatedSequence( + self._annotation[index], + self._sequence[seq_start:seq_stop], + rel_seq_start, + ) elif isinstance(index, numbers.Integral): return self._sequence[index - self._seqstart] else: - raise TypeError( - f"'{type(index).__name__}' instances are invalid indices" - ) + raise TypeError(f"'{type(index).__name__}' instances are invalid indices") def __setitem__(self, index, item): if isinstance(index, Feature): @@ -796,10 +796,11 @@ def __setitem__(self, index, item): for loc in index.locs: slice_start = loc.first - self._seqstart # +1 due to exclusive stop - slice_stop = loc.last - self._seqstart +1 + slice_stop = loc.last - self._seqstart + 1 interval_size = slice_stop - slice_start - self._sequence[slice_start:slice_stop] \ - = sub_seq[sub_seq_i : sub_seq_i + interval_size] + self._sequence[slice_start:slice_stop] = sub_seq[ + sub_seq_i : sub_seq_i + interval_size + ] sub_seq_i += interval_size elif isinstance(index, slice): # Sequence start correction @@ -817,13 +818,13 @@ def __setitem__(self, index, item): # Item is a symbol self._sequence[index - self._seqstart] = item else: - raise TypeError( - f"'{type(index).__name__}' instances are invalid indices" - ) + raise TypeError(f"'{type(index).__name__}' instances are invalid indices") def __eq__(self, item): if not isinstance(item, AnnotatedSequence): return False - return ( self.annotation == item.annotation - and self.sequence == item.sequence - and self._seqstart == item._seqstart) + return ( + self.annotation == item.annotation + and self.sequence == item.sequence + and self._seqstart == item._seqstart + ) diff --git a/src/biotite/sequence/codon.py b/src/biotite/sequence/codon.py index 67d2ab291..13a5d64d8 100644 --- a/src/biotite/sequence/codon.py +++ b/src/biotite/sequence/codon.py @@ -7,11 +7,10 @@ __all__ = ["CodonTable"] import copy -from os.path import join, dirname, realpath -import numpy as np from numbers import Integral -from .seqtypes import NucleotideSequence, ProteinSequence - +from os.path import dirname, join, realpath +import numpy as np +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence # Abbreviations _NUC_ALPH = NucleotideSequence.alphabet_unamb @@ -20,7 +19,7 @@ # Multiplier array that converts a codon in code representation # into a unique integer _radix = len(_NUC_ALPH) -_radix_multiplier = np.array([_radix**n for n in (2,1,0)], dtype=int) +_radix_multiplier = np.array([_radix**n for n in (2, 1, 0)], dtype=int) class CodonTable(object): @@ -29,14 +28,14 @@ class CodonTable(object): amino acid. It also defines start codons. A :class:`CodonTable` takes/outputs either the symbols or code of the codon/amino acid. - + Furthermore, this class is able to give a list of codons that corresponds to a given amino acid. - + The :func:`load()` method allows loading of NCBI codon tables. - + Objects of this class are immutable. - + Parameters ---------- codon_dict : dict of (str -> str) @@ -47,27 +46,27 @@ class CodonTable(object): starts : iterable object of str The start codons. Each entry must be a string of length 3 (all upper case). - + Examples -------- - + Get the amino acid coded by a given codon (symbol and code): - + >>> table = CodonTable.default_table() >>> print(table["ATG"]) M >>> print(table[(1,2,3)]) 14 - + Get the codons coding for a given amino acid (symbol and code): - + >>> table = CodonTable.default_table() >>> print(table["M"]) ('ATG',) >>> print(table[14]) ((0, 2, 0), (0, 2, 2), (1, 2, 0), (1, 2, 1), (1, 2, 2), (1, 2, 3)) """ - + # For efficient mapping of codon codes to amino acid codes, # especially in in the 'map_codon_codes()' function, the class # maps each possible codon into a unique number using a radix based @@ -77,7 +76,7 @@ class CodonTable(object): # file for builtin codon tables from NCBI _table_file = join(dirname(realpath(__file__)), "codon_tables.txt") - + def __init__(self, codon_dict, starts): # Check if 'starts' is iterable object of length 3 string for start in starts: @@ -100,12 +99,10 @@ def __init__(self, codon_dict, starts): if (self._codons == -1).any(): # Find the missing codon missing_index = np.where(self._codons == -1)[0][0] - codon_code = CodonTable._to_codon(missing_index) + codon_code = CodonTable._to_codon(missing_index) codon = _NUC_ALPH.decode_multiple(codon_code) codon_str = "".join(codon) - raise ValueError( - f"Codon dictionary does not contain codon '{codon_str}'" - ) + raise ValueError(f"Codon dictionary does not contain codon '{codon_str}'") def __repr__(self): """Represent CodonTable as a string for debugging.""" @@ -131,8 +128,10 @@ def __getitem__(self, item): codon_numbers = np.where(self._codons == aa_code)[0] codon_codes = CodonTable._to_codon(codon_numbers) codons = tuple( - ["".join(_NUC_ALPH.decode_multiple(codon_code)) - for codon_code in codon_codes] + [ + "".join(_NUC_ALPH.decode_multiple(codon_code)) + for codon_code in codon_codes + ] ) return codons elif len(item) == 3: @@ -147,37 +146,36 @@ def __getitem__(self, item): elif isinstance(item, int): # Code for amino acid -> return possible codon codes codon_numbers = np.where(self._codons == item)[0] - codon_codes = tuple(CodonTable._to_codon(codon_numbers)) - codon_codes = tuple([tuple(code) for code in codon_codes]) + codon_codes = tuple( + [tuple(code.tolist()) for code in CodonTable._to_codon(codon_numbers)] + ) return codon_codes else: # Code for codon as any iterable object # Code for codon -> return corresponding amino acid codes if len(item) != 3: - raise ValueError( - f"{item} is an invalid sequence code for a codon" - ) + raise ValueError(f"{item} is an invalid sequence code for a codon") codon_number = CodonTable._to_number(item) aa_code = self._codons[codon_number] return aa_code - + def map_codon_codes(self, codon_codes): """ Efficiently map multiple codons to the corresponding amino acids. - + Parameters ---------- codon_codes : ndarray, dtype=int, shape=(n,3) The codons to be translated into amino acids. The codons are given as symbol codes. *n* is the amount of codons. - + Returns ------- aa_codes : ndarray, dtype=int, shape=(n,) The amino acids as symbol codes. - + Examples -------- >>> dna = NucleotideSequence("ATGGTTTAA") @@ -208,46 +206,50 @@ def map_codon_codes(self, codon_codes): codon_numbers = CodonTable._to_number(codon_codes) aa_codes = self._codons[codon_numbers] return aa_codes - + def codon_dict(self, code=False): """ Get the codon to amino acid mappings dictionary. - + Parameters ---------- code : bool If true, the dictionary contains keys and values as code. Otherwise, the dictionary contains strings for codons and amino acid. (Default: False) - + Returns ------- codon_dict : dict The dictionary mapping codons to amino acids. """ if code: - return {tuple(CodonTable._to_codon(codon_number)): aa_code - for codon_number, aa_code in enumerate(self._codons)} + return { + tuple(CodonTable._to_codon(codon_number)): aa_code + for codon_number, aa_code in enumerate(self._codons) + } else: - return {"".join(_NUC_ALPH.decode_multiple(codon_code)): - _PROT_ALPH.decode(aa_code) - for codon_code, aa_code - in self.codon_dict(code=True).items()} - + return { + "".join(_NUC_ALPH.decode_multiple(codon_code)): _PROT_ALPH.decode( + aa_code + ) + for codon_code, aa_code in self.codon_dict(code=True).items() + } + def is_start_codon(self, codon_codes): codon_numbers = CodonTable._to_number(codon_codes) return np.isin(codon_numbers, self._starts) - + def start_codons(self, code=False): """ Get the start codons of the codon table. - + Parameters ---------- code : bool If true, the code will be returned instead of strings. (Default: False) - + Returns ------- start_codons : tuple @@ -256,25 +258,29 @@ def start_codons(self, code=False): """ if code: return tuple( - [tuple(CodonTable._to_codon(codon_number)) - for codon_number in self._starts] + [ + tuple(CodonTable._to_codon(codon_number)) + for codon_number in self._starts + ] ) else: return tuple( - ["".join(_NUC_ALPH.decode_multiple(codon_code)) - for codon_code in self.start_codons(code=True)] + [ + "".join(_NUC_ALPH.decode_multiple(codon_code)) + for codon_code in self.start_codons(code=True) + ] ) - + def with_start_codons(self, starts): """ Create an new :class:`CodonTable` with the same codon mappings, but changed start codons. - + Parameters ---------- starts : iterable object of str The new start codons. - + Returns ------- new_table : CodonTable @@ -287,17 +293,17 @@ def with_start_codons(self, starts): ) new_table._starts = CodonTable._to_number(start_codon_codes) return new_table - + def with_codon_mappings(self, codon_dict): """ Create an new :class:`CodonTable` with partially changed codon mappings. - + Parameters ---------- codon_dict : dict of (str -> str) The changed codon mappings. - + Returns ------- new_table : CodonTable @@ -328,9 +334,9 @@ def __str__(self): else: string += " " # Add space for next codon - string += " "*3 + string += " " * 3 # Remove terminal space - string = string [:-6] + string = string[:-6] # Jump to next line string += "\n" # Add empty line @@ -353,10 +359,10 @@ def _to_codon(numbers): if not isinstance(numbers, np.ndarray): numbers = np.array(list(numbers), dtype=int) codons = np.zeros(numbers.shape + (3,), dtype=int) - for n in (2,1,0): + for n in (2, 1, 0): val = _radix**n digit = numbers // val - codons[..., -(n+1)] = digit + codons[..., -(n + 1)] = digit numbers = numbers - digit * val return codons @@ -364,14 +370,14 @@ def _to_codon(numbers): def load(table_name): """ Load a NCBI codon table. - + Parameters ---------- table_name : str or int If a string is given, it is interpreted as official NCBI codon table name (e.g. "Vertebrate Mitochondrial"). An integer is interpreted as NCBI codon table ID. - + Returns ------- table : CodonTable @@ -380,7 +386,7 @@ def load(table_name): # Loads codon tables from codon_tables.txt with open(CodonTable._table_file, "r") as f: lines = f.read().split("\n") - + # Extract data for codon table from file table_found = False aa = None @@ -391,11 +397,11 @@ def load(table_name): for line in lines: if not line: table_found = False - if type(table_name) == int and line.startswith("id"): + if isinstance(table_name, Integral) and line.startswith("id"): # remove identifier 'id' if table_name == int(line[2:]): table_found = True - elif type(table_name) == str and line.startswith("name"): + elif isinstance(table_name, str) and line.startswith("name"): # Get list of table names from lines # (separated with ';') # remove identifier 'name' @@ -404,7 +410,7 @@ def load(table_name): table_found = True if table_found: if line.startswith("AA"): - #Remove identifier + # Remove identifier aa = line[5:].strip() elif line.startswith("Init"): init = line[5:].strip() @@ -414,19 +420,24 @@ def load(table_name): base2 = line[5:].strip() elif line.startswith("Base3"): base3 = line[5:].strip() - + # Create codon table from data - if aa is not None and init is not None \ - and base1 is not None and base2 is not None and base3 is not None: - symbol_dict = {} - starts = [] - # aa, init and baseX all have the same length - for i in range(len(aa)): - codon = base1[i] + base2[i] + base3[i] - if init[i] == "i": - starts.append(codon) - symbol_dict[codon] = aa[i] - return CodonTable(symbol_dict, starts) + if ( + aa is not None + and init is not None + and base1 is not None + and base2 is not None + and base3 is not None + ): + symbol_dict = {} + starts = [] + # aa, init and baseX all have the same length + for i in range(len(aa)): + codon = base1[i] + base2[i] + base3[i] + if init[i] == "i": + starts.append(codon) + symbol_dict[codon] = aa[i] + return CodonTable(symbol_dict, starts) else: raise ValueError(f"Codon table '{table_name}' was not found") @@ -434,7 +445,7 @@ def load(table_name): def table_names(): """ The possible codon table names for :func:`load()`. - + Returns ------- names : list of str @@ -447,14 +458,14 @@ def table_names(): if line.startswith("name"): names.extend([name.strip() for name in line[4:].split(";")]) return names - + @staticmethod def default_table(): """ The default codon table. The table is equal to the NCBI "Standard" codon table, with the difference that only "ATG" is a start codon. - + Returns ------- table : CodonTable diff --git a/src/biotite/sequence/graphics/__init__.py b/src/biotite/sequence/graphics/__init__.py index b1dbbf051..4b0b39b9f 100644 --- a/src/biotite/sequence/graphics/__init__.py +++ b/src/biotite/sequence/graphics/__init__.py @@ -29,5 +29,5 @@ from .colorschemes import * from .dendrogram import * from .features import * -from .plasmid import * from .logo import * +from .plasmid import * diff --git a/src/biotite/sequence/graphics/alignment.py b/src/biotite/sequence/graphics/alignment.py index b84c7be0d..f3bdb6380 100644 --- a/src/biotite/sequence/graphics/alignment.py +++ b/src/biotite/sequence/graphics/alignment.py @@ -4,15 +4,22 @@ __name__ = "biotite.sequence.graphics" __author__ = "Patrick Kunzmann" -__all__ = ["SymbolPlotter", "LetterPlotter", "LetterSimilarityPlotter", - "LetterTypePlotter","ArrayPlotter", - "plot_alignment", "plot_alignment_similarity_based", - "plot_alignment_type_based","plot_alignment_array"] +__all__ = [ + "SymbolPlotter", + "LetterPlotter", + "LetterSimilarityPlotter", + "LetterTypePlotter", + "ArrayPlotter", + "plot_alignment", + "plot_alignment_similarity_based", + "plot_alignment_type_based", + "plot_alignment_array", +] import abc import numpy as np -from ...visualize import colors -from .colorschemes import get_color_scheme +from biotite.sequence.graphics.colorschemes import get_color_scheme +from biotite.visualize import colors class SymbolPlotter(metaclass=abc.ABCMeta): @@ -81,8 +88,7 @@ class LetterPlotter(SymbolPlotter, metaclass=abc.ABCMeta): :class:`matplotlib.Text` instance of each symbol. """ - def __init__(self, axes, color_symbols=False, - font_size=None, font_param=None): + def __init__(self, axes, color_symbols=False, font_size=None, font_param=None): super().__init__(axes) self._color_symbols = color_symbols self._font_size = font_size @@ -101,9 +107,15 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i): box = Rectangle(bbox.p0, bbox.width, bbox.height) self.axes.add_patch(box) text = self.axes.text( - bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2, - symbol, color="black", ha="center", va="center", - size=self._font_size, **self._font_param) + bbox.x0 + bbox.width / 2, + bbox.y0 + bbox.height / 2, + symbol, + color="black", + ha="center", + va="center", + size=self._font_size, + **self._font_param, + ) text.set_clip_on(True) if self._color_symbols: @@ -196,17 +208,16 @@ class LetterSimilarityPlotter(LetterPlotter): because *a* does also occur in *b*\ :sub:`i`. """ - def __init__(self, axes, matrix=None, color_symbols=False, - font_size=None, font_param=None): - + def __init__( + self, axes, matrix=None, color_symbols=False, font_size=None, font_param=None + ): super().__init__(axes, color_symbols, font_size, font_param) if matrix is not None: self._matrix = matrix.score_matrix() else: self._matrix = None # Default colormap - self._cmap = self._generate_colormap(colors["dimgreen"], - self._color_symbols) + self._cmap = self._generate_colormap(colors["dimgreen"], self._color_symbols) def set_color(self, color=None, cmap=None): """ @@ -257,8 +268,7 @@ def get_color(self, alignment, column_i, seq_i): similarities[i] = 0 else: code2 = alignment.sequences[i].code[index2] - similarities[i] = self._get_similarity(self._matrix, - code1, code2) + similarities[i] = self._get_similarity(self._matrix, code1, code2) # Delete self-similarity similarities = np.delete(similarities, seq_i) similarity = np.average(similarities) @@ -283,14 +293,18 @@ def _generate_colormap(color, to_black): if to_black: # From color to black cmap_val = np.stack( - [np.interp(np.linspace(0, 1, 100), [0, 1], [color[i], 0]) - for i in range(len(color))] + [ + np.interp(np.linspace(0, 1, 100), [0, 1], [color[i], 0]) + for i in range(len(color)) + ] ).transpose() else: # From white to color cmap_val = np.stack( - [np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) - for i in range(len(color))] + [ + np.interp(np.linspace(0, 1, 100), [0, 1], [1, color[i]]) + for i in range(len(color)) + ] ).transpose() return ListedColormap(cmap_val) @@ -310,7 +324,7 @@ class LetterTypePlotter(LetterPlotter): The alphabet of the alignment(s) to be plotted color_scheme : str or list of (tuple or str), optional Either a valid color scheme name - (e.g. ``"rainbow"``, ``"clustalx"``, ``blossom``, etc.) + (e.g. ``"flower"``, ``"clustalx"``, ``blossom``, etc.) or a list of *Matplotlib* compatible colors. The list length must be at least as long as the length of the alphabet used by the sequences. @@ -325,12 +339,19 @@ class LetterTypePlotter(LetterPlotter): :class:`matplotlib.Text` instance of each symbol. """ - def __init__(self, axes, alphabet, color_scheme=None, color_symbols=False, - font_size=None, font_param=None): + def __init__( + self, + axes, + alphabet, + color_scheme=None, + color_symbols=False, + font_size=None, + font_param=None, + ): super().__init__(axes, color_symbols, font_size, font_param) if color_scheme is None: - self._colors = get_color_scheme("rainbow", alphabet) + self._colors = get_color_scheme("flower", alphabet) elif isinstance(color_scheme, str): self._colors = get_color_scheme(color_scheme, alphabet) else: @@ -346,7 +367,7 @@ def get_color(self, alignment, column_i, seq_i): class ArrayPlotter(LetterPlotter): - ''' + """ This :class:`SymbolPlotter` quantitatively decorates sequences alignments, with molecular recognition data obtained from e.g. microarrays. Symbols are visualized as characters on a colored background box. The color of a given box represents the recognition @@ -371,15 +392,14 @@ class ArrayPlotter(LetterPlotter): Additional parameters that is given to the :class:`matplotlib.Text` instance of each symbol. - ''' - def __init__(self, axes, fl_score, color_symbols=False, - font_size=None, font_param=None): + """ + def __init__( + self, axes, fl_score, color_symbols=False, font_size=None, font_param=None + ): super().__init__(axes, color_symbols, font_size, font_param) self.fl_score = fl_score - self._cmap = self._generate_colormap(colors["dimorange"], - self._color_symbols) - + self._cmap = self._generate_colormap(colors["dimorange"], self._color_symbols) def get_color(self, alignment, column_i, seq_i): index1 = alignment.trace[column_i, seq_i] @@ -389,7 +409,6 @@ def get_color(self, alignment, column_i, seq_i): spot_signal = self._get_signal(self.fl_score, column_i, seq_i) return self._cmap(spot_signal) - def _get_signal(self, fl_score, column_i, seq_i): if fl_score is None: signal = 0.0 @@ -400,7 +419,6 @@ def _get_signal(self, fl_score, column_i, seq_i): def get_cmap(self): return self._cmap - def plot_symbol(self, bbox, alignment, column_i, seq_i): from matplotlib.patches import Rectangle @@ -422,9 +440,15 @@ def plot_symbol(self, bbox, alignment, column_i, seq_i): box = Rectangle(bbox.p0, bbox.width, bbox.height) self.axes.add_patch(box) text = self.axes.text( - bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2, - symbol, color="black", ha="center", va="center", - size=self._font_size, **self._font_param) + bbox.x0 + bbox.width / 2, + bbox.y0 + bbox.height / 2, + symbol, + color="black", + ha="center", + va="center", + size=self._font_size, + **self._font_param, + ) text.set_clip_on(True) if self._color_symbols: @@ -455,11 +479,20 @@ def _generate_colormap(color, to_black): return ListedColormap(cmap_val) -def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, - show_numbers=False, number_size=None, number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, symbol_spacing=None): +def plot_alignment( + axes, + alignment, + symbol_plotter, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + symbol_spacing=None, +): """ Plot a pairwise or multiple sequence alignment. @@ -545,7 +578,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, ) for i, func in enumerate(number_functions): if func is None: - number_functions[i] = (lambda x: x + 1) + number_functions[i] = lambda x: x + 1 seq_num = alignment.trace.shape[1] seq_len = alignment.trace.shape[0] @@ -573,7 +606,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, for i in range(seq_len): y = y_start for j in range(seq_num): - bbox = Bbox([[x, y], [x+1, y+1]]) + bbox = Bbox([[x, y], [x + 1, y + 1]]) symbol_plotter.plot_symbol(bbox, alignment, i, j) y += 1 line_pos += 1 @@ -583,8 +616,7 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, y_start += seq_num + spacing else: x += 1 - if (symbol_spacing - and (i + 1) % symbol_spacing == 0): + if symbol_spacing and (i + 1) % symbol_spacing == 0: line_pos += 1 x += 1 @@ -613,14 +645,12 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, y = 0.5 for i in range(line_count): for j in range(seq_num): - if i == line_count-1: + if i == line_count - 1: # Last line -> get number of last column in trace trace_pos = len(alignment.trace) - 1 else: - trace_pos = (i+1) * symbols_per_line - 1 - seq_index = _get_last_valid_index( - alignment, trace_pos, j - ) + trace_pos = (i + 1) * symbols_per_line - 1 + seq_index = _get_last_valid_index(alignment, trace_pos, j) # if -1 -> terminal gap # -> skip number for this sequence in this line if seq_index != -1: @@ -636,18 +666,14 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, axes.set_xlim(0, symbols_to_print) # Y-axis starts from top - lim = seq_num*line_count + spacing*(line_count-1) + lim = seq_num * line_count + spacing * (line_count - 1) axes.set_ylim(lim, 0) number_axes.set_ylim(lim, 0) axes.set_frame_on(False) number_axes.set_frame_on(False) # Remove ticks and set label and number size - axes.yaxis.set_tick_params( - left=False, right=False, labelsize=label_size - ) - number_axes.yaxis.set_tick_params( - left=False, right=False, labelsize=number_size - ) + axes.yaxis.set_tick_params(left=False, right=False, labelsize=label_size) + number_axes.yaxis.set_tick_params(left=False, right=False, labelsize=number_size) if show_line_position: axes.xaxis.set_tick_params( @@ -659,15 +685,25 @@ def plot_alignment(axes, alignment, symbol_plotter, symbols_per_line=50, ) -def plot_alignment_similarity_based(axes, alignment, symbols_per_line=50, - show_numbers=False, number_size=None, - number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, - color=None, cmap=None, matrix=None, - color_symbols=False, symbol_spacing=None, - symbol_size=None, symbol_param=None): +def plot_alignment_similarity_based( + axes, + alignment, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color=None, + cmap=None, + matrix=None, + color_symbols=False, + symbol_spacing=None, + symbol_size=None, + symbol_param=None, +): r""" Plot a pairwise or multiple sequence alignment highlighting the similarity per alignment column. @@ -788,31 +824,47 @@ def plot_alignment_similarity_based(axes, alignment, symbols_per_line=50, because *a* does also occur in *b*\ :sub:`i`. """ symbol_plotter = LetterSimilarityPlotter( - axes, matrix=matrix, font_size=symbol_size, font_param=symbol_param, - color_symbols=color_symbols + axes, + matrix=matrix, + font_size=symbol_size, + font_param=symbol_param, + color_symbols=color_symbols, ) if color is not None or cmap is not None: symbol_plotter.set_color(color=color, cmap=cmap) plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing, symbol_spacing=symbol_spacing + spacing=spacing, + symbol_spacing=symbol_spacing, ) -def plot_alignment_type_based(axes, alignment, symbols_per_line=50, - show_numbers=False, number_size=None, - number_functions=None, - labels=None, label_size=None, - show_line_position=False, - spacing=1, - color_scheme=None, color_symbols=False, - symbol_size=None, symbol_param=None, - symbol_spacing=None): +def plot_alignment_type_based( + axes, + alignment, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color_scheme=None, + color_symbols=False, + symbol_size=None, + symbol_param=None, + symbol_spacing=None, +): """ Plot a pairwise or multiple sequence alignment coloring each symbol based on the symbol type. @@ -873,7 +925,7 @@ def plot_alignment_type_based(axes, alignment, symbols_per_line=50, is equal to the size of a symbol box. color_scheme : str or list of (tuple or str), optional Either a valid color scheme name - (e.g. ``"rainbow"``, ``"clustalx"``, ``blossom``, etc.) + (e.g. ``"flower"``, ``"clustalx"``, ``blossom``, etc.) or a list of *Matplotlib* compatible colors. The list length must be at least as long as the length of the alphabet used by the sequences. @@ -897,27 +949,48 @@ def plot_alignment_type_based(axes, alignment, symbols_per_line=50, """ alphabet = alignment.sequences[0].get_alphabet() symbol_plotter = LetterTypePlotter( - axes, alphabet, font_size=symbol_size, font_param=symbol_param, - color_symbols=color_symbols, color_scheme=color_scheme + axes, + alphabet, + font_size=symbol_size, + font_param=symbol_param, + color_symbols=color_symbols, + color_scheme=color_scheme, ) plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing, symbol_spacing=symbol_spacing + spacing=spacing, + symbol_spacing=symbol_spacing, ) -def plot_alignment_array(axes, alignment, fl_score, symbols_per_line=50, - show_numbers=False, number_size=None, - number_functions=None, labels=None, label_size=None, - show_line_position=False, spacing=1, color=None, - cmap=None, symbol_spacing=None, - symbol_size=None, symbol_param=None): - ''' +def plot_alignment_array( + axes, + alignment, + fl_score, + symbols_per_line=50, + show_numbers=False, + number_size=None, + number_functions=None, + labels=None, + label_size=None, + show_line_position=False, + spacing=1, + color=None, + cmap=None, + symbol_spacing=None, + symbol_size=None, + symbol_param=None, +): + """ Plot a pairwise sequence alignment using an :class:`ArrayPlotter` instance. @@ -995,19 +1068,27 @@ def plot_alignment_array(axes, alignment, fl_score, symbols_per_line=50, A '*' represents a sequence match on the alignment A '-' represents a sequence gap on the alignment - ''' + """ symbol_plotter = ArrayPlotter( - axes, fl_score = fl_score, font_size = symbol_size, font_param = symbol_param, + axes, + fl_score=fl_score, + font_size=symbol_size, + font_param=symbol_param, ) plot_alignment( - axes=axes, alignment=alignment, symbol_plotter=symbol_plotter, + axes=axes, + alignment=alignment, + symbol_plotter=symbol_plotter, symbols_per_line=symbols_per_line, - show_numbers=show_numbers, number_size=number_size, + show_numbers=show_numbers, + number_size=number_size, number_functions=number_functions, - labels=labels, label_size=label_size, + labels=labels, + label_size=label_size, show_line_position=show_line_position, - spacing=spacing, symbol_spacing=symbol_spacing + spacing=spacing, + symbol_spacing=symbol_spacing, ) diff --git a/src/biotite/sequence/graphics/colorschemes.py b/src/biotite/sequence/graphics/colorschemes.py index 049cddbb4..d38879c91 100644 --- a/src/biotite/sequence/graphics/colorschemes.py +++ b/src/biotite/sequence/graphics/colorschemes.py @@ -6,12 +6,11 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_color_scheme", "list_color_scheme_names", "load_color_scheme"] -import numpy as np -import json -from os.path import join, dirname, realpath import glob +import json import os -from ..alphabet import Alphabet +from os.path import dirname, join, realpath +from biotite.sequence.alphabet import Alphabet def load_color_scheme(file_name): @@ -26,13 +25,13 @@ def load_color_scheme(file_name): ---------- file_name : str The file name of the JSON file containing the scheme. - + Returns ------- scheme : dict A dictionary representing the color scheme, It contains the following keys, if the input file is proper: - + - **name** - Name of the scheme. - **alphabet** - :class:`Alphabet` instance describing the type of sequence the scheme can be used for. @@ -71,7 +70,7 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"): default : str or tuple, optional A *Matplotlib* compatible color that is used for symbols that have no defined color in the scheme. - + Returns ------- colors : list @@ -99,11 +98,10 @@ def get_color_scheme(name, alphabet, default="#FFFFFF"): if scheme["name"] == name and scheme["alphabet"].extends(alphabet): colors = scheme["colors"] # Replace None values with default color - colors = [color if color is not None else default - for color in colors] + colors = [color if color is not None else default for color in colors] # Only return colors that are in scope of this alphabet # and not the extended alphabet - return colors[:len(alphabet)] + return colors[: len(alphabet)] raise ValueError(f"Unkown scheme '{name}' for given alphabet") @@ -117,7 +115,7 @@ def list_color_scheme_names(alphabet): The alphbet to get the color scheme names for. The alphabet of the scheme must equal or extend this parameter, to be included in the list. - + Returns ------- schemes : list of str @@ -136,4 +134,4 @@ def list_color_scheme_names(alphabet): for file_name in glob.glob(_scheme_dir + os.sep + "*.json"): scheme = load_color_scheme(file_name) - _color_schemes.append(scheme) \ No newline at end of file + _color_schemes.append(scheme) diff --git a/src/biotite/sequence/graphics/dendrogram.py b/src/biotite/sequence/graphics/dendrogram.py index f351c891f..254702443 100644 --- a/src/biotite/sequence/graphics/dendrogram.py +++ b/src/biotite/sequence/graphics/dendrogram.py @@ -8,9 +8,18 @@ import numpy as np -def plot_dendrogram(axes, tree, orientation="left", use_distances=True, - labels=None, label_size=None, color="black", - show_distance=True, **kwargs): + +def plot_dendrogram( + axes, + tree, + orientation="left", + use_distances=True, + labels=None, + label_size=None, + color="black", + show_distance=True, + **kwargs, +): """ Plot a dendrogram from a (phylogenetic) tree. @@ -24,7 +33,7 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True, If true, the `distance` attribute of the :class:`TreeNode` objects are used as distance measure. Otherwise the topological distance is used. - labels : list of str, optional + labels : list of str, optional The leaf node labels. The label of a leaf node is the entry at the position of its `index` attribute. @@ -40,9 +49,9 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True, Additional parameters that are used to draw the dendrogram lines. """ - + indices = tree.root.get_indices() - leaf_dict = {indices[i] : i for i in indices} + leaf_dict = {indices[i]: i for i in indices} # Required for setting the plot limits max_distance = 0 @@ -50,12 +59,12 @@ def plot_dendrogram(axes, tree, orientation="left", use_distances=True, def _plot_node(node, distance): """ Draw the lines from the given node to its children. - + Parameters ---------- dist : float the distance of the node from root - + Returns ------- pos : float @@ -88,31 +97,43 @@ def _plot_node(node, distance): if orientation in ["left", "right"]: # Line connecting the childs axes.plot( - [distance, distance], [child_pos[0], child_pos[-1]], - color=color, marker="None", **kwargs + [distance, distance], + [child_pos[0], child_pos[-1]], + color=color, + marker="None", + **kwargs, ) # Lines depicting the distances of the childs for child_dist, pos in zip(child_distances, child_pos): axes.plot( - [distance, child_dist], [pos, pos], - color=color, marker="None", **kwargs + [distance, child_dist], + [pos, pos], + color=color, + marker="None", + **kwargs, ) elif orientation in ["bottom", "top"]: # Line connecting the childs axes.plot( - [child_pos[0], child_pos[-1]], [distance, distance], - color=color, marker="None", **kwargs + [child_pos[0], child_pos[-1]], + [distance, distance], + color=color, + marker="None", + **kwargs, ) # Lines depicting the distances of the childs for child_dist, pos in zip(child_distances, child_pos): axes.plot( - [pos, pos], [distance, child_dist], - color=color, marker="None", **kwargs + [pos, pos], + [distance, child_dist], + color=color, + marker="None", + **kwargs, ) else: raise ValueError(f"'{orientation}' is not a valid orientation") return center_pos - + _plot_node(tree.root, 0) if labels is not None: @@ -133,12 +154,18 @@ def _plot_node(node, distance): axes.set_yticks(np.arange(0, len(indices))) axes.set_yticklabels(labels) axes.yaxis.set_tick_params( - left=False, right=False, labelleft=False, labelright=True, - labelsize=label_size + left=False, + right=False, + labelleft=False, + labelright=True, + labelsize=label_size, ) axes.xaxis.set_tick_params( - bottom=True, top=False, labelbottom=show_distance, labeltop=False, - labelsize=label_size + bottom=True, + top=False, + labelbottom=show_distance, + labeltop=False, + labelsize=label_size, ) elif orientation == "right": axes.set_xlim(max_distance, zero_limit) @@ -146,12 +173,18 @@ def _plot_node(node, distance): axes.set_yticks(np.arange(0, len(indices))) axes.set_yticklabels(labels) axes.yaxis.set_tick_params( - left=False, right=False, labelleft=True, labelright=False, - labelsize=label_size + left=False, + right=False, + labelleft=True, + labelright=False, + labelsize=label_size, ) axes.xaxis.set_tick_params( - bottom=True, top=False, labelbottom=show_distance, labeltop=False, - labelsize=label_size + bottom=True, + top=False, + labelbottom=show_distance, + labeltop=False, + labelsize=label_size, ) elif orientation == "bottom": axes.set_ylim(zero_limit, max_distance) @@ -159,12 +192,18 @@ def _plot_node(node, distance): axes.set_xticks(np.arange(0, len(indices))) axes.set_xticklabels(labels) axes.xaxis.set_tick_params( - bottom=False, top=False, labelbottom=False, labeltop=True, - labelsize=label_size + bottom=False, + top=False, + labelbottom=False, + labeltop=True, + labelsize=label_size, ) axes.yaxis.set_tick_params( - left=True, right=False, labelleft=show_distance, labelright=False, - labelsize=label_size + left=True, + right=False, + labelleft=show_distance, + labelright=False, + labelsize=label_size, ) elif orientation == "top": axes.set_ylim(max_distance, zero_limit) @@ -172,13 +211,19 @@ def _plot_node(node, distance): axes.set_xticks(np.arange(0, len(indices))) axes.set_xticklabels(labels) axes.xaxis.set_tick_params( - bottom=False, top=False, labelbottom=True, labeltop=False, - labelsize=label_size + bottom=False, + top=False, + labelbottom=True, + labeltop=False, + labelsize=label_size, ) axes.yaxis.set_tick_params( - left=True, right=False, labelleft=show_distance, labelright=False, - labelsize=label_size + left=True, + right=False, + labelleft=show_distance, + labelright=False, + labelsize=label_size, ) else: raise ValueError(f"'{orientation}' is not a valid orientation") - axes.set_frame_on(False) \ No newline at end of file + axes.set_frame_on(False) diff --git a/src/biotite/sequence/graphics/features.py b/src/biotite/sequence/graphics/features.py index e3c6711ee..6fe25fa41 100644 --- a/src/biotite/sequence/graphics/features.py +++ b/src/biotite/sequence/graphics/features.py @@ -4,22 +4,35 @@ __name__ = "biotite.sequence.graphics" __author__ = "Patrick Kunzmann" -__all__ = ["plot_feature_map", "FeaturePlotter", "MiscFeaturePlotter", - "CodingPlotter", "PromoterPlotter", "TerminatorPlotter", - "RBSPlotter"] +__all__ = [ + "plot_feature_map", + "FeaturePlotter", + "MiscFeaturePlotter", + "CodingPlotter", + "PromoterPlotter", + "TerminatorPlotter", + "RBSPlotter", +] -import copy import abc -import numpy as np -from ...visualize import colors, AdaptiveFancyArrow -from ..annotation import Annotation, Feature, Location - - -def plot_feature_map(axes, annotation, loc_range=None, - multi_line=True, symbols_per_line=1000, - show_numbers=False, number_size=None, line_width=0.05, - show_line_position=False, spacing=0.25, - feature_plotters=None, style_param=None): +from biotite.sequence.annotation import Location +from biotite.visualize import AdaptiveFancyArrow, colors + + +def plot_feature_map( + axes, + annotation, + loc_range=None, + multi_line=True, + symbols_per_line=1000, + show_numbers=False, + number_size=None, + line_width=0.05, + show_line_position=False, + spacing=0.25, + feature_plotters=None, + style_param=None, +): """ Plot a sequence annotation, by showing the range of each feature on one or multiple position depicting line(s). @@ -87,8 +100,8 @@ def plot_feature_map(axes, annotation, loc_range=None, features. When two features overlap, their drawing area does also overlap. """ - from matplotlib.transforms import Bbox from matplotlib.patches import Rectangle + from matplotlib.transforms import Bbox if loc_range is None: loc_range = annotation.get_location_range() @@ -98,13 +111,13 @@ def plot_feature_map(axes, annotation, loc_range=None, else: # Line length covers the entire location range symbols_per_line = loc_range_length - + plotters = [ PromoterPlotter(), TerminatorPlotter(), RBSPlotter(), CodingPlotter(), - MiscFeaturePlotter() + MiscFeaturePlotter(), ] if feature_plotters is not None: plotters = list(feature_plotters) + plotters @@ -116,7 +129,6 @@ def plot_feature_map(axes, annotation, loc_range=None, if loc_range_length % symbols_per_line != 0: line_count += 1 - ### Draw lines ### remaining_symbols = loc_range_length y = 0.5 @@ -127,14 +139,19 @@ def plot_feature_map(axes, annotation, loc_range=None, else: # Last line -> Line spans to end of annotation line_length = remaining_symbols - axes.add_patch(Rectangle( - (0, y-line_width/2), line_length, line_width, - color="gray", linewidth=0 - )) + axes.add_patch( + Rectangle( + (0, y - line_width / 2), + line_length, + line_width, + color="gray", + linewidth=0, + ) + ) # Increment by spacing and width (=1) of feature y += spacing + 1 remaining_symbols -= symbols_per_line - + ### Draw features ### line_start_loc = loc_range[0] y = 0 @@ -160,15 +177,12 @@ def plot_feature_map(axes, annotation, loc_range=None, width = loc_len height = 1 bbox = Bbox.from_bounds(x, y, width, height) - plotter.draw( - axes, feature, bbox, loc, - style_param=style_param - ) + plotter.draw(axes, feature, bbox, loc, style_param=style_param) # Increment by spacing and width (=1) of feature y += spacing + 1 remaining_symbols += symbols_per_line line_start_loc += symbols_per_line - + ### Draw position numbers ### ticks = [] tick_labels = [] @@ -176,11 +190,11 @@ def plot_feature_map(axes, annotation, loc_range=None, # Numbers at center height of each feature line -> 0.5 y = 0.5 for i in range(line_count): - if i == line_count-1: + if i == line_count - 1: # Last line -> get number of last column in trace - loc = loc_range[1] -1 + loc = loc_range[1] - 1 else: - loc = loc_range[0] + ((i+1) * symbols_per_line) -1 + loc = loc_range[0] + ((i + 1) * symbols_per_line) - 1 ticks.append(y) tick_labels.append(str(loc)) # Increment by spacing and width of feature (1) @@ -188,20 +202,17 @@ def plot_feature_map(axes, annotation, loc_range=None, axes.set_yticks(ticks) axes.set_yticklabels(tick_labels) - axes.set_xlim(0, symbols_per_line) # Y-axis starts from top - axes.set_ylim(1*line_count + spacing*(line_count-1), 0) + axes.set_ylim(1 * line_count + spacing * (line_count - 1), 0) axes.set_frame_on(False) # Draw location numbers on right side axes.get_yaxis().set_tick_params( left=False, right=False, labelleft=False, labelright=True ) # Remove ticks and set number font size - axes.yaxis.set_tick_params( - left=False, right=False, labelsize=number_size - ) - + axes.yaxis.set_tick_params(left=False, right=False, labelsize=number_size) + if show_line_position: axes.xaxis.set_tick_params( top=False, bottom=True, labeltop=False, labelbottom=True @@ -236,7 +247,7 @@ def matches(self, feature): ---------- feature : Feature The sequence feature to be checked. - + Returns ------- compatibility : bool @@ -244,7 +255,7 @@ def matches(self, feature): false otherwise. """ pass - + @abc.abstractmethod def draw(self, axes, feature, bbox, location, style_param): """ @@ -284,7 +295,7 @@ class CodingPlotter(FeaturePlotter): The width of the arrow head as fraction of the feature drawing area height. """ - + def __init__(self, tail_width=0.5, head_width=0.8): self._tail_width = tail_width self._head_width = head_width @@ -294,9 +305,9 @@ def matches(self, feature): return True else: return False - + def draw(self, axes, feature, bbox, loc, style_param): - y = bbox.y0 + bbox.height/2 + y = bbox.y0 + bbox.height / 2 dy = 0 if loc.strand == Location.Strand.FORWARD: x = bbox.x0 @@ -304,25 +315,35 @@ def draw(self, axes, feature, bbox, loc, style_param): else: x = bbox.x1 dx = -bbox.width - - if ( - loc.strand == Location.Strand.FORWARD - and loc.defect & Location.Defect.MISS_RIGHT - ) or ( - loc.strand == Location.Strand.REVERSE - and loc.defect & Location.Defect.MISS_LEFT - ): - # If the feature extends into the prevoius or next line - # do not draw an arrow head - draw_head = False + + if ( + loc.strand == Location.Strand.FORWARD + and loc.defect & Location.Defect.MISS_RIGHT + ) or ( + loc.strand == Location.Strand.REVERSE + and loc.defect & Location.Defect.MISS_LEFT + ): + # If the feature extends into the prevoius or next line + # do not draw an arrow head + draw_head = False else: - draw_head = True - + draw_head = True + # Create head with 90 degrees tip -> head width/length ratio = 1/2 - axes.add_patch(AdaptiveFancyArrow( - x, y, dx, dy, self._tail_width, self._head_width, head_ratio=0.5, - draw_head=draw_head, color=colors["dimgreen"], linewidth=0 - )) + axes.add_patch( + AdaptiveFancyArrow( + x, + y, + dx, + dy, + self._tail_width, + self._head_width, + head_ratio=0.5, + draw_head=draw_head, + color=colors["dimgreen"], + linewidth=0, + ) + ) if feature.key == "CDS": if "product" not in feature.qual: @@ -332,17 +353,23 @@ def draw(self, axes, feature, bbox, loc, style_param): else: label = feature.qual["product"] elif feature.key == "gene": - if "gene" not in feature.qual: + if "gene" not in feature.qual: label = None else: label = feature.qual["gene"] - + if label is not None: - center_x = bbox.x0 + bbox.width/2 - center_y = bbox.y0 + bbox.height/2 + center_x = bbox.x0 + bbox.width / 2 + center_y = bbox.y0 + bbox.height / 2 axes.text( - center_x, center_y, label, color="black", - ha="center", va="center", size=11) + center_x, + center_y, + label, + color="black", + ha="center", + va="center", + size=11, + ) class MiscFeaturePlotter(FeaturePlotter): @@ -363,17 +390,20 @@ def __init__(self, height=0.4): def matches(self, feature): return True - + def draw(self, axes, feature, bbox, loc, style_param): from matplotlib.patches import Rectangle rect = Rectangle( - (bbox.x0, bbox.y0 + bbox.height/2 * (1-self._height)), - bbox.width, bbox.height*self._height, - color=colors["dimorange"], linewidth=0 + (bbox.x0, bbox.y0 + bbox.height / 2 * (1 - self._height)), + bbox.width, + bbox.height * self._height, + color=colors["dimorange"], + linewidth=0, ) axes.add_patch(rect) + class PromoterPlotter(FeaturePlotter): """ A plotter for *regulatory* features with the *promoter* or @@ -394,8 +424,7 @@ class PromoterPlotter(FeaturePlotter): as fraction of the halffeature drawing area height. """ - def __init__(self, line_width=2, head_width=2, - head_length=6, head_height=0.8): + def __init__(self, line_width=2, head_width=2, head_length=6, head_height=0.8): self._line_width = line_width self._head_width = head_width self._head_length = head_length @@ -404,43 +433,42 @@ def __init__(self, line_width=2, head_width=2, def matches(self, feature): if feature.key == "regulatory": if "regulatory_class" in feature.qual: - if feature.qual["regulatory_class"] in ["promoter","TATA_box"]: + if feature.qual["regulatory_class"] in ["promoter", "TATA_box"]: return True return False - + def draw(self, axes, feature, bbox, loc, style_param): - from matplotlib.patches import FancyArrowPatch, ArrowStyle + from matplotlib.patches import ArrowStyle, FancyArrowPatch from matplotlib.path import Path - x_center = bbox.x0 + bbox.width/2 - y_center = bbox.y0 + bbox.height/2 + x_center = bbox.x0 + bbox.width / 2 + y_center = bbox.y0 + bbox.height / 2 path = Path( vertices=[ (bbox.x0, y_center), - (bbox.x0, y_center - bbox.height/2 * self._head_height), - (bbox.x1, y_center - bbox.height/2 * self._head_height), + (bbox.x0, y_center - bbox.height / 2 * self._head_height), + (bbox.x1, y_center - bbox.height / 2 * self._head_height), ], - codes=[ - Path.MOVETO, - Path.CURVE3, - Path.CURVE3 - ] + codes=[Path.MOVETO, Path.CURVE3, Path.CURVE3], ) style = ArrowStyle.CurveFilledB( head_width=self._head_width, head_length=self._head_length ) arrow = FancyArrowPatch( - path=path, arrowstyle=style, linewidth=self._line_width, - color="black" + path=path, arrowstyle=style, linewidth=self._line_width, color="black" ) axes.add_patch(arrow) - + if "note" in feature.qual: axes.text( - x_center, y_center + bbox.height/4, feature.qual["note"], - color="black", ha="center", va="center", - size=9 + x_center, + y_center + bbox.height / 4, + feature.qual["note"], + color="black", + ha="center", + va="center", + size=9, ) @@ -465,14 +493,17 @@ def matches(self, feature): if feature.qual["regulatory_class"] == "terminator": return True return False - - def draw(self, axes, feature, bbox, loc, style_param): - x = bbox.x0 + bbox.width/2 + def draw(self, axes, feature, bbox, loc, style_param): + x = bbox.x0 + bbox.width / 2 axes.plot( - (x, x), (bbox.y0, bbox.y1), color="black", - linestyle="-", linewidth=self._bar_width, marker="None" + (x, x), + (bbox.y0, bbox.y1), + color="black", + linestyle="-", + linewidth=self._bar_width, + marker="None", ) @@ -499,12 +530,15 @@ def matches(self, feature): if feature.qual["regulatory_class"] == "ribosome_binding_site": return True return False - + def draw(self, axes, feature, bbox, loc, style_param): from matplotlib.patches import Ellipse ellipse = Ellipse( - (bbox.x0 + bbox.width/2, bbox.y0 + bbox.height/2), - bbox.width, self._height*bbox.height, - color=colors["dimorange"], linewidth=0) - axes.add_patch(ellipse) \ No newline at end of file + (bbox.x0 + bbox.width / 2, bbox.y0 + bbox.height / 2), + bbox.width, + self._height * bbox.height, + color=colors["dimorange"], + linewidth=0, + ) + axes.add_patch(ellipse) diff --git a/src/biotite/sequence/graphics/logo.py b/src/biotite/sequence/graphics/logo.py index b4f18a7b8..3fc32a052 100644 --- a/src/biotite/sequence/graphics/logo.py +++ b/src/biotite/sequence/graphics/logo.py @@ -7,12 +7,9 @@ __all__ = ["plot_sequence_logo"] import numpy as np -from ...visualize import set_font_size_in_coord -from ..alphabet import LetterAlphabet -from .colorschemes import get_color_scheme -import warnings -from ..align import Alignment -from .. import SequenceProfile +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.graphics.colorschemes import get_color_scheme +from biotite.visualize import set_font_size_in_coord def plot_sequence_logo(axes, profile, scheme=None, **kwargs): @@ -36,41 +33,35 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs): The logo is created based on this profile. scheme : str or list of (tuple or str) Either a valid color scheme name - (e.g. ``"rainbow"``, ``"clustalx"``, ``blossom``, etc.) + (e.g. ``"flower"``, ``"clustalx"``, ``blossom``, etc.) or a list of *Matplotlib* compatible colors. The list length must be at least as long as the length of the alphabet used by the `profile`. **kwargs Additional `text parameters `_. - + References ---------- - + .. footbibliography:: """ - from matplotlib.text import Text - - if isinstance(profile, Alignment): - warnings.warn("Using an alignment for this method is deprecated; use a profile instead", DeprecationWarning) - profile = SequenceProfile.from_alignment(profile) - alphabet = profile.alphabet if not isinstance(alphabet, LetterAlphabet): raise TypeError("The sequences' alphabet must be a letter alphabet") if scheme is None: - colors = get_color_scheme("rainbow", alphabet) + colors = get_color_scheme("flower", alphabet) elif isinstance(scheme, str): colors = get_color_scheme(scheme, alphabet) else: colors = scheme - + # 'color' and 'size' property is not passed on to text kwargs.pop("color", None) - kwargs.pop("size", None) - + kwargs.pop("size", None) + frequencies, entropies, max_entropy = _get_entropy(profile) - stack_heights = (max_entropy - entropies) + stack_heights = max_entropy - entropies symbols_heights = stack_heights[:, np.newaxis] * frequencies index_order = np.argsort(symbols_heights, axis=1) for i in range(symbols_heights.shape[0]): @@ -79,21 +70,25 @@ def plot_sequence_logo(axes, profile, scheme=None, **kwargs): start_height = 0 for j in index_order[i]: # Stack the symbols at position on top of the preceeding one - height = symbols_heights[i,j] + height = symbols_heights[i, j] if height > 0: symbol = alphabet.decode(j) text = axes.text( - i+0.5, start_height, symbol, - ha="left", va="bottom", color=colors[j], + i + 0.5, + start_height, + symbol, + ha="left", + va="bottom", + color=colors[j], # Best results are obtained with this font size size=1, - **kwargs + **kwargs, ) text.set_clip_on(True) set_font_size_in_coord(text, width=1, height=height) start_height += height - axes.set_xlim(0.5, len(profile.symbols)+0.5) + axes.set_xlim(0.5, len(profile.symbols) + 0.5) axes.set_ylim(0, max_entropy) @@ -103,8 +98,7 @@ def _get_entropy(profile): # 0 * log2(0) = 0 -> Convert NaN to 0 no_zeros = freq != 0 pre_entropies = np.zeros(freq.shape) - pre_entropies[no_zeros] \ - = freq[no_zeros] * np.log2(freq[no_zeros]) + pre_entropies[no_zeros] = freq[no_zeros] * np.log2(freq[no_zeros]) entropies = -np.sum(pre_entropies, axis=1) max_entropy = np.log2(len(profile.alphabet)) - return freq, entropies, max_entropy \ No newline at end of file + return freq, entropies, max_entropy diff --git a/src/biotite/sequence/graphics/plasmid.py b/src/biotite/sequence/graphics/plasmid.py index 8527dc8d7..08972fce9 100644 --- a/src/biotite/sequence/graphics/plasmid.py +++ b/src/biotite/sequence/graphics/plasmid.py @@ -6,20 +6,29 @@ __author__ = "Patrick Kunzmann" __all__ = ["plot_plasmid_map"] -import copy +import re import warnings -import abc import numpy as np -import re -from ...visualize import colors -from ..annotation import Annotation, Feature, Location - - -def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, - tick_step=200, ring_width=0.01, feature_width=0.06, - spacing=0.01, arrow_head_length=0.04, label=None, - face_properties=None, label_properties=None, - omit_oversized_labels=True, feature_formatter=None): +from biotite.sequence.annotation import Feature, Location +from biotite.visualize import colors + + +def plot_plasmid_map( + axes, + annotation, + plasmid_size, + tick_length=0.02, + tick_step=200, + ring_width=0.01, + feature_width=0.06, + spacing=0.01, + arrow_head_length=0.04, + label=None, + face_properties=None, + label_properties=None, + omit_oversized_labels=True, + feature_formatter=None, +): """ Plot a plasmid map using the sequence features in the given :class:`Annotation`. @@ -84,26 +93,26 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, the following tuple: - *directional* : bool - + True, if the direction of the feature should be indicated by an arrow. Otherwise, the feature is plotted is arc. - + - *face_color* : tuple or str, optional - + A *Matplotlib* compatible color for the feature arrow/arc. - + - *label_color* : tuple or str, optional - + A *Matplotlib* compatible color for the feature label. - + - *label* : str or None - + The label to be displayed for this feature. None, if no label should be displayed. """ from matplotlib.projections.polar import PolarAxes - + if not isinstance(axes, PolarAxes): raise TypeError("The given axes must be a 'PolarAxes'") @@ -118,16 +127,13 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, if feature_formatter is None: feature_formatter = _default_feature_formatter - ### Setup matplotlib ### # The x-coordinate is given as angle (rad) # Full circle -> 2*pi - axes.set_xlim(0, 2*np.pi) + axes.set_xlim(0, 2 * np.pi) axes.set_ylim(0, 1) axes.yaxis.set_visible(False) - axes.xaxis.set_tick_params( - bottom=False, labelbottom=True - ) + axes.xaxis.set_tick_params(bottom=False, labelbottom=True) axes.set_theta_zero_location("N") axes.set_theta_direction("clockwise") axes.spines["polar"].set_visible(False) @@ -142,32 +148,39 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, axes.xaxis.set_ticks([_loc_to_rad(tick, plasmid_size) for tick in ticks]) axes.xaxis.set_ticklabels(tick_labels) ### Draw plasmid ring with ticks and central label ### - + # Plasmid ring # Use 'barh()' instead of a Rectangle patch to ensure that the axes # is properly initialized # Otherwise the feature rectangles are not curved, but straight axes.barh( - 1-ring_width-tick_length, 2*np.pi, ring_width, - align="edge", color="black" + 1 - ring_width - tick_length, 2 * np.pi, ring_width, align="edge", color="black" ) - + # Ticks (ticks itself, not the tick labels) for tick in ticks: angle = _loc_to_rad(tick, plasmid_size) axes.plot( - (angle, angle), (1-tick_length, 1), - color="black", linewidth=1, linestyle="-" + (angle, angle), + (1 - tick_length, 1), + color="black", + linewidth=1, + linestyle="-", ) - + # Central plasmid label if label is not None: axes.text( - 0, 0, label, ha="center", va="center", - color="black", size=32, fontweight="bold" + 0, + 0, + label, + ha="center", + va="center", + color="black", + size=32, + fontweight="bold", ) - ### Draw plasmid interior ### inner_radius = 1 - ring_width - tick_length features = sorted( @@ -177,28 +190,51 @@ def plot_plasmid_map(axes, annotation, plasmid_size, tick_length=0.02, ], # Features are sorted by the length of their location range # The shortest come first - key = lambda feature: np.diff(feature.get_location_range())[0], - reverse = True + key=lambda feature: np.diff(feature.get_location_range())[0], + reverse=True, + ) + axes.add_artist( + PlasmidMap( + axes, + 0, + features, + plasmid_size, + inner_radius, + feature_width, + spacing, + arrow_head_length, + label, + face_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ) ) - axes.add_artist(PlasmidMap( - axes, 0, features, plasmid_size, inner_radius, feature_width, spacing, - arrow_head_length, label, face_properties, label_properties, - omit_oversized_labels, feature_formatter - )) try: # Only create these classes when matplotlib is installed from matplotlib.artist import Artist + from matplotlib.patches import Polygon, Rectangle from matplotlib.transforms import Bbox - from matplotlib.patches import Rectangle, Polygon - class PlasmidMap(Artist): - def __init__(self, axes, zorder, features, plasmid_size, radius, - feature_width, spacing, arrow_head_length, label, - face_properties, label_properties, omit_oversized_labels, - feature_formatter): + def __init__( + self, + axes, + zorder, + features, + plasmid_size, + radius, + feature_width, + spacing, + arrow_head_length, + label, + face_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ): super().__init__() self._axes = axes self.zorder = zorder @@ -212,30 +248,36 @@ def __init__(self, axes, zorder, features, plasmid_size, radius, for feature in features: indicators_for_feature = [] for loc in feature.locs: - # Set proper positions in 'draw()' method + # Set proper positions in 'draw()' method bbox = Bbox.from_extents(0, 0, 0, 0) # Draw features as curved arrows (feature indicator) - indicator = axes.add_artist(Feature_Indicator( - axes, self.zorder + 1, feature, loc, bbox, - arrow_head_length, face_properties, label_properties, - omit_oversized_labels, feature_formatter - )) + indicator = axes.add_artist( + FeatureIndicator( + axes, + self.zorder + 1, + feature, + loc, + bbox, + arrow_head_length, + face_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ) + ) indicators_for_feature.append(indicator) self._all_indicators.append(indicators_for_feature) - def draw(self, renderer, *args, **kwargs): # Find the maximum amount of feature rows # (used for overlapping features) - row_count = int( - self._radius // (self._feature_width + self._spacing) - ) + row_count = int(self._radius // (self._feature_width + self._spacing)) # Tracks the location ranges of feature that were added to # a row in order to check if that row is occupied ranges_in_row = [[] for i in range(row_count)] # Stores the bottom coordinate (radius) for each row row_bottoms = [ - self._radius - (row+1) * (self._feature_width + self._spacing) + self._radius - (row + 1) * (self._feature_width + self._spacing) for row in range(row_count) ] @@ -258,11 +300,13 @@ def draw(self, renderer, *args, **kwargs): # 'Normal feature' if first <= curr_last and last >= curr_first: is_occupied = True - else: # first < 1 + else: # first < 1 # Location is over periodic boundary - if first + self._plasmid_size <= curr_last \ - or last >= curr_first: - is_occupied = True + if ( + first + self._plasmid_size <= curr_last + or last >= curr_first + ): + is_occupied = True if not is_occupied: # Row is not occupied by another feature # in the location range of the new feature @@ -273,12 +317,10 @@ def draw(self, renderer, *args, **kwargs): else: # Location is over periodic boundary # Split into 'end' and 'start' part - ranges_in_row[row_i].append(( - first + self._plasmid_size, self._plasmid_size - )) - ranges_in_row[row_i].append(( - 1, last - )) + ranges_in_row[row_i].append( + (first + self._plasmid_size, self._plasmid_size) + ) + ranges_in_row[row_i].append((1, last)) row_bottom = row_bottoms[row_i] break if row_bottom is None: @@ -288,24 +330,30 @@ def draw(self, renderer, *args, **kwargs): "radius or decrease the feature width or spacing" ) else: - for loc, indicator in zip( - feature.locs, indicators_for_feature - ): + for loc, indicator in zip(feature.locs, indicators_for_feature): # Calculate arrow shape parameters - row_center = row_bottom + self._feature_width/2 row_top = row_bottom + self._feature_width start_ang = _loc_to_rad(loc.first, self._plasmid_size) - stop_ang = _loc_to_rad(loc.last, self._plasmid_size) + stop_ang = _loc_to_rad(loc.last, self._plasmid_size) bbox = Bbox.from_extents( start_ang, row_bottom, stop_ang, row_top ) indicator.set_bbox(bbox) - - class Feature_Indicator(Artist): - def __init__(self, axes, zorder, feature, loc, bbox, head_length, - arrow_properties, label_properties, omit_oversized_labels, - feature_formatter): + class FeatureIndicator(Artist): + def __init__( + self, + axes, + zorder, + feature, + loc, + bbox, + head_length, + arrow_properties, + label_properties, + omit_oversized_labels, + feature_formatter, + ): super().__init__() self._axes = axes self.zorder = zorder @@ -313,44 +361,59 @@ def __init__(self, axes, zorder, feature, loc, bbox, head_length, self._bbox = bbox self._head_length = head_length self._omit_oversized_labels = omit_oversized_labels - + # Determine how to draw the feature - directional, face_color, label_color, label \ - = feature_formatter(feature) - + directional, face_color, label_color, label = feature_formatter(feature) + # Draw arrow as composition of a rectangle and a triangle, # as FancyArrow does not properly work for polar plots - self._arrow_tail = axes.add_patch(Rectangle( - # Set positions in 'draw()' method - (0, 0), 0, 0, - # Line width is set to 1 to avoid strange artifact in - # the transition from rectangle (tail) to polygon (head) - color=face_color, linewidth=1, zorder = self.zorder + 1, - **arrow_properties - )) - + self._arrow_tail = axes.add_patch( + Rectangle( + # Set positions in 'draw()' method + (0, 0), + 0, + 0, + # Line width is set to 1 to avoid strange artifact in + # the transition from rectangle (tail) to polygon (head) + color=face_color, + linewidth=1, + zorder=self.zorder + 1, + **arrow_properties, + ) + ) + if directional: # Only draw any arrow head when feature has a direction, # otherwise simply draw the tail (rectangle) - self._arrow_head = axes.add_patch(Polygon( - # Set positions in 'draw()' method - [(0, 0), (0, 0), (0, 0)], - color=face_color, linewidth=1, zorder = self.zorder + 1, - **arrow_properties - )) + self._arrow_head = axes.add_patch( + Polygon( + # Set positions in 'draw()' method + [(0, 0), (0, 0), (0, 0)], + color=face_color, + linewidth=1, + zorder=self.zorder + 1, + **arrow_properties, + ) + ) else: self._arrow_head = None if label is not None: label_properties["color"] = label_color - self._label = axes.add_artist(CurvedText( - # Set positions in 'draw()' method - axes, self.zorder + 1, 0, 0, label, label_properties - )) + self._label = axes.add_artist( + CurvedText( + # Set positions in 'draw()' method + axes, + self.zorder + 1, + 0, + 0, + label, + label_properties, + ) + ) else: self._label = None - def set_bbox(self, bbox): self._bbox = bbox @@ -359,17 +422,15 @@ def set_bbox(self, bbox): if self._label is not None: self._label.set_position(center_x, center_y) - def draw(self, renderer, *args, **kwargs): bbox = self._bbox - center_x = (bbox.x0 + bbox.x1) / 2 center_y = (bbox.y0 + bbox.y1) / 2 # Constant absolute width for all arrows # irrespective of the radius in the polar plot # Calculate actual angle from given absolute width head_length = self._head_length / center_y - + # Check if the head should be drawn if self._arrow_head is None: head_length = 0 @@ -382,39 +443,38 @@ def draw(self, renderer, *args, **kwargs): rect_pos = (bbox.x0, bbox.y0) # (x0, y0), (x1, y1), (x2, y2) triangle_coord = [ - (bbox.x1 - head_length, bbox.y0), # base 1 - (bbox.x1 - head_length, bbox.y1), # base 2 - (bbox.x1, center_y) # tip + (bbox.x1 - head_length, bbox.y0), # base 1 + (bbox.x1 - head_length, bbox.y1), # base 2 + (bbox.x1, center_y), # tip ] else: - rect_pos = (bbox.x0+head_length, bbox.y0) + rect_pos = (bbox.x0 + head_length, bbox.y0) triangle_coord = [ - (bbox.x0 + head_length, bbox.y0), # base 1 - (bbox.x0 + head_length, bbox.y1), # base 2 - (bbox.x0, center_y) # tip + (bbox.x0 + head_length, bbox.y0), # base 1 + (bbox.x0 + head_length, bbox.y1), # base 2 + (bbox.x0, center_y), # tip ] - + # Update coordinates of sub-artists self._arrow_tail.set_xy(rect_pos) - self._arrow_tail.set_width(bbox.width-head_length) + self._arrow_tail.set_width(bbox.width - head_length) self._arrow_tail.set_height(bbox.height) if self._arrow_head is not None: self._arrow_head.set_xy(triangle_coord) - + if self._label is not None: # Do not draw the labels if it is larger than the # indicator - if self._omit_oversized_labels \ - and self._label.get_total_angle(renderer) > bbox.width: - self._label.set_visible(False) + if ( + self._omit_oversized_labels + and self._label.get_total_angle(renderer) > bbox.width + ): + self._label.set_visible(False) else: self._label.set_visible(True) - - class CurvedText(Artist): - def __init__(self, axes, zorder, angle, radius, string, - text_properties): + def __init__(self, axes, zorder, angle, radius, string, text_properties): super().__init__() self._axes = axes self.zorder = zorder @@ -425,44 +485,35 @@ def __init__(self, axes, zorder, angle, radius, string, for word in _split_into_words(string): text = axes.text( # Set position in 'draw()' method - 0, 0, + 0, + 0, word, - ha="center", va="center", + ha="center", + va="center", zorder=self.zorder + 1, **text_properties, ) self._texts.append(text) - def set_visible(self, visible): super().set_visible(visible) for text in self._texts: text.set_visible(visible) - def set_position(self, angle, radius): self._angle = angle self._radius = radius - def get_total_angle(self, renderer): return np.sum(self.get_word_angles(renderer)) - def get_word_angles(self, renderer): ax_px_radius = self._axes.get_window_extent(renderer).width / 2 ax_unit_radius = self._axes.get_ylim()[1] - circle_px_circumference = ax_px_radius * 2*np.pi \ - * (self._radius / ax_unit_radius) + circle_px_circumference = ( + ax_px_radius * 2 * np.pi * (self._radius / ax_unit_radius) + ) - rad_angle = 360 - np.rad2deg(self._angle) - # Avoid to draw the text upside down, when drawn on the - # bottom half of the map - if rad_angle > 90 and rad_angle < 270: - turn_around = True - else: - turn_around = False - angles = [] for text in self._texts: orig_rot = text.get_rotation() @@ -477,14 +528,12 @@ def get_word_angles(self, renderer): # In this case, assign a fixed width if np.isnan(word_px_width): word_px_width = 5.0 - word_angle \ - = 2*np.pi * word_px_width / circle_px_circumference + word_angle = 2 * np.pi * word_px_width / circle_px_circumference angles.append(word_angle) # Restore text.set_rotation(orig_rot) text.set_visible(orig_visible) return angles - def draw(self, renderer, *args, **kwargs): angles = self.get_word_angles(renderer) @@ -497,7 +546,7 @@ def draw(self, renderer, *args, **kwargs): turn_around = True else: turn_around = False - + # Now that the angle for each word is known, # the appropriate position and rotation can be set if turn_around: @@ -526,20 +575,18 @@ def draw(self, renderer, *args, **kwargs): pass - - def _loc_to_rad(loc, plasmid_size): if loc > plasmid_size: raise ValueError( f"Location {loc} is larger then the plasmid size of {plasmid_size}" ) # Location starts at 1 -> (loc-1) - return ((loc-1) / plasmid_size) * 2*np.pi + return ((loc - 1) / plasmid_size) * 2 * np.pi def _rad_to_loc(rad, plasmid_size): # Location starts at 1 -> + 1 - return rad / (2*np.pi) * plasmid_size + 1 + return rad / (2 * np.pi) * plasmid_size + 1 def _merge_over_periodic_boundary(feature, plasmid_size): @@ -547,7 +594,7 @@ def _merge_over_periodic_boundary(feature, plasmid_size): # Only one location -> no merge possible return feature first_loc = None - last_loc = None + last_loc = None # Find total first location of the feature for loc in feature.locs: if first_loc is None or loc.first < first_loc.first: @@ -558,38 +605,43 @@ def _merge_over_periodic_boundary(feature, plasmid_size): last_loc = loc # If the first and last location meet at the periodic boundary of # the plasmid -> merge them - if first_loc.first == 1 and last_loc.last == plasmid_size \ - and first_loc.strand == last_loc.strand: - new_locs = set(feature.locs) - new_locs.remove(first_loc) - new_locs.remove(last_loc) - new_locs.add(Location( + if ( + first_loc.first == 1 + and last_loc.last == plasmid_size + and first_loc.strand == last_loc.strand + ): + new_locs = set(feature.locs) + new_locs.remove(first_loc) + new_locs.remove(last_loc) + new_locs.add( + Location( # the fist base is now at negative location # by shifting by one plasmid 'period' - first = last_loc.first - plasmid_size, - last = first_loc.last, - strand = first_loc.strand, - defect = first_loc.defect | last_loc.defect - )) - return Feature(feature.key, new_locs, feature.qual) + first=last_loc.first - plasmid_size, + last=first_loc.last, + strand=first_loc.strand, + defect=first_loc.defect | last_loc.defect, + ) + ) + return Feature(feature.key, new_locs, feature.qual) else: return feature # ' ', '-' and '_' are word delimiters separators = re.compile(r"\s|_|-") + + def _split_into_words(string): - match_indices = sorted( - [match.start() for match in separators.finditer(string)] - ) + match_indices = sorted([match.start() for match in separators.finditer(string)]) current_index = 0 words = [] for i in match_indices: # Add word up to delimiter - words.append(string[current_index : i]) + words.append(string[current_index:i]) # Add delimiter - words.append(string[i : i+1]) - current_index = i+1 + words.append(string[i : i + 1]) + current_index = i + 1 # If there is a word after the last delimiter, add it too if current_index < len(string): words.append(string[current_index:]) @@ -618,44 +670,43 @@ def _default_feature_formatter(f): else: label = None return False, "black", "white", label - + # Origin of Replication elif f.key == "rep_origin": - return False, "indigo", "white", \ - f.qual.get("standard_name", "ori") - + return False, "indigo", "white", f.qual.get("standard_name", "ori") + # Coding sequences elif f.key in ["gene", "CDS", "rRNA"]: label = f.qual.get("product") if label is None: label = f.qual.get("gene") return True, colors["orange"], "black", label - + elif f.key == "regulatory": # Promoters if f.qual.get("regulatory_class") in [ "promoter", "TATA_box", "minus_35_signal", - "minus_10_signal" + "minus_10_signal", ]: return True, colors["dimgreen"], "black", f.qual.get("note") - + # Terminators elif f.qual.get("regulatory_class") in "terminator": return False, "firebrick", "white", f.qual.get("note") - + # RBS elif f.qual.get("regulatory_class") == "ribosome_binding_site": return False, colors["brightorange"], "white", None - + # Primers elif f.key == "primer_bind": return True, "royalblue", "black", f.qual.get("note") - + # Binding proteins elif f.key == "protein_bind": return False, colors["lightgreen"], "black", f.qual.get("note") - + # Misc - return True, "dimgray", "white", f.qual.get("note") \ No newline at end of file + return True, "dimgray", "white", f.qual.get("note") diff --git a/src/biotite/sequence/io/fasta/__init__.py b/src/biotite/sequence/io/fasta/__init__.py index 5aa14febe..8fad54b21 100644 --- a/src/biotite/sequence/io/fasta/__init__.py +++ b/src/biotite/sequence/io/fasta/__init__.py @@ -18,5 +18,5 @@ __name__ = "biotite.sequence.io.fasta" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/sequence/io/fasta/convert.py b/src/biotite/sequence/io/fasta/convert.py index 0e8ca854a..0a73240dd 100644 --- a/src/biotite/sequence/io/fasta/convert.py +++ b/src/biotite/sequence/io/fasta/convert.py @@ -7,13 +7,18 @@ import warnings from collections import OrderedDict -from ...sequence import Sequence -from ...alphabet import AlphabetError, LetterAlphabet -from ...seqtypes import NucleotideSequence, ProteinSequence -from ...align.alignment import Alignment +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.alphabet import AlphabetError, LetterAlphabet +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence -__all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences", - "get_alignment", "set_alignment"] +__all__ = [ + "get_sequence", + "get_sequences", + "set_sequence", + "set_sequences", + "get_alignment", + "set_alignment", +] def get_sequence(fasta_file, header=None, seq_type=None): @@ -180,8 +185,10 @@ def get_alignment(fasta_file, additional_gap_chars=("_",), seq_type=None): for i, seq_str in enumerate(seq_strings): seq_strings[i] = seq_str.replace(char, "-") # Remove gaps for creation of sequences - sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type) - for seq_str in seq_strings] + sequences = [ + _convert_to_sequence(seq_str.replace("-", ""), seq_type) + for seq_str in seq_strings + ] trace = Alignment.trace_from_strings(seq_strings) return Alignment(sequences, trace, score=None) @@ -212,44 +219,29 @@ def set_alignment(fasta_file, alignment, seq_names): def _convert_to_sequence(seq_str, seq_type=None): - - # Define preprocessing of preimplemented sequence types - - # Replace selenocysteine with cysteine - # and pyrrolysine with lysine - process_protein_sequence = ( - lambda x : x.upper().replace("U", "C").replace("O", "K") - ) - # For nucleotides uracil is represented by thymine and there is only - # one letter for completely unknown nucleotides - process_nucleotide_sequence = ( - lambda x : x.upper().replace("U","T").replace("X","N") - ) - # Set manually selected sequence type - if seq_type is not None: # Do preprocessing as done without manual selection if seq_type == NucleotideSequence: - seq_str = process_nucleotide_sequence(seq_str) + seq_str = _process_nucleotide_sequence(seq_str) elif seq_type == ProteinSequence: if "U" in seq_str: warnings.warn( "ProteinSequence objects do not support selenocysteine " "(U), occurrences were substituted by cysteine (C)" ) - seq_str = process_protein_sequence(seq_str) + seq_str = _process_protein_sequence(seq_str) # Return the converted sequence return seq_type(seq_str) # Attempt to automatically determine sequence type try: - return NucleotideSequence(process_nucleotide_sequence(seq_str)) + return NucleotideSequence(_process_nucleotide_sequence(seq_str)) except AlphabetError: pass try: - prot_seq = ProteinSequence(process_protein_sequence(seq_str)) + prot_seq = ProteinSequence(_process_protein_sequence(seq_str)) # Raise Warning after conversion into 'ProteinSequence' # to wait for potential 'AlphabetError' if "U" in seq_str: @@ -259,15 +251,34 @@ def _convert_to_sequence(seq_str, seq_type=None): ) return prot_seq except AlphabetError: - raise ValueError("FASTA data cannot be converted either to " - "'NucleotideSequence' nor to 'ProteinSequence'") + raise ValueError( + "FASTA data cannot be converted either to " + "'NucleotideSequence' nor to 'ProteinSequence'" + ) + + +def _process_protein_sequence(x): + """ + Replace selenocysteine with cysteine and pyrrolysine with lysine. + """ + return x.upper().replace("U", "C").replace("O", "K") + + +def _process_nucleotide_sequence(x): + """ + For nucleotides uracil is represented by thymine and there is only + one letter for completely unknown nucleotides + """ + return x.upper().replace("U", "T").replace("X", "N") def _convert_to_string(sequence, as_rna): if not isinstance(sequence.get_alphabet(), LetterAlphabet): - raise ValueError("Only sequences using single letter alphabets " - "can be stored in a FASTA file") + raise ValueError( + "Only sequences using single letter alphabets " + "can be stored in a FASTA file" + ) if isinstance(sequence, NucleotideSequence) and as_rna: - return(str(sequence).replace("T", "U")) + return str(sequence).replace("T", "U") else: - return(str(sequence)) + return str(sequence) diff --git a/src/biotite/sequence/io/fasta/file.py b/src/biotite/sequence/io/fasta/file.py index 89eab5398..e0fe20ad7 100644 --- a/src/biotite/sequence/io/fasta/file.py +++ b/src/biotite/sequence/io/fasta/file.py @@ -6,21 +6,21 @@ __author__ = "Patrick Kunzmann" __all__ = ["FastaFile"] -from ....file import TextFile, InvalidFileError, wrap_string from collections import OrderedDict from collections.abc import MutableMapping +from biotite.file import InvalidFileError, TextFile, wrap_string class FastaFile(TextFile, MutableMapping): """ This class represents a file in FASTA format. - + A FASTA file contains so called *header* lines, beginning with ``>``, that describe following sequence. The corresponding sequence starts at the line after the header line and ends at the next header line or at the end of file. The header along with its sequence forms an entry. - + This class is used in a dictionary like manner, implementing the :class:`MutableMapping` interface: Headers (without the leading ``>``) are used as keys, @@ -35,10 +35,10 @@ class FastaFile(TextFile, MutableMapping): after which a line break is inserted. Only relevant, when adding sequences to a file. Default is 80. - + Examples -------- - + >>> import os.path >>> file = FastaFile() >>> file["seq1"] = "ATACT" @@ -61,17 +61,17 @@ class FastaFile(TextFile, MutableMapping): {'seq2': 'AAAATT'} >>> file.write(os.path.join(path_to_directory, "test.fasta")) """ - + def __init__(self, chars_per_line=80): super().__init__() self._chars_per_line = chars_per_line self._entries = OrderedDict() - + @classmethod def read(cls, file, chars_per_line=80): """ Read a FASTA file. - + Parameters ---------- file : file-like object or str @@ -82,7 +82,7 @@ def read(cls, file, chars_per_line=80): after which a line break is inserted. Only relevant, when adding sequences to a file. Default is 80. - + Returns ------- file_object : FastaFile @@ -90,24 +90,23 @@ def read(cls, file, chars_per_line=80): """ file = super().read(file, chars_per_line) # Filter out empty and comment lines - file.lines = [line for line in file.lines - if len(line.strip()) != 0 and line[0] != ";"] + file.lines = [ + line for line in file.lines if len(line.strip()) != 0 and line[0] != ";" + ] if len(file.lines) == 0: raise InvalidFileError("File is empty or contains only comments") file._find_entries() return file - + def __setitem__(self, header, seq_str): if not isinstance(header, str): - raise IndexError( - "'FastaFile' only supports header strings as keys" - ) + raise IndexError("'FastaFile' only supports header strings as keys") if not isinstance(seq_str, str): - raise TypeError("'FastaFile' only supports sequence strings " - "as values") + raise TypeError("'FastaFile' only supports sequence strings " "as values") # Create lines for new header and sequence (with line breaks) - new_lines = [">" + header.replace("\n","").strip()] + \ - wrap_string(seq_str, width=self._chars_per_line) + new_lines = [">" + header.replace("\n", "").strip()] + wrap_string( + seq_str, width=self._chars_per_line + ) if header in self: # Delete lines of entry corresponding to the header, # if existing @@ -118,83 +117,75 @@ def __setitem__(self, header, seq_str): # Simply append lines # Add entry in a more efficient way than '_find_entries()' # for this simple case - self._entries[header] = ( - len(self.lines), - len(self.lines) + len(new_lines) - ) + self._entries[header] = (len(self.lines), len(self.lines) + len(new_lines)) self.lines += new_lines - + def __getitem__(self, header): if not isinstance(header, str): - raise IndexError( - "'FastaFile' only supports header strings as keys" - ) + raise IndexError("'FastaFile' only supports header strings as keys") start, stop = self._entries[header] # Concatenate sequence string from following lines - seq_string = "".join( - [line.strip() for line in self.lines[start+1 : stop]] - ) + seq_string = "".join([line.strip() for line in self.lines[start + 1 : stop]]) return seq_string - + def __delitem__(self, header): start, stop = self._entries[header] del self.lines[start:stop] del self._entries[header] self._find_entries() - + def __len__(self): return len(self._entries) - + def __iter__(self): return self._entries.__iter__() - + def __contains__(self, identifer): return identifer in self._entries - + def _find_entries(self): if len(self.lines) > 0 and self.lines[0][0] != ">": raise InvalidFileError( f"File starts with '{self.lines[0][0]}' instead of '>'" ) - + header_i = [] for i, line in enumerate(self.lines): if line[0] == ">": header_i.append(i) - + self._entries = OrderedDict() for j in range(len(header_i)): # Remove leading '>' from header header = self.lines[header_i[j]].strip()[1:] start = header_i[j] - if j < len(header_i) -1: + if j < len(header_i) - 1: # Header in mid or start of file # -> stop is start of next header - stop = header_i[j+1] + stop = header_i[j + 1] else: # Last header -> entry stops at end of file stop = len(self.lines) self._entries[header] = (start, stop) - @staticmethod def read_iter(file): """ Create an iterator over each sequence of the given FASTA file. - + Parameters ---------- file : file-like object or str The file to be read. Alternatively a file path can be supplied. - + Yields ------ header : str The header of the current sequence. seq_str : str The current sequence as string. - + Notes ----- This approach gives the same results as @@ -221,7 +212,6 @@ def read_iter(file): # Yield final entry if header is not None: yield header, "".join(seq_str_list) - @staticmethod def write_iter(file, items, chars_per_line=80): @@ -235,7 +225,7 @@ def write_iter(file, items, chars_per_line=80): Hence, this static method may save a large amount of memory if a large file should be written, especially if the `items` are provided as generator. - + Parameters ---------- file : file-like object or str @@ -256,23 +246,20 @@ def write_iter(file, items, chars_per_line=80): This method does not test, whether the given identifiers are unambiguous. """ + def line_generator(): for item in items: header, seq_str = item if not isinstance(header, str): - raise IndexError( - "'FastaFile' only supports header strings" - ) + raise IndexError("'FastaFile' only supports header strings") if not isinstance(seq_str, str): - raise TypeError( - "'FastaFile' only supports sequence strings" - ) - + raise TypeError("'FastaFile' only supports sequence strings") + # Yield header line - yield ">" + header.replace("\n","").strip() + yield ">" + header.replace("\n", "").strip() # Yield sequence line(s) for line in wrap_string(seq_str, width=chars_per_line): yield line - - TextFile.write_iter(file, line_generator()) \ No newline at end of file + + TextFile.write_iter(file, line_generator()) diff --git a/src/biotite/sequence/io/fastq/__init__.py b/src/biotite/sequence/io/fastq/__init__.py index d763198b1..cff2e7097 100644 --- a/src/biotite/sequence/io/fastq/__init__.py +++ b/src/biotite/sequence/io/fastq/__init__.py @@ -15,5 +15,5 @@ __name__ = "biotite.sequence.io.fastq" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/sequence/io/fastq/convert.py b/src/biotite/sequence/io/fastq/convert.py index 868536c6e..5b743fcd7 100644 --- a/src/biotite/sequence/io/fastq/convert.py +++ b/src/biotite/sequence/io/fastq/convert.py @@ -6,10 +6,7 @@ __author__ = "Patrick Kunzmann" from collections import OrderedDict -from ...sequence import Sequence -from ...alphabet import AlphabetError, LetterAlphabet -from ...seqtypes import NucleotideSequence -from ...align.alignment import Alignment +from biotite.sequence.seqtypes import NucleotideSequence __all__ = ["get_sequence", "get_sequences", "set_sequence", "set_sequences"] @@ -17,7 +14,7 @@ def get_sequence(fastq_file, header=None): """ Get a sequence and quality scores from a `FastqFile` instance. - + Parameters ---------- fastq_file : FastqFile @@ -25,7 +22,7 @@ def get_sequence(fastq_file, header=None): header : str, optional The identifier to get the sequence and scores from. By default, the first sequence of the file is returned. - + Returns ------- sequence : NucleotideSequence @@ -43,7 +40,7 @@ def get_sequence(fastq_file, header=None): break if seq_str is None: raise ValueError("File does not contain any sequences") - processed_seq_str = seq_str.replace("U","T").replace("X","N") + processed_seq_str = seq_str.replace("U", "T").replace("X", "N") return NucleotideSequence(processed_seq_str), scores @@ -51,12 +48,12 @@ def get_sequences(fastq_file): """ Get a dictionary from a `FastqFile` instance, where identifiers are keys and sequence-score-tuples are values. - + Parameters ---------- fastq_file : FastqFile The `Fastqile` to be accessed. - + Returns ------- seq_dict : dict @@ -65,7 +62,7 @@ def get_sequences(fastq_file): """ seq_dict = OrderedDict() for header, (seq_str, scores) in fastq_file.items(): - processed_seq_str = seq_str.replace("U","T").replace("X","N") + processed_seq_str = seq_str.replace("U", "T").replace("X", "N") seq_dict[header] = NucleotideSequence(processed_seq_str), scores return seq_dict @@ -73,7 +70,7 @@ def get_sequences(fastq_file): def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False): """ Set a sequence and a quality score array in a `FastqFile` instance. - + Parameters ---------- fastq_file : FastqFile @@ -96,7 +93,7 @@ def set_sequence(fastq_file, sequence, scores, header=None, as_rna=False): def set_sequences(fastq_file, sequence_dict, as_rna=False): """ Set sequences in a `FastqFile` instance from a dictionary. - + Parameters ---------- fastq_file : FastqFile @@ -115,6 +112,6 @@ def set_sequences(fastq_file, sequence_dict, as_rna=False): def _convert_to_string(sequence, as_rna): if as_rna: - return(str(sequence).replace("T", "U")) + return str(sequence).replace("T", "U") else: - return(str(sequence)) \ No newline at end of file + return str(sequence) diff --git a/src/biotite/sequence/io/fastq/file.py b/src/biotite/sequence/io/fastq/file.py index 5b00674cf..c6c85c6cb 100644 --- a/src/biotite/sequence/io/fastq/file.py +++ b/src/biotite/sequence/io/fastq/file.py @@ -5,23 +5,21 @@ __name__ = "biotite.sequence.io.fastq" __author__ = "Patrick Kunzmann" -import warnings -from numbers import Integral from collections import OrderedDict from collections.abc import MutableMapping +from numbers import Integral import numpy as np -from ....file import TextFile, InvalidFileError, wrap_string -from ...seqtypes import NucleotideSequence +from biotite.file import InvalidFileError, TextFile, wrap_string __all__ = ["FastqFile"] _OFFSETS = { - "Sanger" : 33, - "Solexa" : 64, - "Illumina-1.3" : 64, - "Illumina-1.5" : 64, - "Illumina-1.8" : 33, + "Sanger": 33, + "Solexa": 64, + "Illumina-1.3": 64, + "Illumina-1.5": 64, + "Illumina-1.8": 33, } @@ -47,7 +45,7 @@ class FastqFile(TextFile, MutableMapping): An identifier string (without the leading ``@``) is used as index to get and set the corresponding sequence and quality. ``del`` removes an entry in the file. - + Parameters ---------- offset : int or {'Sanger', 'Solexa', 'Illumina-1.3', 'Illumina-1.5', 'Illumina-1.8'} @@ -61,10 +59,10 @@ class FastqFile(TextFile, MutableMapping): Only relevant, when adding sequences to a file. By default each sequence (and score string) is put into one line. - + Examples -------- - + >>> import os.path >>> file = FastqFile(offset="Sanger") >>> file["seq1"] = str(NucleotideSequence("ATACT")), [0,3,10,7,12] @@ -91,18 +89,18 @@ class FastqFile(TextFile, MutableMapping): 0.96=GD >>> file.write(os.path.join(path_to_directory, "test.fastq")) """ - + def __init__(self, offset, chars_per_line=None): super().__init__() self._chars_per_line = chars_per_line self._entries = OrderedDict() self._offset = _convert_offset(offset) - + @classmethod def read(cls, file, offset, chars_per_line=None): """ Read a FASTQ file. - + Parameters ---------- file : file-like object or str @@ -119,7 +117,7 @@ def read(cls, file, offset, chars_per_line=None): Only relevant, when adding sequences to a file. By default each sequence (and score string) is put into one line. - + Returns ------- file_object : FastqFile @@ -134,31 +132,7 @@ def read(cls, file, offset, chars_per_line=None): raise InvalidFileError("File is empty") file._find_entries() return file - - def get_sequence(self, identifier): - """ - Get the sequence for the specified identifier. - - DEPRECATED: Use :meth:`get_seq_string()` or - :func:`get_sequence()` instead. - Parameters - ---------- - identifier : str - The identifier of the sequence. - - Returns - ------- - sequence : NucleotideSequence - The sequence corresponding to the identifier. - """ - warnings.warn( - "'get_sequence()' is deprecated, use the 'get_seq_string()'" - "method or 'fasta.get_sequence()' function instead", - DeprecationWarning - ) - return NucleotideSequence(self.get_seq_string(identifier)) - def get_seq_string(self, identifier): """ Get the string representing the sequence for the specified @@ -168,22 +142,19 @@ def get_seq_string(self, identifier): ---------- identifier : str The identifier of the sequence. - + Returns ------- sequence : str The sequence corresponding to the identifier. """ if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports identifier strings as keys" - ) - seq_start, seq_stop, score_start, score_stop \ - = self._entries[identifier] + raise IndexError("'FastqFile' only supports identifier strings as keys") + seq_start, seq_stop, score_start, score_stop = self._entries[identifier] # Concatenate sequence string from the sequence lines - seq_str = "".join(self.lines[seq_start : seq_stop]) + seq_str = "".join(self.lines[seq_start:seq_stop]) return seq_str - + def get_quality(self, identifier): """ Get the quality scores for the specified identifier. @@ -192,24 +163,20 @@ def get_quality(self, identifier): ---------- identifier : str The identifier of the quality scores. - + Returns ------- scores : ndarray, dtype=int The quality scores corresponding to the identifier. """ if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports identifier strings as keys" - ) - seq_start, seq_stop, score_start, score_stop \ - = self._entries[identifier] + raise IndexError("'FastqFile' only supports identifier strings as keys") + seq_start, seq_stop, score_start, score_stop = self._entries[identifier] # Concatenate sequence string from the score lines return _score_str_to_scores( - "".join(self.lines[score_start : score_stop]), - self._offset + "".join(self.lines[score_start:score_stop]), self._offset ) - + def __setitem__(self, identifier, item): sequence, scores = item if len(sequence) != len(scores): @@ -218,24 +185,22 @@ def __setitem__(self, identifier, item): f"but score length is {len(scores)}" ) if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports strings as identifier" - ) + raise IndexError("'FastqFile' only supports strings as identifier") # Delete lines of entry corresponding to the identifier, # if already existing if identifier in self: del self[identifier] - + # Create new lines # Start with identifier line - new_lines = ["@" + identifier.replace("\n","").strip()] + new_lines = ["@" + identifier.replace("\n", "").strip()] # Append new lines with sequence string (with line breaks) seq_start_i = len(new_lines) if self._chars_per_line is None: new_lines.append(str(sequence)) else: new_lines += wrap_string(sequence, width=self._chars_per_line) - seq_stop_i =len(new_lines) + seq_stop_i = len(new_lines) # Append sequence-score separator new_lines += ["+"] # Append scores @@ -261,29 +226,28 @@ def __setitem__(self, identifier, item): len(self.lines) + seq_start_i, len(self.lines) + seq_stop_i, len(self.lines) + score_start_i, - len(self.lines) + score_stop_i + len(self.lines) + score_stop_i, ) self.lines += new_lines - + def __getitem__(self, identifier): return self.get_seq_string(identifier), self.get_quality(identifier) - + def __delitem__(self, identifier): - seq_start, seq_stop, score_start, score_stop \ - = self._entries[identifier] - del self.lines[seq_start-1 : score_stop] + seq_start, seq_stop, score_start, score_stop = self._entries[identifier] + del self.lines[seq_start - 1 : score_stop] del self._entries[identifier] self._find_entries() - + def __len__(self): return len(self._entries) - + def __iter__(self): return self._entries.__iter__() - + def __contains__(self, identifer): return identifer in self._entries - + def _find_entries(self): self._entries = OrderedDict() in_sequence = False @@ -302,7 +266,7 @@ def _find_entries(self): if not in_scores and not in_sequence and line[0] == "@": # Identifier line identifier = line[1:] - seq_start_i = i+1 + seq_start_i = i + 1 # Next line is sequence in_sequence = True # Reset @@ -314,7 +278,7 @@ def _find_entries(self): in_sequence = False in_scores = True seq_stop_i = i - score_start_i = i+1 + score_start_i = i + 1 else: # Still in sequence seq_len += len(line) @@ -330,9 +294,12 @@ def _find_entries(self): in_scores = False # Record this entry self._entries[identifier] = ( - seq_start_i, seq_stop_i, score_start_i, score_stop_i + seq_start_i, + seq_stop_i, + score_start_i, + score_stop_i, ) - else: # score_len > seq_len + else: # score_len > seq_len raise InvalidFileError( f"The amount of scores is not equal to the sequence " f"length for the sequence in line {seq_start_i+1} " @@ -343,14 +310,13 @@ def _find_entries(self): # must have properly ended if in_sequence or in_scores: raise InvalidFileError("The last entry in the file is incomplete") - @staticmethod def read_iter(file, offset): """ Create an iterator over each sequence (and corresponding scores) of the given FASTQ file. - + Parameters ---------- file : file-like object or str @@ -361,7 +327,7 @@ def read_iter(file, offset): ASCII code. Can either be directly the value, or a string that indicates the score format. - + Yields ------ identifier : str @@ -369,7 +335,7 @@ def read_iter(file, offset): sequence : tuple(str, ndarray) The current sequence as string and its corresponding quality scores as :class:`ndarray`. - + Notes ----- This approach gives the same results as @@ -377,7 +343,7 @@ def read_iter(file, offset): and much more memory efficient. """ offset = _convert_offset(offset) - + identifier = None seq_str_list = [] score_str_list = [] @@ -391,7 +357,7 @@ def read_iter(file, offset): # Ignore empty lines if len(line) == 0: continue - + if not in_scores and not in_sequence and line[0] == "@": # Track new entry identifier = line[1:] @@ -401,7 +367,7 @@ def read_iter(file, offset): score_len = 0 seq_str_list = [] score_str_list = [] - + elif in_sequence: if line[0] == "+": # End of sequence start of scores @@ -411,7 +377,7 @@ def read_iter(file, offset): # Still in sequence seq_len += len(line) seq_str_list.append(line) - + elif in_scores: score_len += len(line) score_str_list.append(line) @@ -422,20 +388,15 @@ def read_iter(file, offset): # -> End of entry in_scores = False # yield this entry - scores = _score_str_to_scores( - "".join(score_str_list), - offset - ) + scores = _score_str_to_scores("".join(score_str_list), offset) yield identifier, ("".join(seq_str_list), scores) - else: # score_len > seq_len + else: # score_len > seq_len raise InvalidFileError( - f"The amount of scores is not equal to the sequence " - f"length" + "The amount of scores is not equal to the sequence " "length" ) - + else: - raise InvalidFileError(f"FASTQ file is invalid") - + raise InvalidFileError("FASTQ file is invalid") @staticmethod def write_iter(file, items, offset, chars_per_line=None): @@ -449,7 +410,7 @@ def write_iter(file, items, offset, chars_per_line=None): Hence, this static method may save a large amount of memory if a large file should be written, especially if the `items` are provided as generator. - + Parameters ---------- file : file-like object or str @@ -487,12 +448,10 @@ def line_generator(): f"but score length is {len(scores)}" ) if not isinstance(identifier, str): - raise IndexError( - "'FastqFile' only supports strings as identifier" - ) - + raise IndexError("'FastqFile' only supports strings as identifier") + # Yield identifier line - yield "@" + identifier.replace("\n","").strip() + yield "@" + identifier.replace("\n", "").strip() # Yield sequence line(s) if chars_per_line is None: @@ -500,10 +459,10 @@ def line_generator(): else: for line in wrap_string(sequence, width=chars_per_line): yield line - + # Yield separator yield "+" - + # Yield scores score_chars = _scores_to_score_str(scores, offset) if chars_per_line is None: @@ -511,7 +470,7 @@ def line_generator(): else: for line in wrap_string(score_chars, width=chars_per_line): yield line - + TextFile.write_iter(file, line_generator()) @@ -519,15 +478,11 @@ def _score_str_to_scores(score_str, offset): """ Convert an ASCII string into actual score values. """ - scores = np.frombuffer( - bytearray( - score_str, encoding="ascii" - ), - dtype=np.int8 - ) + scores = np.frombuffer(bytearray(score_str, encoding="ascii"), dtype=np.int8) scores -= offset return scores + def _scores_to_score_str(scores, offset): """ Convert score values into an ASCII string. @@ -535,6 +490,7 @@ def _scores_to_score_str(scores, offset): scores = np.asarray(scores) + offset return scores.astype(np.int8, copy=False).tobytes().decode("ascii") + def _convert_offset(offset_val_or_string): """ If the given offset is a string return the corresponding numerical @@ -543,9 +499,9 @@ def _convert_offset(offset_val_or_string): if isinstance(offset_val_or_string, Integral): return offset_val_or_string elif isinstance(offset_val_or_string, str): - return _OFFSETS[offset_val_or_string] + return _OFFSETS[offset_val_or_string] else: raise TypeError( f"The offset must be either an integer or a string " f"indicating the format, not {type(offset_val_or_string).__name__}" - ) \ No newline at end of file + ) diff --git a/src/biotite/sequence/io/genbank/__init__.py b/src/biotite/sequence/io/genbank/__init__.py index bccb3feab..11f745f10 100644 --- a/src/biotite/sequence/io/genbank/__init__.py +++ b/src/biotite/sequence/io/genbank/__init__.py @@ -11,7 +11,7 @@ __name__ = "biotite.sequence.io.genbank" __author__ = "Patrick Kunzmann" -from .file import * from .annotation import * +from .file import * +from .metadata import * from .sequence import * -from .metadata import * \ No newline at end of file diff --git a/src/biotite/sequence/io/genbank/annotation.py b/src/biotite/sequence/io/genbank/annotation.py index fcd5e072b..223a67ddb 100644 --- a/src/biotite/sequence/io/genbank/annotation.py +++ b/src/biotite/sequence/io/genbank/annotation.py @@ -12,10 +12,8 @@ import re import warnings -from ....file import InvalidFileError -from ...annotation import Annotation, Feature, Location -from .file import GenBankFile - +from biotite.file import InvalidFileError +from biotite.sequence.annotation import Annotation, Feature, Location _KEY_START = 5 _QUAL_START = 21 @@ -46,7 +44,6 @@ def get_annotation(gb_file, include_only=None): raise InvalidFileError("File has multiple 'FEATURES' fields") lines, _ = fields[0] - ### Parse all lines to create an index of features, # i.e. pairs of the feature key # and the text belonging to the respective feature @@ -60,13 +57,12 @@ def get_annotation(gb_file, include_only=None): # Store old feature key and value feature_list.append((feature_key, feature_value)) # Track new key - feature_key = line[_KEY_START : _QUAL_START-1].strip() + feature_key = line[_KEY_START : _QUAL_START - 1].strip() feature_value = "" feature_value += line[_QUAL_START:] + " " # Store last feature key and value (loop already exited) feature_list.append((feature_key, feature_value)) - ### Process only relevant features and put them into an Annotation annotation = Annotation() # Regex to separate qualifiers from each other @@ -92,7 +88,7 @@ def get_annotation(gb_file, include_only=None): loc_string = qualifier_parts.pop(0).strip() try: locs = _parse_locs(loc_string) - except: + except Exception: warnings.warn( f"'{loc_string}' is an unsupported location identifier, " f"skipping feature" @@ -114,7 +110,7 @@ def get_annotation(gb_file, include_only=None): # -> split at whitespaces, # as keys do not contain whitespaces for subpart in part.split(): - if not "=" in subpart: + if "=" not in subpart: # Qualifier without value, e.g. '/pseudo' # -> store immediately # Remove "/" -> subpart[1:] @@ -147,11 +143,11 @@ def get_annotation(gb_file, include_only=None): def _parse_locs(loc_str): locs = [] if loc_str.startswith(("join", "order")): - str_list = loc_str[loc_str.index("(")+1:loc_str.rindex(")")].split(",") + str_list = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")].split(",") for s in str_list: locs.extend(_parse_locs(s.strip())) elif loc_str.startswith("complement"): - compl_str = loc_str[loc_str.index("(")+1:loc_str.rindex(")")] + compl_str = loc_str[loc_str.index("(") + 1 : loc_str.rindex(")")] compl_locs = [ Location(loc.first, loc.last, Location.Strand.REVERSE, loc.defect) for loc in _parse_locs(compl_str) @@ -214,8 +210,6 @@ def _set_qual(qual_dict, key, val): qual_dict[key] = val - - def set_annotation(gb_file, annotation): """ Set the *FEATURES* field of a GenBank file with an annotation. @@ -236,12 +230,12 @@ def set_annotation(gb_file, annotation): for key, values in feature.qual.items(): if values is None: line = " " * _QUAL_START - line += f'/{key}' + line += f"/{key}" lines.append(line) else: for val in values.split("\n"): line = " " * _QUAL_START - line += f'/{key}="{val}"' + line += f'/{key}="{val}"' lines.append(line) gb_file.set_field("FEATURES", lines) @@ -254,11 +248,11 @@ def _convert_to_loc_string(locs): if len(locs) == 1: loc = list(locs)[0] loc_first_str = str(loc.first) - loc_last_str = str(loc.last) + loc_last_str = str(loc.last) if loc.defect & Location.Defect.BEYOND_LEFT: loc_first_str = "<" + loc_first_str if loc.defect & Location.Defect.BEYOND_RIGHT: - loc_last_str = ">" + loc_last_str + loc_last_str = ">" + loc_last_str if loc.first == loc.last: loc_string = loc_first_str elif loc.defect & Location.Defect.UNK_LOC: @@ -270,8 +264,6 @@ def _convert_to_loc_string(locs): if loc.strand == Location.Strand.REVERSE: loc_string = f"complement({loc_string})" else: - loc_string = ",".join( - [_convert_to_loc_string([loc]) for loc in locs] - ) + loc_string = ",".join([_convert_to_loc_string([loc]) for loc in locs]) loc_string = f"join({loc_string})" return loc_string diff --git a/src/biotite/sequence/io/genbank/file.py b/src/biotite/sequence/io/genbank/file.py index 72a225647..0fdd99c63 100644 --- a/src/biotite/sequence/io/genbank/file.py +++ b/src/biotite/sequence/io/genbank/file.py @@ -6,14 +6,16 @@ __author__ = "Patrick Kunzmann" __all__ = ["GenBankFile", "MultiFile"] -#import textwrap +# import textwrap import copy -#import re + +# import re import io -from ....file import TextFile, InvalidFileError from collections import OrderedDict -#from ...annotation import Location, Feature, Annotation, AnnotatedSequence -#from ...seqtypes import NucleotideSequence, ProteinSequence +from biotite.file import InvalidFileError, TextFile + +# from ...annotation import Location, Feature, Annotation, AnnotatedSequence +# from ...seqtypes import NucleotideSequence, ProteinSequence class GenBankFile(TextFile): @@ -33,7 +35,7 @@ class GenBankFile(TextFile): Some fields may occur multiple times, e.g. the *REFERENCE* field. A sample GenBank file can be viewed at ``_. - + This class provides a low-level interface for parsing, editing and writing GenBank files. It works like a list of field entries, where a field consists of the @@ -47,7 +49,7 @@ class GenBankFile(TextFile): The subfields are represented by a dictionary, with subfield names being keys and the corresponding lines being values. The *FEATURES* and *ORIGIN* fields have no subfields. - + Every entry can be obtained, set and deleted via the index operator. Notes @@ -55,7 +57,7 @@ class GenBankFile(TextFile): This class does not support location identifiers with references to other Entrez database entries, e.g. ``join(1..100,J00194.1:100..202)``. - + Examples -------- Create a GenBank file from scratch: @@ -79,9 +81,9 @@ class GenBankFile(TextFile): ['One line', 'A second line'] >>> print(subfields) OrderedDict([('SUBFIELD1', ['Single Line']), ('SUBFIELD2', ['Two', 'lines'])]) - + Adding an additional field: - + >>> file.insert(0, "OTHERFIELD", ["Another line"]) >>> print(len(file)) 2 @@ -174,18 +176,18 @@ def __init__(self): # and names of categories self._field_pos = [] self._find_field_indices() - + @classmethod def read(cls, file): """ Read a GenBank file. - + Parameters ---------- file : file-like object or str The file to be read. Alternatively a file path can be supplied. - + Returns ------- file_object : GenBankFile @@ -194,16 +196,16 @@ def read(cls, file): file = super().read(file) file._find_field_indices() return file - + def get_fields(self, name): """ Get all *GenBank* fields associated with a given field name. - + Parameters ---------- name : str The field name. - + Returns ------- fields : list of (list of str, OrderedDict of str -> str) @@ -218,17 +220,17 @@ def get_fields(self, name): indices = self.get_indices(name) # Omit the field name return [self[i][1:] for i in indices] - + def get_indices(self, name): """ Get the indices to all *GenBank* fields associated with a given field name. - + Parameters ---------- name : str The field name. - + Returns ------- fields : list of int @@ -242,7 +244,7 @@ def get_indices(self, name): if fname == name: indices.append(i) return indices - + def set_field(self, name, content, subfield_dict=None): """ Set a *GenBank* field with the given content. @@ -250,7 +252,7 @@ def set_field(self, name, content, subfield_dict=None): If the field already exists in the file, the field is overwritten, otherwise a new field is created at the end of the file. - + Parameters ---------- name : str @@ -261,7 +263,7 @@ def set_field(self, name, content, subfield_dict=None): The subfields of the field. The dictionary maps subfield names to the content lines of the respective subfield. - + Raises ------ InvalidFileError @@ -283,13 +285,13 @@ def set_field(self, name, content, subfield_dict=None): def __getitem__(self, index): index = self._translate_idx(index) start, stop, name = self._field_pos[index] - + if name in ["FEATURES", "ORIGIN"]: # For those two fields return the complete lines, # beginning with the line after the field name - content = self._get_field_content(start+1, stop, indent=0) + content = self._get_field_content(start + 1, stop, indent=0) subfield_dict = OrderedDict() - + else: # For all metadata fields use the # standard GenBank indentation (=12) @@ -297,11 +299,11 @@ def __getitem__(self, index): subfield_dict = OrderedDict() subfield_start = None first_subfield_start = None - for i in range(start+1, stop): + header = None + for i in range(start + 1, stop): line = self.lines[i] - # Check if line contains a new subfield - # (Header beginning from first column) if len(line) != 0 and line[:12].strip() != "": + # New header -> new subfield if first_subfield_start is None: first_subfield_start = i # Store previous subfield @@ -320,12 +322,10 @@ def __getitem__(self, index): # that are not part of a subfield if first_subfield_start is not None: stop = first_subfield_start - content = self._get_field_content( - start, stop, indent=12 - ) - + content = self._get_field_content(start, stop, indent=12) + return name, content, subfield_dict - + def __setitem__(self, index, item): index = self._translate_idx(index) if not isinstance(item, tuple): @@ -342,7 +342,7 @@ def __setitem__(self, index, item): "Expected a tuple of name, content and optionally subfields" ) inserted_lines = self._to_lines(name, content, subfields) - + # Stop of field to be replaced is start of new field start, old_stop, _ = self._field_pos[index] # If not the last element is set, @@ -355,12 +355,12 @@ def __setitem__(self, index, item): # Shift the start/stop indices of the following fields # by the amount of created fields shift = len(inserted_lines) - (old_stop - start) - for i in range(index+1, len(self._field_pos)): + for i in range(index + 1, len(self._field_pos)): old_start, old_stop, fname = self._field_pos[i] - self._field_pos[i] = old_start+shift, old_stop+shift, fname + self._field_pos[i] = old_start + shift, old_stop + shift, fname # Add new entry - self._field_pos[index] = start, start+len(inserted_lines), name.upper() - + self._field_pos[index] = start, start + len(inserted_lines), name.upper() + def __delitem__(self, index): index = self._translate_idx(index) start, stop, _ = self._field_pos[index] @@ -369,17 +369,17 @@ def __delitem__(self, index): shift = stop - start for i in range(index, len(self._field_pos)): old_start, old_stop, name = self._field_pos[i] - self._field_pos[i] = old_start-shift, old_stop-shift, name - del self.lines[start : stop] + self._field_pos[i] = old_start - shift, old_stop - shift, name + del self.lines[start:stop] del self._field_pos[index] - + def __len__(self): return len(self._field_pos) def insert(self, index, name, content, subfields=None): """ Insert a *GenBank* field at the given position. - + Parameters ---------- index : int @@ -398,12 +398,12 @@ def insert(self, index, name, content, subfields=None): """ index = self._translate_idx(index, length_exclusive=False) inserted_lines = self._to_lines(name, content, subfields) - + # Stop of previous field is start of new field if index == 0: start = 0 else: - _, start, _ = self._field_pos[index-1] + _, start, _ = self._field_pos[index - 1] # If the new lines are not inserted at the end, # the following lines need to be added, too if start is not len(self.lines): @@ -416,17 +416,16 @@ def insert(self, index, name, content, subfields=None): shift = len(inserted_lines) for i in range(index, len(self._field_pos)): old_start, old_stop, fname = self._field_pos[i] - self._field_pos[i] = old_start+shift, old_stop+shift, fname + self._field_pos[i] = old_start + shift, old_stop + shift, fname # Add new entry self._field_pos.insert( - index, - (start, start+len(inserted_lines), name.upper()) + index, (start, start + len(inserted_lines), name.upper()) ) - + def append(self, name, content, subfields=None): """ Create a new *GenBank* field at the end of the file. - + Parameters ---------- name : str @@ -440,7 +439,6 @@ def append(self, name, content, subfields=None): """ self.insert(len(self), name, content, subfields) - def _find_field_indices(self): """ Identify the start and exclusive stop indices of lines @@ -469,10 +467,10 @@ def _find_field_indices(self): def _get_field_content(self, start, stop, indent): if indent == 0: - return self.lines[start : stop] + return self.lines[start:stop] else: - return [line[12:] for line in self.lines[start : stop]] - + return [line[12:] for line in self.lines[start:stop]] + def _to_lines(self, name, content, subfields): """ Convert the field name, field content und subfield dictionary @@ -480,22 +478,22 @@ def _to_lines(self, name, content, subfields): """ if subfields is None: subfields = {} - + name = name.strip().upper() if len(name) == 0: - raise ValueError(f"Must give a non emtpy name") - subfields = OrderedDict({ - subfield_name.upper().strip() : subfield_lines - for subfield_name, subfield_lines in subfields.items() - }) - + raise ValueError("Must give a non emtpy name") + subfields = OrderedDict( + { + subfield_name.upper().strip(): subfield_lines + for subfield_name, subfield_lines in subfields.items() + } + ) + # Create lines for new field if name == "FEATURES": # Header line plus all actual feature lines lines = copy.copy(content) - lines.insert( - 0, "FEATURES" + " "*13 + "Location/Qualifiers" - ) + lines.insert(0, "FEATURES" + " " * 13 + "Location/Qualifiers") elif name == "ORIGIN": # Header line plus all actual sequence lines lines = copy.copy(content) @@ -504,19 +502,19 @@ def _to_lines(self, name, content, subfields): name_column = [] content_column = [] # Create a line for the field name and empty lines - # for each additional line required by the content - name_column += [name] + [""] * (len(content)-1) + # for each additional line required by the content + name_column += [name] + [""] * (len(content) - 1) content_column += content for subfield_name, subfield_lines in subfields.items(): - name_column += [" " + subfield_name] \ - + [""] * (len(subfield_lines)-1) + name_column += [" " + subfield_name] + [""] * (len(subfield_lines) - 1) content_column += subfield_lines - lines = [f"{n_col:12}{c_col}" for n_col, c_col - in zip(name_column, content_column)] - + lines = [ + f"{n_col:12}{c_col}" + for n_col, c_col in zip(name_column, content_column) + ] + return lines - def _translate_idx(self, index, length_exclusive=True): """ Check index boundaries and convert negative index to positive @@ -539,15 +537,15 @@ class MultiFile(TextFile): """ This class represents a file in *GenBank* or *GenPept* format, that contains multiple entries, for more than one UID. - + The information for each UID are appended to each other in such a file. Objects of this class can be iterated to obtain a :class:`GenBankFile` for each entry in the file. - + Examples -------- - + >>> import os.path >>> file_name = fetch_single_file( ... ["1L2Y_A", "3O5R_A", "5UGO_A"], @@ -568,8 +566,8 @@ def __iter__(self): line = self.lines[i] if line.strip() == "//": # Create file with lines corresponding to that file - file_content = "\n".join(self.lines[start_i : i+1]) + file_content = "\n".join(self.lines[start_i : i + 1]) file = GenBankFile.read(io.StringIO(file_content)) # Reset file start index start_i = i - yield file \ No newline at end of file + yield file diff --git a/src/biotite/sequence/io/genbank/metadata.py b/src/biotite/sequence/io/genbank/metadata.py index f4d25004f..477c0fbf2 100644 --- a/src/biotite/sequence/io/genbank/metadata.py +++ b/src/biotite/sequence/io/genbank/metadata.py @@ -8,17 +8,24 @@ __name__ = "biotite.sequence.io.genbank" __author__ = "Patrick Kunzmann, Natasha Jaffe" -__all__ = ["get_locus", "get_definition", "get_accession", "get_version", - "get_gi", "get_db_link", "get_source", - "set_locus"] +__all__ = [ + "get_locus", + "get_definition", + "get_accession", + "get_version", + "get_gi", + "get_db_link", + "get_source", + "set_locus", +] + +from biotite.file import InvalidFileError -from ....file import InvalidFileError -from .file import GenBankFile def get_locus(gb_file): """ Parse the *LOCUS* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile @@ -39,10 +46,10 @@ def get_locus(gb_file): The GenBank division to which the file belongs. date : str, optional The date of last modification. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> name, length, mol_type, is_circular, division, date = get_locus(file) @@ -68,59 +75,57 @@ def get_locus(gb_file): # The first field will always be the ID name = fields[0] - # The second field will always be the length followed + # The second field will always be the length followed # by units (eg 1224 aa) length = int(fields[1]) - # The third field *should* be the molecular type + # The third field *should* be the molecular type # but sometimes this is missing. This gets tricky # because sometimes the next field, circular/linear, # is missing, too. The field after that, division, # is a 3 letter all caps token. Unfortunately, mol_type - # is also often a 3 letter all caps token (eg DNA)! + # is also often a 3 letter all caps token (eg DNA)! # Fortunately, GenBank publishes the set list of divisions # here: https://www.ncbi.nlm.nih.gov/genbank/samplerecord , # so we can check against that set when determining whether # the current token represents the molecular type. divisions = ( - 'PRI', # primate sequences - 'ROD', # rodent sequences - 'MAM', # other mammalian sequences - 'VRT', # other vertebrate sequences - 'INV', # invertebrate sequences - 'PLN', # plant, fungal, and algal sequences - 'BCT', # bacterial sequences - 'VRL', # viral sequences - 'PHG', # bacteriophage sequences - 'SYN', # synthetic sequences - 'UNA', # unannotated sequences - 'EST', # EST sequences (expressed sequence tags) - 'PAT', # patent sequences - 'STS', # STS sequences (sequence tagged sites) - 'GSS', # GSS sequences (genome survey sequences) - 'HTG', # HTG sequences (high-throughput genomic sequences) - 'HTC', # unfinished high-throughput cDNA sequencing - 'ENV', # environmental sampling sequences - 'CON', + "PRI", # primate sequences + "ROD", # rodent sequences + "MAM", # other mammalian sequences + "VRT", # other vertebrate sequences + "INV", # invertebrate sequences + "PLN", # plant, fungal, and algal sequences + "BCT", # bacterial sequences + "VRL", # viral sequences + "PHG", # bacteriophage sequences + "SYN", # synthetic sequences + "UNA", # unannotated sequences + "EST", # EST sequences (expressed sequence tags) + "PAT", # patent sequences + "STS", # STS sequences (sequence tagged sites) + "GSS", # GSS sequences (genome survey sequences) + "HTG", # HTG sequences (high-throughput genomic sequences) + "HTC", # unfinished high-throughput cDNA sequencing + "ENV", # environmental sampling sequences + "CON", ) - # NOTE: Remember that fields[2] is the unit for length, + # NOTE: Remember that fields[2] is the unit for length, # eg bp or aa, so we move to fields[3] here. - if fields[3] not in ('linear', 'circular') \ - and fields[3] not in divisions: + if fields[3] not in ("linear", "circular") and fields[3] not in divisions: mol_type = fields[3] next_idx = 4 else: mol_type = None next_idx = 3 - - # The next field should be the token 'linear' or 'circular', + # The next field should be the token 'linear' or 'circular', # but sometimes this is missing - if 'linear' == fields[next_idx]: + if "linear" == fields[next_idx]: is_circular = False next_idx += 1 - elif 'circular' == fields[next_idx]: + elif "circular" == fields[next_idx]: is_circular = True next_idx += 1 else: @@ -136,23 +141,24 @@ def get_locus(gb_file): return name, length, mol_type, is_circular, division, date + def get_definition(gb_file): """ Parse the *DEFINITION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *DEFINITION* field from. - + Returns ------- definition : str Content of the *DEFINITION* field. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> print(get_definition(file)) @@ -161,23 +167,24 @@ def get_definition(gb_file): lines, _ = _expect_single_field(gb_file, "DEFINITION") return " ".join([line.strip() for line in lines]) + def get_accession(gb_file): """ Parse the *ACCESSION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *ACCESSION* field from. - + Returns ------- accession : str The accession ID of the file. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> print(get_accession(file)) @@ -187,16 +194,17 @@ def get_accession(gb_file): # 'ACCESSION' field has only one line return lines[0] + def get_version(gb_file): """ Parse the version from the *VERSION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *VERSION* field from. - + Returns ------- version : str @@ -206,16 +214,17 @@ def get_version(gb_file): # 'VERSION' field has only one line return lines[0].split()[0] + def get_gi(gb_file): """ Parse the GI from the *VERSION* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *VERSION* field from. - + Returns ------- gi : str @@ -229,24 +238,25 @@ def get_gi(gb_file): # Truncate GI return int(version_info[1][3:]) + def get_db_link(gb_file): """ Parse the *DBLINK* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *DBLINK* field from. - + Returns ------- link_dict : dict A dictionary storing the database links, with the database name as key, and the corresponding ID as value. - + Examples -------- - + >>> import os.path >>> file = GenBankFile.read(os.path.join(path_to_sequences, "ec_bl21.gb")) >>> for key, val in get_db_link(file).items(): @@ -265,12 +275,12 @@ def get_db_link(gb_file): def get_source(gb_file): """ Parse the *SOURCE* field of a GenBank or GenPept file. - + Parameters ---------- gb_file : GenBankFile The GenBank file to read the *SOURCE* field from. - + Returns ------- accession : str @@ -290,12 +300,12 @@ def _expect_single_field(gb_file, name): return fields[0] - -def set_locus(gb_file, name, length, mol_type=None, is_circular=False, - division=None, date=None): +def set_locus( + gb_file, name, length, mol_type=None, is_circular=False, division=None, date=None +): """ Set the *LOCUS* field of a GenBank file. - + Parameters ---------- gb_file : GenBankFile @@ -319,6 +329,8 @@ def set_locus(gb_file, name, length, mol_type=None, is_circular=False, circularity = "circular" if is_circular else "linear" division = "" if division is None else division date = "" if date is None else date - line = f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} " \ - f"{circularity:8} {division:3} {date:11}" - gb_file.set_field("LOCUS", [line]) \ No newline at end of file + line = ( + f"{name:18} {length:>9} {restype_abbr} {mol_type:^10} " + f"{circularity:8} {division:3} {date:11}" + ) + gb_file.set_field("LOCUS", [line]) diff --git a/src/biotite/sequence/io/genbank/sequence.py b/src/biotite/sequence/io/genbank/sequence.py index 26ec645bb..f5b194746 100644 --- a/src/biotite/sequence/io/genbank/sequence.py +++ b/src/biotite/sequence/io/genbank/sequence.py @@ -8,16 +8,19 @@ __name__ = "biotite.sequence.io.genbank" __author__ = "Patrick Kunzmann" -__all__ = ["get_raw_sequence", "get_sequence", "get_annotated_sequence", - "set_sequence", "set_annotated_sequence"] +__all__ = [ + "get_raw_sequence", + "get_sequence", + "get_annotated_sequence", + "set_sequence", + "set_annotated_sequence", +] import re -from ....file import InvalidFileError -from ...seqtypes import ProteinSequence, NucleotideSequence -from ...annotation import AnnotatedSequence -from .file import GenBankFile -from .annotation import get_annotation, set_annotation - +from biotite.file import InvalidFileError +from biotite.sequence.annotation import AnnotatedSequence +from biotite.sequence.io.genbank.annotation import get_annotation, set_annotation +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence _SYMBOLS_PER_CHUNK = 10 _SEQ_CHUNKS_PER_LINE = 6 @@ -112,7 +115,7 @@ def _convert_seq_str(seq_str, format): if len(seq_str) == 0: raise InvalidFileError("The file's 'ORIGIN' field is empty") if format == "gb": - return NucleotideSequence(seq_str.replace("U","T").replace("X","N")) + return NucleotideSequence(seq_str.replace("U", "T").replace("X", "N")) elif format == "gp": return ProteinSequence(seq_str.replace("U", "C").replace("O", "K")) else: @@ -125,8 +128,6 @@ def _get_seq_start(origin_content): return int(origin_content[0].split()[0]) - - def set_sequence(gb_file, sequence, sequence_start=1): """ Set the *ORIGIN* field of a GenBank file with a sequence. @@ -167,6 +168,4 @@ def set_annotated_sequence(gb_file, annot_sequence): The annotated sequence that is put into the GenBank file. """ set_annotation(gb_file, annot_sequence.annotation) - set_sequence( - gb_file, annot_sequence.sequence, annot_sequence.sequence_start - ) \ No newline at end of file + set_sequence(gb_file, annot_sequence.sequence, annot_sequence.sequence_start) diff --git a/src/biotite/sequence/io/general.py b/src/biotite/sequence/io/general.py index 09b7c2722..c76e11b72 100644 --- a/src/biotite/sequence/io/general.py +++ b/src/biotite/sequence/io/general.py @@ -9,31 +9,27 @@ __name__ = "biotite.sequence.io" __author__ = "Patrick Kunzmann" -__all__ = ["load_sequence", "save_sequence", - "load_sequences", "save_sequences"] +__all__ = ["load_sequence", "save_sequence", "load_sequences", "save_sequences"] -import itertools import os.path -import io from collections import OrderedDict import numpy as np -from ..seqtypes import NucleotideSequence, ProteinSequence -from ..alphabet import Alphabet +from biotite.sequence.seqtypes import NucleotideSequence def load_sequence(file_path): """ Load a sequence from a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the file extension. - + Parameters ---------- file_path : str The path to the sequence file. - + Returns ------- sequence : Sequence @@ -42,11 +38,13 @@ def load_sequence(file_path): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, get_sequence + from biotite.sequence.io.fasta import FastaFile, get_sequence + file = FastaFile.read(file_path) return get_sequence(file) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile.read(file_path, offset="Sanger") @@ -56,7 +54,8 @@ def load_sequence(file_path): break return sequence elif suffix in [".gb", ".gbk", ".gp"]: - from .genbank import GenBankFile, get_sequence + from biotite.sequence.io.genbank import GenBankFile, get_sequence + format = "gp" if suffix == ".gp" else "gb" file = GenBankFile.read(file_path) return get_sequence(file, format) @@ -68,10 +67,10 @@ def save_sequence(file_path, sequence): """ Save a sequence into a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the given file extension. - + Parameters ---------- file_path : str @@ -82,12 +81,14 @@ def save_sequence(file_path, sequence): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, set_sequence + from biotite.sequence.io.fasta import FastaFile, set_sequence + file = FastaFile() set_sequence(file, sequence) file.write(file_path) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile(offset="Sanger") @@ -96,7 +97,8 @@ def save_sequence(file_path, sequence): file["sequence"] = str(sequence), scores file.write(file_path) elif suffix in [".gb", ".gbk", ".gp"]: - from .genbank import GenBankFile, set_locus, set_sequence + from biotite.sequence.io.genbank import GenBankFile, set_locus, set_sequence + file = GenBankFile() set_locus(file, "sequence", len(sequence)) set_sequence(file, sequence) @@ -109,37 +111,42 @@ def load_sequences(file_path): """ Load multiple sequences from a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the file extension. - + Parameters ---------- file_path : str The path to the sequence file. - + Returns ------- sequences : dict of (str, Sequence) The sequences in the file. This dictionary maps each header name to - the respective sequence. + the respective sequence. """ # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, get_sequences + from biotite.sequence.io.fasta import FastaFile, get_sequences + file = FastaFile.read(file_path) return get_sequences(file) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile.read(file_path, offset="Sanger") - return {identifier : NucleotideSequence(seq_str) - for identifier, (seq_str, scores) in file.items()} + return { + identifier: NucleotideSequence(seq_str) + for identifier, (seq_str, scores) in file.items() + } elif suffix in [".gb", ".gbk", ".gp"]: - from .genbank import MultiFile, get_definition, get_sequence + from biotite.sequence.io.genbank import MultiFile, get_definition, get_sequence + file = MultiFile.read(file_path) format = "gp" if suffix == ".gp" else "gb" sequences = OrderedDict() @@ -154,10 +161,10 @@ def save_sequences(file_path, sequences): """ Save multiple sequences into a sequence file without the need to manually instantiate a :class:`File` object. - + Internally this function uses a :class:`File` object, based on the given file extension. - + Parameters ---------- file_path : str @@ -169,12 +176,14 @@ def save_sequences(file_path, sequences): # We only need the suffix here filename, suffix = os.path.splitext(file_path) if suffix in [".fasta", ".fa", ".mpfa", ".fna", ".fsa"]: - from .fasta import FastaFile, set_sequences + from biotite.sequence.io.fasta import FastaFile, set_sequences + file = FastaFile() set_sequences(file, sequences) file.write(file_path) elif suffix in [".fastq", ".fq"]: - from .fastq import FastqFile + from biotite.sequence.io.fastq import FastqFile + # Quality scores are irrelevant for this function # -> Offset is irrelevant file = FastqFile(offset="Sanger") diff --git a/src/biotite/sequence/io/gff/__init__.py b/src/biotite/sequence/io/gff/__init__.py index f544a0ddd..52bac129c 100644 --- a/src/biotite/sequence/io/gff/__init__.py +++ b/src/biotite/sequence/io/gff/__init__.py @@ -14,7 +14,7 @@ GFF 3 files. This means, that you cannot directly access the the parent or child of a feature. However, the ``Id`` and ``Name`` attributes are stored in the - qualifiers of the created :class:`Feature` objects. + qualifiers of the created :class:`Feature` objects. Hence, it is possible to implement such a data structure from this information. """ @@ -22,5 +22,5 @@ __name__ = "biotite.sequence.io.gff" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/sequence/io/gff/convert.py b/src/biotite/sequence/io/gff/convert.py index 9c8782f65..8f3fb75f2 100644 --- a/src/biotite/sequence/io/gff/convert.py +++ b/src/biotite/sequence/io/gff/convert.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_annotation", "set_annotation"] -from ...annotation import Location, Feature, Annotation +from biotite.sequence.annotation import Annotation, Feature, Location def get_annotation(gff_file): @@ -22,12 +22,12 @@ def get_annotation(gff_file): Thus, for entries with the same ``ID``, the *type* and *attributes* are only parsed once and the locations are aggregated from each entry. - + Parameters ---------- gff_file : GFFFile The file tro extract the :class:`Annotation` object from. - + Returns ------- annotation : Annotation @@ -45,9 +45,7 @@ def get_annotation(gff_file): # (beginning of the file) if current_key is not None: # Beginning of new feature -> Save previous feature - annot.add_feature( - Feature(current_key, current_locs, current_qual) - ) + annot.add_feature(Feature(current_key, current_locs, current_qual)) # Track new feature current_key = type current_locs = [Location(start, end, strand)] @@ -61,15 +59,14 @@ def get_annotation(gff_file): return annot -def set_annotation(gff_file, annotation, - seqid=None, source=None, is_stranded=True): +def set_annotation(gff_file, annotation, seqid=None, source=None, is_stranded=True): """ Write an :class:`Annotation` object into a GFF3 file. Each feature will get one entry for each location it has. :class:`Feature` objects with multiple locations require the ``ID`` qualifier in its :attr:`Feature.qual` attribute. - + Parameters ---------- gff_file : GFFFile @@ -87,14 +84,13 @@ def set_annotation(gff_file, annotation, for feature in sorted(annotation): if len(feature.locs) > 1 and "ID" not in feature.qual: raise ValueError( - "The 'Id' qualifier is required " - "for features with multiple locations" + "The 'Id' qualifier is required " "for features with multiple locations" ) ## seqid ## if seqid is not None and " " in seqid: raise ValueError("The 'seqid' must not contain whitespaces") ## source ## - #Nothing to be done + # Nothing to be done ## type ## type = feature.key ## strand ## @@ -128,6 +124,5 @@ def set_annotation(gff_file, annotation, else: phase = None gff_file.append( - seqid, source, type, start, end, - score, strand, phase, attributes - ) \ No newline at end of file + seqid, source, type, start, end, score, strand, phase, attributes + ) diff --git a/src/biotite/sequence/io/gff/file.py b/src/biotite/sequence/io/gff/file.py index f708712d2..c151bd869 100644 --- a/src/biotite/sequence/io/gff/file.py +++ b/src/biotite/sequence/io/gff/file.py @@ -6,19 +6,17 @@ __author__ = "Patrick Kunzmann" __all__ = ["GFFFile"] -import copy import string -from urllib.parse import quote, unquote import warnings -from ....file import TextFile, InvalidFileError -from ...annotation import Location - +from urllib.parse import quote, unquote +from biotite.file import InvalidFileError, TextFile +from biotite.sequence.annotation import Location # All punctuation characters except # percent, semicolon, equals, ampersand, comma -_NOT_QUOTED = "".join( - [char for char in string.punctuation if char not in "%;=&,"] -) + " " +_NOT_QUOTED = ( + "".join([char for char in string.punctuation if char not in "%;=&,"]) + " " +) class GFFFile(TextFile): @@ -61,7 +59,7 @@ class GFFFile(TextFile): The content after the ``##FASTA`` directive is simply ignored. Please provide the sequence via a separate file or read the FASTA data directly via the :attr:`lines` attribute: - + >>> import os.path >>> from io import StringIO >>> gff_file = GFFFile.read(os.path.join(path_to_sequences, "indexing_test.gff3")) @@ -121,7 +119,7 @@ class GFFFile(TextFile): ##Example directive param1 param2 SomeSeqID Biotite CDS 1 99 . + 0 ID=FeatureID;product=A protein """ - + def __init__(self): super().__init__() # Maps entry indices to line indices @@ -132,18 +130,18 @@ def __init__(self): self._has_fasta = None self._index_entries() self.append_directive("gff-version", "3") - + @classmethod def read(cls, file): """ Read a GFF3 file. - + Parameters ---------- file : file-like object or str The file to be read. Alternatively a file path can be supplied. - + Returns ------- file_object : GFFFile @@ -152,18 +150,29 @@ def read(cls, file): file = super().read(file) file._index_entries() return file - - def insert(self, index, seqid, source, type, start, end, - score, strand, phase, attributes=None): + + def insert( + self, + index, + seqid, + source, + type, + start, + end, + score, + strand, + phase, + attributes=None, + ): """ Insert an entry at the given index. - + Parameters ---------- index : int Index where the entry is inserted. If the index is equal to the length of the file, the entry - is appended at the end of the file. + is appended at the end of the file. seqid : str The ID of the reference sequence. source : str @@ -184,22 +193,23 @@ def insert(self, index, seqid, source, type, start, end, Additional properties of the feature. """ if index == len(self): - self.append(seqid, source, type, start, end, - score, strand, phase, attributes) + self.append( + seqid, source, type, start, end, score, strand, phase, attributes + ) else: line_index = self._entries[index] line = GFFFile._create_line( - seqid, source, type, start, end, - score, strand, phase, attributes + seqid, source, type, start, end, score, strand, phase, attributes ) self.lines.insert(line_index, line) self._index_entries() - - def append(self, seqid, source, type, start, end, - score, strand, phase, attributes=None): + + def append( + self, seqid, source, type, start, end, score, strand, phase, attributes=None + ): """ Append an entry to the end of the file. - + Parameters ---------- seqid : str @@ -232,11 +242,11 @@ def append(self, seqid, source, type, start, end, self.lines.append(line) # Fast update of entry index by adding last line self._entries.append(len(self.lines) - 1) - + def append_directive(self, directive, *args): """ Append a directive line to the end of the file. - + Parameters ---------- directive : str @@ -245,13 +255,13 @@ def append_directive(self, directive, *args): Optional parameters for the directive. Each argument is simply appended to the directive, separated by a single space character. - + Raises ------ NotImplementedError If the ``##FASTA`` directive is used, which is not supported. - + Examples -------- @@ -262,17 +272,15 @@ def append_directive(self, directive, *args): ##Example directive param1 param2 """ if directive.startswith("FASTA"): - raise NotImplementedError( - "Adding FASTA information is not supported" - ) + raise NotImplementedError("Adding FASTA information is not supported") directive_line = "##" + directive + " " + " ".join(args) self._directives.append((directive_line[2:], len(self.lines))) self.lines.append(directive_line) - + def directives(self): """ Get the directives in the file. - + Returns ------- directives : list of tuple(str, int) @@ -283,7 +291,7 @@ def directives(self): """ # Sort in line order return sorted(self._directives, key=lambda directive: directive[1]) - + def __setitem__(self, index, item): seqid, source, type, start, end, score, strand, phase, attrib = item line = GFFFile._create_line( @@ -292,15 +300,13 @@ def __setitem__(self, index, item): line_index = self._entries[index] self.lines[line_index] = line - def __getitem__(self, index): - if (index >= 0 and index >= len(self)) or \ - (index < 0 and -index > len(self)): - raise IndexError( - f"Index {index} is out of range for GFFFile with " - f"{len(self)} entries" - ) - + if (index >= 0 and index >= len(self)) or (index < 0 and -index > len(self)): + raise IndexError( + f"Index {index} is out of range for GFFFile with " + f"{len(self)} entries" + ) + line_index = self._entries[index] # Columns are tab separated s = self.lines[line_index].strip().split("\t") @@ -324,15 +330,15 @@ def __getitem__(self, index): attrib = GFFFile._parse_attributes(attrib) return seqid, source, type, start, end, score, strand, phase, attrib - + def __delitem__(self, index): line_index = self._entries[index] del self.lines[line_index] self._index_entries() - + def __len__(self): return len(self._entries) - + def _index_entries(self): """ Parse the file for comment and directive lines. @@ -374,15 +380,12 @@ def _index_entries(self): self._entries = self._entries[:entry_counter] @staticmethod - def _create_line(seqid, source, type, start, end, - score, strand, phase, attributes): + def _create_line(seqid, source, type, start, end, score, strand, phase, attributes): """ Create a line for a newly created entry. """ - seqid = quote(seqid.strip(), safe=_NOT_QUOTED) \ - if seqid is not None else "." - source = quote(source.strip(), safe=_NOT_QUOTED) \ - if source is not None else "." + seqid = quote(seqid.strip(), safe=_NOT_QUOTED) if seqid is not None else "." + source = quote(source.strip(), safe=_NOT_QUOTED) if source is not None else "." type = type.strip() # Perform checks @@ -394,7 +397,7 @@ def _create_line(seqid, source, type, start, end, raise ValueError("'type' must not be empty") if seqid[0] == ">": raise ValueError("'seqid' must not start with '>'") - + score = str(score) if score is not None else "." if strand == Location.Strand.FORWARD: strand = "+" @@ -403,16 +406,31 @@ def _create_line(seqid, source, type, start, end, else: strand = "." phase = str(phase) if phase is not None else "." - attributes = ";".join( - [quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED) - for key, val in attributes.items()] - ) if attributes is not None and len(attributes) > 0 else "." + attributes = ( + ";".join( + [ + quote(key, safe=_NOT_QUOTED) + "=" + quote(val, safe=_NOT_QUOTED) + for key, val in attributes.items() + ] + ) + if attributes is not None and len(attributes) > 0 + else "." + ) return "\t".join( - [seqid, source, type, str(start), str(end), - str(score), strand, phase, attributes] + [ + seqid, + source, + type, + str(start), + str(end), + str(score), + strand, + phase, + attributes, + ] ) - + @staticmethod def _parse_attributes(attributes): """ @@ -426,9 +444,7 @@ def _parse_attributes(attributes): for entry in attrib_entries: compounds = entry.split("=") if len(compounds) != 2: - raise InvalidFileError( - f"Attribute entry '{entry}' is invalid" - ) + raise InvalidFileError(f"Attribute entry '{entry}' is invalid") key, val = compounds attrib_dict[unquote(key)] = unquote(val) - return attrib_dict \ No newline at end of file + return attrib_dict diff --git a/src/biotite/sequence/phylo/__init__.py b/src/biotite/sequence/phylo/__init__.py index d70caa681..5d29f1a9e 100644 --- a/src/biotite/sequence/phylo/__init__.py +++ b/src/biotite/sequence/phylo/__init__.py @@ -31,6 +31,6 @@ __name__ = "biotite.sequence.phylo" __author__ = "Patrick Kunzmann" +from .nj import * from .tree import * from .upgma import * -from .nj import * \ No newline at end of file diff --git a/src/biotite/sequence/profile.py b/src/biotite/sequence/profile.py index 1a140e1f9..d208b2b3f 100644 --- a/src/biotite/sequence/profile.py +++ b/src/biotite/sequence/profile.py @@ -4,9 +4,13 @@ import warnings import numpy as np -from .seqtypes import NucleotideSequence, ProteinSequence, GeneralSequence -from .alphabet import LetterAlphabet -from .align.alignment import get_codes +from biotite.sequence.align.alignment import get_codes +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.seqtypes import ( + GeneralSequence, + NucleotideSequence, + ProteinSequence, +) __name__ = "biotite.sequence" __author__ = "Maximilian Greil" @@ -73,7 +77,7 @@ class SequenceProfile(object): be created from an indefinite number of aligned sequences. With :meth:`sequence_probability_from_matrix()` the probability of a - sequence can be calculated based on the before calculated position + sequence can be calculated based on the before calculated position probability matrix of this instance of object SequenceProfile. With :meth:`sequence_score_from_matrix()` the score of a sequence @@ -154,8 +158,10 @@ def gaps(self, new_gaps): def __repr__(self): """Represent SequenceProfile as a string for debugging.""" - return f"SequenceProfile(np.{np.array_repr(self.symbols)}, " \ - f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))" + return ( + f"SequenceProfile(np.{np.array_repr(self.symbols)}, " + f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))" + ) def __eq__(self, item): if not isinstance(item, SequenceProfile): @@ -204,16 +210,16 @@ def from_alignment(alignment, alphabet=None): for alph in (seq.alphabet for seq in alignment.sequences): if not alphabet.extends(alph): raise ValueError( - f"The given alphabet is incompatible with a least one " + "The given alphabet is incompatible with a least one " "alphabet of the given sequences" ) symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int) gaps = np.zeros(len(sequences[0]), dtype=int) sequences = np.transpose(sequences) for i in range(len(sequences)): - row = np.where(sequences[i, ] == -1, len(alphabet), sequences[i, ]) + row = np.where(sequences[i,] == -1, len(alphabet), sequences[i,]) count = np.bincount(row, minlength=len(alphabet) + 1) - symbols[i, ] = count[0:len(alphabet)] + symbols[i,] = count[0 : len(alphabet)] gaps[i] = count[-1] return SequenceProfile(symbols, gaps, alphabet) @@ -248,10 +254,21 @@ def to_consensus(self, as_general=False): def _dna_to_consensus(self): codes = { - (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'T', - (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M', - (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V', - (0, 1, 2, 3): 'N' + (0,): "A", + (1,): "C", + (2,): "G", + (3,): "T", + (0, 2): "R", + (1, 3): "Y", + (1, 2): "S", + (0, 3): "W", + (2, 3): "K", + (0, 1): "M", + (1, 2, 3): "B", + (0, 2, 3): "D", + (0, 1, 3): "H", + (0, 1, 2): "V", + (0, 1, 2, 3): "N", } consensus = "" maxes = np.max(self.symbols, axis=1) @@ -261,10 +278,21 @@ def _dna_to_consensus(self): def _rna_to_consensus(self): codes = { - (0,): 'A', (1,): 'C', (2,): 'G', (3,): 'U', - (0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M', - (1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V', - (0, 1, 2, 3): 'N' + (0,): "A", + (1,): "C", + (2,): "G", + (3,): "U", + (0, 2): "R", + (1, 3): "Y", + (1, 2): "S", + (0, 3): "W", + (2, 3): "K", + (0, 1): "M", + (1, 2, 3): "B", + (0, 2, 3): "D", + (0, 1, 3): "H", + (0, 1, 2): "V", + (0, 1, 2, 3): "N", } consensus = "" maxes = np.max(self.symbols, axis=1) @@ -307,7 +335,7 @@ def probability_matrix(self, pseudocount=0): .. math:: P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p} - + :math:`S`: The symbol. :math:`C_S`: The count of symbol :math:`S` at the sequence @@ -330,11 +358,10 @@ def probability_matrix(self, pseudocount=0): The calculated the position probability matrix. """ if pseudocount < 0: - raise ValueError( - f"Pseudocount can not be smaller than zero." - ) - return (self.symbols + pseudocount / self.symbols.shape[1]) / \ - (np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount) + raise ValueError("Pseudocount can not be smaller than zero.") + return (self.symbols + pseudocount / self.symbols.shape[1]) / ( + np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount + ) def log_odds_matrix(self, background_frequencies=None, pseudocount=0): r""" @@ -346,7 +373,7 @@ def log_odds_matrix(self, background_frequencies=None, pseudocount=0): .. math:: W(S) = \log_2 \left( \frac{P(S)}{B_S} \right) - + :math:`S`: The symbol. :math:`P(S)`: The probability of symbol :math:`S` at the @@ -363,7 +390,7 @@ def log_odds_matrix(self, background_frequencies=None, pseudocount=0): background_frequencies: ndarray, shape=(k,), dtype=float, optional The background frequencies for each symbol in the alphabet. By default, a uniform distribution is assumed. - + Returns ------- pwm: ndarray, dtype=float, shape=(n,k) @@ -383,7 +410,7 @@ def sequence_probability(self, sequence, pseudocount=0): Calculate probability of a sequence based on the position probability matrix (PPM). - The sequence probability is the product of the probability of + The sequence probability is the product of the probability of the respective symbol over all sequence positions. Parameters @@ -419,7 +446,7 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0): Calculate score of a sequence based on the position weight matrix (PWM). - The score is the sum of weights (log-odds scores) of + The score is the sum of weights (log-odds scores) of the respective symbol over all sequence positions. Parameters @@ -442,7 +469,9 @@ def sequence_score(self, sequence, background_frequencies=None, pseudocount=0): """ if background_frequencies is None: background_frequencies = 1 / len(self.alphabet) - pwm = self.log_odds_matrix(background_frequencies=background_frequencies, pseudocount=pseudocount) + pwm = self.log_odds_matrix( + background_frequencies=background_frequencies, pseudocount=pseudocount + ) if len(sequence) != len(pwm): raise ValueError( f"The given sequence has a different length ({len(sequence)}) than " diff --git a/src/biotite/sequence/search.py b/src/biotite/sequence/search.py index c57e7d119..96af23d03 100644 --- a/src/biotite/sequence/search.py +++ b/src/biotite/sequence/search.py @@ -4,8 +4,7 @@ __name__ = "biotite.sequence" __author__ = "Patrick Kunzmann" -__all__ = ["find_subsequence", "find_symbol", "find_symbol_first", - "find_symbol_last"] +__all__ = ["find_subsequence", "find_symbol", "find_symbol_first", "find_symbol_last"] import numpy as np @@ -13,7 +12,7 @@ def find_subsequence(sequence, query): """ Find a subsequence in a sequence. - + Parameters ---------- sequence : Sequence @@ -21,26 +20,26 @@ def find_subsequence(sequence, query): query : Sequence The potential subsequence. Its alphabet must extend the `sequence` alphabet. - + Returns ------- match_indices : ndarray The starting indices in `sequence`, where `query` has been found. The array is empty if no match has been found. - + Raises ------ ValueError If the `query` alphabet does not extend the `sequence` alphabet. - + Examples -------- - + >>> main_seq = NucleotideSequence("ACTGAATGA") >>> sub_seq = NucleotideSequence("TGA") >>> print(find_subsequence(main_seq, sub_seq)) [2 6] - + """ if not sequence.get_alphabet().extends(query.get_alphabet()): raise ValueError("The sequences alphabets are not equal") @@ -52,17 +51,18 @@ def find_subsequence(sequence, query): match_indices.append(i) return np.array(match_indices) + def find_symbol(sequence, symbol): """ Find a symbol in a sequence. - + Parameters ---------- sequence : Sequence The sequence to find the symbol in. symbol : object The symbol to be found in `sequence`. - + Returns ------- match_indices : ndarray @@ -71,17 +71,18 @@ def find_symbol(sequence, symbol): code = sequence.get_alphabet().encode(symbol) return np.where(sequence.code == code)[0] + def find_symbol_first(sequence, symbol): """ Find first occurence of a symbol in a sequence. - + Parameters ---------- sequence : Sequence The sequence to find the symbol in. symbol : object The symbol to be found in `sequence`. - + Returns ------- first_index : int @@ -92,18 +93,19 @@ def find_symbol_first(sequence, symbol): if len(match_i) == 0: return -1 return np.min(match_i) - + + def find_symbol_last(sequence, symbol): """ Find last occurence of a symbol in a sequence. - + Parameters ---------- sequence : Sequence The sequence to find the symbol in. symbol : object The symbol to be found in `sequence`. - + Returns ------- flast_index : int diff --git a/src/biotite/sequence/seqtypes.py b/src/biotite/sequence/seqtypes.py index 76254e13f..e09527c35 100644 --- a/src/biotite/sequence/seqtypes.py +++ b/src/biotite/sequence/seqtypes.py @@ -6,17 +6,16 @@ __author__ = "Patrick Kunzmann", "Thomas Nevolianis" __all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"] -from .sequence import Sequence -from .alphabet import LetterAlphabet, AlphabetError, AlphabetMapper import numpy as np -import copy +from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet +from biotite.sequence.sequence import Sequence class GeneralSequence(Sequence): """ This class allows the creation of a sequence with custom :class:`Alphabet` without the need to subclass :class:`Sequence`. - + Parameters ---------- alphabet : Alphabet @@ -27,22 +26,24 @@ class GeneralSequence(Sequence): may also be a :class:`str` object. By default the sequence is empty. """ - + def __init__(self, alphabet, sequence=()): self._alphabet = alphabet super().__init__(sequence) def __repr__(self): """Represent GeneralSequence as a string for debugging.""" - return f"GeneralSequence(Alphabet({self._alphabet}), " \ - f"[{', '.join([repr(symbol) for symbol in self.symbols])}])" + return ( + f"GeneralSequence(Alphabet({self._alphabet}), " + f"[{', '.join([repr(symbol) for symbol in self.symbols])}])" + ) def __copy_create__(self): return GeneralSequence(self._alphabet) - + def get_alphabet(self): return self._alphabet - + def as_type(self, sequence): """ Convert the :class:`GeneralSequence` into a sequence of another @@ -58,12 +59,12 @@ def as_type(self, sequence): of this object. The alphabet must equal or extend the alphabet of this object. - + Returns ------- sequence : Sequence The input `sequence` with replaced sequence code. - + Raises ------ AlphabetError @@ -78,16 +79,17 @@ def as_type(self, sequence): sequence.code = self.code return sequence + class NucleotideSequence(Sequence): """ Representation of a nucleotide sequence (DNA or RNA). - + This class may have one of two different alphabets: :attr:`unambiguous_alphabet()` contains only the unambiguous DNA letters 'A', 'C', 'G' and 'T'. - :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous + :attr:`ambiguous_alphabet()` uses an extended alphabet for ambiguous letters. - + Parameters ---------- sequence : iterable object, optional @@ -100,35 +102,36 @@ class NucleotideSequence(Sequence): ambiguous letters in the sequence, the ambiguous alphabet is used. """ - - alphabet_unamb = LetterAlphabet(["A","C","G","T"]) - alphabet_amb = LetterAlphabet( - ["A","C","G","T","R","Y","W","S", - "M","K","H","B","V","D","N"] + + alphabet_unamb = LetterAlphabet(["A", "C", "G", "T"]) + alphabet_amb = LetterAlphabet( + ["A", "C", "G", "T", "R", "Y", "W", "S", "M", "K", "H", "B", "V", "D", "N"] ) - - compl_symbol_dict = {"A" : "T", - "C" : "G", - "G" : "C", - "T" : "A", - "M" : "K", - "R" : "Y", - "W" : "W", - "S" : "S", - "Y" : "R", - "K" : "M", - "V" : "B", - "H" : "D", - "D" : "H", - "B" : "V", - "N" : "N"} + + compl_symbol_dict = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "N": "N", + } # List comprehension does not work in this scope _compl_symbols = [] for _symbol in alphabet_amb.get_symbols(): _compl_symbols.append(compl_symbol_dict[_symbol]) _compl_alphabet_unamb = LetterAlphabet(_compl_symbols) _compl_mapper = AlphabetMapper(_compl_alphabet_unamb, alphabet_amb) - + def __init__(self, sequence=[], ambiguous=None): if isinstance(sequence, str): sequence = sequence.upper() @@ -164,28 +167,28 @@ def __copy_create__(self): else: seq_copy = NucleotideSequence(ambiguous=False) return seq_copy - + def get_alphabet(self): return self._alphabet - + def complement(self): """ Get the complement nucleotide sequence. - + Returns ------- complement : NucleotideSequence The complement sequence. - + Examples -------- - + >>> dna_seq = NucleotideSequence("ACGCTT") >>> print(dna_seq.complement()) TGCGAA >>> print(dna_seq.reverse().complement()) AAGCGT - + """ # Interpreting the sequence code of this object in the # complementary alphabet gives the complementary symbols @@ -194,18 +197,18 @@ def complement(self): # alphabet into the original alphabet compl_code = NucleotideSequence._compl_mapper[self.code] return self.copy(compl_code) - + def translate(self, complete=False, codon_table=None, met_start=False): """ Translate the nucleotide sequence into a protein sequence. - + If `complete` is true, the entire sequence is translated, beginning with the first codon and ending with the last codon, even if stop codons occur during the translation. - + Otherwise this method returns possible ORFs in the sequence, even if not stop codon occurs in an ORF. - + Parameters ---------- complete : bool, optional @@ -222,7 +225,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): Otherwise the translation starts with the amino acid the codon codes for. Only applies, if `complete` is false. (Default: False) - + Returns ------- protein : ProteinSequence or list of ProteinSequence @@ -233,15 +236,15 @@ def translate(self, complete=False, codon_table=None, met_start=False): pos : list of tuple (int, int) Is only returned if `complete` is false. The list contains a tuple for each ORF. - The first element of the tuple is the index of the + The first element of the tuple is the index of the :class:`NucleotideSequence`, where the translation starts. The second element is the exclusive stop index, it represents the first nucleotide in the :class:`NucleotideSequence` after a stop codon. - + Examples -------- - + >>> dna_seq = NucleotideSequence("AATGATGCTATAGAT") >>> prot_seq = dna_seq.translate(complete=True) >>> print(prot_seq) @@ -251,29 +254,32 @@ def translate(self, complete=False, codon_table=None, met_start=False): ... print(seq) MML* ML* - + """ if self._alphabet != NucleotideSequence.alphabet_unamb: raise AlphabetError("Translation requires unambiguous alphabet") # Determine codon_table if codon_table is None: # Import at this position to avoid circular import - from .codon import CodonTable + from biotite.sequence.codon import CodonTable + codon_table = CodonTable.default_table() - + if complete: if len(self) % 3 != 0: - raise ValueError("Sequence length needs to be a multiple of 3 " - "for complete translation") + raise ValueError( + "Sequence length needs to be a multiple of 3 " + "for complete translation" + ) # Reshape code into (n,3), with n being the amount of codons codons = self.code.reshape(-1, 3) protein_seq = ProteinSequence() protein_seq.code = codon_table.map_codon_codes(codons) return protein_seq - + else: stop_code = ProteinSequence.alphabet.encode("*") - met_code = ProteinSequence.alphabet.encode("M") + met_code = ProteinSequence.alphabet.encode("M") protein_seqs = [] pos = [] code = self.code @@ -282,7 +288,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): # The frame length is always a multiple of 3 # If there is a trailing partial codon, remove it frame_length = ((len(code) - shift) // 3) * 3 - frame = code[shift : shift+frame_length] + frame = code[shift : shift + frame_length] # Reshape frame into (n,3), with n being the amount of codons frame_codons = frame.reshape(-1, 3) # At first, translate frame completely @@ -297,8 +303,7 @@ def translate(self, complete=False, codon_table=None, met_start=False): stops = np.where(code_from_start == stop_code)[0] # Find first stop codon after start codon # Include stop -> stops[0] + 1 - stop_i = stops[0] + 1 if len(stops) > 0 \ - else len(code_from_start) + stop_i = stops[0] + 1 if len(stops) > 0 else len(code_from_start) code_from_start_to_stop = code_from_start[:stop_i] prot_seq = ProteinSequence() if met_start: @@ -310,13 +315,13 @@ def translate(self, complete=False, codon_table=None, met_start=False): protein_seqs.append(prot_seq) # Codon indices are transformed # to nucleotide sequence indices - pos.append((shift + start_i*3, shift + (start_i+stop_i)*3)) + pos.append((shift + start_i * 3, shift + (start_i + stop_i) * 3)) # Sort by start position order = np.argsort([start for start, stop in pos]) pos = [pos[i] for i in order] protein_seqs = [protein_seqs[i] for i in order] return protein_seqs, pos - + @staticmethod def unambiguous_alphabet(): """ @@ -329,7 +334,7 @@ def unambiguous_alphabet(): The unambiguous nucleotide alphabet. """ return NucleotideSequence.alphabet_unamb - + @staticmethod def ambiguous_alphabet(): """ @@ -348,10 +353,10 @@ def ambiguous_alphabet(): class ProteinSequence(Sequence): """ Representation of a protein sequence. - + Furthermore this class offers a conversion of amino acids from 3-letter code into 1-letter code and vice versa. - + Parameters ---------- sequence : iterable object, optional @@ -359,7 +364,7 @@ class ProteinSequence(Sequence): string. May take upper or lower case letters. If a list is given, the list elements can be 1-letter or 3-letter amino acid representations. By default the sequence is empty. - + Notes ----- The :class:`Alphabet` of this :class:`Sequence` class does not @@ -370,106 +375,138 @@ class ProteinSequence(Sequence): """ _codon_table = None - - alphabet = LetterAlphabet(["A","C","D","E","F","G","H","I","K","L", - "M","N","P","Q","R","S","T","V","W","Y", - "B","Z","X","*"]) + + alphabet = LetterAlphabet( + [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + "B", + "Z", + "X", + "*", + ] + ) # Masses are taken from # https://web.expasy.org/findmod/findmod_masses.html#AA - _mol_weight_average = np.array([ - 71.0788, # A - 103.1388, # C - 115.0886, # D - 129.1155, # E - 147.1766, # F - 57.0519, # G - 137.1411, # H - 113.1594, # I - 128.1741, # K - 113.1594, # L - 131.1926, # M - 114.1038, # N - 97.1167, # P - 128.1307, # Q - 156.1875, # R - 87.0782, # S - 101.1051, # T - 99.1326, # V - 186.2132, # W - 163.1760, # Y - np.nan, # B - np.nan, # Z - np.nan, # X - np.nan, # * - ]) - - _mol_weight_monoisotopic = np.array([ - 71.03711, # A - 103.00919, # C - 115.02694, # D - 129.04259, # E - 147.06841, # F - 57.02146, # G - 137.05891, # H - 113.08406, # I - 128.09496, # K - 113.08406, # L - 131.04049, # M - 114.04293, # N - 97.05276, # P - 128.05858, # Q - 156.10111, # R - 87.03203, # S - 101.04768, # T - 99.06841, # V - 186.07931, # W - 163.06333, # Y - np.nan, # B - np.nan, # Z - np.nan, # X - np.nan, # * - ]) - - _dict_1to3 = {"A" : "ALA", - "C" : "CYS", - "D" : "ASP", - "E" : "GLU", - "F" : "PHE", - "G" : "GLY", - "H" : "HIS", - "I" : "ILE", - "K" : "LYS", - "L" : "LEU", - "M" : "MET", - "N" : "ASN", - "P" : "PRO", - "Q" : "GLN", - "R" : "ARG", - "S" : "SER", - "T" : "THR", - "V" : "VAL", - "W" : "TRP", - "Y" : "TYR", - "B" : "ASX", - "Z" : "GLX", - "X" : "UNK", - "*" : " * "} - + _mol_weight_average = np.array( + [ + 71.0788, # A + 103.1388, # C + 115.0886, # D + 129.1155, # E + 147.1766, # F + 57.0519, # G + 137.1411, # H + 113.1594, # I + 128.1741, # K + 113.1594, # L + 131.1926, # M + 114.1038, # N + 97.1167, # P + 128.1307, # Q + 156.1875, # R + 87.0782, # S + 101.1051, # T + 99.1326, # V + 186.2132, # W + 163.1760, # Y + np.nan, # B + np.nan, # Z + np.nan, # X + np.nan, # * + ] + ) + + _mol_weight_monoisotopic = np.array( + [ + 71.03711, # A + 103.00919, # C + 115.02694, # D + 129.04259, # E + 147.06841, # F + 57.02146, # G + 137.05891, # H + 113.08406, # I + 128.09496, # K + 113.08406, # L + 131.04049, # M + 114.04293, # N + 97.05276, # P + 128.05858, # Q + 156.10111, # R + 87.03203, # S + 101.04768, # T + 99.06841, # V + 186.07931, # W + 163.06333, # Y + np.nan, # B + np.nan, # Z + np.nan, # X + np.nan, # * + ] + ) + + _dict_1to3 = { + "A": "ALA", + "C": "CYS", + "D": "ASP", + "E": "GLU", + "F": "PHE", + "G": "GLY", + "H": "HIS", + "I": "ILE", + "K": "LYS", + "L": "LEU", + "M": "MET", + "N": "ASN", + "P": "PRO", + "Q": "GLN", + "R": "ARG", + "S": "SER", + "T": "THR", + "V": "VAL", + "W": "TRP", + "Y": "TYR", + "B": "ASX", + "Z": "GLX", + "X": "UNK", + "*": " * ", + } + _dict_3to1 = {} for _key, _value in _dict_1to3.items(): _dict_3to1[_value] = _key _dict_3to1["SEC"] = "C" _dict_3to1["MSE"] = "M" - + def __init__(self, sequence=()): dict_3to1 = ProteinSequence._dict_3to1 - alph = ProteinSequence.alphabet # Convert 3-letter codes to single letter codes, # if list contains 3-letter codes - sequence = [dict_3to1[symbol.upper()] if len(symbol) == 3 - else symbol.upper() for symbol in sequence] + sequence = [ + dict_3to1[symbol.upper()] if len(symbol) == 3 else symbol.upper() + for symbol in sequence + ] super().__init__(sequence) def __repr__(self): @@ -478,11 +515,11 @@ def __repr__(self): def get_alphabet(self): return ProteinSequence.alphabet - + def remove_stops(self): """ Remove *stop signals* from the sequence. - + Returns ------- no_stop : ProteinSequence @@ -493,34 +530,34 @@ def remove_stops(self): seq_code = no_stop.code no_stop.code = seq_code[seq_code != stop_code] return no_stop - + @staticmethod def convert_letter_3to1(symbol): """ Convert a 3-letter to a 1-letter amino acid representation. - + Parameters ---------- symbol : string 3-letter amino acid representation. - + Returns ------- convert : string 1-letter amino acid representation. """ return ProteinSequence._dict_3to1[symbol.upper()] - + @staticmethod def convert_letter_1to3(symbol): """ Convert a 1-letter to a 3-letter amino acid representation. - + Parameters ---------- symbol : string 1-letter amino acid representation. - + Returns ------- convert : string @@ -531,7 +568,7 @@ def convert_letter_1to3(symbol): def get_molecular_weight(self, monoisotopic=False): """ Calculate the molecular weight of this protein. - + Average protein molecular weight is calculated by the addition of average isotopic masses of the amino acids in the protein and the average isotopic mass of one water @@ -550,7 +587,6 @@ def get_molecular_weight(self, monoisotopic=False): if np.isnan(weight): raise ValueError( - "Sequence contains ambiguous amino acids, " - "cannot calculate weight" + "Sequence contains ambiguous amino acids, " "cannot calculate weight" ) return weight diff --git a/src/biotite/sequence/sequence.py b/src/biotite/sequence/sequence.py index f9a69dfb0..4040fcc0e 100644 --- a/src/biotite/sequence/sequence.py +++ b/src/biotite/sequence/sequence.py @@ -10,16 +10,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["Sequence"] -import numbers import abc +import numbers import numpy as np -from .alphabet import Alphabet, LetterAlphabet -from ..copyable import Copyable - +from biotite.copyable import Copyable +from biotite.sequence.alphabet import LetterAlphabet -_size_uint8 = np.iinfo(np.uint8 ).max +1 -_size_uint16 = np.iinfo(np.uint16).max +1 -_size_uint32 = np.iinfo(np.uint32).max +1 +_size_uint8 = np.iinfo(np.uint8).max + 1 +_size_uint16 = np.iinfo(np.uint16).max + 1 +_size_uint32 = np.iinfo(np.uint32).max + 1 class Sequence(Copyable, metaclass=abc.ABCMeta): @@ -277,12 +276,10 @@ def get_symbol_frequency(self): corresponding number of occurences in the sequence as values. """ - counts = np.bincount( - self._seq_code, minlength=len(self.get_alphabet()) - ) + counts = np.bincount(self._seq_code, minlength=len(self.get_alphabet())) return { - symbol: count for symbol, count - in zip(self.get_alphabet().get_symbols(), counts) + symbol: count + for symbol, count in zip(self.get_alphabet().get_symbols(), counts) } def __getitem__(self, index): @@ -329,12 +326,13 @@ def __eq__(self, item): def __str__(self): alph = self.get_alphabet() if isinstance(alph, LetterAlphabet): - return alph.decode_multiple(self._seq_code, as_bytes=True)\ - .tobytes().decode("ASCII") - else: - return ", ".join( - [str(e) for e in alph.decode_multiple(self._seq_code)] + return ( + alph.decode_multiple(self._seq_code, as_bytes=True) + .tobytes() + .decode("ASCII") ) + else: + return ", ".join([str(e) for e in alph.decode_multiple(self._seq_code)]) def __add__(self, sequence): if self.get_alphabet().extends(sequence.get_alphabet()): diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py index 0685b0e61..df9776324 100644 --- a/src/biotite/structure/__init__.py +++ b/src/biotite/structure/__init__.py @@ -104,9 +104,11 @@ __author__ = "Patrick Kunzmann" from .atoms import * +from .basepairs import * from .bonds import * from .box import * from .celllist import * +from .chains import * from .charges import * from .compare import * from .density import * @@ -122,11 +124,9 @@ from .rdf import * from .repair import * from .residues import * -from .chains import * from .sasa import * from .sequence import * from .sse import * from .superimpose import * from .transform import * -from .basepairs import * -# util and resutil are used internally +# util and segments are used internally diff --git a/src/biotite/structure/atoms.py b/src/biotite/structure/atoms.py index 47f97de7d..ad6228807 100644 --- a/src/biotite/structure/atoms.py +++ b/src/biotite/structure/atoms.py @@ -4,19 +4,27 @@ """ This module contains the main types of the ``structure`` subpackage: -:class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`. +:class:`Atom`, :class:`AtomArray` and :class:`AtomArrayStack`. """ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["Atom", "AtomArray", "AtomArrayStack", - "array", "stack", "repeat", "from_template", "coord"] +__all__ = [ + "Atom", + "AtomArray", + "AtomArrayStack", + "array", + "stack", + "repeat", + "from_template", + "coord", +] -import numbers import abc +import numbers import numpy as np -from .bonds import BondList -from ..copyable import Copyable +from biotite.copyable import Copyable +from biotite.structure.bonds import BondList class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta): @@ -26,7 +34,7 @@ class _AtomArrayBase(Copyable, metaclass=abc.ABCMeta): It implements functionality for annotation arrays and also rudimentarily for coordinates. """ - + def __init__(self, length): """ Create the annotation arrays @@ -43,14 +51,14 @@ def __init__(self, length): self.add_annotation("hetero", dtype=bool) self.add_annotation("atom_name", dtype="U6") self.add_annotation("element", dtype="U2") - + def array_length(self): """ Get the length of the atom array. - + This value is equivalent to the length of each annotation array. For :class:`AtomArray` it is the same as ``len(array)``. - + Returns ------- length : int @@ -71,15 +79,15 @@ def shape(self): shape : tuple of int Shape of the object. """ - return - + return + def add_annotation(self, category, dtype): """ Add an annotation category, if not already existing. - + Initially the new annotation is filled with the *zero* representation of the given type. - + Parameters ---------- category : str @@ -87,19 +95,18 @@ def add_annotation(self, category, dtype): dtype : type or str A type instance or a valid *NumPy* *dtype* string. Defines the type of the annotation - + See Also -------- set_annotation """ if category not in self._annot: - self._annot[str(category)] = np.zeros(self._array_length, - dtype=dtype) - + self._annot[str(category)] = np.zeros(self._array_length, dtype=dtype) + def del_annotation(self, category): """ Removes an annotation category. - + Parameters ---------- category : str @@ -107,32 +114,30 @@ def del_annotation(self, category): """ if category in self._annot: del self._annot[str(category)] - + def get_annotation(self, category): """ Return an annotation array. - + Parameters ---------- category : str The annotation category to be returned. - + Returns ------- array : ndarray The annotation array. """ if category not in self._annot: - raise ValueError( - f"Annotation category '{category}' is not existing" - ) + raise ValueError(f"Annotation category '{category}' is not existing") return self._annot[category] - + def set_annotation(self, category, array): """ Set an annotation array. If the annotation category does not exist yet, the category is created. - + Parameters ---------- category : str @@ -143,28 +148,25 @@ def set_annotation(self, category, array): """ if len(array) != self._array_length: raise IndexError( - f"Expected array length {self._array_length}, " - f"but got {len(array)}" + f"Expected array length {self._array_length}, " f"but got {len(array)}" ) if category in self._annot: # Keep the dtype if the annotation already exists - self._annot[category] = np.asarray( - array, dtype=self._annot[category].dtype - ) + self._annot[category] = np.asarray(array, dtype=self._annot[category].dtype) else: self._annot[category] = np.asarray(array) - + def get_annotation_categories(self): """ Return a list containing all annotation array categories. - + Returns ------- categories : list The list containing the names of each annotation array. """ return list(self._annot.keys()) - + def _subarray(self, index): # Index is one dimensional (boolean mask, index array) new_coord = self._coord[..., index, :] @@ -180,10 +182,9 @@ def _subarray(self, index): if self._box is not None: new_object._box = self._box for annotation in self._annot: - new_object._annot[annotation] = (self._annot[annotation] - .__getitem__(index)) + new_object._annot[annotation] = self._annot[annotation].__getitem__(index) return new_object - + def _set_element(self, index, atom): try: if isinstance(index, (numbers.Integral, np.ndarray)): @@ -191,12 +192,10 @@ def _set_element(self, index, atom): self._annot[name][index] = atom._annot[name] self._coord[..., index, :] = atom.coord else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") except KeyError: raise KeyError("The annotations of the 'Atom' are incompatible") - + def _del_element(self, index): if isinstance(index, numbers.Integral): for name in self._annot: @@ -208,20 +207,18 @@ def _del_element(self, index): mask[index] = False self._bonds = self._bonds[mask] else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) - + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") + def equal_annotations(self, item): """ Check, if this object shares equal annotation arrays with the given :class:`AtomArray` or :class:`AtomArrayStack`. - + Parameters ---------- item : AtomArray or AtomArrayStack The object to compare the annotation arrays with. - + Returns ------- equality : bool @@ -235,24 +232,24 @@ def equal_annotations(self, item): if not np.array_equal(self._annot[name], item._annot[name]): return False return True - + def equal_annotation_categories(self, item): """ Check, if this object shares equal annotation array categories with the given :class:`AtomArray` or :class:`AtomArrayStack`. - + Parameters ---------- item : AtomArray or AtomArrayStack The object to compare the annotation arrays with. - + Returns ------- equality : bool True, if the annotation array names are equal. """ return sorted(self._annot.keys()) == sorted(item._annot.keys()) - + def __getattr__(self, attr): """ If the attribute is an annotation, the annotation is returned @@ -273,7 +270,7 @@ def __getattr__(self, attr): raise AttributeError( f"'{type(self).__name__}' object has no attribute '{attr}'" ) - + def __setattr__(self, attr, value): """ If the attribute is an annotation, the :attr:`value` is saved @@ -287,15 +284,13 @@ def __setattr__(self, attr, value): if isinstance(self, AtomArray): if value.ndim != 2: raise ValueError( - "A 2-dimensional ndarray is expected " - "for an AtomArray" - ) + "A 2-dimensional ndarray is expected " "for an AtomArray" + ) elif isinstance(self, AtomArrayStack): if value.ndim != 3: raise ValueError( - "A 3-dimensional ndarray is expected " - "for an AtomArrayStack" - ) + "A 3-dimensional ndarray is expected " "for an AtomArrayStack" + ) if value.shape[-2] != self._array_length: raise ValueError( f"Expected array length {self._array_length}, " @@ -304,7 +299,7 @@ def __setattr__(self, attr, value): if value.shape[-1] != 3: raise TypeError("Expected 3 coordinates for each atom") super().__setattr__("_coord", value.astype(np.float32, copy=False)) - + elif attr == "bonds": if isinstance(value, BondList): if value.get_atom_count() != self._array_length: @@ -318,22 +313,21 @@ def __setattr__(self, attr, value): super().__setattr__("_bonds", None) else: raise TypeError("Value must be 'BondList'") - + elif attr == "box": if isinstance(value, np.ndarray): if isinstance(self, AtomArray): if value.ndim != 2: raise ValueError( - "A 2-dimensional ndarray is expected " - "for an AtomArray" - ) - else: # AtomArrayStack + "A 2-dimensional ndarray is expected " "for an AtomArray" + ) + else: # AtomArrayStack if value.ndim != 3: raise ValueError( "A 3-dimensional ndarray is expected " "for an AtomArrayStack" - ) - if value.shape[-2:] != (3,3): + ) + if value.shape[-2:] != (3, 3): raise TypeError("Box must be a 3x3 matrix (three vectors)") box = value.astype(np.float32, copy=False) super().__setattr__("_box", box) @@ -342,14 +336,14 @@ def __setattr__(self, attr, value): super().__setattr__("_box", None) else: raise TypeError("Box must be ndarray of floats or None") - + elif attr == "_annot": super().__setattr__(attr, value) elif attr in self._annot: self.set_annotation(attr, value) else: super().__setattr__(attr, value) - + def __dir__(self): attr = super().__dir__() attr.append("coord") @@ -358,7 +352,7 @@ def __dir__(self): for name in self._annot.keys(): attr.append(name) return attr - + def __eq__(self, item): """ See Also @@ -376,30 +370,31 @@ def __eq__(self, item): if not np.array_equal(self._box, item._box): return False return np.array_equal(self._coord, item._coord) - + def __len__(self): """ The length of the annotation arrays. - + Returns ------- length : int Length of the annotation arrays. """ return self._array_length - + def __add__(self, array): - if type(self) != type(array): + if not isinstance(self, type(array)): raise TypeError("Can only concatenate two arrays or two stacks") # Create either new array or stack, depending of the own type if isinstance(self, AtomArray): - concat = AtomArray(length = self._array_length+array._array_length) + concat = AtomArray(length=self._array_length + array._array_length) if isinstance(self, AtomArrayStack): - concat = AtomArrayStack(self.stack_depth(), - self._array_length + array._array_length) - + concat = AtomArrayStack( + self.stack_depth(), self._array_length + array._array_length + ) + concat._coord = np.concatenate((self._coord, array.coord), axis=-2) - + # Transfer only annotations, # which are existent in both operands arr_categories = list(array._annot.keys()) @@ -407,29 +402,29 @@ def __add__(self, array): if category in arr_categories: annot = self._annot[category] arr_annot = array._annot[category] - concat._annot[category] = np.concatenate((annot,arr_annot)) - + concat._annot[category] = np.concatenate((annot, arr_annot)) + # Concatenate bonds lists, # if at least one of them contains bond information if self._bonds is not None or array._bonds is not None: bonds1 = self._bonds bonds2 = array._bonds if bonds1 is None: - bonds1 = BondList(self._array_length) + bonds1 = BondList(self._array_length) if bonds2 is None: bonds2 = BondList(array._array_length) concat._bonds = bonds1 + bonds2 - + # Copy box if self._box is not None: concat._box = np.copy(self._box) return concat - + def __copy_fill__(self, clone): super().__copy_fill__(clone) self._copy_annotations(clone) clone._coord = np.copy(self._coord) - + def _copy_annotations(self, clone): for name in self._annot: clone._annot[name] = np.copy(self._annot[name]) @@ -437,23 +432,23 @@ def _copy_annotations(self, clone): clone._box = np.copy(self._box) if self._bonds is not None: clone._bonds = self._bonds.copy() - + class Atom(Copyable): """ A representation of a single atom. - + The coordinates an annotations can be accessed directly. A detailed description of each annotation category can be viewed :doc:`here `. - + Parameters ---------- coord: list or ndarray The x, y and z coordinates. kwargs Atom annotations as key value pair. - + Attributes ---------- {annot} : scalar @@ -463,19 +458,19 @@ class Atom(Copyable): shape : tuple of int Shape of the object. In case of an :class:`Atom`, the tuple is empty. - + Examples -------- - + >>> atom = Atom([1,2,3], chain_id="A") >>> atom.atom_name = "CA" >>> print(atom.atom_name) CA >>> print(atom.coord) [1. 2. 3.] - + """ - + def __init__(self, coord, **kwargs): self._annot = {} self._annot["chain_id"] = "" @@ -500,17 +495,17 @@ def __repr__(self): """Represent Atom as a string for debugging.""" # print out key-value pairs and format strings in quotation marks annot_parts = [ - f'{key}="{value}"' if isinstance(value, str) else f'{key}={value}' + f'{key}="{value}"' if isinstance(value, str) else f"{key}={value}" for key, value in self._annot.items() ] - annot = ', '.join(annot_parts) - return f'Atom(np.{np.array_repr(self.coord)}, {annot})' + annot = ", ".join(annot_parts) + return f"Atom(np.{np.array_repr(self.coord)}, {annot})" @property def shape(self): return () - + def __getattr__(self, attr): if attr in super().__getattribute__("_annot"): return self._annot[attr] @@ -518,7 +513,7 @@ def __getattr__(self, attr): raise AttributeError( f"'{type(self).__name__}' object has no attribute '{attr}'" ) - + def __setattr__(self, attr, value): if attr == "_annot": super().__setattr__(attr, value) @@ -526,16 +521,18 @@ def __setattr__(self, attr, value): super().__setattr__(attr, value) else: self._annot[attr] = value - + def __str__(self): hetero = "HET" if self.hetero else "" - return f"{hetero:3} {self.chain_id:3} " \ - f"{self.res_id:5d}{self.ins_code:1} {self.res_name:3} " \ - f"{self.atom_name:6} {self.element:2} " \ - f"{self.coord[0]:8.3f} " \ - f"{self.coord[1]:8.3f} " \ - f"{self.coord[2]:8.3f}" - + return ( + f"{hetero:3} {self.chain_id:3} " + f"{self.res_id:5d}{self.ins_code:1} {self.res_name:3} " + f"{self.atom_name:6} {self.element:2} " + f"{self.coord[0]:8.3f} " + f"{self.coord[1]:8.3f} " + f"{self.coord[2]:8.3f}" + ) + def __eq__(self, item): if not isinstance(item, Atom): return False @@ -547,18 +544,18 @@ def __eq__(self, item): if self._annot[name] != item._annot[name]: return False return True - + def __ne__(self, item): return not self == item - + def __copy_create__(self): return Atom(self.coord, **self._annot) - + class AtomArray(_AtomArrayBase): """ An array representation of a model consisting of multiple atoms. - + An :class:`AtomArray` can be seen as a list of :class:`Atom` instances. Instead of using directly a list, this class uses an *NumPy* @@ -573,14 +570,14 @@ class AtomArray(_AtomArrayBase): or :func:`set_annotation()`. A detailed description of each annotation category can be viewed :doc:`here `. - + In order to get an an subarray of an :class:`AtomArray`, *NumPy* style indexing is used. This includes slices, boolean arrays, index arrays and even *Ellipsis* notation. Using a single integer as index returns a single :class:`Atom` instance. - + Inserting or appending an :class:`AtomArray` to another :class:`AtomArray` is done with the '+' operator. Only the annotation categories, which are existing in both arrays, @@ -611,7 +608,7 @@ class AtomArray(_AtomArrayBase): ---------- length : int The fixed amount of atoms in the array. - + Attributes ---------- {annot} : ndarray @@ -629,44 +626,44 @@ class AtomArray(_AtomArrayBase): Shape of the atom array. The single value in the tuple is the length of the atom array. - + Examples -------- Creating an atom array from atoms: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") >>> atom_array = array([atom1, atom2, atom3]) >>> print(atom_array.array_length()) 3 - + Accessing an annotation array: - + >>> print(atom_array.chain_id) ['A' 'A' 'B'] - + Accessing the coordinates: - + >>> print(atom_array.coord) [[1. 2. 3.] [2. 3. 4.] [3. 4. 5.]] - + *NumPy* style filtering: - + >>> atom_array = atom_array[atom_array.chain_id == "A"] >>> print(atom_array.array_length()) 2 - + Inserting an atom array: - + >>> insert = array([Atom([7,8,9], chain_id="C")]) >>> atom_array = atom_array[0:1] + insert + atom_array[1:2] >>> print(atom_array.chain_id) ['A' 'C' 'A'] """ - + def __init__(self, length): super().__init__(length) if length is None: @@ -676,13 +673,13 @@ def __init__(self, length): def __repr__(self): """Represent AtomArray as a string for debugging.""" - atoms = '' + atoms = "" for i in range(0, self.array_length()): if len(atoms) == 0: - atoms = '\n\t' + self.get_atom(i).__repr__() + atoms = "\n\t" + self.get_atom(i).__repr__() else: - atoms = atoms + ',\n\t' + self.get_atom(i).__repr__() - return f'array([{atoms}\n])' + atoms = atoms + ",\n\t" + self.get_atom(i).__repr__() + return f"array([{atoms}\n])" @property def shape(self): @@ -703,33 +700,33 @@ def shape(self): -------- array_length """ - return self.array_length(), + return (self.array_length(),) def get_atom(self, index): """ Obtain the atom instance of the array at the specified index. - + The same as ``array[index]``, if `index` is an integer. - + Parameters ---------- index : int Index of the atom. - + Returns ------- atom : Atom - Atom at position `index`. + Atom at position `index`. """ kwargs = {} for name, annotation in self._annot.items(): kwargs[name] = annotation[index] - return Atom(coord = self._coord[index], kwargs=kwargs) - + return Atom(coord=self._coord[index], kwargs=kwargs) + def __iter__(self): """ Iterate through the array. - + Yields ------ atom : Atom @@ -738,16 +735,16 @@ def __iter__(self): while i < len(self): yield self.get_atom(i) i += 1 - + def __getitem__(self, index): """ Obtain a subarray or the atom instance at the specified index. - + Parameters ---------- index : object All index types *NumPy* accepts, are valid. - + Returns ------- sub_array : Atom or AtomArray @@ -763,16 +760,14 @@ def __getitem__(self, index): # If first index is "...", just ignore the first index return self.__getitem__(index[1]) else: - raise IndexError( - "'AtomArray' does not accept multidimensional indices" - ) + raise IndexError("'AtomArray' does not accept multidimensional indices") else: return self._subarray(index) - + def __setitem__(self, index, atom): """ Set the atom at the specified array position. - + Parameters ---------- index : int @@ -781,38 +776,38 @@ def __setitem__(self, index, atom): The atom to be set. """ self._set_element(index, atom) - + def __delitem__(self, index): """ Deletes the atom at the specified array position. - + Parameters ---------- index : int The position where the atom should be deleted. """ self._del_element(index) - + def __len__(self): """ The length of the array. - + Returns ------- length : int Length of the array. """ return self.array_length() - + def __eq__(self, item): """ Check if the array equals another :class:`AtomArray`. - + Parameters ---------- item : object Object to campare the array with. - + Returns ------- equal : bool @@ -824,15 +819,15 @@ def __eq__(self, item): if not isinstance(item, AtomArray): return False return True - + def __str__(self): """ Get a string representation of the array. - + Each line contains the attributes of one atom. """ return "\n".join([str(atom) for atom in self]) - + def __copy_create__(self): return AtomArray(self.array_length()) @@ -841,7 +836,7 @@ class AtomArrayStack(_AtomArrayBase): """ A collection of multiple :class:`AtomArray` instances, where each atom array has equal annotation arrays. - + Effectively, this means that each atom is occuring in every array in the stack at differing coordinates. This situation arises e.g. in NMR-elucidated or simulated structures. Since the annotations are @@ -849,7 +844,7 @@ class AtomArrayStack(_AtomArrayBase): coordinate array is 3-D (m x n x 3). A detailed description of each annotation category can be viewed :doc:`here `. - + Indexing works similar to :class:`AtomArray`, with the difference, that two index dimensions are possible: The first index dimension specifies the array(s), the second index @@ -857,24 +852,24 @@ class AtomArrayStack(_AtomArrayBase): in :class:`AtomArray`). Using a single integer as first dimension index returns a single :class:`AtomArray` instance. - + Concatenation of atoms for each array in the stack is done using the '+' operator. For addition of atom arrays onto the stack use the :func:`stack()` method. The :attr:`box` attribute has the shape *m x 3 x 3*, as the cell might be different for each frame in the atom array stack. - + Parameters ---------- depth : int The fixed amount of arrays in the stack. When indexing, this is the length of the first dimension. - + length : int The fixed amount of atoms in each array in the stack. When indexing, this is the length of the second dimension. - + Attributes ---------- {annot} : ndarray, shape=(n,) @@ -892,15 +887,15 @@ class AtomArrayStack(_AtomArrayBase): Shape of the stack. The numbers correspond to the stack depth and array length, respectively. - + See also -------- AtomArray - + Examples -------- Creating an atom array stack from two arrays: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") @@ -925,40 +920,40 @@ class AtomArrayStack(_AtomArrayBase): [5. 6. 7.] [6. 7. 8.]]] """ - + def __init__(self, depth, length): super().__init__(length) - if depth == None or length == None: + if depth is None or length is None: self._coord = None else: self._coord = np.full((depth, length, 3), np.nan, dtype=np.float32) def __repr__(self): """Represent AtomArrayStack as a string for debugging.""" - arrays = '' + arrays = "" for i in range(0, self.stack_depth()): if len(arrays) == 0: - arrays = '\n\t' + self.get_array(i).__repr__() + arrays = "\n\t" + self.get_array(i).__repr__() else: - arrays = arrays + ',\n\t' + self.get_array(i).__repr__() - return f'stack([{arrays}\n])' + arrays = arrays + ",\n\t" + self.get_array(i).__repr__() + return f"stack([{arrays}\n])" def get_array(self, index): """ Obtain the atom array instance of the stack at the specified index. - + The same as ``stack[index]``, if `index` is an integer. - + Parameters ---------- index : int Index of the atom array. - + Returns ------- array : AtomArray - AtomArray at position `index`. + AtomArray at position `index`. """ array = AtomArray(self.array_length()) for name in self._annot: @@ -970,14 +965,14 @@ def get_array(self, index): array._box = self._box[index] return array - + def stack_depth(self): """ Get the depth of the stack. - + This value represents the amount of atom arrays in the stack. It is the same as ``len(array)``. - + Returns ------- length : int @@ -1005,7 +1000,7 @@ def shape(self): def __iter__(self): """ Iterate through the array. - + Yields ------ array : AtomArray @@ -1014,17 +1009,17 @@ def __iter__(self): while i < len(self): yield self.get_array(i) i += 1 - + def __getitem__(self, index): """ Obtain the atom array instance or an substack at the specified index. - + Parameters ---------- index : object All index types *NumPy* accepts are valid. - + Returns ------- sub_array : AtomArray or AtomArrayStack @@ -1033,7 +1028,7 @@ def __getitem__(self, index): Otherwise an :class:`AtomArrayStack` with reduced depth and length is returned. In case the index is a tuple(int, int) an :class:`Atom` - instance is returned. + instance is returned. """ if isinstance(index, numbers.Integral): return self.get_array(index) @@ -1050,7 +1045,7 @@ def __getitem__(self, index): if isinstance(index[1], numbers.Integral): # Prevent reduction in dimensionality # in second dimension - new_stack = self._subarray(slice(index[1], index[1]+1)) + new_stack = self._subarray(slice(index[1], index[1] + 1)) else: new_stack = self._subarray(index[1]) if index[0] is not Ellipsis: @@ -1065,14 +1060,13 @@ def __getitem__(self, index): if self._box is not None: new_stack._box = self._box[index] return new_stack - - + def __setitem__(self, index, array): """ Set the atom array at the specified stack position. - + The array and the stack must have equal annotation arrays. - + Parameters ---------- index : int @@ -1081,26 +1075,20 @@ def __setitem__(self, index, array): The atom array to be set. """ if not self.equal_annotations(array): - raise ValueError( - "The stack and the array have unequal annotations" - ) + raise ValueError("The stack and the array have unequal annotations") if self.bonds != array.bonds: - raise ValueError( - "The stack and the array have unequal bonds" - ) + raise ValueError("The stack and the array have unequal bonds") if isinstance(index, numbers.Integral): self.coord[index] = array.coord if self.box is not None: self.box[index] = array.box else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) - + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") + def __delitem__(self, index): """ Deletes the atom array at the specified stack position. - + Parameters ---------- index : int @@ -1109,14 +1097,12 @@ def __delitem__(self, index): if isinstance(index, numbers.Integral): self._coord = np.delete(self._coord, index, axis=0) else: - raise TypeError( - f"Index must be integer, not '{type(index).__name__}'" - ) - + raise TypeError(f"Index must be integer, not '{type(index).__name__}'") + def __len__(self): """ The depth of the stack, i.e. the amount of models. - + Returns ------- depth : int @@ -1124,16 +1110,16 @@ def __len__(self): """ # length is determined by length of coord attribute return self._coord.shape[0] - + def __eq__(self, item): """ Check if the array equals another :class:`AtomArray` - + Parameters ---------- item : object Object to campare the array with. - + Returns ------- equal : bool @@ -1145,20 +1131,20 @@ def __eq__(self, item): if not isinstance(item, AtomArrayStack): return False return True - + def __str__(self): """ Get a string representation of the stack. - + :class:`AtomArray` strings eparated by blank lines and a line indicating the index. """ string = "" for i, array in enumerate(self): - string += "Model " + str(i+1) + "\n" + string += "Model " + str(i + 1) + "\n" string += str(array) + "\n" + "\n" return string - + def __copy_create__(self): return AtomArrayStack(self.stack_depth(), self.array_length()) @@ -1166,23 +1152,23 @@ def __copy_create__(self): def array(atoms): """ Create an :class:`AtomArray` from a list of :class:`Atom`. - + Parameters ---------- atoms : iterable object of Atom The atoms to be combined in an array. All atoms must share the same annotation categories. - + Returns ------- array : AtomArray The listed atoms as array. - + Examples -------- - + Creating an atom array from atoms: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") @@ -1204,7 +1190,7 @@ def array(atoms): array = AtomArray(len(atoms)) # Add all (also optional) annotation categories for name in names: - array.add_annotation(name, dtype=type(atoms[0]._annot[name])) + array.add_annotation(name, dtype=type(atoms[0]._annot[name])) # Add all atoms to AtomArray for i in range(len(atoms)): for name in names: @@ -1216,23 +1202,23 @@ def array(atoms): def stack(arrays): """ Create an :class:`AtomArrayStack` from a list of :class:`AtomArray`. - + Parameters ---------- arrays : iterable object of AtomArray The atom arrays to be combined in a stack. All atom arrays must have an equal number of atoms and equal annotation arrays. - + Returns ------- stack : AtomArrayStack The stacked atom arrays. - + Examples -------- Creating an atom array stack from two arrays: - + >>> atom1 = Atom([1,2,3], chain_id="A") >>> atom2 = Atom([2,3,4], chain_id="A") >>> atom3 = Atom([3,4,5], chain_id="B") @@ -1272,7 +1258,7 @@ def stack(arrays): array_stack = AtomArrayStack(array_count, ref_array.array_length()) for name, annotation in ref_array._annot.items(): array_stack._annot[name] = annotation - coord_list = [array._coord for array in arrays] + coord_list = [array._coord for array in arrays] array_stack._coord = np.stack(coord_list, axis=0) # Take bond list from first array array_stack._bonds = ref_array._bonds @@ -1296,14 +1282,14 @@ def repeat(atoms, coord): The length of first dimension determines the number of repeats. If `atoms` is an :class:`AtomArray` 3 dimensions, otherwise 4 dimensions are required. - + Returns ------- repeated: AtomArray, shape=(n*k,) or AtomArrayStack, shape=(m,n*k) The repeated atoms. Whether an :class:`AtomArray` or an :class:`AtomArrayStack` is returned depends on the input `atoms`. - + Examples -------- @@ -1336,7 +1322,7 @@ def repeat(atoms, coord): raise ValueError( f"Expected 4 dimensions for the coordinate array, got {coord.ndim}" ) - + repetitions = len(coord) orig_length = atoms.array_length() new_length = orig_length * repetitions @@ -1358,24 +1344,24 @@ def repeat(atoms, coord): ) repeated = AtomArrayStack(atoms.stack_depth(), new_length) repeated.coord = coord.reshape((atoms.stack_depth(), new_length, 3)) - + else: raise TypeError( f"Expected 'AtomArray' or 'AtomArrayStack', " f"but got {type(atoms).__name__}" ) - + for category in atoms.get_annotation_categories(): annot = np.tile(atoms.get_annotation(category), repetitions) repeated.set_annotation(category, annot) if atoms.bonds is not None: repeated_bonds = atoms.bonds.copy() - for _ in range(repetitions-1): + for _ in range(repetitions - 1): repeated_bonds += atoms.bonds repeated.bonds = repeated_bonds if atoms.box is not None: repeated.box = atoms.box.copy() - + return repeated @@ -1383,7 +1369,7 @@ def from_template(template, coord, box=None): """ Create an :class:`AtomArrayStack` using template atoms and given coordinates. - + Parameters ---------- template : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) @@ -1393,7 +1379,7 @@ def from_template(template, coord, box=None): The coordinates for each model of the returned stack. box : ndarray, optional, dtype=float, shape=(l,3,3) The box for each model of the returned stack. - + Returns ------- array_stack : AtomArrayStack @@ -1409,7 +1395,7 @@ def from_template(template, coord, box=None): # Create empty stack with no models new_stack = AtomArrayStack(0, template.array_length()) - + for category in template.get_annotation_categories(): annot = template.get_annotation(category) new_stack.set_annotation(category, annot) @@ -1417,30 +1403,30 @@ def from_template(template, coord, box=None): new_stack.bonds = template.bonds.copy() if box is not None: new_stack.box = box.copy() - + # After setting the coordinates the number of models is the number # of models in the new coordinates new_stack.coord = coord - + return new_stack def coord(item): """ Get the atom coordinates of the given array. - + This may be directly and :class:`Atom`, :class:`AtomArray` or :class:`AtomArrayStack` or alternatively an (n x 3) or (m x n x 3) :class:`ndarray` containing the coordinates. - + Parameters ---------- item : Atom or AtomArray or AtomArrayStack or ndarray Returns the :attr:`coord` attribute, if `item` is an :class:`Atom`, :class:`AtomArray` or :class:`AtomArrayStack`. Directly returns the input, if `item` is a :class:`ndarray`. - + Returns ------- coord : ndarray diff --git a/src/biotite/structure/basepairs.py b/src/biotite/structure/basepairs.py index 371477cd0..a1b255fc2 100644 --- a/src/biotite/structure/basepairs.py +++ b/src/biotite/structure/basepairs.py @@ -8,23 +8,33 @@ __name__ = "biotite.structure" __author__ = "Tom David Müller" -__all__ = ["base_pairs", "map_nucleotide", "base_stacking", "base_pairs_edge", - "Edge", "base_pairs_glycosidic_bond", "GlycosidicBond"] +__all__ = [ + "base_pairs", + "map_nucleotide", + "base_stacking", + "base_pairs_edge", + "Edge", + "base_pairs_glycosidic_bond", + "GlycosidicBond", +] -import numpy as np import warnings from enum import IntEnum -from .atoms import Atom, array -from .superimpose import superimpose -from .filter import filter_nucleotides -from .celllist import CellList -from .hbond import hbond -from .error import IncompleteStructureWarning, UnexpectedStructureWarning, \ - BadStructureError -from .util import distance, norm_vector -from .residues import get_residue_starts_for, get_residue_masks -from .info.standardize import standardize_order -from .compare import rmsd +import numpy as np +from biotite.structure.atoms import Atom, array +from biotite.structure.celllist import CellList +from biotite.structure.compare import rmsd +from biotite.structure.error import ( + BadStructureError, + IncompleteStructureWarning, + UnexpectedStructureWarning, +) +from biotite.structure.filter import filter_nucleotides +from biotite.structure.hbond import hbond +from biotite.structure.info.standardize import standardize_order +from biotite.structure.residues import get_residue_masks, get_residue_starts_for +from biotite.structure.superimpose import superimpose +from biotite.structure.util import distance, norm_vector def _get_std_adenine(): @@ -43,31 +53,29 @@ def _get_std_adenine(): ring center, :class:`ndarray` containing the coordinates of the imidazole ring center """ - atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A") - atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A") - atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A") - atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A") - atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A") - atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A") - atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A") - atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A") - atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A") - atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A") + atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A") + atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A") + atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A") + atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A") + atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A") + atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A") + atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A") + atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A") + atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A") + atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A") adenine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, - atom9, atom10] + [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10] ) # Get the midpoint between the N1 and C4 atoms midpoint = np.mean([atom7.coord, atom10.coord], axis=-2) # Calculate the coordinates of the aromatic ring centers pyrimidine_center = np.mean( - [atom4.coord, atom5.coord, atom7.coord, - atom8.coord, atom9.coord, atom10.coord], axis=-2 + [atom4.coord, atom5.coord, atom7.coord, atom8.coord, atom9.coord, atom10.coord], + axis=-2, ) imidazole_center = np.mean( - [atom1.coord, atom2.coord, atom3.coord, - atom4.coord, atom10.coord], axis=-2 + [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom10.coord], axis=-2 ) return adenine, (midpoint, pyrimidine_center, imidazole_center) @@ -75,37 +83,35 @@ def _get_std_adenine(): def _get_std_cytosine(): """ - Get standard base variables for cytosine. + Get standard base variables for cytosine. - Returns - ------- - standard_base : AtomArray - Standard coordinates nomenclature of the cytosine base as - :class:`AtomArray` with nomenclature of PDB File Format V3 - coordinates : tuple (ndarray, ndarray, dtype=float) - :class:`ndarray` containing the center according to the SCHNaP- - paper referenced in the function ``base_pairs``, - :class:`ndarray` containing the coordinates of the pyrimidine - ring center + Returns + ------- + standard_base : AtomArray + Standard coordinates nomenclature of the cytosine base as + :class:`AtomArray` with nomenclature of PDB File Format V3 + coordinates : tuple (ndarray, ndarray, dtype=float) + :class:`ndarray` containing the center according to the SCHNaP- + paper referenced in the function ``base_pairs``, + :class:`ndarray` containing the coordinates of the pyrimidine + ring center """ - atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C") - atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C") - atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C") - atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C") - atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C") - atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C") - atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C") - atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C") - cytosine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8] - ) + atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C") + atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C") + atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C") + atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C") + atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C") + atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C") + atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C") + atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C") + cytosine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8]) # Get the midpoint between the N3 and C6 atoms midpoint = np.mean([atom4.coord, atom8.coord], axis=-2) # Calculate the coordinates of the aromatic ring center pyrimidine_center = np.mean( - [atom1.coord, atom2.coord, atom4.coord, - atom5.coord, atom7.coord, atom8.coord], axis=-2 + [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord], + axis=-2, ) return cytosine, (midpoint, pyrimidine_center) @@ -127,32 +133,37 @@ def _get_std_guanine(): ring center, :class:`ndarray` containing the coordinates of the imidazole ring center """ - atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G") - atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G") - atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G") - atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G") - atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G") - atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G") - atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G") - atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G") - atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G") - atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G") - atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G") + atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G") + atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G") + atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G") + atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G") + atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G") + atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G") + atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G") + atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G") + atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G") + atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G") + atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G") guanine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, - atom9, atom10, atom11] + [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10, atom11] ) # Get the midpoint between the N1 and C4 atoms midpoint = np.mean([atom7.coord, atom11.coord], axis=-2) # Calculate the coordinates of the aromatic ring centers pyrimidine_center = np.mean( - [atom4.coord, atom5.coord, atom7.coord, - atom8.coord, atom10.coord, atom11.coord], axis=-2 + [ + atom4.coord, + atom5.coord, + atom7.coord, + atom8.coord, + atom10.coord, + atom11.coord, + ], + axis=-2, ) imidazole_center = np.mean( - [atom1.coord, atom2.coord, atom3.coord, - atom4.coord, atom11.coord], axis=-2 + [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom11.coord], axis=-2 ) return guanine, (midpoint, pyrimidine_center, imidazole_center) @@ -173,25 +184,23 @@ def _get_std_thymine(): :class:`ndarray` containing the coordinates of the pyrimidine ring center """ - atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T") - atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T") - atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T") - atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T") - atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T") - atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T") - atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T") - atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T") - atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T") - thymine = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9] - ) + atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T") + atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T") + atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T") + atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T") + atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T") + atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T") + atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T") + atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T") + atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T") + thymine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9]) # Get the midpoint between the N3 and C6 atoms midpoint = np.mean([atom4.coord, atom9.coord], axis=-2) # Calculate the coordinates of the aromatic ring center pyrimidine_center = np.mean( - [atom1.coord, atom2.coord, atom4.coord, - atom5.coord, atom7.coord, atom9.coord], axis=-2 + [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom9.coord], + axis=-2, ) return thymine, (midpoint, pyrimidine_center) @@ -212,30 +221,28 @@ def _get_std_uracil(): :class:`ndarray` containing the coordinates of the pyrimidine ring center """ - atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U") - atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U") - atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U") - atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U") - atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U") - atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U") - atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U") - atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U") - uracil = array( - [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8] - ) + atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U") + atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U") + atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U") + atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U") + atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U") + atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U") + atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U") + atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U") + uracil = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8]) # Get the midpoint between the N3 and C6 atoms midpoint = np.mean([atom4.coord, atom8.coord], axis=-2) # Calculate the coordinates of the aromatic ring center pyrimidine_center = np.mean( - [atom1.coord, atom2.coord, atom4.coord, - atom5.coord, atom7.coord, atom8.coord], axis=-2 + [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord], + axis=-2, ) return uracil, (midpoint, pyrimidine_center) -_STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine() +_STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine() _STD_CYTOSINE, _STD_CYTOSINE_RING_CENTERS = _get_std_cytosine() _STD_GUANINE, _STD_GUANINE_RING_CENTERS = _get_std_guanine() _STD_THYMINE, _STD_THYMINE_RING_CENTERS = _get_std_thymine() @@ -247,35 +254,35 @@ def _get_std_uracil(): _GUANINE_CONTAINING_NUCLEOTIDES = ["G", "DG"] _URACIL_CONTAINING_NUCLEOTIDES = ["U", "DU"] _REFERENCE_NUCLEOTIDE_NAMES = ( - _ADENINE_CONTAINING_NUCLEOTIDES + - _THYMINE_CONTAINING_NUCLEOTIDES + - _CYTOSINE_CONTAINING_NUCLEOTIDES + - _GUANINE_CONTAINING_NUCLEOTIDES + - _URACIL_CONTAINING_NUCLEOTIDES + _ADENINE_CONTAINING_NUCLEOTIDES + + _THYMINE_CONTAINING_NUCLEOTIDES + + _CYTOSINE_CONTAINING_NUCLEOTIDES + + _GUANINE_CONTAINING_NUCLEOTIDES + + _URACIL_CONTAINING_NUCLEOTIDES ) # Atoms that are part of respective base edges according to the # Leontis-Westhof nomenclature _WATSON_CRICK_EDGE = { - "A" : ["N6", "N1"], - "G" : ["O6", "N1", "N2"], - "U" : ["O4", "N3", "O2"], - "T" : ["O4", "N3", "O2"], - "C" : ["N4", "N3", "O2"] + "A": ["N6", "N1"], + "G": ["O6", "N1", "N2"], + "U": ["O4", "N3", "O2"], + "T": ["O4", "N3", "O2"], + "C": ["N4", "N3", "O2"], } _HOOGSTEEN_EDGE = { - "A" : ["N6", "N7"], - "G" : ["O6", "N7"], - "U" : ["O4"], - "T" : ["O4"], - "C" : ["N4"] + "A": ["N6", "N7"], + "G": ["O6", "N7"], + "U": ["O4"], + "T": ["O4"], + "C": ["N4"], } _SUGAR_EDGE = { - "A" : ["N3", "O2'"], - "G" : ["N2", "N3", "O2'"], - "U" : ["O2", "O2'"], - "T" : ["O2", "O2'"], - "C" : ["O2", "O2'"] + "A": ["N3", "O2'"], + "G": ["N2", "N3", "O2'"], + "U": ["O2", "O2'"], + "T": ["O2", "O2'"], + "C": ["O2", "O2'"], } _EDGES = [_WATSON_CRICK_EDGE, _HOOGSTEEN_EDGE, _SUGAR_EDGE] @@ -284,9 +291,10 @@ class Edge(IntEnum): """ This enum type represents the interacting edge for a given base. """ - INVALID = 0, - WATSON_CRICK = 1, - HOOGSTEEN = 2, + + INVALID = (0,) + WATSON_CRICK = (1,) + HOOGSTEEN = (2,) SUGAR = 3 @@ -295,9 +303,10 @@ class GlycosidicBond(IntEnum): This enum type represents the relative glycosidic bond orientation for a given base pair. """ + INVALID = 0 - CIS = 1, - TRANS = 2, + CIS = (1,) + TRANS = (2,) def base_pairs_edge(atom_array, base_pairs): @@ -370,19 +379,19 @@ def base_pairs_edge(atom_array, base_pairs): The resulting integers can be interpreted as :class:`Edge` ``Enum``: >>> for interaction in interacting_edges: - ... print(Edge(interaction[0]), Edge(interaction[1])) - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK - Edge.WATSON_CRICK Edge.WATSON_CRICK + ... print(f"{Edge(interaction[0]).name} to {Edge(interaction[1]).name}") + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK + WATSON_CRICK to WATSON_CRICK References ---------- @@ -390,7 +399,7 @@ def base_pairs_edge(atom_array, base_pairs): .. footbibliography:: """ # Result-``ndarray`` matches the dimensions of the input array - results = np.zeros_like(base_pairs, dtype='uint8') + results = np.zeros_like(base_pairs, dtype="uint8") # Get the residue masks for each residue base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten()) @@ -441,16 +450,15 @@ def _get_edge_matrix(atom_array, base_masks): ) # filter out donor/acceptor heteroatoms and flatten for easy # iteration - hbonds = hbonds[:, (0,2)].flatten() + hbonds = hbonds[:, (0, 2)].flatten() # ``ndarray`` with one row for each base and the number of # bonded edge heteroatoms as in ``_edge`` as columns - matrix = np.zeros((2, 3), dtype='int32') + matrix = np.zeros((2, 3), dtype="int32") # Iterate through the atoms and corresponding atoms indices # that are part of the hydrogen bonds for atom, atom_index in zip(atom_array[hbonds], hbonds): - if atom.res_name not in _REFERENCE_NUCLEOTIDE_NAMES: continue @@ -460,8 +468,10 @@ def _get_edge_matrix(atom_array, base_masks): for base_index, base_mask in enumerate(base_masks): # If a donor/acceptor atom name matches a name in # the corresponding edge list increase the tally - if (base_mask[atom_index] and - atom.atom_name in edge_type[atom.res_name[-1]]): + if ( + base_mask[atom_index] + and atom.atom_name in edge_type[atom.res_name[-1]] + ): matrix[base_index, edge_type_index] += 1 return matrix @@ -521,26 +531,26 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): ``Enum``: >>> for orientation in orientations: - ... print(GlycosidicBond(orientation)) - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS - GlycosidicBond.CIS + ... print(GlycosidicBond(orientation).name) + CIS + CIS + CIS + CIS + CIS + CIS + CIS + CIS + CIS + CIS + CIS + CIS References ---------- .. footbibliography:: """ - results = np.zeros(len(base_pairs), dtype='uint8') + results = np.zeros(len(base_pairs), dtype="uint8") # Get the residue masks for each residue base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten()) @@ -552,7 +562,6 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): ) for i, pair_masks in enumerate(base_pairs_masks): - # position vectors of each bases geometric center geometric_centers = np.zeros((2, 3)) # direction vectors of the glycosidic bonds @@ -565,23 +574,22 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): # For Purines the glycosidic bond is between the C1' and the # N9 atoms, for pyrimidines it is between the C1' atom and # the N1 atom - if (base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES or - base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES): - - geometric_centers[base_index] = ( - (ring_center[0] + ring_center[1]) / 2 - ) + if ( + base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES + or base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES + ): + geometric_centers[base_index] = (ring_center[0] + ring_center[1]) / 2 base_atom = base[base.atom_name == "N9"][0] - elif (base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES or - base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES or - base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES): - + elif ( + base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES + or base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES + or base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES + ): geometric_centers[base_index] = ring_center[0] base_atom = base[base.atom_name == "N1"][0] else: - results[i] = GlycosidicBond.INVALID break @@ -596,15 +604,16 @@ def base_pairs_glycosidic_bond(atom_array, base_pairs): geometric_centers_dir = geometric_centers[1] - geometric_centers[0] # Check the orientation of the glycosidic bonds - if np.dot( - np.cross(geometric_centers_dir, glycosidic_bonds[0]), - np.cross(geometric_centers_dir, glycosidic_bonds[1]) - ) < 0: - + if ( + np.dot( + np.cross(geometric_centers_dir, glycosidic_bonds[0]), + np.cross(geometric_centers_dir, glycosidic_bonds[1]), + ) + < 0 + ): results[i] = GlycosidicBond.TRANS else: - results[i] = GlycosidicBond.CIS return results @@ -723,15 +732,18 @@ def base_stacking(atom_array, min_atoms_per_base=3): for i in range(2): base_tuple = _match_base(bases[i], min_atoms_per_base) - if(base_tuple is None): + if base_tuple is None: break transformed_std_vectors[i] = base_tuple - normal_vectors = np.vstack((transformed_std_vectors[0][1], - transformed_std_vectors[1][1])) - aromatic_ring_centers = [transformed_std_vectors[0][3:], - transformed_std_vectors[1][3:]] + normal_vectors = np.vstack( + (transformed_std_vectors[0][1], transformed_std_vectors[1][1]) + ) + aromatic_ring_centers = [ + transformed_std_vectors[0][3:], + transformed_std_vectors[1][3:], + ] # Check if the base pairs are stacked. stacked = _check_base_stacking(aromatic_ring_centers, normal_vectors) @@ -744,7 +756,7 @@ def base_stacking(atom_array, min_atoms_per_base=3): return np.array(stacked_bases) -def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): +def base_pairs(atom_array, min_atoms_per_base=3, unique=True): """ Use DSSR criteria to find the base pairs in an :class:`AtomArray`. @@ -854,11 +866,8 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): nucleotides_boolean = filter_nucleotides(atom_array) # Disregard the phosphate-backbone - non_phosphate_boolean = ( - ~ np.isin( - atom_array.atom_name, - ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"] - ) + non_phosphate_boolean = ~np.isin( + atom_array.atom_name, ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"] ) # Combine the two boolean masks @@ -867,7 +876,6 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): # Get only nucleosides nucleosides = atom_array[boolean_mask] - # Get the base pair candidates according to a N/O cutoff distance, # where each base is identified as the first index of its respective # residue @@ -896,9 +904,7 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): base1 = nucleosides[base1_mask] base2 = nucleosides[base2_mask] - hbonds = _check_dssr_criteria( - (base1, base2), min_atoms_per_base, unique - ) + hbonds = _check_dssr_criteria((base1, base2), min_atoms_per_base, unique) # If no hydrogens are present use the number N/O pairs to # decide between multiple pairing possibilities. @@ -906,7 +912,7 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): if hbonds is None: # Each N/O-pair is detected twice. Thus, the number of # matches must be divided by two. - hbonds = n_o_pairs/2 + hbonds = n_o_pairs / 2 if hbonds != -1: basepairs.append((base1_index, base2_index)) if unique: @@ -922,20 +928,16 @@ def base_pairs(atom_array, min_atoms_per_base = 3, unique = True): # Get all bases that have non-unique pairing interactions base_indices, occurrences = np.unique(basepairs, return_counts=True) for base_index, occurrence in zip(base_indices, occurrences): - if(occurrence > 1): + if occurrence > 1: # Write the non-unique base pairs to a dictionary as # 'index: number of hydrogen bonds' remove_candidates = {} - for i, row in enumerate( - np.asarray(basepair_array == base_index) - ): - if(np.any(row)): + for i, row in enumerate(np.asarray(basepair_array == base_index)): + if np.any(row): remove_candidates[i] = basepairs_hbonds[i] # Flag all non-unique base pairs for removal except the # one that has the most hydrogen bonds - del remove_candidates[ - max(remove_candidates, key=remove_candidates.get) - ] + del remove_candidates[max(remove_candidates, key=remove_candidates.get)] to_remove += list(remove_candidates.keys()) # Remove all flagged base pairs from the output `ndarray` basepair_array = np.delete(basepair_array, to_remove, axis=0) @@ -984,21 +986,22 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): # Generate the data necessary for analysis of each base. for i in range(2): - transformed_std_vectors[i] = _match_base( - basepair[i], min_atoms_per_base - ) + transformed_std_vectors[i] = _match_base(basepair[i], min_atoms_per_base) - if(transformed_std_vectors[i] is None): + if transformed_std_vectors[i] is None: return -1 - origins = np.vstack((transformed_std_vectors[0][0], - transformed_std_vectors[1][0])) - normal_vectors = np.vstack((transformed_std_vectors[0][1], - transformed_std_vectors[1][1])) - schnaap_origins = np.vstack((transformed_std_vectors[0][2], - transformed_std_vectors[1][2])) - aromatic_ring_centers = [transformed_std_vectors[0][3:], - transformed_std_vectors[1][3:]] + origins = np.vstack((transformed_std_vectors[0][0], transformed_std_vectors[1][0])) + normal_vectors = np.vstack( + (transformed_std_vectors[0][1], transformed_std_vectors[1][1]) + ) + schnaap_origins = np.vstack( + (transformed_std_vectors[0][2], transformed_std_vectors[1][2]) + ) + aromatic_ring_centers = [ + transformed_std_vectors[0][3:], + transformed_std_vectors[1][3:], + ] # Criterion 1: Distance between orgins <=15 Å if not (distance(origins[0], origins[1]) <= 15): @@ -1009,9 +1012,8 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): # Average the base normal vectors. If the angle between the vectors # is >=90°, flip one vector before averaging mean_normal_vector = ( - normal_vectors[0] + (normal_vectors[1] * np.sign(np.dot( - normal_vectors[0], normal_vectors[1] - ))) + normal_vectors[0] + + (normal_vectors[1] * np.sign(np.dot(normal_vectors[0], normal_vectors[1]))) ) / 2 norm_vector(mean_normal_vector) # Calculate the distance vector between the two SCHNAaP origins @@ -1024,8 +1026,9 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): return -1 # Criterion 3: Angle between normal vectors <=65° - if not (np.arccos(np.dot(normal_vectors[0], normal_vectors[1])) - >= ((115*np.pi)/180)): + if not ( + np.arccos(np.dot(normal_vectors[0], normal_vectors[1])) >= ((115 * np.pi) / 180) + ): return -1 # Criterion 4: Absence of stacking @@ -1035,8 +1038,7 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): # Criterion 5: Presence of at least one hydrogen bond # # Check if both bases came with hydrogens. - if (("H" in basepair[0].element) - and ("H" in basepair[1].element)): + if ("H" in basepair[0].element) and ("H" in basepair[1].element): # For Structures that contain hydrogens, check for their # presence directly. # @@ -1044,11 +1046,13 @@ def _check_dssr_criteria(basepair, min_atoms_per_base, unique): potential_basepair = basepair[0] + basepair[1] # Get the number of hydrogen bonds - bonds = len(hbond( - potential_basepair, - np.ones_like(potential_basepair, dtype=bool), - np.ones_like(potential_basepair, dtype=bool) - )) + bonds = len( + hbond( + potential_basepair, + np.ones_like(potential_basepair, dtype=bool), + np.ones_like(potential_basepair, dtype=bool), + ) + ) if bonds > 0: return bonds @@ -1085,7 +1089,7 @@ def _check_base_stacking(aromatic_ring_centers, normal_vectors): wrong_distance = True for ring_center1 in aromatic_ring_centers[0]: for ring_center2 in aromatic_ring_centers[1]: - if (distance(ring_center1, ring_center2) <= 4.5): + if distance(ring_center1, ring_center2) <= 4.5: wrong_distance = False normalized_distance_vectors.append(ring_center2 - ring_center1) norm_vector(normalized_distance_vectors[-1]) @@ -1106,8 +1110,7 @@ def _check_base_stacking(aromatic_ring_centers, normal_vectors): dist_normal_vector_angle = np.rad2deg( np.arccos(np.dot(normal_vector, normalized_dist_vector)) ) - if ((dist_normal_vector_angle >= 40) and - (dist_normal_vector_angle <= 140)): + if (dist_normal_vector_angle >= 40) and (dist_normal_vector_angle <= 140): return False return True @@ -1142,19 +1145,19 @@ def _match_base(nucleotide, min_atoms_per_base): if one_letter_code is None: return None - if (one_letter_code == 'A'): + if one_letter_code == "A": std_base = _STD_ADENINE std_ring_centers = _STD_ADENINE_RING_CENTERS - elif (one_letter_code == 'T'): + elif one_letter_code == "T": std_base = _STD_THYMINE std_ring_centers = _STD_THYMINE_RING_CENTERS - elif (one_letter_code == 'C'): + elif one_letter_code == "C": std_base = _STD_CYTOSINE std_ring_centers = _STD_CYTOSINE_RING_CENTERS - elif (one_letter_code == 'G'): + elif one_letter_code == "G": std_base = _STD_GUANINE std_ring_centers = _STD_GUANINE_RING_CENTERS - elif (one_letter_code == 'U'): + elif one_letter_code == "U": std_base = _STD_URACIL std_ring_centers = _STD_URACIL_RING_CENTERS @@ -1162,16 +1165,10 @@ def _match_base(nucleotide, min_atoms_per_base): vectors = np.vstack((vectors, std_ring_centers)) # Select the matching atoms of the nucleotide and the standard base - nucleotide_matched = nucleotide[ - np.isin(nucleotide.atom_name, std_base.atom_name) - ] - std_base_matched = std_base[ - np.isin(std_base.atom_name, nucleotide.atom_name) - ] + nucleotide_matched = nucleotide[np.isin(nucleotide.atom_name, std_base.atom_name)] + std_base_matched = std_base[np.isin(std_base.atom_name, nucleotide.atom_name)] # Ensure the nucleotide does not contain duplicate atom names - _, unique_indices = np.unique( - nucleotide_matched.atom_name, return_index=True - ) + _, unique_indices = np.unique(nucleotide_matched.atom_name, return_index=True) nucleotide_matched = nucleotide_matched[unique_indices] # Only continue if minimum number of matching atoms is reached if len(nucleotide_matched) < min_atoms_per_base: @@ -1179,21 +1176,19 @@ def _match_base(nucleotide, min_atoms_per_base): f"Nucleotide with res_id {nucleotide.res_id[0]} and " f"chain_id {nucleotide.chain_id[0]} has less than 3 base " f"atoms, unable to check for base pair.", - IncompleteStructureWarning + IncompleteStructureWarning, ) return None # Reorder the atoms of the nucleotide to obtain the standard RCSB # PDB atom order. - nucleotide_matched = nucleotide_matched[ - standardize_order(nucleotide_matched) - ] + nucleotide_matched = nucleotide_matched[standardize_order(nucleotide_matched)] # Match the selected std_base to the base. _, transformation = superimpose(nucleotide_matched, std_base_matched) vectors = transformation.apply(vectors) # Normalize the base-normal-vector - vectors[1,:] = vectors[1,:]-vectors[0,:] - norm_vector(vectors[1,:]) + vectors[1, :] = vectors[1, :] - vectors[0, :] + norm_vector(vectors[1, :]) return vectors @@ -1259,8 +1254,11 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # List of the standard bases for easy iteration std_base_list = [ - _STD_ADENINE, _STD_THYMINE, _STD_CYTOSINE, _STD_GUANINE, - _STD_URACIL + _STD_ADENINE, + _STD_THYMINE, + _STD_CYTOSINE, + _STD_GUANINE, + _STD_URACIL, ] # The number of matched atoms for each 'standard' base @@ -1275,7 +1273,7 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): f"{residue.chain_id[0]} has an overlap with the reference " f"bases which is less than {min_atoms_per_base} atoms. " f"Unable to map nucleotide.", - IncompleteStructureWarning + IncompleteStructureWarning, ) return None, False @@ -1284,7 +1282,7 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # Iterate through the reference bases with the maximum number of # matching atoms - for ref_base in np.array(std_base_list, dtype='object')[ + for ref_base in np.array(std_base_list, dtype="object")[ np.array(matched_atom_no) == np.max(matched_atom_no) ]: # Copy the residue as the res_name property of the ``AtomArray`` @@ -1293,12 +1291,8 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # Select the matching atoms of the nucleotide and the reference # base - nuc = nuc[ - np.isin(nuc.atom_name, ref_base.atom_name) - ] - ref_base_matched = ref_base[ - np.isin(ref_base.atom_name, nuc.atom_name) - ] + nuc = nuc[np.isin(nuc.atom_name, ref_base.atom_name)] + ref_base_matched = ref_base[np.isin(ref_base.atom_name, nuc.atom_name)] # Set the res_name property to the same as the reference base. # This is a requirement for ``standardize_order`` @@ -1319,14 +1313,14 @@ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28): # If the RMSD is lower than the specified cutoff or better than # a previous found reference, the current reference is selected # as best base - if(rmsd(fitted, ref_base_matched) < rmsd_cutoff): + if rmsd(fitted, ref_base_matched) < rmsd_cutoff: rmsd_cutoff = rmsd(fitted, ref_base_matched) best_base = ref_base_matched.res_name[0][-1] if best_base is None: warnings.warn( f"Base Type {residue.res_name[0]} not supported. ", - UnexpectedStructureWarning + UnexpectedStructureWarning, ) return None @@ -1360,9 +1354,9 @@ def _get_proximate_residues(atom_array, boolean_mask, cutoff): # Get the indices of the atoms that are within the maximum cutoff # of each other - indices = CellList( - atom_array, cutoff, selection=boolean_mask - ).get_atoms(atom_array.coord[boolean_mask], cutoff) + indices = CellList(atom_array, cutoff, selection=boolean_mask).get_atoms( + atom_array.coord[boolean_mask], cutoff + ) # Loop through the indices of potential partners pairs = [] @@ -1375,16 +1369,12 @@ def _get_proximate_residues(atom_array, boolean_mask, cutoff): # indices. pairs = np.array(pairs) basepair_candidates_shape = pairs.shape - pairs = get_residue_starts_for( - atom_array, pairs.flatten() - ).reshape(basepair_candidates_shape) + pairs = get_residue_starts_for(atom_array, pairs.flatten()).reshape( + basepair_candidates_shape + ) # Remove candidates where the pairs are from the same residue - pairs = np.delete( - pairs, np.where( - pairs[:,0] == pairs[:,1] - ), axis=0 - ) + pairs = np.delete(pairs, np.where(pairs[:, 0] == pairs[:, 1]), axis=0) # Sort the residue starts for each pair for i, candidate in enumerate(pairs): pairs[i] = sorted(candidate) @@ -1411,5 +1401,4 @@ def _filter_atom_type(atom_array, atom_names): This array is ``True`` for all indices in the :class:`AtomArray` , where the atom has the desired atom names. """ - return (np.isin(atom_array.atom_name, atom_names) - & (atom_array.res_id != -1)) + return np.isin(atom_array.atom_name, atom_names) & (atom_array.res_id != -1) diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx index 8dc6e3b04..bd0a7d402 100644 --- a/src/biotite/structure/bonds.pyx +++ b/src/biotite/structure/bonds.pyx @@ -85,8 +85,8 @@ class BondType(IntEnum): Examples -------- - >>> print(BondType.AROMATIC_DOUBLE.without_aromaticity()) - BondType.DOUBLE + >>> print(BondType.AROMATIC_DOUBLE.without_aromaticity().name) + DOUBLE """ difference = BondType.AROMATIC_SINGLE - BondType.SINGLE if self >= BondType.AROMATIC_SINGLE: @@ -212,21 +212,21 @@ class BondList(Copyable): ... ) >>> for i, j, bond_type in benzene.bonds.as_array(): ... print( - ... f"{str(BondType(bond_type))} bond between " + ... f"{BondType(bond_type).name} bond between " ... f"{benzene.atom_name[i]} and {benzene.atom_name[j]}" ... ) - BondType.AROMATIC_SINGLE bond between C1 and C2 - BondType.AROMATIC_DOUBLE bond between C2 and C3 - BondType.AROMATIC_SINGLE bond between C3 and C4 - BondType.AROMATIC_DOUBLE bond between C4 and C5 - BondType.AROMATIC_SINGLE bond between C5 and C6 - BondType.AROMATIC_DOUBLE bond between C1 and C6 - BondType.SINGLE bond between C1 and H1 - BondType.SINGLE bond between C2 and H2 - BondType.SINGLE bond between C3 and H3 - BondType.SINGLE bond between C4 and H4 - BondType.SINGLE bond between C5 and H5 - BondType.SINGLE bond between C6 and H6 + AROMATIC_SINGLE bond between C1 and C2 + AROMATIC_DOUBLE bond between C2 and C3 + AROMATIC_SINGLE bond between C3 and C4 + AROMATIC_DOUBLE bond between C4 and C5 + AROMATIC_SINGLE bond between C5 and C6 + AROMATIC_DOUBLE bond between C1 and C6 + SINGLE bond between C1 and H1 + SINGLE bond between C2 and H2 + SINGLE bond between C3 and H3 + SINGLE bond between C4 and H4 + SINGLE bond between C5 and H5 + SINGLE bond between C6 and H6 Obtain the bonded atoms for the :math:`C_1`: @@ -248,14 +248,14 @@ class BondList(Copyable): ... ] >>> for i, j, bond_type in half_benzene.bonds.as_array(): ... print( - ... f"{str(BondType(bond_type))} bond between " + ... f"{BondType(bond_type).name} bond between " ... f"{half_benzene.atom_name[i]} and {half_benzene.atom_name[j]}" ... ) - BondType.AROMATIC_DOUBLE bond between C4 and C5 - BondType.AROMATIC_SINGLE bond between C5 and C6 - BondType.SINGLE bond between C4 and H4 - BondType.SINGLE bond between C5 and H5 - BondType.SINGLE bond between C6 and H6 + AROMATIC_DOUBLE bond between C4 and C5 + AROMATIC_SINGLE bond between C5 and C6 + SINGLE bond between C4 and H4 + SINGLE bond between C5 and H5 + SINGLE bond between C6 and H6 """ def __init__(self, uint32 atom_count, np.ndarray bonds=None): @@ -449,9 +449,9 @@ class BondList(Copyable): >>> bond_list.add_bond(1, 2, BondType.AROMATIC_DOUBLE) >>> bond_list.remove_aromaticity() >>> for i, j, bond_type in bond_list.as_array(): - ... print(i, j, BondType(bond_type)) - 0 1 BondType.SINGLE - 1 2 BondType.DOUBLE + ... print(i, j, BondType(bond_type).name) + 0 1 SINGLE + 1 2 DOUBLE """ bonds = self._bonds difference = BondType.AROMATIC_SINGLE - BondType.SINGLE @@ -1330,6 +1330,7 @@ def _invert_index(IndexType[:] index_v, uint32 length): +# fmt: off _DEFAULT_DISTANCE_RANGE = { # Taken from Allen et al. # min - 2*std max + 2*std @@ -1376,9 +1377,9 @@ _DEFAULT_DISTANCE_RANGE = { ("SE", "SE") : (2.340 - 2*0.024, 2.340 + 2*0.024), ("SI", "SE") : (2.359 - 2*0.012, 2.359 + 2*0.012), } +# fmt: on -def connect_via_distances(atoms, dict distance_range=None, atom_mask=None, - bint inter_residue=True, +def connect_via_distances(atoms, dict distance_range=None, bint inter_residue=True, default_bond_type=BondType.ANY, bint periodic=False): """ connect_via_distances(atoms, distance_range=None, atom_mask=None, @@ -1410,8 +1411,6 @@ def connect_via_distances(atoms, dict distance_range=None, atom_mask=None, Hence, the default bond distances for missing element pairs are still taken from the default dictionary. The default bond distances are taken from :footcite:`Allen1987`. - atom_mask : ndarray, dtype=bool, shape=(n,), optional - DEPRECATED: This option has no effect. inter_residue : bool, optional If true, connections between consecutive amino acids and nucleotides are also added. @@ -1532,7 +1531,7 @@ def connect_via_distances(atoms, dict distance_range=None, atom_mask=None, -def connect_via_residue_names(atoms, atom_mask=None, bint inter_residue=True, +def connect_via_residue_names(atoms, bint inter_residue=True, dict custom_bond_dict=None): """ connect_via_residue_names(atoms, atom_mask=None, inter_residue=True) @@ -1549,8 +1548,6 @@ def connect_via_residue_names(atoms, atom_mask=None, bint inter_residue=True, ---------- atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) The structure to create the :class:`BondList` for. - atom_mask : ndarray, dtype=bool, shape=(n,), optional - DEPRECATED: This option has no effect. inter_residue : bool, optional If true, connections between consecutive amino acids and nucleotides are also added. diff --git a/src/biotite/structure/box.py b/src/biotite/structure/box.py index ae4918add..41349bb9d 100644 --- a/src/biotite/structure/box.py +++ b/src/biotite/structure/box.py @@ -4,25 +4,33 @@ """ Functions related to working with the simulation box or unit cell -of a structure +of a structure """ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["vectors_from_unitcell", "unitcell_from_vectors", "box_volume", - "repeat_box", "repeat_box_coord", "move_inside_box", - "remove_pbc", "remove_pbc_from_coord", - "coord_to_fraction", "fraction_to_coord", "is_orthogonal"] +__all__ = [ + "vectors_from_unitcell", + "unitcell_from_vectors", + "box_volume", + "repeat_box", + "repeat_box_coord", + "move_inside_box", + "remove_pbc", + "remove_pbc_from_coord", + "coord_to_fraction", + "fraction_to_coord", + "is_orthogonal", +] -from collections.abc import Iterable from numbers import Integral import numpy as np import numpy.linalg as linalg -from .util import vector_dot -from .atoms import repeat -from .molecules import get_molecule_masks -from .chains import get_chain_masks, get_chain_starts -from .error import BadStructureError +from biotite.structure.atoms import repeat +from biotite.structure.chains import get_chain_masks, get_chain_starts +from biotite.structure.error import BadStructureError +from biotite.structure.molecules import get_molecule_masks +from biotite.structure.util import vector_dot def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): @@ -41,7 +49,7 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): The angles between the box vectors in radians. *alpha* is the angle between *b* and *c*, *beta* between *a* and *c*, *gamma* between *a* and *b* - + Returns ------- box : ndarray, dtype=float, shape=(3,3) @@ -49,7 +57,7 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): The vector components are in the last dimension. The value can be directly used as :attr:`box` attribute in an atom array. - + See also -------- unitcell_from_vectors @@ -58,19 +66,15 @@ def vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma): b_x = len_b * np.cos(gamma) b_y = len_b * np.sin(gamma) c_x = len_c * np.cos(beta) - c_y = len_c * (np.cos(alpha) - np.cos(beta)*np.cos(gamma)) / np.sin(gamma) - c_z = np.sqrt(len_c*len_c - c_x*c_x - c_y*c_y) - box = np.array([ - [a_x, 0, 0], - [b_x, b_y, 0], - [c_x, c_y, c_z] - ], dtype=np.float32) - + c_y = len_c * (np.cos(alpha) - np.cos(beta) * np.cos(gamma)) / np.sin(gamma) + c_z = np.sqrt(len_c * len_c - c_x * c_x - c_y * c_y) + box = np.array([[a_x, 0, 0], [b_x, b_y, 0], [c_x, c_y, c_z]], dtype=np.float32) + # Fix numerical errors, as values, that are actually 0, # might not be calculated as such tol = 1e-4 * (len_a + len_b + len_c) box[np.abs(box) < tol] = 0 - + return box @@ -84,7 +88,7 @@ def unitcell_from_vectors(box): ---------- box : ndarray, shape=(3,3) The box vectors - + Returns ------- len_a, len_b, len_c : float @@ -103,7 +107,7 @@ def unitcell_from_vectors(box): len_b = linalg.norm(b) len_c = linalg.norm(c) alpha = np.arccos(np.dot(b, c) / (len_b * len_c)) - beta = np.arccos(np.dot(a, c) / (len_a * len_c)) + beta = np.arccos(np.dot(a, c) / (len_a * len_c)) gamma = np.arccos(np.dot(a, b) / (len_a * len_b)) return len_a, len_b, len_c, alpha, beta, gamma @@ -116,7 +120,7 @@ def box_volume(box): ---------- box : ndarray, shape=(3,3) or shape=(m,3,3) One or multiple boxes to get the volume for. - + Returns ------- volume : float or ndarray, shape=(m,) @@ -159,7 +163,7 @@ def repeat_box(atoms, amount=1): Indices to the atoms in the original atom array (stack). Equal to ``numpy.tile(np.arange(atoms.array_length()), (1 + 2 * amount) ** 3)``. - + See also -------- repeat_box_coord @@ -232,12 +236,12 @@ def repeat_box(atoms, amount=1): """ if atoms.box is None: raise BadStructureError("Structure has no box") - + repeat_coord, indices = repeat_box_coord(atoms.coord, atoms.box) # Unroll repeated coordinates for input to 'repeat()' if repeat_coord.ndim == 2: repeat_coord = repeat_coord.reshape(-1, atoms.array_length(), 3) - else: # ndim == 3 + else: # ndim == 3 repeat_coord = repeat_coord.reshape( atoms.stack_depth(), -1, atoms.array_length(), 3 ) @@ -283,16 +287,15 @@ def repeat_box_coord(coord, box, amount=1): raise TypeError("The amount must be an integer") # List of numpy arrays for each box repeat coords_for_boxes = [coord] - for i in range(-amount, amount+1): - for j in range(-amount, amount+1): - for k in range(-amount, amount+1): + for i in range(-amount, amount + 1): + for j in range(-amount, amount + 1): + for k in range(-amount, amount + 1): # Omit the central box if i != 0 or j != 0 or k != 0: temp_coord = coord.copy() # Shift coordinates to adjacent box/unit cell translation_vec = np.sum( - box * np.array([i,j,k])[:, np.newaxis], - axis=-2 + box * np.array([i, j, k])[:, np.newaxis], axis=-2 ) # 'newaxis' to perform same translation on all # atoms for each model @@ -300,7 +303,7 @@ def repeat_box_coord(coord, box, amount=1): coords_for_boxes.append(temp_coord) return ( np.concatenate(coords_for_boxes, axis=-2), - np.tile(np.arange(coord.shape[-2]), (1 + 2 * amount) ** 3) + np.tile(np.arange(coord.shape[-2]), (1 + 2 * amount) ** 3), ) @@ -323,16 +326,16 @@ def move_inside_box(coord, box): The box(es) for one or multiple models. When `coord` is given for multiple models, :attr:`box` must be given for multiple models as well. - + Returns ------- moved_coord : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) The moved coordinates. Has the same shape is the input `coord`. - + Examples -------- - + >>> box = np.array([[10,0,0], [0,10,0], [0,0,10]], dtype=float) >>> inside_coord = [ 1, 2, 3] >>> outside_coord = [ 1, 22, 54] @@ -363,7 +366,7 @@ def remove_pbc(atoms, selection=None): To determine the molecules the structure is required to have an associated `BondList`. Otherwise segmentation removal is performed on a per-chain basis. - + Parameters ---------- atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) @@ -373,13 +376,13 @@ def remove_pbc(atoms, selection=None): selection : ndarray, dtype=bool, shape=(n,) Specifies which parts of `atoms` are sanitized, i.e the segmentation is removed. - + Returns ------- sanitized_atoms : AtomArray or AtomArrayStack The input structure with removed segmentation over periodic boundaries. - + See also -------- remove_pbc_from_coord @@ -392,12 +395,10 @@ def remove_pbc(atoms, selection=None): half box size. """ # Avoid circular import - from .geometry import centroid - + from biotite.structure.geometry import centroid + if atoms.box is None: - raise BadStructureError( - "The 'box' attribute must be set in the structure" - ) + raise BadStructureError("The 'box' attribute must be set in the structure") new_atoms = atoms.copy() if atoms.bonds is not None: @@ -414,10 +415,8 @@ def remove_pbc(atoms, selection=None): ) # Put center of molecule into box center = centroid(new_atoms.coord[..., mask, :])[..., np.newaxis, :] - center_in_box = move_inside_box( - center, new_atoms.box - ) - new_atoms.coord[..., mask, :] += (center_in_box - center) + center_in_box = move_inside_box(center, new_atoms.box) + new_atoms.coord[..., mask, :] += center_in_box - center return new_atoms @@ -433,11 +432,11 @@ def remove_pbc_from_coord(coord, box): the displacement coordinates in adjacent array positions. Basically, this function performs the reverse action of :func:`move_inside_box()`. - + Parameters ---------- coord : ndarray, dtype=float, shape=(m,n,3) or shape=(n,3) - The coordinates of the potentially segmented structure. + The coordinates of the potentially segmented structure. box : ndarray, dtype=float, shape=(m,3,3) or shape=(3,3) The simulation box or unit cell that is used as periodic boundary. @@ -447,7 +446,7 @@ def remove_pbc_from_coord(coord, box): ------- sanitized_coord : ndarray, dtype=float, shape=(m,n,3) or shape=(n,3) The reassembled coordinates. - + See also -------- remove_pbc_from_coord @@ -463,19 +462,14 @@ def remove_pbc_from_coord(coord, box): """ # Import in function to avoid circular import - from .geometry import index_displacement + from biotite.structure.geometry import index_displacement + # Get the PBC-sanitized displacements of all coordinates # to the respective next coordinate index_pairs = np.stack( - [ - np.arange(0, coord.shape[-2] - 1), - np.arange(1, coord.shape[-2] ) - ], - axis=1 - ) - neighbour_disp = index_displacement( - coord, index_pairs, box=box, periodic=True + [np.arange(0, coord.shape[-2] - 1), np.arange(1, coord.shape[-2])], axis=1 ) + neighbour_disp = index_displacement(coord, index_pairs, box=box, periodic=True) # Get the PBC-sanitized displacements of all but the first # coordinates to (0,0,0) absolute_disp = np.cumsum(neighbour_disp, axis=-2) @@ -501,19 +495,19 @@ def coord_to_fraction(coord, box): The box(es) for one or multiple models. When `coord` is given for multiple models, :attr:`box` must be given for multiple models as well. - + Returns ------- fraction : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) The fractions of the box vectors. - + See also -------- fraction_to_coord Examples -------- - + >>> box = np.array([[5,0,0], [0,5,0], [0,5,5]], dtype=float) >>> coord = np.array( ... [[1,1,1], [10,0,0], [0,0,10], [-5,2,1]], @@ -548,12 +542,12 @@ def fraction_to_coord(fraction, box): The box(es) for one or multiple models. When `coord` is given for multiple models, :attr:`box` must be given for multiple models as well. - + Returns ------- coord : ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) The coordinates. - + See also -------- coord_to_fraction @@ -572,12 +566,12 @@ def is_orthogonal(box): ---------- box : ndarray, dtype=float, shape=(3,3) or shape=(m,3,3) A single box or multiple boxes. - + Returns ------- is_orthgonal : bool or ndarray, shape=(m,), dtype=bool True, if the box vectors are orthogonal, false otherwise - + Notes ----- Due to possible numerical errors, this function also evaluates two @@ -587,6 +581,8 @@ def is_orthogonal(box): # Fix numerical errors, as values, that are actually 0, # might not be calculated as such tol = 1e-6 - return (np.abs(vector_dot(box[..., 0, :], box[..., 1, :])) < tol) & \ - (np.abs(vector_dot(box[..., 0, :], box[..., 2, :])) < tol) & \ - (np.abs(vector_dot(box[..., 1, :], box[..., 2, :])) < tol) \ No newline at end of file + return ( + (np.abs(vector_dot(box[..., 0, :], box[..., 1, :])) < tol) + & (np.abs(vector_dot(box[..., 0, :], box[..., 2, :])) < tol) + & (np.abs(vector_dot(box[..., 1, :], box[..., 2, :])) < tol) + ) diff --git a/src/biotite/structure/chains.py b/src/biotite/structure/chains.py index df3134267..c4bbd4996 100644 --- a/src/biotite/structure/chains.py +++ b/src/biotite/structure/chains.py @@ -9,22 +9,38 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["get_chain_starts", "apply_chain_wise", "spread_chain_wise", - "get_chain_masks", "get_chain_starts_for", "get_chain_positions", - "chain_iter", "get_chains", "get_chain_count", "chain_iter"] +__all__ = [ + "get_chain_starts", + "apply_chain_wise", + "spread_chain_wise", + "get_chain_masks", + "get_chain_starts_for", + "get_chain_positions", + "chain_iter", + "get_chains", + "get_chain_count", + "chain_iter", +] import numpy as np -from .resutil import * +from biotite.structure.segments import ( + apply_segment_wise, + get_segment_masks, + get_segment_positions, + get_segment_starts_for, + segment_iter, + spread_segment_wise, +) def get_chain_starts(array, add_exclusive_stop=False): """ Get the indices in an atom array, which indicates the beginning of a new chain. - + A new chain starts, when the chain ID changes or when the residue ID decreases. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -33,17 +49,17 @@ def get_chain_starts(array, add_exclusive_stop=False): If true, the exclusive stop of the input atom array, i.e. ``array.array_length()``, is added to the returned array of start indices as last element. - + Returns ------- starts : ndarray, dtype=int The start indices of new chains in `array`. - + Notes ----- This method is internally used by all other chain-related functions. - + See also -------- get_residue_starts @@ -51,13 +67,13 @@ def get_chain_starts(array, add_exclusive_stop=False): diff = np.diff(array.res_id) res_id_decrement = diff < 0 # This mask is 'true' at indices where the value changes - chain_id_changes = (array.chain_id[1:] != array.chain_id[:-1]) - + chain_id_changes = array.chain_id[1:] != array.chain_id[:-1] + # Convert mask to indices # Add 1, to shift the indices from the end of a chain # to the start of a new chain chain_starts = np.where(res_id_decrement | chain_id_changes)[0] + 1 - + # The first chain is not included yet -> Insert '[0]' if add_exclusive_stop: return np.concatenate(([0], chain_starts, [array.array_length()])) @@ -69,7 +85,7 @@ def apply_chain_wise(array, data, function, axis=None): """ Apply a function to intervals of data, where each interval corresponds to one chain. - + The function takes an atom array (stack) and an data array (`ndarray`) of the same length. The function iterates through the chain IDs of the atom array (stack) and identifies intervals of @@ -77,8 +93,8 @@ def apply_chain_wise(array, data, function, axis=None): partitioned into the same intervals, and each interval (also an :class:`ndarray`) is put as parameter into `function`. Each return value is stored as element in the resulting :class:`ndarray`, therefore each element - corresponds to one chain. - + corresponds to one chain. + Parameters ---------- array : AtomArray or AtomArrayStack @@ -92,14 +108,14 @@ def apply_chain_wise(array, data, function, axis=None): must return a value with the same shape and data type. axis : int, optional This value is given to the `axis` parameter of `function`. - + Returns ------- processed_data : ndarray Chain-wise evaluation of `data` by `function`. The size of the first dimension of this array is equal to the amount of chains. - + See also -------- apply_residue_wise @@ -114,11 +130,11 @@ def spread_chain_wise(array, input_data): Each value in the chain-wise input is assigned to all atoms of this chain: - + ``output_data[i] = input_data[j]``, *i* is incremented from atom to atom, *j* is incremented every chain change. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -126,13 +142,13 @@ def spread_chain_wise(array, input_data): input_data : ndarray The data to be spread. The length of axis=0 must be equal to the amount of different chain IDs in `array`. - + Returns ------- output_data : ndarray Chain-wise spread `input_data`. Length is the same as `array_length()` of `array`. - + See also -------- spread_residue_wise @@ -154,14 +170,14 @@ def get_chain_masks(array, indices): These indices indicate the atoms to get the corresponding chains for. Negative indices are not allowed. - + Returns ------- chains_masks : ndarray, dtype=bool, shape=(k,n) Multiple boolean masks, one for each given index in `indices`. Each array masks the atoms that belong to the same chain as the atom at the given index. - + See also -------- get_residue_masks @@ -183,13 +199,13 @@ def get_chain_starts_for(array, indices): These indices point to the atoms to get the corresponding chain starts for. Negative indices are not allowed. - + Returns ------- start_indices : ndarray, dtype=int, shape=(k,) The indices that point to the chain starts for the input `indices`. - + See also -------- get_residue_starts_for @@ -214,12 +230,12 @@ def get_chain_positions(array, indices): These indices point to the atoms to get the corresponding chain positions for. Negative indices are not allowed. - + Returns ------- start_indices : ndarray, dtype=int, shape=(k,) The indices that point to the position of the chains. - + See also -------- get_residue_positions @@ -231,20 +247,20 @@ def get_chain_positions(array, indices): def get_chains(array): """ Get the chain IDs of an atom array (stack). - + The chains are listed in the same order they occur in the array (stack). - + Parameters ---------- array : AtomArray or AtomArrayStack The atom array (stack), where the chains are determined. - + Returns ------- ids : ndarray, dtype=str List of chain IDs. - + See also -------- get_residues @@ -255,20 +271,20 @@ def get_chains(array): def get_chain_count(array): """ Get the amount of chains in an atom array (stack). - + The count is determined from the `chain_id` annotation. Each time the chain ID changes, the count is incremented. - + Parameters ---------- array : AtomArray or AtomArrayStack The atom array (stack), where the chains are counted. - + Returns ------- count : int Amount of chains. - + See also -------- get_residue_count @@ -279,20 +295,20 @@ def get_chain_count(array): def chain_iter(array): """ Iterate over all chains in an atom array (stack). - + Parameters ---------- array : AtomArray or AtomArrayStack The atom array (stack) to iterate over. - + Yields ------ chain : AtomArray or AtomArrayStack A single chain of the input `array`. - + See also -------- residue_iter """ starts = get_chain_starts(array, add_exclusive_stop=True) - return segment_iter(array, starts) \ No newline at end of file + return segment_iter(array, starts) diff --git a/src/biotite/structure/compare.py b/src/biotite/structure/compare.py index abb6b7e9f..bdce1d7a0 100644 --- a/src/biotite/structure/compare.py +++ b/src/biotite/structure/compare.py @@ -12,21 +12,21 @@ __all__ = ["rmsd", "rmspd", "rmsf", "average"] import numpy as np -from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .geometry import index_distance -from .util import vector_dot +from biotite.structure.atoms import AtomArrayStack, coord +from biotite.structure.geometry import index_distance +from biotite.structure.util import vector_dot def rmsd(reference, subject): r""" Calculate the RMSD between two structures. - + The *root-mean-square-deviation* (RMSD) indicates the overall deviation of each model of a structure to a reference structure. It is defined as: - + .. math:: RMSD = \sqrt{ \frac{1}{n} \sum\limits_{i=1}^n (x_i - x_{ref,i})^2} - + Parameters ---------- reference : AtomArray or ndarray, dtype=float, shape=(n,3) @@ -37,7 +37,7 @@ def rmsd(reference, subject): Structure(s) to be compared with `reference`. Alternatively, coordinates can be provided directly as :class:`ndarray`. - + Returns ------- rmsd : float or ndarray, dtype=float, shape=(m,) @@ -45,7 +45,7 @@ def rmsd(reference, subject): If subject is an :class:`AtomArray` a float is returned. If subject is an :class:`AtomArrayStack` a :class:`ndarray` containing the RMSD for each model is returned. - + See Also -------- rmsf @@ -71,16 +71,17 @@ def rmsd(reference, subject): """ return np.sqrt(np.mean(_sq_euclidian(reference, subject), axis=-1)) + def rmspd(reference, subject, periodic=False, box=None): r""" - Calculate the RMSD of atom pair distances for given structures + Calculate the RMSD of atom pair distances for given structures relative to those found in a reference structure. - Unlike the standard RMSD, the *root-mean-square-pairwise-deviation* - (RMSPD) is a fit-free method to determine deviations between + Unlike the standard RMSD, the *root-mean-square-pairwise-deviation* + (RMSPD) is a fit-free method to determine deviations between a structure and a preset reference. - .. math:: RMSPD = \sqrt{ \frac{1}{n^2} \sum\limits_{i=1}^n \sum\limits_{j \neq i}^n (d_{ij} - d_{ref,ij})^2} + .. math:: RMSPD = \sqrt{ \frac{1}{n^2} \sum\limits_{i=1}^n \sum\limits_{j \neq i}^n (d_{ij} - d_{ref,ij})^2} Parameters ---------- @@ -102,7 +103,7 @@ def rmspd(reference, subject, periodic=False, box=None): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- rmspd : float or ndarray, dtype=float, shape=(m,) @@ -110,7 +111,7 @@ def rmspd(reference, subject, periodic=False, box=None): If subject is an :class:`AtomArray` a float is returned. If subject is an :class:`AtomArrayStack` a :class:`ndarray` containing the RMSD for each model is returned. - + Warnings -------- Internally, this function uses :func:`index_distance()`. @@ -119,7 +120,7 @@ def rmspd(reference, subject, periodic=False, box=None): prior to the computation of RMSPDs with `periodic` set to false to ensure correct results. (e.g. with :func:`remove_pbc()`). - + See also -------- index_distance @@ -134,9 +135,10 @@ def rmspd(reference, subject, periodic=False, box=None): refdist = index_distance(reference, pairs, periodic=periodic, box=box) subjdist = index_distance(subject, pairs, periodic=periodic, box=box) - rmspd = np.sqrt(np.sum((subjdist - refdist)**2, axis = -1))/reflen + rmspd = np.sqrt(np.sum((subjdist - refdist) ** 2, axis=-1)) / reflen return rmspd + def rmsf(reference, subject): r""" Calculate the RMSF between two structures. @@ -146,9 +148,9 @@ def rmsf(reference, subject): models. Usually the reference structure, is the average over all models. The RMSF is defined as: - + .. math:: RMSF(i) = \sqrt{ \frac{1}{T} \sum\limits_{t=1}^T (x_i(t) - x_{ref,i}(t))^2} - + Parameters ---------- reference : AtomArray or ndarray, dtype=float, shape=(n,3) @@ -161,14 +163,14 @@ def rmsf(reference, subject): :class:`AtomArrayStack`. Alternatively, coordinates can be provided directly as :class:`ndarray`. - + Returns ------- rmsf : ndarray, dtype=float, shape=(n,) RMSF between subject and reference structure. Each element gives the RMSF for the atom at the respective index. - + See Also -------- rmsd @@ -198,41 +200,39 @@ def rmsf(reference, subject): def average(atoms): """ Calculate an average structure. - + The average structure has the average coordinates of the input models. - + Parameters ---------- atoms : AtomArrayStack or ndarray, dtype=float, shape=(m,n,3) The structure models to be averaged. Alternatively, coordinates can be provided directly as :class:`ndarray`. - + Returns ------- average : AtomArray or ndarray, dtype=float, shape=(n,3) Structure with averaged atom coordinates. If `atoms` is a :class:`ndarray` and :class:`ndarray` is also returned. - + See Also -------- rmsd, rmsf - + Notes ----- The calculated average structure is not suitable for visualization or geometric calculations, since bond lengths and angles will deviate from meaningful values. This method is rather useful to provide a reference structure for - calculation of e.g. the RMSD or RMSF. + calculation of e.g. the RMSD or RMSF. """ coords = coord(atoms) if coords.ndim != 3: - raise TypeError( - "Expected an AtomArrayStack or an ndarray with shape (m,n,3)" - ) + raise TypeError("Expected an AtomArrayStack or an ndarray with shape (m,n,3)") mean_coords = np.mean(coords, axis=0) if isinstance(atoms, AtomArrayStack): mean_array = atoms[0].copy() @@ -246,7 +246,7 @@ def _sq_euclidian(reference, subject): """ Calculate squared euclidian distance between atoms in two structures. - + Parameters ---------- reference : AtomArray or ndarray, dtype=float, shape=(n,3) @@ -254,7 +254,7 @@ def _sq_euclidian(reference, subject): subject : AtomArray or AtomArrayStack or ndarray, dtype=float, shape=(n,3) or shape=(m,n,3) Structure(s) whose atoms squared euclidian distance to `reference` is measured. - + Returns ------- ndarray, dtype=float, shape=(n,) or shape=(m,n) @@ -271,4 +271,4 @@ def _sq_euclidian(reference, subject): "Expected an AtomArray or an ndarray with shape (n,3) as reference" ) dif = subject_coord - reference_coord - return vector_dot(dif, dif) \ No newline at end of file + return vector_dot(dif, dif) diff --git a/src/biotite/structure/density.py b/src/biotite/structure/density.py index 5f6043412..86f24d53e 100644 --- a/src/biotite/structure/density.py +++ b/src/biotite/structure/density.py @@ -11,11 +11,10 @@ __all__ = ["density"] import numpy as np -from .atoms import coord +from biotite.structure.atoms import coord -def density(atoms, selection=None, delta=1.0, bins=None, - density=False, weights=None): +def density(atoms, selection=None, delta=1.0, bins=None, density=False, weights=None): r""" Compute the density of the selected atoms. @@ -51,13 +50,13 @@ def density(atoms, selection=None, delta=1.0, bins=None, Otherwise, returns the probability density function of each bin. See :func:`numpy.histogramdd()` for further details. weights: ndarray, shape=(n,) or shape=(m,n), optional - An array of values to weight the contribution of *n* atoms in + An array of values to weight the contribution of *n* atoms in *m* models. If the shape is *(n,)*, the weights will be interpreted as *per atom*. A shape of *(m,n)* allows to additionally weight atoms on a *per model* basis. - + Returns ------- H : ndarray, dtype=float @@ -69,12 +68,12 @@ def density(atoms, selection=None, delta=1.0, bins=None, A list containing the 3 arrays describing the bin edges. """ coords = coord(atoms) - + is_stack = coords.ndim == 3 # Define the grid for coordinate binning based on coordinates of # supplied atoms - # This makes the binning independent of a supplied box vector and + # This makes the binning independent of a supplied box vector and # fluctuating box dimensions are not a problem # However, this means that the user has to make sure the region of # interest is in the center of the box, i.e. by centering the @@ -84,19 +83,17 @@ def density(atoms, selection=None, delta=1.0, bins=None, axis = (0, 1) else: axis = 0 - grid_min, grid_max = np.min( - coords, axis=axis), np.max(coords, axis=axis - ) + grid_min, grid_max = np.min(coords, axis=axis), np.max(coords, axis=axis) bins = [ - np.arange(grid_min[0], grid_max[0]+delta, delta), - np.arange(grid_min[1], grid_max[1]+delta, delta), - np.arange(grid_min[2], grid_max[2]+delta, delta), + np.arange(grid_min[0], grid_max[0] + delta, delta), + np.arange(grid_min[1], grid_max[1] + delta, delta), + np.arange(grid_min[2], grid_max[2] + delta, delta), ] if selection is None: selected_coords = coords else: - selected_coords = coords[...,selection, :] + selected_coords = coords[..., selection, :] # Reshape the coords into Nx3 coords = selected_coords.reshape((np.prod(selected_coords.shape[:-1]), 3)) @@ -106,9 +103,7 @@ def density(atoms, selection=None, delta=1.0, bins=None, if is_stack and len(weights.shape) < 2: weights = np.tile(weights, len(selected_coords)) weights = weights.reshape(coords.shape[0]) - + # Calculate the histogram - hist = np.histogramdd( - coords, bins=bins, density=density, weights=weights - ) + hist = np.histogramdd(coords, bins=bins, density=density, weights=weights) return hist diff --git a/src/biotite/structure/dotbracket.py b/src/biotite/structure/dotbracket.py index ebfc3cf7f..66d8af441 100644 --- a/src/biotite/structure/dotbracket.py +++ b/src/biotite/structure/dotbracket.py @@ -9,13 +9,12 @@ __name__ = "biotite.structure" __author__ = "Tom David Müller" -__all__ = ["dot_bracket_from_structure", "dot_bracket", - "base_pairs_from_dot_bracket"] +__all__ = ["dot_bracket_from_structure", "dot_bracket", "base_pairs_from_dot_bracket"] import numpy as np -from .basepairs import base_pairs -from .pseudoknots import pseudoknots -from .residues import get_residue_count, get_residue_positions +from biotite.structure.basepairs import base_pairs +from biotite.structure.pseudoknots import pseudoknots +from biotite.structure.residues import get_residue_count, get_residue_positions _OPENING_BRACKETS = "([{ 0: diff --git a/src/biotite/structure/error.py b/src/biotite/structure/error.py index 269ee2276..1fe632e97 100644 --- a/src/biotite/structure/error.py +++ b/src/biotite/structure/error.py @@ -8,24 +8,32 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["BadStructureError", "IncompleteStructureWarning", - "UnexpectedStructureWarning"] +__all__ = [ + "BadStructureError", + "IncompleteStructureWarning", + "UnexpectedStructureWarning", +] class BadStructureError(Exception): """ Indicates that a structure is not suitable for a certain operation. """ + pass + class IncompleteStructureWarning(Warning): """ Indicates that a structure is not complete. """ + pass + class UnexpectedStructureWarning(Warning): """ Indicates that a structure was not expected. """ + pass diff --git a/src/biotite/structure/filter.py b/src/biotite/structure/filter.py index d32bce085..c6e4aefd6 100644 --- a/src/biotite/structure/filter.py +++ b/src/biotite/structure/filter.py @@ -9,32 +9,64 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Tom David Müller" -__all__ = ["filter_solvent", "filter_monoatomic_ions", "filter_nucleotides", - "filter_canonical_nucleotides", "filter_amino_acids", - "filter_canonical_amino_acids", "filter_carbohydrates", - "filter_backbone", "filter_intersection", "filter_first_altloc", - "filter_highest_occupancy_altloc", "filter_peptide_backbone", - "filter_phosphate_backbone", "filter_linear_bond_continuity", - "filter_polymer"] +__all__ = [ + "filter_solvent", + "filter_monoatomic_ions", + "filter_nucleotides", + "filter_canonical_nucleotides", + "filter_amino_acids", + "filter_canonical_amino_acids", + "filter_carbohydrates", + "filter_intersection", + "filter_first_altloc", + "filter_highest_occupancy_altloc", + "filter_peptide_backbone", + "filter_phosphate_backbone", + "filter_linear_bond_continuity", + "filter_polymer", +] -import warnings -import numpy as np from functools import partial -from .atoms import array as atom_array -from .residues import get_residue_starts, get_residue_count -from .info.groups import amino_acid_names, carbohydrate_names, nucleotide_names - - -_canonical_aa_list = ["ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS", - "ILE","LEU","LYS","MET","PHE","PRO","PYL","SER","THR", - "TRP","TYR","VAL", "SEC"] +import numpy as np +from biotite.structure.atoms import array as atom_array +from biotite.structure.info.groups import ( + amino_acid_names, + carbohydrate_names, + nucleotide_names, +) +from biotite.structure.residues import get_residue_count, get_residue_starts + +_canonical_aa_list = [ + "ALA", + "ARG", + "ASN", + "ASP", + "CYS", + "GLN", + "GLU", + "GLY", + "HIS", + "ILE", + "LEU", + "LYS", + "MET", + "PHE", + "PRO", + "PYL", + "SER", + "THR", + "TRP", + "TYR", + "VAL", + "SEC", +] _canonical_nucleotide_list = ["A", "DA", "G", "DG", "C", "DC", "U", "DT"] -_solvent_list = ["HOH","SOL"] +_solvent_list = ["HOH", "SOL"] -_peptide_backbone_atoms = ['N', 'CA', 'C'] -_phosphate_backbone_atoms = ['P', 'O5\'', 'C5\'', 'C4\'', 'C3\'', 'O3\''] +_peptide_backbone_atoms = ["N", "CA", "C"] +_phosphate_backbone_atoms = ["P", "O5'", "C5'", "C4'", "C3'", "O3'"] def filter_monoatomic_ions(array): @@ -55,7 +87,7 @@ def filter_monoatomic_ions(array): """ # Exclusively in monoatomic ions, # the element name is equal to the residue name - return (array.res_name == array.element) + return array.res_name == array.element def filter_solvent(array): @@ -206,37 +238,6 @@ def filter_carbohydrates(array): return np.isin(array.res_name, carbohydrate_names()) -def filter_backbone(array): - """ - Filter all peptide backbone atoms of one array. - - This includes the "N", "CA" and "C" atoms of amino acids. - - DEPRECATED: Please use :func:`filter_peptide_backbone` to filter - for protein backbone atoms. - - Parameters - ---------- - array : AtomArray or AtomArrayStack - The array to be filtered. - - Returns - ------- - filter : ndarray, dtype=bool - This array is `True` for all indices in `array`, where the atom - as an backbone atom. - """ - warnings.warn( - "Please use `filter_peptide_backbone()` to filter " - "for protein backbone atoms.", - DeprecationWarning - ) - return ( ((array.atom_name == "N") | - (array.atom_name == "CA") | - (array.atom_name == "C")) & - filter_amino_acids(array) ) - - def _filter_atom_names(array, atom_names): return np.isin(array.atom_name, atom_names) @@ -259,8 +260,9 @@ def filter_peptide_backbone(array): is a part of the peptide backbone. """ - return (_filter_atom_names(array, _peptide_backbone_atoms) & - filter_amino_acids(array)) + return _filter_atom_names(array, _peptide_backbone_atoms) & filter_amino_acids( + array + ) def filter_phosphate_backbone(array): @@ -281,8 +283,9 @@ def filter_phosphate_backbone(array): is a part of the phosphate backbone. """ - return (_filter_atom_names(array, _phosphate_backbone_atoms) & - filter_nucleotides(array)) + return _filter_atom_names(array, _phosphate_backbone_atoms) & filter_nucleotides( + array + ) def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8): @@ -328,21 +331,20 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8): def _is_polymer(array, min_size, pol_type): - - if pol_type.startswith('p'): + if pol_type.startswith("p"): filt_fn = filter_amino_acids - elif pol_type.startswith('n'): + elif pol_type.startswith("n"): filt_fn = filter_nucleotides - elif pol_type.startswith('c'): + elif pol_type.startswith("c"): filt_fn = filter_carbohydrates else: - raise ValueError(f'Unsupported polymer type {pol_type}') + raise ValueError(f"Unsupported polymer type {pol_type}") mask = filt_fn(array) return get_residue_count(array[mask]) >= min_size -def filter_polymer(array, min_size=2, pol_type='peptide'): +def filter_polymer(array, min_size=2, pol_type="peptide"): """ Filter for atoms that are a part of a consecutive standard macromolecular polymer entity. @@ -365,13 +367,14 @@ def filter_polymer(array, min_size=2, pol_type='peptide'): """ # Import `check_res_id_continuity` here to avoid circular imports - from .integrity import check_res_id_continuity + from biotite.structure.integrity import check_res_id_continuity + split_idx = check_res_id_continuity(array) check_pol = partial(_is_polymer, min_size=min_size, pol_type=pol_type) bool_idx = map( lambda a: np.full(len(a), check_pol(atom_array(a)), dtype=bool), - np.split(array, split_idx) + np.split(array, split_idx), ) return np.concatenate(list(bool_idx)) @@ -415,13 +418,17 @@ def filter_intersection(array, intersect): intersect_categories = intersect.get_annotation_categories() # Check atom equality only for categories, # which exist in both arrays - categories = [category for category in array.get_annotation_categories() - if category in intersect_categories] + categories = [ + category + for category in array.get_annotation_categories() + if category in intersect_categories + ] for i in range(array.array_length()): subfilter = np.full(intersect.array_length(), True, dtype=bool) for category in categories: - subfilter &= (intersect.get_annotation(category) - == array.get_annotation(category)[i]) + subfilter &= ( + intersect.get_annotation(category) == array.get_annotation(category)[i] + ) filter[i] = subfilter.any() return filter @@ -479,15 +486,15 @@ def filter_first_altloc(atoms, altloc_ids): 1 CB 4.000 5.000 6.000 """ # Filter all atoms without altloc code - altloc_filter = np.in1d(altloc_ids, [".", "?", " ", ""]) + altloc_filter = np.isin(altloc_ids, [".", "?", " ", ""]) # And filter all atoms for each residue with the first altloc ID residue_starts = get_residue_starts(atoms, add_exclusive_stop=True) for start, stop in zip(residue_starts[:-1], residue_starts[1:]): - letter_altloc_ids = [l for l in altloc_ids[start:stop] if l.isalpha()] + letter_altloc_ids = [loc for loc in altloc_ids[start:stop] if loc.isalpha()] if len(letter_altloc_ids) > 0: first_id = letter_altloc_ids[0] - altloc_filter[start:stop] |= (altloc_ids[start:stop] == first_id) + altloc_filter[start:stop] |= altloc_ids[start:stop] == first_id else: # No altloc ID in this residue -> Nothing to do pass @@ -556,7 +563,7 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies): 1 CB 6.000 5.000 4.000 """ # Filter all atoms without altloc code - altloc_filter = np.in1d(altloc_ids, [".", "?", " ", ""]) + altloc_filter = np.isin(altloc_ids, [".", "?", " ", ""]) # And filter all atoms for each residue with the highest sum of # occupancies @@ -565,19 +572,17 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies): occupancies_in_res = occupancies[start:stop] altloc_ids_in_res = altloc_ids[start:stop] - letter_altloc_ids = [l for l in altloc_ids_in_res if l.isalpha()] + letter_altloc_ids = [loc for loc in altloc_ids_in_res if loc.isalpha()] if len(letter_altloc_ids) > 0: highest = -1.0 highest_id = None for id in set(letter_altloc_ids): - occupancy_sum = np.sum( - occupancies_in_res[altloc_ids_in_res == id] - ) + occupancy_sum = np.sum(occupancies_in_res[altloc_ids_in_res == id]) if occupancy_sum > highest: highest = occupancy_sum highest_id = id - altloc_filter[start:stop] |= (altloc_ids[start:stop] == highest_id) + altloc_filter[start:stop] |= altloc_ids[start:stop] == highest_id else: # No altloc ID in this residue -> Nothing to do pass diff --git a/src/biotite/structure/geometry.py b/src/biotite/structure/geometry.py index ce39d1e82..cc5c59f4e 100644 --- a/src/biotite/structure/geometry.py +++ b/src/biotite/structure/geometry.py @@ -9,25 +9,33 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["displacement", "index_displacement", "distance", "index_distance", - "angle", "index_angle", "dihedral", "index_dihedral", - "dihedral_backbone", "centroid"] +__all__ = [ + "displacement", + "index_displacement", + "distance", + "index_distance", + "angle", + "index_angle", + "dihedral", + "index_dihedral", + "dihedral_backbone", + "centroid", +] import numpy as np -from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .util import vector_dot, norm_vector -from .filter import filter_peptide_backbone -from .chains import chain_iter -from .box import (coord_to_fraction, fraction_to_coord, - move_inside_box, is_orthogonal) -from .error import BadStructureError +from biotite.structure.atoms import AtomArray, AtomArrayStack, coord +from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal +from biotite.structure.chains import chain_iter +from biotite.structure.error import BadStructureError +from biotite.structure.filter import filter_peptide_backbone +from biotite.structure.util import norm_vector, vector_dot def displacement(atoms1, atoms2, box=None): """ Measure the displacement vector, i.e. the vector difference, from one array of atom coordinates to another array of coordinates. - + Parameters ---------- atoms1, atoms2 : ndarray, shape=(m,n,3) or ndarray, shape=(n,3) or ndarray, shape=(3,) or Atom or AtomArray or AtomArrayStack @@ -43,13 +51,13 @@ def displacement(atoms1, atoms2, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- disp : ndarray, shape=(m,n,3) or ndarray, shape=(n,3) or ndarray, shape=(3,) The displacement vector(s). The shape is equal to the shape of the input `atoms` with the highest dimensionality. - + See also -------- index_displacement @@ -62,7 +70,7 @@ def displacement(atoms1, atoms2, box=None): diff = v2 - v1 else: diff = -(v1 - v2) - + # Use minimum-image convention if box is given if box is not None: # Transform difference vector @@ -81,28 +89,24 @@ def displacement(atoms1, atoms2, box=None): fractions = fractions[np.newaxis, :] disp = disp[np.newaxis, :] if orthogonality: - _displacement_orthogonal_box( - fractions, box, disp - ) + _displacement_orthogonal_box(fractions, box, disp) else: _displacement_triclinic_box( fractions.astype(diff.dtype, copy=False), box.astype(diff.dtype, copy=False), - disp + disp, ) # Transform back disp = disp[0] if fractions.ndim == 2: # Single model if orthogonality: - _displacement_orthogonal_box( - fractions, box, disp - ) + _displacement_orthogonal_box(fractions, box, disp) else: _displacement_triclinic_box( fractions.astype(diff.dtype, copy=False), box.astype(diff.dtype, copy=False), - disp + disp, ) elif fractions.ndim == 3: # Multiple models @@ -117,21 +121,17 @@ def displacement(atoms1, atoms2, box=None): else: raise ValueError(f"{box.ndim} are to many box dimensions") if orthogonality_for_model: - _displacement_orthogonal_box( - fractions[i], box_for_model, disp[i] - ) + _displacement_orthogonal_box(fractions[i], box_for_model, disp[i]) else: _displacement_triclinic_box( fractions[i].astype(diff.dtype, copy=False), box_for_model.astype(diff.dtype, copy=False), - disp[i] + disp[i], ) else: - raise ValueError( - f"{diff.shape} is an invalid shape for atom coordinates" - ) + raise ValueError(f"{diff.shape} is an invalid shape for atom coordinates") return disp - + else: return diff @@ -139,7 +139,7 @@ def displacement(atoms1, atoms2, box=None): def index_displacement(*args, **kwargs): """ index_displacement(atoms, indices, periodic=False, box=None) - + Measure the displacement, i.e. the vector difference, between pairs of atoms. @@ -159,7 +159,7 @@ def index_displacement(*args, **kwargs): :class:`ndarray`. indices : ndarray, shape=(k,2) Pairs of indices that point to `atoms`. - The displacement is measured from ``indices[x,0]`` to + The displacement is measured from ``indices[x,0]`` to ``indices[x,1]``. periodic : bool, optional If set to true, periodic boundary conditions are taken into @@ -171,14 +171,14 @@ def index_displacement(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- disp : ndarray, shape=(k,) or shape=(m,k) The pairwise displacements. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -199,7 +199,7 @@ def index_displacement(*args, **kwargs): def distance(atoms1, atoms2, box=None): """ Measure the euclidian distance between atoms. - + Parameters ---------- atoms1, atoms2 : ndarray or Atom or AtomArray or AtomArrayStack @@ -214,14 +214,14 @@ def distance(atoms1, atoms2, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- dist : float or ndarray The atom distances. The shape is equal to the shape of the input `atoms` with the highest dimensionality minus the last axis. - + See also -------- index_distance @@ -233,7 +233,7 @@ def distance(atoms1, atoms2, box=None): def index_distance(*args, **kwargs): """ index_distance(atoms, indices, periodic=False, box=None) - + Measure the euclidian distance between pairs of atoms. The pairs refer to indices of a given atom array, whose pairwise @@ -262,14 +262,14 @@ def index_distance(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- dist : ndarray, shape=(k,) or shape=(m,k) The pairwise distances. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -290,7 +290,7 @@ def index_distance(*args, **kwargs): def angle(atoms1, atoms2, atoms3, box=None): """ Measure the angle between 3 atoms. - + Parameters ---------- atoms1, atoms2, atoms3 : ndarray or Atom or AtomArray or AtomArrayStack @@ -302,14 +302,14 @@ def angle(atoms1, atoms2, atoms3, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- angle : float or ndarray The angle(s) between the atoms. The shape is equal to the shape of the input `atoms` with the highest dimensionality minus the last axis. - + See also -------- index_angle @@ -318,13 +318,13 @@ def angle(atoms1, atoms2, atoms3, box=None): v2 = displacement(atoms3, atoms2, box) norm_vector(v1) norm_vector(v2) - return np.arccos(vector_dot(v1,v2)) + return np.arccos(vector_dot(v1, v2)) def index_angle(*args, **kwargs): """ index_angle(atoms, indices, periodic=False, box=None) - + Measure the angle between triples of atoms. The triples refer to indices of a given atom array, whose triplewise @@ -351,14 +351,14 @@ def index_angle(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- angle : ndarray, shape=(k,) or shape=(m,k) The triplewise angles. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -379,7 +379,7 @@ def index_angle(*args, **kwargs): def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): """ Measure the dihedral angle between 4 atoms. - + Parameters ---------- atoms1, atoms2, atoms3, atoms4 : ndarray or Atom or AtomArray or AtomArrayStack @@ -392,14 +392,14 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): the box vectors given with this parameter. The shape *(m,3,3)* is only allowed, when the input coordinates comprise multiple models. - + Returns ------- dihed : float or ndarray The dihedral angle(s) between the atoms. The shape is equal to the shape of the input `atoms` with the highest dimensionality minus the last axis. - + See Also -------- index_dihedral @@ -411,20 +411,20 @@ def dihedral(atoms1, atoms2, atoms3, atoms4, box=None): norm_vector(v1) norm_vector(v2) norm_vector(v3) - + n1 = np.cross(v1, v2) n2 = np.cross(v2, v3) - - # Calculation using atan2, to ensure the correct sign of the angle - x = vector_dot(n1,n2) - y = vector_dot(np.cross(n1,n2), v2) - return np.arctan2(y,x) + + # Calculation using atan2, to ensure the correct sign of the angle + x = vector_dot(n1, n2) + y = vector_dot(np.cross(n1, n2), v2) + return np.arctan2(y, x) def index_dihedral(*args, **kwargs): """ index_dihedral(atoms, indices, periodic=False, box=None) - + Measure the dihedral angle between quadruples of atoms. The triples refer to indices of a given atom array, whose @@ -452,14 +452,14 @@ def index_dihedral(*args, **kwargs): box : ndarray, shape=(3,3) or shape=(m,3,3), optional If this parameter is set, the given box is used instead of the `box` attribute of `atoms`. - + Returns ------- dihedral : ndarray, shape=(k,) or shape=(m,k) The quadruplewise dihedral angles. If `atoms` is an atom array stack, The distances are calculated for each model. - + Warnings -------- In case `periodic` is set to true and if the box is not orthorhombic @@ -482,7 +482,7 @@ def dihedral_backbone(atom_array): """ Measure the characteristic backbone dihedral angles of a protein structure. - + Parameters ---------- atom_array: AtomArray or AtomArrayStack @@ -492,7 +492,7 @@ def dihedral_backbone(atom_array): `NaN`. The order of the backbone atoms for each residue must be (N, CA, C). - + Returns ------- phi, psi, omega : ndarray @@ -502,20 +502,20 @@ def dihedral_backbone(atom_array): have *NaN* values. If an :class:`AtomArrayStack` is given, the output angles are 2-dimensional, the first dimension corresponds to the model number. - + Raises ------ BadStructureError If the amount of backbone atoms is not equal to amount of residues times 3 (for N, CA and C). - + See Also -------- dihedral - + Examples -------- - + >>> phi, psi, omega = dihedral_backbone(atom_array) >>> print(np.stack([np.rad2deg(phi), np.rad2deg(psi)]).T) [[ nan -56.145] @@ -541,15 +541,17 @@ def dihedral_backbone(atom_array): """ bb_filter = filter_peptide_backbone(atom_array) backbone = atom_array[..., bb_filter] - - if backbone.array_length() % 3 != 0 \ - or (backbone.atom_name[0::3] != "N" ).any() \ - or (backbone.atom_name[1::3] != "CA").any() \ - or (backbone.atom_name[2::3] != "C" ).any(): - raise BadStructureError( - "The backbone is invalid, must be repeats of (N, CA, C), " - "maybe a backbone atom is missing" - ) + + if ( + backbone.array_length() % 3 != 0 + or (backbone.atom_name[0::3] != "N").any() + or (backbone.atom_name[1::3] != "CA").any() + or (backbone.atom_name[2::3] != "C").any() + ): + raise BadStructureError( + "The backbone is invalid, must be repeats of (N, CA, C), " + "maybe a backbone atom is missing" + ) phis = [] psis = [] omegas = [] @@ -558,9 +560,11 @@ def dihedral_backbone(atom_array): phis.append(phi) psis.append(psi) omegas.append(omega) - return np.concatenate(phis, axis=-1), np.concatenate(psis, axis=-1), \ - np.concatenate(omegas, axis=-1) - + return ( + np.concatenate(phis, axis=-1), + np.concatenate(psis, axis=-1), + np.concatenate(omegas, axis=-1), + ) def _dihedral_backbone(chain_bb): @@ -571,49 +575,57 @@ def _dihedral_backbone(chain_bb): # Dim 2: X, Y, Z coordinates # Dim 3: Atoms involved in dihedral angle if isinstance(chain_bb, AtomArray): - angle_coord_shape = (len(bb_coord)//3, 3, 4) + angle_coord_shape = (len(bb_coord) // 3, 3, 4) elif isinstance(chain_bb, AtomArrayStack): - angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1]//3, 3, 4) - phi_coord = np.full(angle_coord_shape, np.nan) - psi_coord = np.full(angle_coord_shape, np.nan) + angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1] // 3, 3, 4) + phi_coord = np.full(angle_coord_shape, np.nan) + psi_coord = np.full(angle_coord_shape, np.nan) omega_coord = np.full(angle_coord_shape, np.nan) - - # Indices for coordinates of CA atoms - ca_i = np.arange(bb_coord.shape[-2]//3) * 3 + 1 - phi_coord [..., 1: , :, 0] = bb_coord[..., ca_i[1: ]-2 ,:] - phi_coord [..., 1: , :, 1] = bb_coord[..., ca_i[1: ]-1 ,:] - phi_coord [..., 1: , :, 2] = bb_coord[..., ca_i[1: ] ,:] - phi_coord [..., 1: , :, 3] = bb_coord[..., ca_i[1: ]+1 ,:] - psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1 ,:] - psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1] ,:] - psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1 ,:] - psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2 ,:] - omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1] ,:] - omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1 ,:] - omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2 ,:] - omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3 ,:] - - phi = dihedral(phi_coord[...,0], phi_coord[...,1], - phi_coord[...,2], phi_coord[...,3]) - psi = dihedral(psi_coord[...,0], psi_coord[...,1], - psi_coord[...,2], psi_coord[...,3]) - omega = dihedral(omega_coord[...,0], omega_coord[...,1], - omega_coord[...,2], omega_coord[...,3]) - + + # Indices for coordinates of CA atoms + ca_i = np.arange(bb_coord.shape[-2] // 3) * 3 + 1 + # fmt: off + phi_coord [..., 1:, :, 0] = bb_coord[..., ca_i[1: ]-2, :] + phi_coord [..., 1:, :, 1] = bb_coord[..., ca_i[1: ]-1, :] + phi_coord [..., 1:, :, 2] = bb_coord[..., ca_i[1: ], :] + phi_coord [..., 1:, :, 3] = bb_coord[..., ca_i[1: ]+1, :] + psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1, :] + psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1], :] + psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1, :] + psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2, :] + omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1], :] + omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1, :] + omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2, :] + omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3, :] + # fmt: on + + phi = dihedral( + phi_coord[..., 0], phi_coord[..., 1], phi_coord[..., 2], phi_coord[..., 3] + ) + psi = dihedral( + psi_coord[..., 0], psi_coord[..., 1], psi_coord[..., 2], psi_coord[..., 3] + ) + omega = dihedral( + omega_coord[..., 0], + omega_coord[..., 1], + omega_coord[..., 2], + omega_coord[..., 3], + ) + return phi, psi, omega def centroid(atoms): """ Measure the centroid of a structure. - + Parameters ---------- atoms: ndarray or AtomArray or AtomArrayStack The structures to determine the centroid from. Alternatively an ndarray containing the coordinates can be provided. - + Returns ------- centroid : float or ndarray @@ -623,8 +635,9 @@ def centroid(atoms): return np.mean(coord(atoms), axis=-2) -def _call_non_index_function(function, expected_amount, - atoms, indices, periodic=False, box=None): +def _call_non_index_function( + function, expected_amount, atoms, indices, periodic=False, box=None +): """ Call an `xxx()` function based on the parameters given to a `index_xxx()` function. @@ -636,15 +649,14 @@ def _call_non_index_function(function, expected_amount, ) coord_list = [] for i in range(expected_amount): - coord_list.append(coord(atoms)[..., indices[:,i], :]) + coord_list.append(coord(atoms)[..., indices[:, i], :]) if periodic: if box is None: if isinstance(atoms, (AtomArray, AtomArrayStack)): box = atoms.box else: raise ValueError( - "If `atoms` are coordinates, " - "the box must be set explicitly" + "If `atoms` are coordinates, " "the box must be set explicitly" ) else: box = None @@ -656,7 +668,7 @@ def _displacement_orthogonal_box(fractions, box, disp): Fill in the PBC-aware displacement vector for non-PBC-aware displacements given as fractions of given box vectors. """ - # Fraction components are guaranteed to be positive + # Fraction components are guaranteed to be positive # Use fraction vector components with lower absolute # -> new_vec[i] = vec[i] - 1 if vec[i] > 0.5 else vec[i] fractions[fractions > 0.5] -= 1 @@ -669,7 +681,7 @@ def _displacement_triclinic_box(fractions, box, disp): displacements given as fractions of given box vectors. """ diffs = fraction_to_coord(fractions, box) - # Fraction components are guaranteed to be positive + # Fraction components are guaranteed to be positive # Test all 3 fraction vector components # with positive and negative sign # (i,j,k in {-1, 0}) @@ -678,10 +690,10 @@ def _displacement_triclinic_box(fractions, box, disp): for i in range(-1, 1): for j in range(-1, 1): for k in range(-1, 1): - x = i*box[0,0] + j*box[1,0] + k*box[2,0] - y = i*box[0,1] + j*box[1,1] + k*box[2,1] - z = i*box[0,2] + j*box[1,2] + k*box[2,2] - periodic_shift.append([x,y,z]) + x = i * box[0, 0] + j * box[1, 0] + k * box[2, 0] + y = i * box[0, 1] + j * box[1, 1] + k * box[2, 1] + z = i * box[0, 2] + j * box[1, 2] + k * box[2, 2] + periodic_shift.append([x, y, z]) periodic_shift = np.array(periodic_shift, dtype=disp.dtype) # Create 8 periodically shifted variants for each atom shifted_diffs = diffs[:, np.newaxis, :] + periodic_shift[np.newaxis, :, :] @@ -692,6 +704,5 @@ def _displacement_triclinic_box(fractions, box, disp): # for each given non-PBC-aware displacement find the PBC-aware # displacement with the lowest distance disp[:] = shifted_diffs[ - np.arange(len(shifted_diffs)), - np.argmin(sq_distance, axis=1) + np.arange(len(shifted_diffs)), np.argmin(sq_distance, axis=1) ] diff --git a/src/biotite/structure/graphics/atoms.py b/src/biotite/structure/graphics/atoms.py index bc91492d9..dec54f1fa 100644 --- a/src/biotite/structure/graphics/atoms.py +++ b/src/biotite/structure/graphics/atoms.py @@ -7,18 +7,25 @@ __all__ = ["plot_atoms", "plot_ball_and_stick_model"] import numpy as np -import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d.art3d import Line3DCollection -def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, - center=None, size=None, zoom=1.0): +def plot_atoms( + axes, + atoms, + colors, + line_width=1.0, + background_color=None, + center=None, + size=None, + zoom=1.0, +): """ Plot an :class:`AtomArray` as lines between bonded atoms. The z-axis points into the screen plane. - + Parameters ---------- axes : Axes3D @@ -49,7 +56,7 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, - ``> 1.0``: Zoom in. - ``< 1.0``: Zoom out. - + Notes ----- This is a very simple visualization tools for quick visual analysis @@ -61,38 +68,37 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, raise ValueError("The given axes mut be an 'Axes3D'") if atoms.bonds is None: raise ValueError("The atom array must have an associated bond list") - + # Calculating connections between atoms line_coord = [] line_colors = [] - for index1, index2 in atoms.bonds.as_array()[:,:2]: + for index1, index2 in atoms.bonds.as_array()[:, :2]: # Every connection consist of two lines: # One from the first atom to the center # and from from the second atom to the center line_start = atoms.coord[index1] line_end = atoms.coord[index2] line_center = (line_start + line_end) / 2 - + # Add line from first atom - line_coord.append(( - line_start, line_center - )) + line_coord.append((line_start, line_center)) line_colors.append(colors[index1]) - + # Add line from second atom - line_coord.append(( - line_end, line_center - )) + line_coord.append((line_end, line_center)) line_colors.append(colors[index2]) # Plot computed line coordinates and colors # Use 'Line3DCollection' for higher efficiency lines = Line3DCollection( - line_coord, color=line_colors, linewidths=line_width, - capstyle="round", joinstyle="round" + line_coord, + color=line_colors, + linewidths=line_width, + capstyle="round", + joinstyle="round", ) axes.add_collection(lines) - + # Set viewing angle axes.azim = -90 axes.elev = 90 @@ -105,17 +111,25 @@ def plot_atoms(axes, atoms, colors, line_width=1.0, background_color=None, _set_box(axes, atoms.coord, center, size, zoom) -def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200, - line_color="black", line_width=1.0, - background_color=None, center=None, - size=None, zoom=1.0): +def plot_ball_and_stick_model( + axes, + atoms, + colors, + ball_size=200, + line_color="black", + line_width=1.0, + background_color=None, + center=None, + size=None, + zoom=1.0, +): """ Plot an :class:`AtomArray` as *ball-and-stick* model. The z-axis points into the screen plane. UNSTABLE: This function is probably subject to future changes. - + Parameters ---------- axes : Axes3D @@ -154,7 +168,7 @@ def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200, - ``> 1.0``: Zoom in. - ``< 1.0``: Zoom out. - + Notes ----- This is a very simple visualization tools for quick visual analysis @@ -166,26 +180,27 @@ def plot_ball_and_stick_model(axes, atoms, colors, ball_size=200, raise ValueError("The given axes mut be an 'Axes3D'") if atoms.bonds is None: raise ValueError("The atom array must have an associated bond list") - + # Calculating connections between atoms line_coord = [ (atoms.coord[index1], atoms.coord[index2]) - for index1, index2 in atoms.bonds.as_array()[:,:2] + for index1, index2 in atoms.bonds.as_array()[:, :2] ] # Plot sticks # Use 'Line3DCollection' for higher efficiency sticks = Line3DCollection( - line_coord, color=line_color, linewidths=line_width, - capstyle="round", joinstyle="round" + line_coord, + color=line_color, + linewidths=line_width, + capstyle="round", + joinstyle="round", ) axes.add_collection(sticks) # Plot balls - axes.scatter( - *atoms.coord.T, s=ball_size, c=colors, linewidth=0, alpha=1 - ) - + axes.scatter(*atoms.coord.T, s=ball_size, c=colors, linewidth=0, alpha=1) + # Set viewing angle axes.azim = -90 axes.elev = 90 @@ -211,16 +226,18 @@ def _set_box(axes, coord, center, size, zoom): ) if size is None: - size = np.array([ - coord[:, 0].max() - coord[:, 0].min(), - coord[:, 1].max() - coord[:, 1].min(), - coord[:, 2].max() - coord[:, 2].min() - ]).max() - - axes.set_xlim(center[0] - size/(2*zoom), center[0] + size/(2*zoom)) - axes.set_ylim(center[1] - size/(2*zoom), center[1] + size/(2*zoom)) - axes.set_zlim(center[2] - size/(2*zoom), center[2] + size/(2*zoom)) - + size = np.array( + [ + coord[:, 0].max() - coord[:, 0].min(), + coord[:, 1].max() - coord[:, 1].min(), + coord[:, 2].max() - coord[:, 2].min(), + ] + ).max() + + axes.set_xlim(center[0] - size / (2 * zoom), center[0] + size / (2 * zoom)) + axes.set_ylim(center[1] - size / (2 * zoom), center[1] + size / (2 * zoom)) + axes.set_zlim(center[2] - size / (2 * zoom), center[2] + size / (2 * zoom)) + # Make the axis lengths of the 'plot box' equal # The 'plot box' is not visible due to 'axes.axis("off")' - axes.set_box_aspect([1,1,1]) \ No newline at end of file + axes.set_box_aspect([1, 1, 1]) diff --git a/src/biotite/structure/graphics/rna.py b/src/biotite/structure/graphics/rna.py index b2cf6d198..49648728a 100644 --- a/src/biotite/structure/graphics/rna.py +++ b/src/biotite/structure/graphics/rna.py @@ -7,29 +7,43 @@ __all__ = ["plot_nucleotide_secondary_structure"] import shutil -import numpy as np from itertools import repeat -from .. import pseudoknots -from ...application.viennarna import RNAplotApp +import numpy as np +from biotite.application.viennarna import RNAplotApp +from biotite.structure import pseudoknots def plot_nucleotide_secondary_structure( - axes, base_labels, base_pairs, length, - layout_type=RNAplotApp.Layout.NAVIEW, draw_pseudoknots=True, - pseudoknot_order=None, angle=0, bond_linewidth=1, bond_linestyle=None, - bond_color='black', backbone_linewidth=1, backbone_linestyle='solid', - backbone_color='grey', base_text=None, base_box=None, - annotation_positions=None, annotation_offset=8.5, annotation_text=None, - border=0.03, bin_path="RNAplot" - ): + axes, + base_labels, + base_pairs, + length, + layout_type=RNAplotApp.Layout.NAVIEW, + draw_pseudoknots=True, + pseudoknot_order=None, + angle=0, + bond_linewidth=1, + bond_linestyle=None, + bond_color="black", + backbone_linewidth=1, + backbone_linestyle="solid", + backbone_color="grey", + base_text=None, + base_box=None, + annotation_positions=None, + annotation_offset=8.5, + annotation_text=None, + border=0.03, + bin_path="RNAplot", +): """ Generate 2D plots of nucleic acid secondary structures using the interface to *RNAplot*, which is part of the *ViennaRNA* software package. - Internally a :class:`biotite.application.viennarna.RNAplotApp` - instance is created to generate coordinates for each individual base - on a 2D plane. *ViennaRNA* must be installed in order to use this + Internally a :class:`biotite.application.viennarna.RNAplotApp` + instance is created to generate coordinates for each individual base + on a 2D plane. *ViennaRNA* must be installed in order to use this function. Parameters @@ -49,7 +63,7 @@ def plot_nucleotide_secondary_structure( Whether pseudoknotted bonds should be drawn. pseudoknot_order : iterable, optional (default: None) The pseudoknot order of each pair in the input `base_pairs`. - If no pseudoknot order is given, a solution determined by + If no pseudoknot order is given, a solution determined by :func:`biotite.structure.pseudoknots` is picked at random. angle : int or float, optional (default: 0) The angle the plot should be rotated. @@ -74,9 +88,9 @@ def plot_nucleotide_secondary_structure( backbone_color : str or ndarray, shape=(3,) or shape=(4,), dtype=float, optional (default: 'grey') The *Matplotlib* compatible color of the backbone. base_text : dict or iterable, optional (default: {'size': 'small'}) - The keyword parameters for the *Matplotlib* ``Text`` objects - denoting the type of each base. Provide a single value to set - the parameters for all labels or an iterable to set the + The keyword parameters for the *Matplotlib* ``Text`` objects + denoting the type of each base. Provide a single value to set + the parameters for all labels or an iterable to set the parameters for each individual label. base_box : dict or iterable, optional (default: {'pad'=0, 'color'='white'}) The *Matplotlib* compatible properties of the ``FancyBboxPatch`` @@ -91,9 +105,9 @@ def plot_nucleotide_secondary_structure( annotation_offset : int or float, optional (default: 8.5) The offset of the annotations from the base labels. annotation_text : dict or iterable, optional (default: {'size': 'small'}) - The keyword parameters for the *Matplotlib* ``Text`` objects - annotating the sequence. Provide a single value to set the - parameters for all annotations or an iterable to set the + The keyword parameters for the *Matplotlib* ``Text`` objects + annotating the sequence. Provide a single value to set the + parameters for all annotations or an iterable to set the parameters for each individual annotation. border : float, optional (default: 0.03) The percentage of the coordinate range to be left as whitespace @@ -105,8 +119,8 @@ def plot_nucleotide_secondary_structure( # Check if RNAplot is installed if shutil.which(bin_path) is None: raise FileNotFoundError( - 'RNAplot is not installed at the specified location, unable to ' - 'plot secondary structure.' + "RNAplot is not installed at the specified location, unable to " + "plot secondary structure." ) # Get the unknotted base pairs @@ -127,7 +141,7 @@ def plot_nucleotide_secondary_structure( # Set the default properties of the Matplotlib `bbox` surrounding # the base labels if base_box is None: - base_box=np.full(length, {'pad': 0, 'color': 'white'}) + base_box = np.full(length, {"pad": 0, "color": "white"}) # if `base_box` is a dictionary, extrapolate elif isinstance(base_box, dict): base_box = np.full(length, base_box) @@ -135,25 +149,23 @@ def plot_nucleotide_secondary_structure( # By default pseudoknotted bonds are denoted as dashed lines, while # unknotted bonds are denoted as solid lines if bond_linestyle is None: - bond_linestyle = np.full(base_pairs.shape[0], 'solid', dtype='object') - bond_linestyle[pseudoknot_order != 0] = 'dashed' + bond_linestyle = np.full(base_pairs.shape[0], "solid", dtype="object") + bond_linestyle[pseudoknot_order != 0] = "dashed" # If `bond_linestyle` is a string, extrapolate elif isinstance(bond_linestyle, str): - bond_linestyle = np.full( - base_pairs.shape[0], bond_linestyle, dtype='object' - ) + bond_linestyle = np.full(base_pairs.shape[0], bond_linestyle, dtype="object") # If pseudoknots are not to be drawn, remove pseudoknotted bonds, # regardless of the given linestyles if not draw_pseudoknots: # Ensure that the array can hold the 'None' value # (not possible with 'U1' dtype for example) - bond_linestyle = np.asarray(bond_linestyle, dtype='object') - bond_linestyle[pseudoknot_order != 0] = 'None' + bond_linestyle = np.asarray(bond_linestyle, dtype="object") + bond_linestyle[pseudoknot_order != 0] = "None" # Set the default properties of the base labels if base_text is None: - base_text = np.full(length, {'size': 'small'}) + base_text = np.full(length, {"size": "small"}) elif isinstance(base_text, dict): base_text = np.full(length, base_text) @@ -164,7 +176,7 @@ def plot_nucleotide_secondary_structure( # Set the default font properties of the base annotations if annotation_text is None: - annotation_text = repeat({'size': 'small'}) + annotation_text = repeat({"size": "small"}) elif isinstance(annotation_text, dict): annotation_text = repeat(annotation_text) @@ -173,15 +185,14 @@ def plot_nucleotide_secondary_structure( base_pairs=unknotted_base_pairs, length=length, bin_path=bin_path, - layout_type=layout_type + layout_type=layout_type, ) # Rotate Coordinates if angle != 0: angle = np.deg2rad(angle) rot_matrix = np.array( - [[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]] + [[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]] ) for i, coord in enumerate(coordinates): coordinates[i] = np.dot(rot_matrix, coord) @@ -197,31 +208,32 @@ def plot_nucleotide_secondary_structure( ) axes.set_frame_on(False) - # Define buffer area (Border) coord_range = abs(np.max(coordinates)) + abs(np.min(coordinates)) - buffer = border*coord_range + buffer = border * coord_range # Adjust display axes.set_xlim( - np.min(coordinates[:,0])-buffer, np.max(coordinates[:,0])+buffer + np.min(coordinates[:, 0]) - buffer, np.max(coordinates[:, 0]) + buffer ) axes.set_ylim( - np.min(coordinates[:,1])-buffer, np.max(coordinates[:,1])+buffer + np.min(coordinates[:, 1]) - buffer, np.max(coordinates[:, 1]) + buffer ) - axes.set_aspect(aspect='equal') + axes.set_aspect(aspect="equal") # Draw backbone - axes.plot(coordinates[:,0], coordinates[:,1], color=backbone_color, - linestyle=backbone_linestyle, linewidth=backbone_linewidth) + axes.plot( + coordinates[:, 0], + coordinates[:, 1], + color=backbone_color, + linestyle=backbone_linestyle, + linewidth=backbone_linewidth, + ) # Draw base labels - for coords, label, box, text in zip( - coordinates, base_labels, base_box, base_text - ): + for coords, label, box, text in zip(coordinates, base_labels, base_box, base_text): t = axes.text( - x=coords[0], y=coords[1], s=label, - ha='center', va='center', **text + x=coords[0], y=coords[1], s=label, ha="center", va="center", **text ) t.set_bbox(box) @@ -237,37 +249,41 @@ def plot_nucleotide_secondary_structure( # Draw annotations for i, text in zip(annotation_positions, annotation_text): - if (i > 0) and ((i+1) < length): + if (i > 0) and ((i + 1) < length): # Get the average of the direction vectors to the next and # previous base vector_to_previous = np.array( - [coordinates[i-1][0] - coordinates[i][0], - coordinates[i-1][1] - coordinates[i][1]] - ) - vector_to_previous = vector_to_previous / np.linalg.norm( - vector_to_previous + [ + coordinates[i - 1][0] - coordinates[i][0], + coordinates[i - 1][1] - coordinates[i][1], + ] ) + vector_to_previous = vector_to_previous / np.linalg.norm(vector_to_previous) vector_to_next = np.array( - [coordinates[i][0] - coordinates[i+1][0], - coordinates[i][1] - coordinates[i+1][1]] - ) - vector_to_next = vector_to_next / np.linalg.norm( - vector_to_next + [ + coordinates[i][0] - coordinates[i + 1][0], + coordinates[i][1] - coordinates[i + 1][1], + ] ) + vector_to_next = vector_to_next / np.linalg.norm(vector_to_next) vector = (vector_to_next + vector_to_previous) / 2 elif i > 0: # For the last base get the direction vector to the previous # base vector = np.array( - [coordinates[i-1][0] - coordinates[i][0], - coordinates[i-1][1] - coordinates[i][1]] + [ + coordinates[i - 1][0] - coordinates[i][0], + coordinates[i - 1][1] - coordinates[i][1], + ] ) else: # For the first base get the direction vector to the next # base vector = np.array( - [coordinates[i][0] - coordinates[i+1][0], - coordinates[i][1] - coordinates[i+1][1]] + [ + coordinates[i][0] - coordinates[i + 1][0], + coordinates[i][1] - coordinates[i + 1][1], + ] ) # Normalize the vector vector = vector / np.linalg.norm(vector) @@ -275,8 +291,5 @@ def plot_nucleotide_secondary_structure( vector = np.array([vector[1], -vector[0]]) # The annotations are offset in the direction of the # perpendicular vector - x, y = coordinates[i] + (annotation_offset*vector) - axes.text( - x=x, y=y, s=i+1, - ha='center', va='center', **text - ) \ No newline at end of file + x, y = coordinates[i] + (annotation_offset * vector) + axes.text(x=x, y=y, s=i + 1, ha="center", va="center", **text) diff --git a/src/biotite/structure/hbond.py b/src/biotite/structure/hbond.py index a23c5cdcd..96d0d87f8 100644 --- a/src/biotite/structure/hbond.py +++ b/src/biotite/structure/hbond.py @@ -11,16 +11,23 @@ __all__ = ["hbond", "hbond_frequency"] import warnings -from .geometry import distance, angle import numpy as np -from .atoms import AtomArrayStack, stack -from .celllist import CellList - - -def hbond(atoms, selection1=None, selection2=None, selection1_type='both', - cutoff_dist=2.5, cutoff_angle=120, - donor_elements=('O', 'N', 'S'), acceptor_elements=('O', 'N', 'S'), - periodic=False): +from biotite.structure.atoms import AtomArrayStack, stack +from biotite.structure.celllist import CellList +from biotite.structure.geometry import angle, distance + + +def hbond( + atoms, + selection1=None, + selection2=None, + selection1_type="both", + cutoff_dist=2.5, + cutoff_angle=120, + donor_elements=("O", "N", "S"), + acceptor_elements=("O", "N", "S"), + periodic=False, +): r""" Find hydrogen bonds in a structure using the Baker-Hubbard algorithm. :footcite:`Baker1984` @@ -31,7 +38,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', and :math:`d_{H,A} \le 2.5 \mathring{A}`. Consequently, the given structure must contain hydrogen atoms. Otherwise, no hydrogen bonds will be found. - + Parameters ---------- atoms : AtomArray or AtomArrayStack @@ -60,7 +67,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', boundary conditions. The `box` attribute of `atoms` is required in this case. (Default: False). - + Returns ------- triplets : ndarray, dtype=int, shape=(n,3) @@ -74,7 +81,7 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', *m x n* matrix that shows if an interaction with index *n* in `triplets` is present in the model *m* of the input `atoms`. Only returned if `atoms` is an :class:`AtomArrayStack`. - + Notes ----- The result of this function may include false positives: @@ -84,19 +91,19 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', For example, a nitrogen atom with positive charge could be considered as acceptor atom by this method, although this does make sense from a chemical perspective. - + Examples -------- Calculate the total number of hydrogen bonds found in each model: - + >>> triplets, mask = hbond(atom_array_stack) >>> hbonds_per_model = np.count_nonzero(mask, axis=1) >>> print(hbonds_per_model) [14 14 14 12 11 12 9 13 9 14 13 13 14 11 11 12 11 14 14 13 14 13 15 17 14 12 15 12 12 13 13 13 12 12 11 14 10 11] - + Get hydrogen bond donors of third model: - + >>> # Third model -> index 2 >>> triplets = triplets[mask[2,:]] >>> # First column contains donors @@ -137,12 +144,12 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', single_model = True else: single_model = False - + if periodic: box = atoms.box else: box = None - + # Mask for donor/acceptor elements donor_element_mask = np.isin(atoms.element, donor_elements) acceptor_element_mask = np.isin(atoms.element, acceptor_elements) @@ -152,69 +159,81 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', if selection2 is None: selection2 = np.ones(atoms.array_length(), dtype=bool) - if selection1_type == 'both': + if selection1_type == "both": # The two selections are separated into three selections: # the original ones without the overlaping part # and one containing the overlap - # This prevents redundant triplets and unnecessary computation + # This prevents redundant triplets and unnecessary computation overlap_selection = selection1 & selection2 # Original selections without overlaping part exclusive_selection1 = selection1 & (~overlap_selection) exclusive_selection2 = selection2 & (~overlap_selection) - + # Put selections to list for cleaner iteration - selections = [ - exclusive_selection1, exclusive_selection2, overlap_selection - ] + selections = [exclusive_selection1, exclusive_selection2, overlap_selection] selection_combinations = [ - #(0,0), is not included, would be same selection + # (0,0), is not included, would be same selection # as donor and acceptor simultaneously - (0,1), - (0,2), - (1,0), - #(1,1), # same reason above - (1,2), - (2,0), - (2,1), - (2,2) # overlaping part, combination is necessary + (0, 1), + (0, 2), + (1, 0), + # (1,1), # same reason above + (1, 2), + (2, 0), + (2, 1), + (2, 2), # overlaping part, combination is necessary ] - + all_comb_triplets = [] all_comb_mask = [] for selection_index1, selection_index2 in selection_combinations: donor_mask = selections[selection_index1] acceptor_mask = selections[selection_index2] - if np.count_nonzero(donor_mask) != 0 and \ - np.count_nonzero(acceptor_mask) != 0: - # Calculate triplets and mask - triplets, mask = _hbond( - atoms, donor_mask, acceptor_mask, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, - box - ) - all_comb_triplets.append(triplets) - all_comb_mask.append(mask) + if ( + np.count_nonzero(donor_mask) != 0 + and np.count_nonzero(acceptor_mask) != 0 + ): + # Calculate triplets and mask + triplets, mask = _hbond( + atoms, + donor_mask, + acceptor_mask, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, + ) + all_comb_triplets.append(triplets) + all_comb_mask.append(mask) # Merge results from all combinations triplets = np.concatenate(all_comb_triplets, axis=0) mask = np.concatenate(all_comb_mask, axis=1) - elif selection1_type == 'donor': + elif selection1_type == "donor": triplets, mask = _hbond( - atoms, selection1, selection2, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, - box + atoms, + selection1, + selection2, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, ) - - elif selection1_type == 'acceptor': + + elif selection1_type == "acceptor": triplets, mask = _hbond( - atoms, selection2, selection1, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, - box + atoms, + selection2, + selection1, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, ) - + else: raise ValueError(f"Unkown selection type '{selection1_type}'") @@ -228,12 +247,18 @@ def hbond(atoms, selection1=None, selection2=None, selection1_type='both', return triplets, mask -def _hbond(atoms, donor_mask, acceptor_mask, - donor_element_mask, acceptor_element_mask, - cutoff_dist, cutoff_angle, box): - +def _hbond( + atoms, + donor_mask, + acceptor_mask, + donor_element_mask, + acceptor_element_mask, + cutoff_dist, + cutoff_angle, + box, +): # Filter donor/acceptor elements - donor_mask &= donor_element_mask + donor_mask &= donor_element_mask acceptor_mask &= acceptor_element_mask first_model_box = box[0] if box is not None else None @@ -254,47 +279,43 @@ def _hbond(atoms, donor_mask, acceptor_mask, if len(donor_h_i) == 0 or len(acceptor_i) == 0: # Return empty triplets and mask return ( - np.zeros((0,3), dtype=int), - np.zeros((atoms.stack_depth(),0), dtype=bool) + np.zeros((0, 3), dtype=int), + np.zeros((atoms.stack_depth(), 0), dtype=bool), ) - + # Narrow the amount of possible acceptor to donor-H connections # down via the distance cutoff parameter using a cell list # Save in acceptor-to-hydrogen matrix # (true when distance smaller than cutoff) coord = atoms.coord - possible_bonds = np.zeros( - (len(acceptor_i), len(donor_h_i)), - dtype=bool - ) + possible_bonds = np.zeros((len(acceptor_i), len(donor_h_i)), dtype=bool) periodic = False if box is None else True for model_i in range(atoms.stack_depth()): donor_h_coord = coord[model_i, donor_h_mask] acceptor_coord = coord[model_i, acceptor_mask] box_for_model = box[model_i] if box is not None else None cell_list = CellList( - donor_h_coord, cell_size=cutoff_dist, - periodic=periodic, box=box_for_model - ) - possible_bonds |= cell_list.get_atoms_in_cells( - acceptor_coord, as_mask=True + donor_h_coord, cell_size=cutoff_dist, periodic=periodic, box=box_for_model ) + possible_bonds |= cell_list.get_atoms_in_cells(acceptor_coord, as_mask=True) possible_bonds_i = np.where(possible_bonds) # Narrow down acceptor_i = acceptor_i[possible_bonds_i[0]] donor_h_i = donor_h_i[possible_bonds_i[1]] - + # Build D-H..A triplets donor_i = associated_donor_indices[donor_h_i] triplets = np.stack((donor_i, donor_h_i, acceptor_i), axis=1) # Remove entries where donor and acceptor are the same triplets = triplets[donor_i != acceptor_i] - + hbond_mask = _is_hbond( - coord[:, triplets[:,0]], # donors - coord[:, triplets[:,1]], # donor hydrogens - coord[:, triplets[:,2]], # acceptors - box, cutoff_dist=cutoff_dist, cutoff_angle=cutoff_angle + coord[:, triplets[:, 0]], # donors + coord[:, triplets[:, 1]], # donor hydrogens + coord[:, triplets[:, 2]], # acceptors + box, + cutoff_dist=cutoff_dist, + cutoff_angle=cutoff_angle, ) # Reduce output to contain only triplets counted at least once @@ -311,14 +332,14 @@ def _get_bonded_h(array, donor_mask, bonds): all donors in atoms[donor_mask]. A `BondsList` is used for detecting bonded hydrogen atoms. """ - hydrogen_mask = (array.element == "H") - + hydrogen_mask = array.element == "H" + donor_hydrogen_mask = np.zeros(len(array), dtype=bool) associated_donor_indices = np.full(len(array), -1, dtype=int) all_bond_indices, _ = bonds.get_all_bonds() donor_indices = np.where(donor_mask)[0] - + for donor_i in donor_indices: bonded_indices = all_bond_indices[donor_i] # Remove padding values @@ -327,7 +348,7 @@ def _get_bonded_h(array, donor_mask, bonds): bonded_indices = bonded_indices[hydrogen_mask[bonded_indices]] donor_hydrogen_mask[bonded_indices] = True associated_donor_indices[bonded_indices] = donor_i - + return donor_hydrogen_mask, associated_donor_indices @@ -342,22 +363,20 @@ def _get_bonded_h_via_distance(array, donor_mask, box): coord = array.coord res_id = array.res_id - hydrogen_mask = (array.element == "H") - + hydrogen_mask = array.element == "H" + donor_hydrogen_mask = np.zeros(len(array), dtype=bool) associated_donor_indices = np.full(len(array), -1, dtype=int) donor_indices = np.where(donor_mask)[0] for donor_i in donor_indices: candidate_mask = hydrogen_mask & (res_id == res_id[donor_i]) - distances = distance( - coord[donor_i], coord[candidate_mask], box=box - ) + distances = distance(coord[donor_i], coord[candidate_mask], box=box) donor_h_indices = np.where(candidate_mask)[0][distances <= CUTOFF] for i in donor_h_indices: associated_donor_indices[i] = donor_i donor_hydrogen_mask[i] = True - + return donor_hydrogen_mask, associated_donor_indices @@ -378,12 +397,12 @@ def hbond_frequency(mask): The frequency is the amount of models, where the respective bond exists divided by the total amount of models. - + Parameters ---------- mask: ndarray, dtype=bool, shape=(m,n) Input mask obtained from `hbond` function. - + Returns ------- ndarray, dtype=Float @@ -406,4 +425,4 @@ def hbond_frequency(mask): 0.132 0.053 0.026 0.158 0.026 0.868 0.211 0.026 0.921 0.316 0.079 0.237 0.105 0.421 0.079 0.026 1.000 0.053 0.132 0.026 0.184] """ - return mask.sum(axis=0)/len(mask) + return mask.sum(axis=0) / len(mask) diff --git a/src/biotite/structure/info/__init__.py b/src/biotite/structure/info/__init__.py index 4d754a9b8..3c7078ff7 100644 --- a/src/biotite/structure/info/__init__.py +++ b/src/biotite/structure/info/__init__.py @@ -14,8 +14,6 @@ __name__ = "biotite.structure.info" __author__ = "Patrick Kunzmann, Tom David Müller" -from .groups import * - from .atoms import * from .bonds import * from .groups import * diff --git a/src/biotite/structure/info/atoms.py b/src/biotite/structure/info/atoms.py index 6ab063a99..349fb40e4 100644 --- a/src/biotite/structure/info/atoms.py +++ b/src/biotite/structure/info/atoms.py @@ -6,15 +6,16 @@ __author__ = "Patrick Kunzmann" __all__ = ["residue"] -from .ccd import get_ccd +from biotite.structure.info.ccd import get_ccd - -non_hetero_residues = set([ - "ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS", - "ILE","LEU","LYS","MET","PHE","PRO","PYL","SER","THR", - "TRP","TYR","VAL", "SEC", +# fmt: off +NON_HETERO_RESIDUES = set([ + "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", + "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "THR", + "TRP", "TYR", "VAL", "SEC", "A", "DA", "G", "DG", "C", "DC", "U", "DT", ]) +# fmt: on def residue(res_name): @@ -70,13 +71,11 @@ def residue(res_name): ['OXT' 'HXT']] """ # Avoid circular import - from ..io.pdbx import get_component + from biotite.structure.io.pdbx import get_component try: component = get_component(get_ccd(), res_name=res_name) except KeyError: - raise KeyError( - f"No atom information found for residue '{res_name}' in CCD" - ) - component.hetero[:] = res_name not in non_hetero_residues + raise KeyError(f"No atom information found for residue '{res_name}' in CCD") + component.hetero[:] = res_name not in NON_HETERO_RESIDUES return component diff --git a/src/biotite/structure/info/bonds.py b/src/biotite/structure/info/bonds.py index 421058162..fb851d294 100644 --- a/src/biotite/structure/info/bonds.py +++ b/src/biotite/structure/info/bonds.py @@ -6,18 +6,17 @@ __author__ = "Patrick Kunzmann" __all__ = ["bond_type", "bonds_in_residue"] -from ..bonds import BondType -from .ccd import get_from_ccd - +from biotite.structure.bonds import BondType +from biotite.structure.info.ccd import get_from_ccd BOND_TYPES = { - ("SING", "N") : BondType.SINGLE, - ("DOUB", "N") : BondType.DOUBLE, - ("TRIP", "N") : BondType.TRIPLE, - ("QUAD", "N") : BondType.QUADRUPLE, - ("SING", "Y") : BondType.AROMATIC_SINGLE, - ("DOUB", "Y") : BondType.AROMATIC_DOUBLE, - ("TRIP", "Y") : BondType.AROMATIC_TRIPLE, + ("SING", "N"): BondType.SINGLE, + ("DOUB", "N"): BondType.DOUBLE, + ("TRIP", "N"): BondType.TRIPLE, + ("QUAD", "N"): BondType.QUADRUPLE, + ("SING", "Y"): BondType.AROMATIC_SINGLE, + ("DOUB", "Y"): BondType.AROMATIC_DOUBLE, + ("TRIP", "Y"): BondType.AROMATIC_TRIPLE, } _intra_bonds = {} @@ -48,13 +47,13 @@ def bond_type(res_name, atom_name1, atom_name2): Examples -------- - >>> print(bond_type("PHE", "CA", "CB")) - BondType.SINGLE - >>> print(bond_type("PHE", "CG", "CD1")) - BondType.AROMATIC_DOUBLE - >>> print(bond_type("PHE", "CA", "CG")) + >>> print(repr(bond_type("PHE", "CA", "CB"))) + + >>> print(repr(bond_type("PHE", "CG", "CD1"))) + + >>> print(repr(bond_type("PHE", "CA", "CG"))) None - >>> print(bond_type("PHE", "FOO", "BAR")) + >>> print(repr(bond_type("PHE", "FOO", "BAR"))) None """ bonds_for_residue = bonds_in_residue(res_name) @@ -62,8 +61,7 @@ def bond_type(res_name, atom_name1, atom_name2): return None # Try both atom orders bond_type_int = bonds_for_residue.get( - (atom_name1, atom_name2), - bonds_for_residue.get((atom_name2, atom_name1)) + (atom_name1, atom_name2), bonds_for_residue.get((atom_name2, atom_name1)) ) if bond_type_int is not None: return BondType(bond_type_int) @@ -101,30 +99,30 @@ def bonds_in_residue(res_name): >>> bonds = bonds_in_residue("PHE") >>> for atoms, bond_type_int in sorted(bonds.items()): ... atom1, atom2 = sorted(atoms) - ... print(f"{atom1:3} + {atom2:3} -> {str(BondType(bond_type_int))}") - C + O -> BondType.DOUBLE - C + OXT -> BondType.SINGLE - C + CA -> BondType.SINGLE - CA + CB -> BondType.SINGLE - CA + HA -> BondType.SINGLE - CB + CG -> BondType.SINGLE - CB + HB2 -> BondType.SINGLE - CB + HB3 -> BondType.SINGLE - CD1 + CE1 -> BondType.AROMATIC_SINGLE - CD1 + HD1 -> BondType.SINGLE - CD2 + CE2 -> BondType.AROMATIC_DOUBLE - CD2 + HD2 -> BondType.SINGLE - CE1 + CZ -> BondType.AROMATIC_DOUBLE - CE1 + HE1 -> BondType.SINGLE - CE2 + CZ -> BondType.AROMATIC_SINGLE - CE2 + HE2 -> BondType.SINGLE - CD1 + CG -> BondType.AROMATIC_DOUBLE - CD2 + CG -> BondType.AROMATIC_SINGLE - CZ + HZ -> BondType.SINGLE - CA + N -> BondType.SINGLE - H + N -> BondType.SINGLE - H2 + N -> BondType.SINGLE - HXT + OXT -> BondType.SINGLE + ... print(f"{atom1:3} + {atom2:3} -> {BondType(bond_type_int).name}") + C + O -> DOUBLE + C + OXT -> SINGLE + C + CA -> SINGLE + CA + CB -> SINGLE + CA + HA -> SINGLE + CB + CG -> SINGLE + CB + HB2 -> SINGLE + CB + HB3 -> SINGLE + CD1 + CE1 -> AROMATIC_SINGLE + CD1 + HD1 -> SINGLE + CD2 + CE2 -> AROMATIC_DOUBLE + CD2 + HD2 -> SINGLE + CE1 + CZ -> AROMATIC_DOUBLE + CE1 + HE1 -> SINGLE + CE2 + CZ -> AROMATIC_SINGLE + CE2 + HE2 -> SINGLE + CD1 + CG -> AROMATIC_DOUBLE + CD2 + CG -> AROMATIC_SINGLE + CZ + HZ -> SINGLE + CA + N -> SINGLE + H + N -> SINGLE + H2 + N -> SINGLE + HXT + OXT -> SINGLE """ global _intra_bonds if res_name not in _intra_bonds: @@ -137,7 +135,7 @@ def bonds_in_residue(res_name): chem_comp_bond_dict["atom_id_1"], chem_comp_bond_dict["atom_id_2"], chem_comp_bond_dict["value_order"], - chem_comp_bond_dict["pdbx_aromatic_flag"] + chem_comp_bond_dict["pdbx_aromatic_flag"], ): bond_type = BOND_TYPES[order, aromatic_flag] bonds_for_residue[atom1.item(), atom2.item()] = bond_type diff --git a/src/biotite/structure/info/ccd.py b/src/biotite/structure/info/ccd.py index 8942f59ba..d48ab0f68 100644 --- a/src/biotite/structure/info/ccd.py +++ b/src/biotite/structure/info/ccd.py @@ -9,7 +9,6 @@ from pathlib import Path import numpy as np - CCD_DIR = Path(__file__).parent / "ccd" INDEX_COLUMN_NAME = { "chem_comp": "id", @@ -40,7 +39,7 @@ def get_ccd(): """ # Avoid circular import - from ..io.pdbx.bcif import BinaryCIFFile + from biotite.structure.io.pdbx.bcif import BinaryCIFFile global _ccd_block if _ccd_block is None: @@ -104,7 +103,7 @@ def _index_residues(id_column): # The final start is the exclusive stop of last residue residue_starts = np.concatenate(([0], residue_starts, [len(id_column)])) index = {} - for i in range(len(residue_starts)-1): + for i in range(len(residue_starts) - 1): comp_id = id_column[residue_starts[i]].item() - index[comp_id] = (residue_starts[i], residue_starts[i+1]) - return index \ No newline at end of file + index[comp_id] = (residue_starts[i], residue_starts[i + 1]) + return index diff --git a/src/biotite/structure/info/groups.py b/src/biotite/structure/info/groups.py index c719acd3f..781f9c587 100644 --- a/src/biotite/structure/info/groups.py +++ b/src/biotite/structure/info/groups.py @@ -7,8 +7,6 @@ __all__ = ["amino_acid_names", "nucleotide_names", "carbohydrate_names"] from pathlib import Path -import copy - CCD_DIR = Path(__file__).parent / "ccd" @@ -84,4 +82,4 @@ def _get_group_members(group_name): if group_name not in group_lists: with open(CCD_DIR / f"{group_name}.txt", "r") as file: group_lists[group_name] = tuple(file.read().split()) - return group_lists[group_name] \ No newline at end of file + return group_lists[group_name] diff --git a/src/biotite/structure/info/masses.py b/src/biotite/structure/info/masses.py index 73c0b6828..e0ac8cd8d 100644 --- a/src/biotite/structure/info/masses.py +++ b/src/biotite/structure/info/masses.py @@ -8,9 +8,8 @@ import json from pathlib import Path -from ..atoms import Atom, AtomArray, AtomArrayStack -from .ccd import get_from_ccd - +from biotite.structure.atoms import Atom, AtomArray, AtomArrayStack +from biotite.structure.info.ccd import get_from_ccd # Masses are taken from http://www.sbcs.qmul.ac.uk/iupac/AtWt/ (2018/03/01) ATOM_MASSES_FILE = Path(__file__).parent / "atom_masses.json" @@ -109,15 +108,11 @@ def mass(item, is_residue=None): elif isinstance(item, Atom): result_mass = mass(item.element, is_residue=False) elif isinstance(item, AtomArray) or isinstance(item, AtomArrayStack): - result_mass = sum( - (mass(element, is_residue=False) for element in item.element) - ) + result_mass = sum((mass(element, is_residue=False) for element in item.element)) else: - raise TypeError( - f"Cannot calculate mass for {type(item).__name__} objects" - ) + raise TypeError(f"Cannot calculate mass for {type(item).__name__} objects") if result_mass is None: raise KeyError(f"{item} is not known") - return result_mass \ No newline at end of file + return result_mass diff --git a/src/biotite/structure/info/misc.py b/src/biotite/structure/info/misc.py index 2fb9de55a..57e270568 100644 --- a/src/biotite/structure/info/misc.py +++ b/src/biotite/structure/info/misc.py @@ -6,7 +6,7 @@ __author__ = "Patrick Kunzmann" __all__ = ["all_residues", "full_name", "link_type", "one_letter_code"] -from .ccd import get_ccd, get_from_ccd +from biotite.structure.info.ccd import get_ccd, get_from_ccd def all_residues(): diff --git a/src/biotite/structure/info/radii.py b/src/biotite/structure/info/radii.py index 392dd0c00..64ef734bc 100644 --- a/src/biotite/structure/info/radii.py +++ b/src/biotite/structure/info/radii.py @@ -6,9 +6,9 @@ __author__ = "Patrick Kunzmann" __all__ = ["vdw_radius_protor", "vdw_radius_single"] -from .bonds import bonds_in_residue - +from biotite.structure.info.bonds import bonds_in_residue +# fmt: off # Contains tuples for the different ProtOr groups: # Tuple contains: element, valency, H count _PROTOR_RADII = { @@ -35,28 +35,29 @@ _SINGLE_RADII = { "H": 1.20, "HE": 1.40, - + "C": 1.70, "N": 1.55, "O": 1.52, "F": 1.47, "NE": 1.54, - + "SI": 2.10, "P": 1.80, "S": 1.80, "CL": 1.75, "AR": 1.88, - + "AS": 1.85, "SE": 1.90, "BR": 1.85, "KR": 2.02, - + "TE": 2.06, "I": 1.98, "XE": 2.16, } +# fmt: on # A dictionary that caches radii for each residue _protor_radii = {} @@ -82,7 +83,7 @@ def vdw_radius_protor(res_name, atom_name): to. atom_name : str The name of the non-hydrogen atom. - + Returns ------- The Van-der-Waals radius of the given atom. @@ -91,12 +92,12 @@ def vdw_radius_protor(res_name, atom_name): See also -------- vdw_radius_single - + References ---------- - + .. footbibliography:: - + Examples -------- @@ -113,8 +114,7 @@ def vdw_radius_protor(res_name, atom_name): # Use cached radii for the residue, if already calculated if atom_name not in _protor_radii[res_name]: raise KeyError( - f"Residue '{res_name}' does not contain an atom named " - f"'{atom_name}'" + f"Residue '{res_name}' does not contain an atom named " f"'{atom_name}'" ) return _protor_radii[res_name].get(atom_name) else: @@ -124,6 +124,7 @@ def vdw_radius_protor(res_name, atom_name): # are cached return vdw_radius_protor(res_name, atom_name) + def _calculate_protor_radii(res_name): """ Calculate the ProtOr VdW radii for all atoms (atom names) in @@ -159,8 +160,7 @@ def _calculate_protor_radii(res_name): group[2] += 1 groups[main_atom] = group # Get radii based on ProtOr groups - radii = {atom : _PROTOR_RADII.get(tuple(group)) - for atom, group in groups.items()} + radii = {atom: _PROTOR_RADII.get(tuple(group)) for atom, group in groups.items()} return radii @@ -173,25 +173,25 @@ def vdw_radius_single(element): ---------- element : str The chemical element of the atoms. - + Returns ------- The Van-der-Waals radius of the atom. If the radius is unknown for the element, `None` is returned. - + See also -------- vdw_radius_protor - + References ---------- - + .. footbibliography:: - + Examples -------- >>> print(vdw_radius_single("C")) 1.7 """ - return _SINGLE_RADII.get(element.upper()) \ No newline at end of file + return _SINGLE_RADII.get(element.upper()) diff --git a/src/biotite/structure/info/standardize.py b/src/biotite/structure/info/standardize.py index 2b1000265..558b81f41 100644 --- a/src/biotite/structure/info/standardize.py +++ b/src/biotite/structure/info/standardize.py @@ -8,9 +8,9 @@ import warnings import numpy as np -from .ccd import get_from_ccd -from ..residues import get_residue_starts -from ..error import BadStructureError +from biotite.structure.error import BadStructureError +from biotite.structure.info.ccd import get_from_ccd +from biotite.structure.residues import get_residue_starts def standardize_order(atoms): @@ -116,26 +116,24 @@ def standardize_order(atoms): reordered_indices = np.zeros(atoms.array_length(), dtype=int) starts = get_residue_starts(atoms, add_exclusive_stop=True) - for i in range(len(starts)-1): + for i in range(len(starts) - 1): start = starts[i] - stop = starts[i+1] + stop = starts[i + 1] res_name = atoms.res_name[start] - standard_atom_names = get_from_ccd( - "chem_comp_atom", res_name, "atom_id" - ) + standard_atom_names = get_from_ccd("chem_comp_atom", res_name, "atom_id") if standard_atom_names is None: # If the residue is not in the CCD, keep the current order warnings.warn( f"Residue '{res_name}' is not in the CCD, " f"keeping current atom order" ) - reordered_indices[start : stop] = np.arange(start, stop) + reordered_indices[start:stop] = np.arange(start, stop) continue - reordered_indices[start : stop] = _reorder( - atoms.atom_name[start : stop], standard_atom_names - ) + start + reordered_indices[start:stop] = ( + _reorder(atoms.atom_name[start:stop], standard_atom_names) + start + ) return reordered_indices @@ -164,17 +162,13 @@ def _reorder(origin, target): Indices for `origin` that that changes the order of `origin` to the order of `target`. """ - target_hits, origin_hits = np.where( - target[:, np.newaxis] == origin[np.newaxis, :] - ) + target_hits, origin_hits = np.where(target[:, np.newaxis] == origin[np.newaxis, :]) counts = np.bincount(target_hits, minlength=len(target)) if (counts > 1).any(): counts = np.bincount(target_hits, minlength=len(target)) # Identify which atom is duplicate - duplicate_i = np.where( - counts > 1 - )[0][0] + duplicate_i = np.where(counts > 1)[0][0] duplicate_name = target[duplicate_i] raise BadStructureError( f"Input structure has duplicate atom '{duplicate_name}'" @@ -185,12 +179,7 @@ def _reorder(origin, target): # to the target structure # -> Identify which atoms are missing in the target structure # and append these to the end of the residue - missing_atom_mask = np.bincount( - origin_hits, minlength=len(origin) - ).astype(bool) - return np.concatenate([ - origin_hits, - np.where(~missing_atom_mask)[0] - ]) + missing_atom_mask = np.bincount(origin_hits, minlength=len(origin)).astype(bool) + return np.concatenate([origin_hits, np.where(~missing_atom_mask)[0]]) else: - return origin_hits \ No newline at end of file + return origin_hits diff --git a/src/biotite/structure/integrity.py b/src/biotite/structure/integrity.py index 30ebac610..b3e867666 100644 --- a/src/biotite/structure/integrity.py +++ b/src/biotite/structure/integrity.py @@ -9,53 +9,29 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Daniel Bauer" -__all__ = ["check_id_continuity", "check_atom_id_continuity", - "check_res_id_continuity", "check_backbone_continuity", - "check_duplicate_atoms", "check_bond_continuity", - "check_linear_continuity"] +__all__ = [ + "check_atom_id_continuity", + "check_res_id_continuity", + "check_backbone_continuity", + "check_duplicate_atoms", + "check_linear_continuity", +] import numpy as np -import warnings -from .atoms import AtomArray, AtomArrayStack -from .filter import ( - filter_peptide_backbone, filter_phosphate_backbone, filter_linear_bond_continuity) -from .box import coord_to_fraction +from biotite.structure.box import coord_to_fraction +from biotite.structure.filter import ( + filter_linear_bond_continuity, + filter_peptide_backbone, + filter_phosphate_backbone, +) def _check_continuity(array): diff = np.diff(array) - discontinuity = np.where( ((diff != 0) & (diff != 1)) ) + discontinuity = np.where(((diff != 0) & (diff != 1))) return discontinuity[0] + 1 -def check_id_continuity(array): - """ - Check if the residue IDs are incremented by more than 1 or - decremented, from one atom to the next one. - - An increment by more than 1 is as strong clue for missing residues, - a decrement means probably a start of a new chain. - - DEPRECATED: Use :func:`check_res_id_continuity()` instead. - - Parameters - ---------- - array : AtomArray or AtomArrayStack - The array to be checked. - - Returns - ------- - discontinuity : ndarray, dtype=int - Contains the indices of atoms after a discontinuity - """ - warnings.warn( - "'check_id_continuity()' is deprecated, " - "use 'check_res_id_continuity()' instead", - DeprecationWarning - ) - return check_res_id_continuity(array) - - def check_atom_id_continuity(array): """ Check if the atom IDs are incremented by more than 1 or @@ -99,36 +75,6 @@ def check_res_id_continuity(array): return _check_continuity(ids) -def check_bond_continuity(array, min_len=1.2, max_len=1.8): - """ - Check if the peptide or phosphate backbone atoms have a - non-reasonable distance to the next residue. - - A large or very small distance is a very strong clue, that there is - no bond between those atoms, therefore the chain is discontinued. - - DEPRECATED: Please use :func:`check_backbone_continuity` for the same functionality. - - Parameters - ---------- - array : AtomArray - The array to be checked. - min_len, max_len : float, optional - The interval in which the atom-atom distance is evaluated as - bond. - - Returns - ------- - discontinuity : ndarray, dtype=int - Contains the indices of atoms after a discontinuity. - """ - warnings.warn( - "Reimplemented into `check_backbone_continuity()`", - DeprecationWarning - ) - return check_backbone_continuity(array, min_len, max_len) - - def check_linear_continuity(array, min_len=1.2, max_len=1.8): """ Check linear (consecutive) bond continuity of atoms in atom array. @@ -223,8 +169,9 @@ def check_duplicate_atoms(array): The first occurence of an atom is not counted as duplicate. """ duplicates = [] - annots = [array.get_annotation(category) for category - in array.get_annotation_categories()] + annots = [ + array.get_annotation(category) for category in array.get_annotation_categories() + ] for i in range(1, array.array_length()): # Start with assumption that all atoms in the array # until index i are duplicates of the atom at index i @@ -233,7 +180,7 @@ def check_duplicate_atoms(array): # For each annotation array filter out the atoms until # index i that have an unequal annotation # to the atom at index i - is_duplicate &= (annot[:i] == annot[i]) + is_duplicate &= annot[:i] == annot[i] # After checking all annotation arrays, # if there still is any duplicate to the atom at index i, # add i the the list of duplicate atom indices diff --git a/src/biotite/structure/io/__init__.py b/src/biotite/structure/io/__init__.py index 5a7190f3e..610bb23db 100644 --- a/src/biotite/structure/io/__init__.py +++ b/src/biotite/structure/io/__init__.py @@ -18,13 +18,12 @@ The recommended format for reading structure files is *BinaryCIF*. It has by far the shortest parsing time and file size. -Besides the mentioned structure formats, Gromacs trajectory files can be -loaded, if `mdtraj` is installed. +Besides the mentioned structure formats, common trajectory formats can be +loaded as well. """ __name__ = "biotite.structure.io" __author__ = "Patrick Kunzmann" -from .ctab import * from .general import * -from .trajfile import * \ No newline at end of file +from .trajfile import * diff --git a/src/biotite/structure/io/ctab.py b/src/biotite/structure/io/ctab.py deleted file mode 100644 index f2dc61982..000000000 --- a/src/biotite/structure/io/ctab.py +++ /dev/null @@ -1,72 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io" -__author__ = "Patrick Kunzmann" -__all__ = ["read_structure_from_ctab", "write_structure_to_ctab"] - -import warnings -from ..bonds import BondType - - -def read_structure_from_ctab(ctab_lines): - """ - Parse a *MDL* connection table (Ctab) to obtain an - :class:`AtomArray`. :footcite:`Dalby1992`. - - DEPRECATED: Moved to :mod:`biotite.structure.io.mol.ctab`. - - Parameters - ---------- - ctab_lines : lines of str - The lines containing the *ctab*. - Must begin with the *counts* line and end with the `M END` line - - Returns - ------- - atoms : AtomArray - This :class:`AtomArray` contains the optional ``charge`` - annotation and has an associated :class:`BondList`. - - References - ---------- - - .. footbibliography:: - """ - warnings.warn("Moved to biotite.structure.io.mol.ctab", DeprecationWarning) - from biotite.structure.io.mol.ctab import read_structure_from_ctab - return read_structure_from_ctab(ctab_lines) - - -def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY): - """ - Convert an :class:`AtomArray` into a - *MDL* connection table (Ctab). :footcite:`Dalby1992` - - DEPRECATED: Moved to :mod:`biotite.structure.io.mol.ctab`. - - Parameters - ---------- - atoms : AtomArray - The array must have an associated :class:`BondList`. - - Returns - ------- - ctab_lines : lines of str - The lines containing the *ctab*. - The lines begin with the *counts* line and end with the `M END` - .line - default_bond_type : BondType - Bond type fallback in the *Bond block* if a bond has no bond_type - defined in *atoms* array. By default, each bond is treated as - :attr:`BondType.ANY`. - - References - ---------- - - .. footbibliography:: - """ - warnings.warn("Moved to biotite.structure.io.mol.ctab", DeprecationWarning) - from biotite.structure.io.mol.ctab import write_structure_to_ctab - return write_structure_to_ctab(atoms, default_bond_type) diff --git a/src/biotite/structure/io/dcd/__init__.py b/src/biotite/structure/io/dcd/__init__.py index aa5e79366..1145f2376 100644 --- a/src/biotite/structure/io/dcd/__init__.py +++ b/src/biotite/structure/io/dcd/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.dcd" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/dcd/file.py b/src/biotite/structure/io/dcd/file.py index 5aa1071f4..7590b10e4 100644 --- a/src/biotite/structure/io/dcd/file.py +++ b/src/biotite/structure/io/dcd/file.py @@ -6,21 +6,21 @@ __author__ = "Patrick Kunzmann" __all__ = ["DCDFile"] +import biotraj import numpy as np -from ..trajfile import TrajectoryFile -from ...box import vectors_from_unitcell, unitcell_from_vectors +from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell +from biotite.structure.io.trajfile import TrajectoryFile class DCDFile(TrajectoryFile): """ This file class represents a DCD trajectory file. """ - + @classmethod def traj_type(cls): - import mdtraj.formats as traj - return traj.DCDTrajectoryFile - + return biotraj.DCDTrajectoryFile + @classmethod def process_read_values(cls, read_values): # .netcdf files use Angstrom @@ -28,38 +28,40 @@ def process_read_values(cls, read_values): cell_lengths = read_values[1] cell_angles = read_values[2] if cell_lengths is None or cell_angles is None: - box = None + box = None else: box = np.stack( - [vectors_from_unitcell(a, b, c, alpha, beta, gamma) - for (a, b, c), (alpha, beta, gamma) - in zip(cell_lengths, np.deg2rad(cell_angles))], - axis=0 + [ + vectors_from_unitcell(a, b, c, alpha, beta, gamma) + for (a, b, c), (alpha, beta, gamma) in zip( + cell_lengths, np.deg2rad(cell_angles) + ) + ], + axis=0, ) return coord, box, None - + @classmethod def prepare_write_values(cls, coord, box, time): - xyz = coord.astype(np.float32, copy=False) \ - if coord is not None else None + xyz = coord.astype(np.float32, copy=False) if coord is not None else None if box is None: cell_lengths = None - cell_angles = None + cell_angles = None else: cell_lengths = np.zeros((len(box), 3), dtype=np.float32) - cell_angles = np.zeros((len(box), 3), dtype=np.float32) + cell_angles = np.zeros((len(box), 3), dtype=np.float32) for i, model_box in enumerate(box): a, b, c, alpha, beta, gamma = unitcell_from_vectors(model_box) cell_lengths[i] = np.array((a, b, c)) cell_angles[i] = np.rad2deg((alpha, beta, gamma)) return { - "xyz" : xyz, - "cell_lengths" : cell_lengths, - "cell_angles" : cell_angles, + "xyz": xyz, + "cell_lengths": cell_lengths, + "cell_angles": cell_angles, } def set_time(self, time): if time is not None: raise NotImplementedError( "This trajectory file does not support writing simulation time" - ) \ No newline at end of file + ) diff --git a/src/biotite/structure/io/general.py b/src/biotite/structure/io/general.py index 1e4c5201c..dd76c99de 100644 --- a/src/biotite/structure/io/general.py +++ b/src/biotite/structure/io/general.py @@ -12,9 +12,9 @@ __all__ = ["load_structure", "save_structure"] import datetime -import os.path import io -from ..atoms import AtomArrayStack +import os.path +from biotite.structure.atoms import AtomArrayStack def load_structure(file_path, template=None, **kwargs): @@ -64,73 +64,63 @@ def load_structure(file_path, template=None, **kwargs): _, suffix = os.path.splitext(file_path) match suffix: case ".pdb": - from .pdb import PDBFile + from biotite.structure.io.pdb import PDBFile + file = PDBFile.read(file_path) array = file.get_structure(**kwargs) return _as_single_model_if_possible(array) case ".pdbqt": - from .pdbqt import PDBQTFile + from biotite.structure.io.pdbqt import PDBQTFile + file = PDBQTFile.read(file_path) array = file.get_structure(**kwargs) return _as_single_model_if_possible(array) case ".cif" | ".pdbx": - from .pdbx import CIFFile, get_structure + from biotite.structure.io.pdbx import CIFFile, get_structure + file = CIFFile.read(file_path) array = get_structure(file, **kwargs) return _as_single_model_if_possible(array) case ".bcif": - from .pdbx import BinaryCIFFile, get_structure + from biotite.structure.io.pdbx import BinaryCIFFile, get_structure + file = BinaryCIFFile.read(file_path) array = get_structure(file, **kwargs) return _as_single_model_if_possible(array) case ".gro": - from .gro import GROFile + from biotite.structure.io.gro import GROFile + file = GROFile.read(file_path) array = file.get_structure(**kwargs) return _as_single_model_if_possible(array) - case ".mmtf": - from .mmtf import MMTFFile, get_structure - file = MMTFFile.read(file_path) - array = get_structure(file, **kwargs) - return _as_single_model_if_possible(array) - case ".npz": - from .npz import NpzFile - file = NpzFile.read(file_path) - array = file.get_structure(**kwargs) - return _as_single_model_if_possible(array) case ".mol": - from .mol import MOLFile + from biotite.structure.io.mol import MOLFile + file = MOLFile.read(file_path) array = file.get_structure(**kwargs) # MOL and SDF files only contain a single model return array case ".sdf" | ".sd": - from .mol import SDFile, get_structure + from biotite.structure.io.mol import SDFile, get_structure + file = SDFile.read(file_path) array = get_structure(file, **kwargs) return array - case ".trr" | ".xtc" | ".tng" | ".dcd" | ".netcdf": + case ".trr" | ".xtc" | ".dcd" | ".netcdf": if template is None: - raise TypeError( - "Template must be specified for trajectory files" - ) + raise TypeError("Template must be specified for trajectory files") # Filter template for atom ids, if an unfiltered template - if ( - "atom_i" in kwargs - and template.shape[-1] != len(kwargs["atom_i"]) - ): + if "atom_i" in kwargs and template.shape[-1] != len(kwargs["atom_i"]): template = template[..., kwargs["atom_i"]] - from .trr import TRRFile - from .xtc import XTCFile - from .tng import TNGFile - from .dcd import DCDFile - from .netcdf import NetCDFFile + from biotite.structure.io.dcd import DCDFile + from biotite.structure.io.netcdf import NetCDFFile + from biotite.structure.io.trr import TRRFile + from biotite.structure.io.xtc import XTCFile + if suffix == ".trr": traj_file_cls = TRRFile if suffix == ".xtc": traj_file_cls = XTCFile - if suffix == ".tng": - traj_file_cls = TNGFile if suffix == ".dcd": traj_file_cls = DCDFile if suffix == ".netcdf": @@ -169,65 +159,60 @@ def save_structure(file_path, array, **kwargs): _, suffix = os.path.splitext(file_path) match suffix: case ".pdb": - from .pdb import PDBFile + from biotite.structure.io.pdb import PDBFile + file = PDBFile() file.set_structure(array, **kwargs) file.write(file_path) case ".pdbqt": - from .pdbqt import PDBQTFile + from biotite.structure.io.pdbqt import PDBQTFile + file = PDBQTFile() file.set_structure(array, **kwargs) file.write(file_path) case ".cif" | ".pdbx": - from .pdbx import CIFFile, set_structure + from biotite.structure.io.pdbx import CIFFile, set_structure + file = CIFFile() set_structure(file, array, **kwargs) file.write(file_path) case ".bcif": - from .pdbx import BinaryCIFFile, set_structure + from biotite.structure.io.pdbx import BinaryCIFFile, set_structure + file = BinaryCIFFile() set_structure(file, array, **kwargs) file.write(file_path) case ".gro": - from .gro import GROFile + from biotite.structure.io.gro import GROFile + file = GROFile() file.set_structure(array, **kwargs) file.write(file_path) - case ".mmtf": - from .mmtf import MMTFFile, set_structure - file = MMTFFile() - set_structure(file, array, **kwargs) - file.write(file_path) - case ".npz": - from .npz import NpzFile - file = NpzFile() - file.set_structure(array, **kwargs) - file.write(file_path) case ".mol": - from .mol import MOLFile + from biotite.structure.io.mol import MOLFile + file = MOLFile() file.set_structure(array, **kwargs) file.header = _mol_header() file.write(file_path) case ".sdf" | ".sd": - from .mol import SDFile, SDRecord, set_structure + from biotite.structure.io.mol import SDFile, SDRecord, set_structure + record = SDRecord() record.set_structure(array, **kwargs) record.header = _mol_header() file = SDFile({"Molecule": record}) file.write(file_path) - case ".trr" | ".xtc" | ".tng" | ".dcd" | ".netcdf": - from .trr import TRRFile - from .xtc import XTCFile - from .tng import TNGFile - from .dcd import DCDFile - from .netcdf import NetCDFFile + case ".trr" | ".xtc" | ".dcd" | ".netcdf": + from biotite.structure.io.dcd import DCDFile + from biotite.structure.io.netcdf import NetCDFFile + from biotite.structure.io.trr import TRRFile + from biotite.structure.io.xtc import XTCFile + if suffix == ".trr": traj_file_cls = TRRFile if suffix == ".xtc": traj_file_cls = XTCFile - if suffix == ".tng": - traj_file_cls = TNGFile if suffix == ".dcd": traj_file_cls = DCDFile if suffix == ".netcdf": @@ -248,10 +233,11 @@ def _as_single_model_if_possible(atoms): def _mol_header(): - from .mol import Header + from biotite.structure.io.mol import Header + return Header( mol_name="Molecule", program="Biotite", time=datetime.datetime.now(), dimensions="3D", - ) \ No newline at end of file + ) diff --git a/src/biotite/structure/io/gro/__init__.py b/src/biotite/structure/io/gro/__init__.py index 8d10671b5..e58ccff55 100644 --- a/src/biotite/structure/io/gro/__init__.py +++ b/src/biotite/structure/io/gro/__init__.py @@ -11,4 +11,4 @@ __name__ = "biotite.structure.io.gro" __author__ = "Daniel Bauer" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/gro/file.py b/src/biotite/structure/io/gro/file.py index 188338e50..8279b639d 100644 --- a/src/biotite/structure/io/gro/file.py +++ b/src/biotite/structure/io/gro/file.py @@ -6,25 +6,27 @@ __author__ = "Daniel Bauer, Patrick Kunzmann" __all__ = ["GROFile"] -import numpy as np -from ...atoms import AtomArray, AtomArrayStack -from ...box import is_orthogonal -from ....file import TextFile, InvalidFileError -from ...repair import infer_elements -from ...error import BadStructureError import copy from datetime import datetime - -_atom_records = {"res_id" : (0, 5), - "res_name" : (5,10), - "atom_name" : (10,15), - "atom_id" : (15,20), - "coord_x" : (20, 28), - "coord_y" : (28, 36), - "coord_z" : (36, 44), - "v_x" : (44, 52), - "v_y" : (52, 60), - "v_z" : (60, 68)} +import numpy as np +from biotite.file import InvalidFileError, TextFile +from biotite.structure.atoms import AtomArray, AtomArrayStack +from biotite.structure.box import is_orthogonal +from biotite.structure.error import BadStructureError +from biotite.structure.repair import infer_elements + +_atom_records = { + "res_id": (0, 5), + "res_name": (5, 10), + "atom_name": (10, 15), + "atom_id": (15, 20), + "coord_x": (20, 28), + "coord_y": (28, 36), + "coord_z": (36, 44), + "v_x": (44, 52), + "v_y": (52, 60), + "v_z": (60, 68), +} class GROFile(TextFile): @@ -48,6 +50,7 @@ class GROFile(TextFile): >>> file.write(os.path.join(path_to_directory, "1l2y_mod.gro")) """ + def get_model_count(self): """ Get the number of models contained in this GRO file. @@ -63,7 +66,6 @@ def get_model_count(self): model_count += 1 return model_count - def get_structure(self, model=None): """ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the @@ -91,9 +93,7 @@ def get_atom_line_i(model_start_i, model_atom_counts): """ Helper function to get the indices of all atoms for a model """ - return np.arange( - model_start_i+1, model_start_i+1+model_atom_counts - ) + return np.arange(model_start_i + 1, model_start_i + 1 + model_atom_counts) def set_box_dimen(box_param): """ @@ -114,33 +114,31 @@ def set_box_dimen(box_param): return None if len(box_param) == 3: x, y, z = box_param - return np.array([[x,0,0], [0,y,0], [0,0,z]], dtype=float) + return np.array([[x, 0, 0], [0, y, 0], [0, 0, z]], dtype=float) elif len(box_param) == 9: x1, y2, z3, x2, x3, y1, y3, z1, z2 = box_param - return np.array( - [[x1,x2,x3], [y1,y2,y3], [z1,z2,z3]], dtype=float - ) + return np.array([[x1, x2, x3], [y1, y2, y3], [z1, z2, z3]], dtype=float) else: raise InvalidFileError( f"Invalid amount of box parameters: {len(box_param)}" ) # Line indices where a new model starts - model_start_i = np.array([i for i in range(len(self.lines)) - if _is_int(self.lines[i])], - dtype=int) + model_start_i = np.array( + [i for i in range(len(self.lines)) if _is_int(self.lines[i])], dtype=int + ) # Number of atoms in each model - model_atom_counts = np.array( - [int(self.lines[i]) for i in model_start_i] - ) + model_atom_counts = np.array([int(self.lines[i]) for i in model_start_i]) if model is None: # Check if all models have the same length if np.all(model_atom_counts != model_atom_counts[0]): - raise BadStructureError("The models in the file have unequal " - "amount of atoms, give an explicit " - "model instead") + raise BadStructureError( + "The models in the file have unequal " + "amount of atoms, give an explicit " + "model instead" + ) depth = len(model_start_i) length = model_atom_counts[0] array = AtomArrayStack(depth, length) @@ -159,10 +157,10 @@ def set_box_dimen(box_param): f"the given model {model} does not exist" ) - length = model_atom_counts[model-1] + length = model_atom_counts[model - 1] array = AtomArray(length) - annot_i = get_atom_line_i(model_start_i[model-1], length) + annot_i = get_atom_line_i(model_start_i[model - 1], length) # Replace empty strings for elements with guessed types # i is index in array, line_i is line index @@ -179,27 +177,25 @@ def set_box_dimen(box_param): for i, line_i in enumerate(atom_i): line = self.lines[line_i] # gro files use nm instead of A - array.coord[i,0] = float(line[20:28])*10 - array.coord[i,1] = float(line[28:36])*10 - array.coord[i,2] = float(line[36:44])*10 + array.coord[i, 0] = float(line[20:28]) * 10 + array.coord[i, 1] = float(line[28:36]) * 10 + array.coord[i, 2] = float(line[36:44]) * 10 # Box is stored in last line (after coordinates) box_i = atom_i[-1] + 1 - box_param = [float(e)*10 for e in self.lines[box_i].split()] + box_param = [float(e) * 10 for e in self.lines[box_i].split()] array.box = set_box_dimen(box_param) elif isinstance(array, AtomArrayStack): for m in range(len(model_start_i)): - atom_i = get_atom_line_i( - model_start_i[m], model_atom_counts[m] - ) + atom_i = get_atom_line_i(model_start_i[m], model_atom_counts[m]) for i, line_i in enumerate(atom_i): line = self.lines[line_i] - array.coord[m,i,0] = float(line[20:28])*10 - array.coord[m,i,1] = float(line[28:36])*10 - array.coord[m,i,2] = float(line[36:44])*10 + array.coord[m, i, 0] = float(line[20:28]) * 10 + array.coord[m, i, 1] = float(line[28:36]) * 10 + array.coord[m, i, 2] = float(line[36:44]) * 10 # Box is stored in last line (after coordinates) box_i = atom_i[-1] + 1 - box_param = [float(e)*10 for e in self.lines[box_i].split()] + box_param = [float(e) * 10 for e in self.lines[box_i].split()] box = set_box_dimen(box_param) # Create a box in the stack if not already existing # and the box is not a dummy @@ -210,7 +206,6 @@ def set_box_dimen(box_param): return array - def set_structure(self, array): """ Set the :class:`AtomArray` or :class:`AtomArrayStack` for the @@ -223,6 +218,7 @@ def set_structure(self, array): is given, each array in the stack is saved as separate model. """ + def get_box_dimen(array): """ GRO files have the box dimensions as last line for each @@ -253,10 +249,15 @@ def get_box_dimen(array): else: box = box / 10 box_elements = ( - box[0,0], box[1,1], box[2,2], - box[0,1], box[0,2], - box[1,0], box[1,2], - box[2,0], box[2,1], + box[0, 0], + box[1, 1], + box[2, 2], + box[0, 1], + box[0, 2], + box[1, 0], + box[1, 2], + box[2, 0], + box[2, 1], ) return " ".join([f"{e:>9.5f}" for e in box_elements]) @@ -266,17 +267,11 @@ def get_box_dimen(array): atom_id = np.arange(1, array.array_length() + 1) # Atom IDs are supported up to 99999, # but negative IDs are also possible - gro_atom_id = np.where( - atom_id > 0, - ((atom_id - 1) % 99999) + 1, - atom_id - ) + gro_atom_id = np.where(atom_id > 0, ((atom_id - 1) % 99999) + 1, atom_id) # Residue IDs are supported up to 9999, # but negative IDs are also possible gro_res_id = np.where( - array.res_id > 0, - ((array.res_id - 1) % 99999) + 1, - array.res_id + array.res_id > 0, ((array.res_id - 1) % 99999) + 1, array.res_id ) if isinstance(array, AtomArray): @@ -290,10 +285,14 @@ def get_box_dimen(array): fmt = "{:>5d}{:5s}{:>5s}{:>5d}{:>8.3f}{:>8.3f}{:>8.3f}" for i in range(array.array_length()): # gro format is in nm -> multiply coords by 10 - self.lines[i+2] = fmt.format( - gro_res_id[i], array.res_name[i], array.atom_name[i], - gro_atom_id[i], array.coord[i,0]/10, array.coord[i,1]/10, - array.coord[i,2]/10 + self.lines[i + 2] = fmt.format( + gro_res_id[i], + array.res_name[i], + array.atom_name[i], + gro_atom_id[i], + array.coord[i, 0] / 10, + array.coord[i, 1] / 10, + array.coord[i, 2] / 10, ) # Write box lines self.lines[-1] = get_box_dimen(array) @@ -304,10 +303,11 @@ def get_box_dimen(array): # Therefore template lines are created # which are afterwards applied for each model templines = [None] * array.array_length() - fmt = '{:>5d}{:5s}{:>5s}{:5d}' + fmt = "{:>5d}{:5s}{:>5s}{:5d}" for i in range(array.array_length()): - templines[i] = fmt.format(gro_res_id[i], array.res_name[i], - array.atom_name[i], gro_atom_id[i]) + templines[i] = fmt.format( + gro_res_id[i], array.res_name[i], array.atom_name[i], gro_atom_id[i] + ) for i in range(array.stack_depth()): self.lines.append( @@ -319,10 +319,11 @@ def get_box_dimen(array): modellines = copy.copy(templines) for j, line in enumerate(modellines): # Insert coordinates - line = (line + "{:>8.3f}{:>8.3f}{:>8.3f}".format( - array.coord[i,j,0]/10, - array.coord[i,j,1]/10, - array.coord[i,j,2]/10)) + line = line + "{:>8.3f}{:>8.3f}{:>8.3f}".format( + array.coord[i, j, 0] / 10, + array.coord[i, j, 1] / 10, + array.coord[i, j, 2] / 10, + ) modellines[j] = line self.lines.extend(modellines) self.lines.append(get_box_dimen(array[i])) @@ -340,4 +341,4 @@ def _is_int(string): int(string) return True except ValueError: - return False \ No newline at end of file + return False diff --git a/src/biotite/structure/io/mmtf/__init__.py b/src/biotite/structure/io/mmtf/__init__.py deleted file mode 100644 index e34c923b4..000000000 --- a/src/biotite/structure/io/mmtf/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -""" -This subpackage is used for reading and writing an :class:`AtomArray` or -:class:`AtomArrayStack` using the binary MMTF format. This format -features a smaller file size and a highly increased I/O operation -performance, than the text based file formats. - -DEPRECATED: Use :class:`biotite.structure.io.pdbx.BinaryCIFFile` -instead. -""" - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" - -from .assembly import * -from .file import * -from .convertfile import * -from .convertarray import * \ No newline at end of file diff --git a/src/biotite/structure/io/mmtf/assembly.py b/src/biotite/structure/io/mmtf/assembly.py deleted file mode 100644 index 67c391aa0..000000000 --- a/src/biotite/structure/io/mmtf/assembly.py +++ /dev/null @@ -1,214 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" -__all__ = ["list_assemblies", "get_assembly"] - - -import numpy as np -from .convertfile import get_structure -from ...chains import get_chain_starts -from ...util import matrix_rotate -from ....file import InvalidFileError - - -def list_assemblies(file): - """ - List the biological assemblies that are available for the - structure in the given file. - - This function receives the data from the ``"bioAssemblyList"`` field - in the file. - Consequently, this field must be present in the file. - - Parameters - ---------- - file : MMTFFile - The file object. - - Returns - ------- - assemblies : list of str - A list that contains the available assembly IDs. - - Examples - -------- - >>> import os.path - >>> file = MMTFFile.read(os.path.join(path_to_structures, "1f2n.mmtf")) - >>> print(list_assemblies(file)) - ['1', '2', '3', '4', '5', '6'] - """ - return [assembly["name"] for assembly in file["bioAssemblyList"]] - - -def get_assembly(file, assembly_id=None, model=None, altloc="first", - extra_fields=[], include_bonds=False): - """ - Build the given biological assembly. - - This function receives the data from ``bioAssemblyList`` field in - the file. - Consequently, this field must be present in the file. - - Parameters - ---------- - file : MMTFFile - The file object. - assembly_id : str - The assembly to build. - Available assembly IDs can be obtained via - :func:`list_assemblies()`. - model : int, optional - If this parameter is given, the function will return an - :class:`AtomArray` from the atoms corresponding to the given - model number (starting at 1). - Negative values are used to index models starting from the - last model instead of the first model. - If this parameter is omitted, an :class:`AtomArrayStack` - containing all models will be returned, even if the - structure contains only one model. - altloc : {'first', 'occupancy', 'all'} - This parameter defines how *altloc* IDs are handled: - - ``'first'`` - Use atoms that have the first - *altloc* ID appearing in a residue. - - ``'occupancy'`` - Use atoms that have the *altloc* ID - with the highest occupancy for a residue. - - ``'all'`` - Use all atoms. - Note that this leads to duplicate atoms. - When this option is chosen, the ``altloc_id`` - annotation array is added to the returned structure. - extra_fields : list of str, optional - The strings in the list are optional annotation categories - that should be stored in the output array or stack. - These are valid values: - ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and - ``'charge'``. - include_bonds : bool, optional - If set to true, a :class:`BondList` will be created for the - resulting :class:`AtomArray` containing the bond information - from the file. - All bonds have :attr:`BondType.ANY`, since the PDB format - does not support bond orders. - - Raises - ------ - NotImplementedError - If any transformation required by the assembly only affects a - part of the atoms (not every chain) and the number of chains - as detected by :func:`get_chain_count()` is different from - the ``chainNameList`` field. - This limitation of this function exists, as the - :class:`AtomArray` of the asymmetric unit used for constructing - the assembly has not the chain index information required by the - ``bioAssemblyList`` field. - In short, :func:`get_assembly()` does not work for a significant - portion of the PDB. - If you require reliable assembly building for any PDB entry, - you should use the analogous function for PDB or mmCIF files - instead. - - Returns - ------- - assembly : AtomArray or AtomArrayStack - The assembly. - The return type depends on the `model` parameter. - - Examples - -------- - - >>> import os.path - >>> file = MMTFFile.read(os.path.join(path_to_structures, "1f2n.mmtf")) - >>> assembly = get_assembly(file, model=1) - """ - structure = get_structure( - file, model, altloc, extra_fields, include_bonds - ) - - # Get transformations for chosen assembly - selected_assembly = None - if not "bioAssemblyList" in file: - raise InvalidFileError( - "File does not contain assembly information " - "(missing 'bioAssemblyList')" - ) - for assembly in file["bioAssemblyList"]: - current_assembly_id = assembly["name"] - transform_list = assembly["transformList"] - if assembly_id is None or current_assembly_id == assembly_id: - selected_assembly = transform_list - break - if selected_assembly is None: - raise KeyError( - f"The assembly ID '{assembly_id}' is not found" - ) - - # In most cases the transformations in an assembly applies to all - # atoms equally ('apply_to_all == True') - # If this is the case, the selection of atoms for each - # transformation can be omitted, improving the performance - chain_index_count = len(file["chainNameList"]) - apply_to_all = True - for transformation in selected_assembly: - # If the number of affected chains matches the number of total - # chains, all atoms are affected - if len(transformation["chainIndexList"]) != chain_index_count: - apply_to_all = False - # If the transformations in the assembly do not apply to all atoms, - # but only to certain chains we need the ranges of these chains - # in the base structure (the asymmetric unit) - if not apply_to_all: - chains_starts = get_chain_starts( - structure, add_exclusive_stop=True - ) - # Furthermore the number of chains determined by Biotite via - # 'get_chain_starts()' must corresponds to the number of chains - # in the MMTF file - # If this is not the case the assembly cannot be read using - # this function due to the shortcoming in 'get_structure()' - if len(chains_starts) != chain_index_count: - raise NotImplementedError( - "The structure file is not suitable for this function, as the " - "number of chains in the file do not match the automatically " - "detected number of chains" - ) - - # Apply transformations for set of chains (or all chains) and add - # the transformed atoms to assembly - assembly = None - for transformation in selected_assembly: - if apply_to_all: - affected_coord = structure.coord - else: - # Mask atoms affected by this transformation - affected_mask = np.zeros(structure.array_length(), dtype=bool) - for chain_i in transformation["chainIndexList"]: - chain_start = chains_starts[chain_i] - chain_stop = chains_starts[chain_i+1] - affected_mask[chain_start : chain_stop] = True - affected_coord = structure.coord[..., affected_mask, :] - # Apply the transformation - transformed_coord = _apply_transformation( - affected_coord, transformation["matrix"] - ) - sub_assembly = structure.copy() - sub_assembly.coord = transformed_coord - # Add transformed coordinates to assembly - if assembly is None: - assembly = sub_assembly - else: - assembly += sub_assembly - - return assembly - - -def _apply_transformation(coord, mmtf_matrix): - # Obtain matrix from flattened form - matrix = np.array(mmtf_matrix).reshape(4, 4) - # Separate rotation and translation part - rotation = matrix[:3, :3] - translation = matrix[:3, 3] - coord = matrix_rotate(coord, rotation) - coord += translation - return coord \ No newline at end of file diff --git a/src/biotite/structure/io/mmtf/convertarray.pyx b/src/biotite/structure/io/mmtf/convertarray.pyx deleted file mode 100644 index 6202701e1..000000000 --- a/src/biotite/structure/io/mmtf/convertarray.pyx +++ /dev/null @@ -1,341 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" -__all__ = ["set_structure"] - -cimport cython -cimport numpy as np - -import numpy as np -from .file import MMTFFile -from ...atoms import Atom, AtomArray, AtomArrayStack -from ...bonds import BondList -from ...error import BadStructureError -from ...residues import get_residue_starts -from ...box import unitcell_from_vectors -from ...info.misc import link_type - -ctypedef np.int8_t int8 -ctypedef np.int16_t int16 -ctypedef np.int32_t int32 -ctypedef np.uint8_t uint8 -ctypedef np.uint16_t uint16 -ctypedef np.uint32_t uint32 -ctypedef np.uint64_t uint64 -ctypedef np.float32_t float32 - - -def set_structure(file, array): - """ - set_structure(file, array) - - Set the relevant fields of an MMTF file with the content of an - :class:`AtomArray` or :class:`AtomArrayStack`. - - All required and some optional fields of the MMTF file will be set - or overriden if the field does already exist. Fields are removed - when they are optional and when setting the structure information - could invalidate its content (e.g. altLocList). - - Parameters - ---------- - file : MMTFFile - The file object. - array : AtomArray or AtomArrayStack - The structure to be written. If a stack is given, each array in - the stack will be in a separate model. - - Notes - ----- - As the MMTF format only supports one unit cell, individual unit - cells for each model are not supported. - Instead only the first box in an :class:`AtomArrayStack` is written - into the file. - - Examples - -------- - - >>> import os.path - >>> file = MMTFFile() - >>> set_structure(file, atom_array) - >>> file.write(os.path.join(path_to_directory, "structure.mmtf")) - - """ - cdef bint include_bonds = (array.bonds is not None) - - cdef int i=0, j=0 - cdef array_length = array.array_length() - - - # Get annotation arrays from atom array (stack) - cdef np.ndarray arr_chain_id = array.chain_id - cdef np.ndarray arr_res_id = array.res_id - cdef np.ndarray arr_ins_code = array.ins_code - cdef np.ndarray arr_res_name = array.res_name - cdef np.ndarray arr_hetero = array.hetero - cdef np.ndarray arr_atom_name = array.atom_name - cdef np.ndarray arr_element = array.element - cdef np.ndarray arr_charge = None - if "charge" in array.get_annotation_categories(): - arr_charge = array.charge - - - # Residue start indices - # Since the stop of i is the start of i+1, - # The exclusive end of the atom array is appended - # to enable convenient usage in the following loops - cdef np.ndarray starts = np.append(get_residue_starts(array), - [array_length]) - - - ### Preparing the group list ### - # List of 'groupType' dictsfor setting the file's 'groupList' - cdef list residues - # Maps 'groupType' values (not the keys) to the index in 'residues' - # Necessary a 'groupType' are dictionaries, which are not hashable - cdef dict residue_dict - # An entry in 'residues' - cdef dict group_type - # An entry in 'residue_dict' - cdef tuple hashable_group_type - # Index to list of residues - cdef int residue_i - # List of indices to list of residues - cdef np.ndarray res_types - # Start and exclusive stop of on residue interval - cdef int start - cdef int stop - # Amount of atoms in a residue - cdef int res_length - # Name of a residue - cdef res_name - # BondList for inter-residue bonds - # intra-residue bonds are successively removed - if include_bonds: - inter_bonds = array.bonds.copy() - # 'len(starts)-1' since 'starts' has the end - # of the atom array appended - res_types = np.zeros(len(starts)-1, dtype=np.int32) - residues = [] - residue_dict = {} - for i in range(len(starts)-1): - start = starts[i] - stop = starts[i+1] - res_length = stop - start - res_name = arr_res_name[start] - # Get intra-residue bonds of this residue - if include_bonds: - intra_bonds = array.bonds[start:stop] - - # Create 'groupType' dictionary for current residue - group_type = {} - group_type["atomNameList"] = tuple( - arr_atom_name[start:stop].tolist() - ) - group_type["elementList"] = tuple( - [e.capitalize() for e in arr_element[start:stop]] - ) - if arr_charge is not None: - group_type["formalChargeList"] = tuple( - arr_charge[start:stop].tolist() - ) - else: - group_type["formalChargeList"] = (0,) * (stop-start) - group_type["groupName"] = res_name - link = link_type(res_name) - # Use 'NON-POLYMER' as default - if link is None: - link = "NON-POLYMER" - group_type["chemCompType"] = link - # Add intra-residue bonds - if include_bonds: - intra_bonds = array.bonds[start:stop] - bond_array = intra_bonds.as_array() - group_type["bondAtomList"] = tuple( - bond_array[:,:2].flatten().tolist() - ) - group_type["bondOrderList"] = tuple( - bond_array[:,2].tolist() - ) - else: - group_type["bondAtomList"] = () - group_type["bondOrderList"] = () - - # Find index of current residue in later 'groupList' - hashable_group_type = tuple(group_type.values()) - residue_i = residue_dict.get(hashable_group_type, -1) - if residue_i == -1: - # Add new residue if not yet existing in 'groupList' - residue_i = len(residues) - residues.append(group_type) - residue_dict[hashable_group_type] = residue_i - - # Remove intra-residue bonds from all bonds - # to obtain inter-residue bonds - # If the residue is already known is irrelevant for this case - if include_bonds: - # Offset is required to obtain original indices - # for bond removal - intra_bonds.offset_indices(start) - inter_bonds.remove_bonds(intra_bonds) - # Put new or already known residue to sequence of residue types - res_types[i] = residue_i - - - ### Convert annotation arrays into MMTF arrays ### - # Pessimistic assumption on length of arrays - # -> At maximum as large as atom array - cdef np.ndarray chain_names = np.zeros(array_length, dtype="U4") - cdef np.ndarray res_per_chain = np.zeros(array_length, dtype=np.int32) - # Variables for storing last and current chain ID - cdef last_chain_id = arr_chain_id[0] - cdef curr_chain_id - # Counter for chain length - cdef int res_counter = 0 - i = 0 - j = 0 - for i in range(len(starts)-1): - start = starts[i] - curr_chain_id = arr_chain_id[start] - if curr_chain_id != last_chain_id: - # New chain - chain_names[j] = last_chain_id - res_per_chain[j] = res_counter - last_chain_id = curr_chain_id - # Reset residue-per-chain counter - res_counter = 1 - j += 1 - else: - res_counter += 1 - # Add last element - chain_names[j] = last_chain_id - res_per_chain[j] = res_counter - j += 1 - # Trim to correct size - chain_names = chain_names[:j] - res_per_chain = res_per_chain[:j] - # Residue IDs from residue starts - cdef np.ndarray res_ids = arr_res_id[starts[:-1]].astype(np.int32) - cdef np.ndarray res_inscodes - res_inscodes = arr_ins_code[starts[:-1]] - - ### Adapt arrays for multiple models - cdef int model_count = 1 - cdef int chains_per_model = len(chain_names) - if isinstance(array, AtomArrayStack): - # Multi-model - model_count = array.stack_depth() - chain_names = np.tile(chain_names, model_count) - res_per_chain = np.tile(res_per_chain, model_count) - res_ids = np.tile(res_ids, model_count) - res_inscodes = np.tile(res_inscodes, model_count) - res_types = np.tile(res_types, model_count) - - - ### Remove arrays from file ### - # Arrays are removed if they are optional - # and if setting the structure information invalidates its content - _delete_record(file, "bondAtomList") - _delete_record(file, "bondOrderList") - _delete_record(file, "bFactorList") - _delete_record(file, "atomIdList") - _delete_record(file, "altLocList") - _delete_record(file, "occupancyList") - _delete_record(file, "secStructList") - _delete_record(file, "insCodeList") - - - ### Put prepared arrays into file ### - cdef np.ndarray coord - if isinstance(array, AtomArrayStack): - coord = array.coord.reshape( - (array.stack_depth() * array.array_length(), 3) - ).astype(np.float32, copy=False) - else: - coord = array.coord.astype(np.float32, copy=False) - file.set_array("xCoordList", coord[:,0], codec=10, param=1000) - file.set_array("yCoordList", coord[:,1], codec=10, param=1000) - file.set_array("zCoordList", coord[:,2], codec=10, param=1000) - - file["numModels"] = model_count - file["chainsPerModel"] = [chains_per_model] * model_count - file["numChains"] = len(chain_names) - file.set_array("chainNameList", chain_names, codec=5, param=4) - file.set_array("chainIdList", chain_names, codec=5, param=4) - file["groupsPerChain"] = res_per_chain.tolist() - file["numGroups"] = len(res_ids) - file.set_array("groupIdList", res_ids, codec=8) - file.set_array("insCodeList", res_inscodes, codec=6) - file.set_array("groupTypeList", res_types, codec=4) - file["groupList"] = residues - file["numAtoms"] = model_count * array_length - - # Optional annotation arrays - categories = array.get_annotation_categories() - if "atom_id" in categories: - file.set_array("atomIdList", - np.tile(array.atom_id.astype(np.int32), model_count), - codec=8) - if "b_factor" in categories: - file.set_array("bFactorList", - np.tile(array.b_factor.astype(np.float32), model_count), - codec=10, param=100) - if "occupancy" in categories: - file.set_array("occupancyList", - np.tile(array.occupancy.astype(np.float32), model_count), - codec=9, param=100) - - - ### Add inter-residue bonds ### - if include_bonds: - all_inter_bonds = inter_bonds - # Repeat the inter-residue bonds for each additional model - for i in range(model_count-1): - all_inter_bonds += inter_bonds - bond_array = all_inter_bonds.as_array() - file.set_array("bondAtomList", - bond_array[:,:2].flatten().astype(np.int32), - codec=4) - file.set_array("bondOrderList", - bond_array[:,2].astype(np.int8), - codec=2) - file["numBonds"] = array.bonds.get_bond_count() * model_count - else: - file["numBonds"] = 0 - - - ### Add unit cell ### - if array.box is not None: - if isinstance(array, AtomArray): - box = array.box - elif isinstance(array, AtomArrayStack): - # Use box of first model, since MMTF does not support - # multiple boxes - box = array.box[0] - len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box) - file["unitCell"] = [ - len_a, len_b, len_c, - np.rad2deg(alpha), np.rad2deg(beta), np.rad2deg(gamma) - ] - - - ### Add additional information ### - # Only set additional information, if not already set - try: - val = file["mmtfVersion"] - except KeyError: - file["mmtfVersion"] = "1.0.0" - try: - val = file["mmtfProducer"] - except KeyError: - file["mmtfProducer"] = "UNKNOWN" - - -def _delete_record(file, record): - try: - del file[record] - except: - pass \ No newline at end of file diff --git a/src/biotite/structure/io/mmtf/convertfile.pyx b/src/biotite/structure/io/mmtf/convertfile.pyx deleted file mode 100644 index b3d33251d..000000000 --- a/src/biotite/structure/io/mmtf/convertfile.pyx +++ /dev/null @@ -1,501 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" -__all__ = ["get_model_count", "get_structure"] - -cimport cython -cimport numpy as np - -import numpy as np -from .file import MMTFFile -from ...atoms import Atom, AtomArray, AtomArrayStack -from ...bonds import BondList -from ...error import BadStructureError -from ...filter import filter_first_altloc, filter_highest_occupancy_altloc -from ...residues import get_residue_starts -from ...box import vectors_from_unitcell -from ....file import InvalidFileError - -ctypedef np.int8_t int8 -ctypedef np.int16_t int16 -ctypedef np.int32_t int32 -ctypedef np.uint8_t uint8 -ctypedef np.uint16_t uint16 -ctypedef np.uint32_t uint32 -ctypedef np.uint64_t uint64 -ctypedef np.float32_t float32 - - -def get_model_count(file): - """ - Get the number of models contained in a MMTF file. - - Parameters - ---------- - file : MMTFFile - The file object. - - Returns - ------- - model_count : int - The number of models. - """ - return file["numModels"] - - -def get_structure(file, model=None, altloc="first", - extra_fields=[], include_bonds=False): - """ - get_structure(file, model=None, altloc=[], extra_fields=[], - include_bonds=False) - - Get an :class:`AtomArray` or :class:`AtomArrayStack` from the MMTF file. - - Parameters - ---------- - file : MMTFFile - The file object. - model : int, optional - If this parameter is given, the function will return an - :class:`AtomArray` from the atoms corresponding to the given - model number (starting at 1). - Negative values are used to index models starting from the last - model insted of the first model. - If this parameter is omitted, an :class:`AtomArrayStack` - containing all models will be returned, even if the structure - contains only one model. - altloc : {'first', 'occupancy', 'all'} - This parameter defines how *altloc* IDs are handled: - - ``'first'`` - Use atoms that have the first *altloc* ID - appearing in a residue. - - ``'occupancy'`` - Use atoms that have the *altloc* ID - with the highest occupancy for a residue. - - ``'all'`` - Use all atoms. - Note that this leads to duplicate atoms. - When this option is chosen, the ``altloc_id`` annotation - array is added to the returned structure. - extra_fields : list of str, optional - The strings in the list are optional annotation categories - that should be stored in the output array or stack. - These are valid values: - ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``. - include_bonds : bool, optional - If set to true, a :class:`BondList` will be created for the - resulting :class:`AtomArray` containing the bond information - from the file. - - Returns - ------- - array : AtomArray or AtomArrayStack - The return type depends on the `model` parameter. - - Examples - -------- - - >>> import os.path - >>> file = MMTFFile.read(os.path.join(path_to_structures, "1l2y.mmtf")) - >>> array = get_structure(file, model=1) - >>> print(array.array_length()) - 304 - >>> stack = get_structure(file) - >>> print(stack.stack_depth(), stack.array_length()) - 38 304 - """ - cdef int i, j, m - - - # Obtain (and potentially decode) required arrays/values from file - cdef int atom_count = file["numAtoms"] - cdef int model_count = file["numModels"] - cdef np.ndarray chain_names = file["chainNameList"] - cdef int32[:] chains_per_model = np.array(file["chainsPerModel"], np.int32) - cdef int32[:] res_per_chain = np.array(file["groupsPerChain"], np.int32) - cdef int32[:] res_type_i = file["groupTypeList"] - cdef np.ndarray index_list = file["groupIdList"] - cdef int32[:] res_ids = index_list - cdef np.ndarray x_coord = file["xCoordList"] - cdef np.ndarray y_coord = file["yCoordList"] - cdef np.ndarray z_coord = file["zCoordList"] - cdef np.ndarray occupancy = file.get("occupancyList") - cdef np.ndarray b_factor - if "b_factor" in extra_fields: - b_factor = file["bFactorList"] - cdef np.ndarray atom_ids - if "atom_id" in extra_fields: - atom_ids = file["atomIdList"] - cdef np.ndarray all_altloc_ids - cdef np.ndarray inscode - all_altloc_ids = file.get("altLocList") - inscode = file.get("insCodeList") - - - # Create arrays from 'groupList' list of dictionaries - cdef list group_list = file["groupList"] - cdef list non_hetero_list = ["L-PEPTIDE LINKING", "PEPTIDE LINKING", - "DNA LINKING", "RNA LINKING"] - # Determine per-residue-count and maximum count - # of atoms in each residue - cdef np.ndarray atoms_per_res = np.zeros(len(group_list), dtype=np.int32) - for i in range(len(group_list)): - atoms_per_res[i] = len(group_list[i]["atomNameList"]) - cdef int32 max_atoms_per_res = np.max(atoms_per_res) - # Create the arrays - cdef np.ndarray res_names = np.zeros(len(group_list), dtype="U5") - cdef np.ndarray hetero_res = np.zeros(len(group_list), dtype=bool) - cdef np.ndarray atom_names = np.zeros((len(group_list), max_atoms_per_res), - dtype="U6") - cdef np.ndarray elements = np.zeros((len(group_list), max_atoms_per_res), - dtype="U2") - cdef np.ndarray charges = np.zeros((len(group_list), max_atoms_per_res), - dtype=np.int32) - # Fill the arrays - for i in range(len(group_list)): - residue = group_list[i] - res_names[i] = residue["groupName"] - hetero_res[i] = (residue["chemCompType"] not in non_hetero_list) - atom_names[i, :atoms_per_res[i]] = residue["atomNameList"] - elements[i, :atoms_per_res[i]] = residue["elementList"] - charges[i, :atoms_per_res[i]] = residue["formalChargeList"] - - - # Create the atom array (stack) - cdef int depth, length - cdef int start_i, stop_i - cdef bint extra_charge - cdef np.ndarray altloc_ids - cdef np.ndarray inscode_array - - - if model == None: - lengths = _get_model_lengths(res_type_i, chains_per_model, - res_per_chain, atoms_per_res) - # Check if each model has the same amount of atoms - # If not, raise exception - if (lengths != lengths[0]).any(): - raise InvalidFileError("The models in the file have unequal " - "amount of atoms, give an explicit " - "model instead") - length = lengths[0] - - depth = model_count - - - array = AtomArrayStack(depth, length) - array.coord = np.stack( - [x_coord, - y_coord, - z_coord], - axis=1 - ).reshape(depth, length, 3) - - # Create altloc array for the final filtering - if all_altloc_ids is not None: - altloc_ids = all_altloc_ids[:length] - else: - altloc_ids = None - - extra_charge = False - if "ins_code" in extra_fields: - extra_inscode = True - array.add_annotation("ins_code", "U1") - if "charge" in extra_fields: - extra_charge = True - array.add_annotation("charge", int) - if "atom_id" in extra_fields: - array.set_annotation("atom_id", atom_ids[:length]) - if "b_factor" in extra_fields: - array.set_annotation("b_factor", b_factor[:length]) - if "occupancy" in extra_fields: - array.set_annotation("occupancy", occupancy[:length]) - - _fill_annotations(1, array, extra_charge, - chain_names, chains_per_model, res_per_chain, - res_type_i, res_ids, inscode, atoms_per_res, - res_names, hetero_res, atom_names, elements, charges) - - if include_bonds: - array.bonds = _create_bond_list( - 1, file["bondAtomList"], file["bondOrderList"], - 0, length, file["numAtoms"], group_list, res_type_i, - atoms_per_res, res_per_chain, chains_per_model - ) - - - else: - lengths = _get_model_lengths(res_type_i, chains_per_model, - res_per_chain, atoms_per_res) - if model == 0: - raise ValueError("The model index must not be 0") - # Negative models mean model index starting from last model - model = len(lengths) + model + 1 if model < 0 else model - if model > len(lengths): - raise ValueError( - f"The file has {len(lengths)} models, " - f"the given model {model} does not exist" - ) - - length = lengths[model-1] - # Indices to filter coords and some annotations - # for the specified model - start_i = np.sum(lengths[:model-1]) - stop_i = start_i + length - - array = AtomArray(length) - array.coord[:,0] = x_coord[start_i : stop_i] - array.coord[:,1] = y_coord[start_i : stop_i] - array.coord[:,2] = z_coord[start_i : stop_i] - - # Create altloc array for the final filtering - if all_altloc_ids is not None: - altloc_ids = np.array(all_altloc_ids[start_i : stop_i], dtype="U1") - else: - altloc_ids = None - - extra_charge = False - if "charge" in extra_fields: - extra_charge = True - array.add_annotation("charge", int) - if "atom_id" in extra_fields: - array.set_annotation("atom_id", atom_ids[start_i : stop_i]) - if "b_factor" in extra_fields: - array.set_annotation("b_factor", b_factor[start_i : stop_i]) - if "occupancy" in extra_fields: - array.set_annotation("occupancy", occupancy[start_i : stop_i]) - - _fill_annotations(model, array, extra_charge, - chain_names, chains_per_model, res_per_chain, - res_type_i, res_ids, inscode, atoms_per_res, - res_names, hetero_res, atom_names, elements, charges) - - if include_bonds: - array.bonds = _create_bond_list( - model, file["bondAtomList"], file["bondOrderList"], - start_i, stop_i, file["numAtoms"], group_list, res_type_i, - atoms_per_res, res_per_chain, chains_per_model - ) - - # Get box - if "unitCell" in file: - a_len, b_len, c_len, alpha, beta, gamma = file["unitCell"] - alpha = np.deg2rad(alpha) - beta = np.deg2rad(beta ) - gamma = np.deg2rad(gamma) - box = vectors_from_unitcell( - a_len, b_len, c_len, alpha, beta, gamma - ) - if isinstance(array, AtomArrayStack): - array.box = np.repeat( - box[np.newaxis, ...], array.stack_depth(), axis=0 - ) - else: - # AtomArray - array.box = box - - - # Filter altloc IDs and return - if altloc_ids is None: - return array - elif altloc == "occupancy" and occupancy is not None: - return array[ - ..., - filter_highest_occupancy_altloc(array, altloc_ids, occupancy) - ] - # 'first' is also fallback if file has no occupancy information - elif altloc == "first": - return array[..., filter_first_altloc(array, altloc_ids)] - elif altloc == "all": - array.set_annotation("altloc_id", altloc_ids) - return array - else: - raise ValueError(f"'{altloc}' is not a valid 'altloc' option") - - -def _get_model_lengths(int32[:] res_type_i, - int32[:] chains_per_model, - int32[:] res_per_chain, - int32[:] atoms_per_res): - cdef int[:] model_lengths = np.zeros(len(chains_per_model), np.int32) - cdef int atom_count = 0 - cdef int model_i = 0 - cdef int chain_i = 0 - cdef int res_i - cdef int res_count_in_chain = 0 - cdef int chain_count_in_model = 0 - # The length of 'res_type_i' - # is equal to the total number of residues - for res_i in range(res_type_i.shape[0]): - atom_count += atoms_per_res[res_type_i[res_i]] - res_count_in_chain += 1 - if res_count_in_chain == res_per_chain[chain_i]: - # Chain is full -> Bump chain index and reset residue count - res_count_in_chain = 0 - chain_i += 1 - chain_count_in_model += 1 - if chain_count_in_model == chains_per_model[model_i]: - # Model is full -> Bump model index and reset chain count - chain_count_in_model = 0 - model_lengths[model_i] = atom_count - # Restart counting for the next model - atom_count = 0 - model_i += 1 - return np.asarray(model_lengths) - - -def _fill_annotations(int model, array, - bint extra_charge, - np.ndarray chain_names, - int32[:] chains_per_model, - int32[:] res_per_chain, - int32[:] res_type_i, - int32[:] res_ids, - np.ndarray res_inscodes, - np.ndarray atoms_per_res, - np.ndarray res_names, - np.ndarray hetero_res, - np.ndarray atom_names, - np.ndarray elements, - np.ndarray charges): - # Get annotation arrays from atom array (stack) - cdef np.ndarray chain_id = array.chain_id - cdef np.ndarray res_id = array.res_id - cdef np.ndarray ins_code = array.ins_code - cdef np.ndarray res_name = array.res_name - cdef np.ndarray hetero = array.hetero - cdef np.ndarray atom_name = array.atom_name - cdef np.ndarray element = array.element - if extra_charge: - charge = array.charge - - cdef int model_i = 0 - cdef int chain_i = 0 - cdef int res_i - cdef int atom_i = 0 - cdef int res_count_in_chain = 0 - cdef int chain_count_in_model = 0 - cdef int atom_index_in_res - - cdef chain_id_for_chain - cdef res_name_for_res - cdef inscode_for_res - cdef bint hetero_for_res - cdef int res_id_for_res - cdef int type_i - - # The length of 'res_type_i' - # is equal to the total number of residues - for res_i in range(res_type_i.shape[0]): - # Wait for the data of the given model - if model_i == model-1: - chain_id_for_chain = chain_names[chain_i] - res_id_for_res = res_ids[res_i] - if res_inscodes is not None: - inscode_for_res = res_inscodes[res_i] - type_i = res_type_i[res_i] - res_name_for_res = res_names[type_i] - hetero_for_res = hetero_res[type_i] - - for atom_index_in_res in range(atoms_per_res[type_i]): - chain_id[atom_i] = chain_id_for_chain - res_id[atom_i] = res_id_for_res - ins_code[atom_i] = inscode_for_res - hetero[atom_i] = hetero_for_res - res_name[atom_i] = res_name_for_res - atom_name[atom_i] = atom_names[type_i][atom_index_in_res] - element[atom_i] = elements[type_i][atom_index_in_res].upper() - if extra_charge: - charge[atom_i] = charges[type_i][atom_index_in_res] - atom_i += 1 - - elif model_i > model-1: - # The given model has already been parsed - # -> parsing is finished - break - - res_count_in_chain += 1 - if res_count_in_chain == res_per_chain[chain_i]: - # Chain is full -> Bump chain index and reset residue count - res_count_in_chain = 0 - chain_i += 1 - chain_count_in_model += 1 - if chain_count_in_model == chains_per_model[model_i]: - # Model is full -> Bump model index and reset chain count - chain_count_in_model = 0 - model_i += 1 - - -def _create_bond_list(int model, np.ndarray bonds, np.ndarray bond_types, - int model_start, int model_stop, int atom_count, - list group_list, int32[:] res_type_i, - int32[:] atoms_per_res, - int32[:] res_per_chain, int32[:] chains_per_model): - cdef int i=0, j=0 - - # Determine per-residue-count and maximum count - # of bonds in each residue - cdef int32[:] bonds_per_res = np.zeros(len(group_list), dtype=np.int32) - for i in range(len(group_list)): - bonds_per_res[i] = len(group_list[i]["bondOrderList"]) - cdef int32 max_bonds_per_res = np.max(bonds_per_res) - - # Create arrays for intra-residue bonds and bond types - cdef np.ndarray intra_bonds = np.zeros( - (len(group_list), max_bonds_per_res, 3), dtype=np.uint32 - ) - # Dictionary for groupList entry - cdef dict residue - # Fill the array - for i in range(len(group_list)): - residue = group_list[i] - bonds_in_residue = np.array(residue["bondAtomList"], dtype=np.uint32) - intra_bonds[i, :bonds_per_res[i], :2] = \ - np.array(residue["bondAtomList"], dtype=np.uint32).reshape((-1, 2)) - intra_bonds[i, :bonds_per_res[i], 2] = residue["bondOrderList"] - - # Unify intra-residue bonds to one BondList - cdef int model_i = 0 - cdef int chain_i = 0 - cdef int res_i - cdef int res_count_in_chain = 0 - cdef int chain_count_in_model = 0 - cdef int type_i - intra_bond_list = BondList(0) - # The length of 'res_type_i' - # is equal to the total number of residues - for res_i in range(res_type_i.shape[0]): - # Wait for the data of the given model - if model_i == model-1: - type_i = res_type_i[res_i] - bond_list_per_res = BondList( - atoms_per_res[type_i], - intra_bonds[type_i, :bonds_per_res[type_i]] - ) - intra_bond_list += bond_list_per_res - - elif model_i > model-1: - # The given model has already been parsed - # -> parsing is finished - break - - res_count_in_chain += 1 - if res_count_in_chain == res_per_chain[chain_i]: - # Chain is full -> Bump chain index and reset residue count - res_count_in_chain = 0 - chain_i += 1 - chain_count_in_model += 1 - if chain_count_in_model == chains_per_model[model_i]: - # Model is full -> Bump model index and reset chain count - chain_count_in_model = 0 - model_i += 1 - - # Add inter-residue bonds to BondList - cdef np.ndarray inter_bonds = np.zeros((len(bond_types), 3), - dtype=np.uint32) - inter_bonds[:,:2] = bonds.reshape((len(bond_types), 2)) - inter_bonds[:,2] = bond_types - inter_bond_list = BondList(atom_count, inter_bonds) - inter_bond_list = inter_bond_list[model_start : model_stop] - global_bond_list = inter_bond_list.merge(intra_bond_list) - return global_bond_list \ No newline at end of file diff --git a/src/biotite/structure/io/mmtf/decode.pyx b/src/biotite/structure/io/mmtf/decode.pyx deleted file mode 100644 index 3649a947a..000000000 --- a/src/biotite/structure/io/mmtf/decode.pyx +++ /dev/null @@ -1,152 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" -__all__ = ["decode_array"] - -cimport cython -cimport numpy as np - -import numpy as np - -ctypedef np.int8_t int8 -ctypedef np.int16_t int16 -ctypedef np.int32_t int32 -ctypedef np.uint8_t uint8 -ctypedef np.uint16_t uint16 -ctypedef np.uint32_t uint32 -ctypedef np.uint64_t uint64 -ctypedef np.float32_t float32 - - -def decode_array(int codec, bytes raw_bytes, int param): - cdef np.ndarray array - # Pass-through: 32-bit floating-point number array - if codec == 1: - array = np.frombuffer(raw_bytes, dtype=">f4").astype(np.float32) - return array - # Pass-through: 8-bit signed integer array - elif codec == 2: - array = np.frombuffer(raw_bytes, dtype=">i1").astype(np.int8) - return array - # Pass-through: 16-bit signed integer array - elif codec == 3: - array = np.frombuffer(raw_bytes, dtype=">i2").astype(np.int16) - return array - # Pass-through: 32-bit signed integer array - elif codec == 4: - array = np.frombuffer(raw_bytes, dtype=">i4").astype(np.int32) - return array - # UTF8/ASCII fixed-length string array - elif codec == 5: - array = np.frombuffer(raw_bytes, np.dtype("S" + str(param))) - return array.astype(np.dtype("U" + str(param))) - # Run-length encoded character array - elif codec == 6: - array = np.frombuffer(raw_bytes, dtype=">i4").astype(np.int32) - return np.frombuffer(_decode_run_length(array), dtype="U1") - # Run-length encoded 32-bit signed integer array - elif codec == 7: - array = np.frombuffer(raw_bytes, dtype=">i4").astype(np.int32) - return _decode_run_length(array) - # Delta & run-length encoded 32-bit signed integer array - elif codec == 8: - array = np.frombuffer(raw_bytes, dtype=">i4").astype(np.int32) - return _decode_delta( - _decode_run_length(array)) - # Integer & run-length encoded 32-bit floating-point number array - elif codec == 9: - array = np.frombuffer(raw_bytes, dtype=">i4").astype(np.int32) - return _decode_integer(param, - _decode_run_length(array)) - # Integer & delta encoded - # & two-byte-packed 32-bit floating-point number array - elif codec == 10: - array = np.frombuffer(raw_bytes, dtype=">i2").astype(np.int16) - return _decode_integer(param, - _decode_delta( - _decode_packed(array))) - # Integer encoded 32-bit floating-point number array - elif codec == 11: - array = np.frombuffer(raw_bytes, dtype=">i2").astype(np.int16) - return _decode_integer(param, array) - # Integer & two-byte-packed 32-bit floating-point number array - elif codec == 12: - array = np.frombuffer(raw_bytes, dtype=">i2").astype(np.int16) - return _decode_integer(param, - _decode_packed(array)) - # Integer & one-byte-packed 32-bit floating-point number array - elif codec == 13: - array = np.frombuffer(raw_bytes, dtype=">i1").astype(np.int8) - return _decode_integer(param, - _decode_packed(array)) - # Two-byte-packed 32-bit signed integer array - elif codec == 14: - array = np.frombuffer(raw_bytes, dtype=">i2").astype(np.int16) - return _decode_packed(array) - # One-byte-packed 32-bit signed integer array - elif codec == 15: - array = np.frombuffer(raw_bytes, dtype=">i1").astype(np.int8) - return _decode_packed(array) - else: - raise ValueError("Unknown codec with ID {codec}") - - -def _decode_delta(np.ndarray array): - return np.cumsum(array, dtype=np.int32) - - -def _decode_run_length(int32[:] array): - cdef int length = 0 - cdef int i, j - cdef int value, repeat - # Determine length of output array by summing the run lengths - for i in range(1, array.shape[0], 2): - length += array[i] - cdef int32[:] output = np.zeros(length, dtype=np.int32) - # Fill output array - j = 0 - for i in range(0, array.shape[0], 2): - value = array[i] - repeat = array[i+1] - output[j : j+repeat] = value - j += repeat - return np.asarray(output) - - -ctypedef fused PackedType: - int8 - int16 -def _decode_packed(PackedType[:] array): - cdef int min_val, max_val - if PackedType is int8: - min_val = np.iinfo(np.int8).min - max_val = np.iinfo(np.int8).max - else: - min_val = np.iinfo(np.int16).min - max_val = np.iinfo(np.int16).max - cdef int i, j - cdef int packed_val, unpacked_val - # Pessimistic size assumption: - # The maximum output array length is the input array length - # in case all values are within the type limits - cdef int32[:] output = np.zeros(array.shape[0], dtype=np.int32) - j = 0 - unpacked_val = 0 - for i in range(array.shape[0]): - packed_val = array[i] - if packed_val == max_val or packed_val == min_val: - unpacked_val += packed_val - else: - unpacked_val += packed_val - output[j] = unpacked_val - unpacked_val = 0 - j += 1 - # Trim to correct size and return - return np.asarray(output[:j]) - - -def _decode_integer(int divisor, np.ndarray array): - return np.divide(array, divisor, dtype=np.float32) \ No newline at end of file diff --git a/src/biotite/structure/io/mmtf/encode.pyx b/src/biotite/structure/io/mmtf/encode.pyx deleted file mode 100644 index 4b7d591ec..000000000 --- a/src/biotite/structure/io/mmtf/encode.pyx +++ /dev/null @@ -1,183 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" -__all__ = ["encode_array"] - -cimport cython -cimport numpy as np - -import numpy as np - -ctypedef np.int8_t int8 -ctypedef np.int16_t int16 -ctypedef np.int32_t int32 -ctypedef np.uint8_t uint8 -ctypedef np.uint16_t uint16 -ctypedef np.uint32_t uint32 -ctypedef np.uint64_t uint64 -ctypedef np.float32_t float32 - - -def encode_array(np.ndarray array, int codec, int param): - # Pass-through: 32-bit floating-point number array - if codec == 1: - array = array.astype(np.float32, copy=False) - return array.astype(">f4").tobytes() - # Pass-through: 8-bit signed integer array - elif codec == 2: - array = array.astype(np.int8, copy=False) - return array.astype(">i1").tobytes() - # Pass-through: 16-bit signed integer array - elif codec == 3: - array = array.astype(np.int16, copy=False) - return array.astype(">i2").tobytes() - # Pass-through: 32-bit signed integer array - elif codec == 4: - array = array.astype(np.int32, copy=False) - return array.astype(">i4").tobytes() - # UTF8/ASCII fixed-length string array - elif codec == 5: - dtype = np.dtype("U" + str(param)) - array = array.astype(dtype, copy=False) - return array.astype(np.dtype("S" + str(param))).tobytes() - # Run-length encoded character array - elif codec == 6: - array = array.astype("U1", copy=False) - array = _encode_run_length(np.frombuffer(array, dtype=np.int32)) - return array.astype(">i4").tobytes() - # Run-length encoded 32-bit signed integer array - elif codec == 7: - array = array.astype(np.int32, copy=False) - return _encode_run_length(array).astype(">i4").tobytes() - # Delta & run-length encoded 32-bit signed integer array - elif codec == 8: - array = array.astype(np.int32, copy=False) - return _encode_run_length(_encode_delta(array)).astype(">i4").tobytes() - # Integer & run-length encoded 32-bit floating-point number array - elif codec == 9: - array = array.astype(np.float32, copy=False) - return _encode_run_length( - _encode_integer(param, array).astype(np.int32) - ).astype(">i4").tobytes() - # Integer & delta encoded - # & two-byte-packed 32-bit floating-point number array - elif codec == 10: - array = array.astype(np.float32, copy=False) - return _encode_packed( - True, _encode_delta( - _encode_integer(param, array).astype(np.int32) - ) - ).astype(">i2").tobytes() - # Integer encoded 32-bit floating-point number array - elif codec == 11: - array = array.astype(np.float32, copy=False) - return _encode_integer(param, array).astype(">i2").tobytes() - # Integer & two-byte-packed 32-bit floating-point number array - elif codec == 12: - array = array.astype(np.float32, copy=False) - return _encode_packed( - True, _encode_integer(param, array).astype(np.int32) - ).astype(">i2").tobytes() - # Integer & one-byte-packed 32-bit floating-point number array - elif codec == 13: - array = array.astype(np.float32, copy=False) - return _encode_packed( - False, _encode_integer(param, array).astype(np.int32) - ).astype(">i1").tobytes() - # Two-byte-packed 32-bit signed integer array - elif codec == 14: - array = array.astype(np.int32, copy=False) - return _encode_packed(True, array).astype(">i2").tobytes() - # One-byte-packed 32-bit signed integer array - elif codec == 15: - array = array.astype(np.int32, copy=False) - return _encode_packed(False, array).astype(">i1").tobytes() - else: - raise ValueError(f"Unknown codec with ID {codec}") - - -def _encode_delta(int32[:] array): - cdef int32[:] output = np.zeros(array.shape[0], np.int32) - output[0] = array[0] - cdef int i = 0 - for i in range(1, array.shape[0]): - output[i] = array[i] - array[i-1] - return np.asarray(output) - - -def _encode_run_length(int32[:] array): - # Pessimistic allocation of output array - # -> Run length is 1 for every element - cdef int32[:] output = np.zeros(array.shape[0] * 2, dtype=np.int32) - cdef int i=0, j=0 - cdef int val = array[0] - cdef int run_length = 0 - cdef int curr_val - for i in range(array.shape[0]): - curr_val = array[i] - if curr_val == val: - run_length += 1 - else: - # New element -> Write element with run-length - output[j] = val - output[j+1] = run_length - j += 2 - val = curr_val - run_length = 1 - # Write last element - output[j] = val - output[j+1] = run_length - j += 2 - # Trim to correct size - return np.asarray(output)[:j] - - -@cython.cdivision(True) -def _encode_packed(bint two_byte, int32[:] array): - cdef int min_val, max_val - cdef int i=0, j=0 - if two_byte: - min_val = np.iinfo(np.int16).min - max_val = np.iinfo(np.int16).max - else: - min_val = np.iinfo(np.int8).min - max_val = np.iinfo(np.int8).max - # Get length of output array - # by summing up required length of each element - cdef int number - cdef int length = 0 - for i in range(array.shape[0]): - number = array[i] - if number < 0: - length += number // min_val +1 - elif number > 0: - length += number // max_val +1 - else: - # e = 0 - length += 1 - # Fill output - cdef int16[:] output = np.zeros(length, dtype=np.int16) - cdef int remainder - j = 0 - for i in range(array.shape[0]): - remainder = array[i] - if remainder < 0: - while remainder <= min_val: - remainder -= min_val - output[j] = min_val - j += 1 - elif remainder > 0: - while remainder >= max_val: - remainder -= max_val - output[j] = max_val - j += 1 - output[j] = remainder - j += 1 - return np.asarray(output) - - -def _encode_integer(int divisor, np.ndarray array): - return np.multiply(array, divisor) \ No newline at end of file diff --git a/src/biotite/structure/io/mmtf/file.py b/src/biotite/structure/io/mmtf/file.py deleted file mode 100644 index f734c45ea..000000000 --- a/src/biotite/structure/io/mmtf/file.py +++ /dev/null @@ -1,233 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.mmtf" -__author__ = "Patrick Kunzmann" -__all__ = ["MMTFFile"] - -import io -from collections.abc import MutableMapping -import struct -import copy -import numpy as np -import msgpack -import warnings -from ....file import File, is_binary, is_open_compatible -from ...error import BadStructureError -from .decode import decode_array -from .encode import encode_array - - -class MMTFFile(File, MutableMapping): - """ - This class represents a MMTF file. - - When reading a file, the *MessagePack* unpacker is used to create - a dictionary of the file content. - This dictionary is accessed by indexing the :class:`MMTFFile` - instance directly with the dictionary keys. - If the dictionary value is an encoded array, the value automatically - decoded. - Decoded arrays are always returned as :class:`ndarray` instances. - - DEPRECATED: Use :class:`biotite.structure.io.pdbx.BinaryCIFFile` - instead. - - Examples - -------- - - >>> import os.path - >>> mmtf_file = MMTFFile.read(os.path.join(path_to_structures, "1l2y.mmtf")) - >>> print(mmtf_file["title"]) - NMR Structure of Trp-Cage Miniprotein Construct TC5b - >>> print(mmtf_file["chainNameList"]) - ['A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' - 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' - 'A' 'A'] - """ - - def __init__(self): - warnings.warn( - "'MMTFFile' is deprecated, use 'BinaryCIFFile' instead", - DeprecationWarning - ) - super().__init__() - self._content = {} - self._content["mmtfVersion"] = "1.0.0" - self._content["mmtfProducer"] = "UNKNOWN" - - @classmethod - def read(self, file): - """ - Read a MMTF file. - - Parameters - ---------- - file : file-like object or str - The file to be read. - Alternatively a file path can be supplied. - - Returns - ------- - file_object : MMTFFile - The parsed file. - """ - mmtf_file = MMTFFile() - # File name - if is_open_compatible(file): - with open(file, "rb") as f: - mmtf_file._content = msgpack.unpackb( - f.read(), use_list=True, raw=False - ) - # File object - else: - if not is_binary(file): - raise TypeError("A file opened in 'binary' mode is required") - mmtf_file._content = msgpack.unpackb( - file.read(), use_list=True, raw=False - ) - return mmtf_file - - def write(self, file): - """ - Write contents into a MMTF file. - - Parameters - ---------- - file : file-like object or str - The file to be written to. - Alternatively, a file path can be supplied. - """ - packed_bytes = msgpack.packb( - self._content, use_bin_type=True, default=_encode_numpy - ) - if is_open_compatible(file): - with open(file, "wb") as f: - f.write(packed_bytes) - else: - if not is_binary(file): - raise TypeError("A file opened in 'binary' mode is required") - file.write(packed_bytes) - - def __copy_fill__(self, clone): - super().__copy_fill__(clone) - clone._content = copy.deepcopy(self._content) - - def get_codec(self, key): - """ - Obtain the codec ID of an MMTF encoded value. - - Parameters - ---------- - key : str - The key for the potentially encoded value. - - Returns - ------- - codec : int or None - The codec ID. `None` if the value is not encoded. - """ - data = self._content[key] - if isinstance(data, bytes) and data[0] == 0: - codec = struct.unpack(">i", data[0:4])[0] - return codec - else: - return None - - def get_length(self, key): - """ - Obtain the length of an MMTF encoded value. - - Parameters - ---------- - key : str - The key for the potentially encoded value. - - Returns - ------- - codec : int or None - The length of the `bytes` array. - `None` if the value is not encoded. - """ - data = self._content[key] - if isinstance(data, bytes) and data[0] == 0: - length = struct.unpack(">i", data[4:8])[0] - return length - else: - return None - - def get_param(self, key): - """ - Obtain the parameter of an MMTF encoded value. - - Parameters - ---------- - key : str - The key for the potentially encoded value. - - Returns - ------- - codec : int or None - The parameter of the encoded value. - `None` if the value is not encoded. - """ - data = self._content[key] - if isinstance(data, bytes) and data[0] == 0: - param = struct.unpack(">i", data[8:12])[0] - return param - else: - return None - - def set_array(self, key, array, codec, param=0): - length = len(array) - raw_bytes = encode_array(array, codec, param) - data = struct.pack(">i", codec) \ - + struct.pack(">i", length) \ - + struct.pack(">i", param) \ - + raw_bytes - self._content[key] = data - - def __getitem__(self, key): - data = self._content[key] - if isinstance(data, bytes) and data[0] == 0: - # MMTF specific format -> requires decoding - codec = struct.unpack(">i", data[0:4 ])[0] - length = struct.unpack(">i", data[4:8 ])[0] - param = struct.unpack(">i", data[8:12])[0] - raw_bytes = data[12:] - return decode_array(codec, raw_bytes, param) - else: - return data - - def __setitem__(self, key, item): - if isinstance(item, np.ndarray): - raise TypeError("Arrays that need to be encoded must be addeed " - "via 'set_array()'") - self._content[key] = item - - def __delitem__(self, key): - del self._content[key] - - def __iter__(self): - return self._content.__iter__() - - def __len__(self): - return len(self._content) - - def __contains__(self, item): - return item in self._content - - -def _encode_numpy(item): - """ - Convert NumPy scalar types to native Python types, - as *Msgpack* cannot handle NumPy types (e.g. float32). - - The function is given to the Msgpack packer as value for the - `default` parameter. - """ - if isinstance(item, np.generic): - return item.item() - else: - raise TypeError(f"can not serialize '{type(item).__name__}' object") diff --git a/src/biotite/structure/io/mol/__init__.py b/src/biotite/structure/io/mol/__init__.py index 9e8ee2097..ba71d85a2 100644 --- a/src/biotite/structure/io/mol/__init__.py +++ b/src/biotite/structure/io/mol/__init__.py @@ -17,4 +17,4 @@ from .convert import * from .header import * from .mol import * -from .sdf import * \ No newline at end of file +from .sdf import * diff --git a/src/biotite/structure/io/mol/convert.py b/src/biotite/structure/io/mol/convert.py index 2961c79c9..64cae7ff3 100644 --- a/src/biotite/structure/io/mol/convert.py +++ b/src/biotite/structure/io/mol/convert.py @@ -6,9 +6,9 @@ __author__ = "Patrick Kunzmann" __all__ = ["get_structure", "set_structure"] -from .mol import MOLFile -from .sdf import SDFile, SDRecord -from ...bonds import BondType +from biotite.structure.bonds import BondType +from biotite.structure.io.mol.mol import MOLFile +from biotite.structure.io.mol.sdf import SDFile, SDRecord def get_structure(mol_file, record_name=None): @@ -39,8 +39,9 @@ def get_structure(mol_file, record_name=None): return record.get_structure() -def set_structure(mol_file, atoms, default_bond_type=BondType.ANY, - version=None, record_name=None): +def set_structure( + mol_file, atoms, default_bond_type=BondType.ANY, version=None, record_name=None +): """ Set the :class:`AtomArray` for the MOL file. @@ -88,9 +89,7 @@ def _get_record(file, record_name): else: return file[record_name] else: - raise TypeError( - f"Unsupported file type '{type(file).__name__}'" - ) + raise TypeError(f"Unsupported file type '{type(file).__name__}'") def _get_or_create_record(file, record_name): @@ -110,6 +109,4 @@ def _get_or_create_record(file, record_name): file[record_name] = record return file[record_name] else: - raise TypeError( - f"Unsupported file type '{type(file).__name__}'" - ) \ No newline at end of file + raise TypeError(f"Unsupported file type '{type(file).__name__}'") diff --git a/src/biotite/structure/io/mol/ctab.py b/src/biotite/structure/io/mol/ctab.py index e8fff5d10..d4577e382 100644 --- a/src/biotite/structure/io/mol/ctab.py +++ b/src/biotite/structure/io/mol/ctab.py @@ -12,13 +12,13 @@ __all__ = ["read_structure_from_ctab", "write_structure_to_ctab"] import itertools -import warnings import shlex +import warnings import numpy as np -from ....file import InvalidFileError -from ...error import BadStructureError -from ...atoms import AtomArray, AtomArrayStack -from ...bonds import BondList, BondType +from biotite.file import InvalidFileError +from biotite.structure.atoms import AtomArray, AtomArrayStack +from biotite.structure.bonds import BondList, BondType +from biotite.structure.error import BadStructureError BOND_TYPE_MAPPING = { 1: BondType.SINGLE, @@ -84,8 +84,7 @@ def read_structure_from_ctab(ctab_lines): raise InvalidFileError(f"Unknown CTAB version '{unkown_version}'") -def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, - version=None): +def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, version=None): """ Convert an :class:`AtomArray` into a *MDL* connection table (Ctab). @@ -124,8 +123,7 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, """ if isinstance(atoms, AtomArrayStack): raise TypeError( - "An 'AtomArrayStack' was given, " - "but only a single model can be written" + "An 'AtomArrayStack' was given, " "but only a single model can be written" ) if atoms.bonds is None: raise BadStructureError("Input AtomArray has no associated BondList") @@ -134,9 +132,7 @@ def write_structure_to_ctab(atoms, default_bond_type=BondType.ANY, match version: case None: - if _is_v2000_compatible( - atoms.array_length(), atoms.bonds.get_bond_count() - ): + if _is_v2000_compatible(atoms.array_length(), atoms.bonds.get_bond_count()): return _write_structure_to_ctab_v2000(atoms, default_bond_type) else: return _write_structure_to_ctab_v3000(atoms, default_bond_type) @@ -160,7 +156,8 @@ def _read_structure_from_ctab_v2000(ctab_lines): atom_lines = ctab_lines[1 : 1 + n_atoms] bond_lines = ctab_lines[1 + n_atoms : 1 + n_atoms + n_bonds] charge_lines = [ - line for line in ctab_lines[1 + n_atoms + n_bonds:] + line + for line in ctab_lines[1 + n_atoms + n_bonds :] if line.startswith("M CHG") ] @@ -208,10 +205,9 @@ def _read_structure_from_ctab_v2000(ctab_lines): return atoms + def _read_structure_from_ctab_v3000(ctab_lines): - v30_lines = [ - line[6:].strip() for line in ctab_lines if line.startswith("M V30") - ] + v30_lines = [line[6:].strip() for line in ctab_lines if line.startswith("M V30")] atom_lines = _get_block_v3000(v30_lines, "ATOM") if len(atom_lines) == 0: @@ -262,16 +258,20 @@ def _read_structure_from_ctab_v3000(ctab_lines): return atoms + def _get_version(counts_line): return counts_line[33:39].strip() + def _is_v2000_compatible(n_atoms, n_bonds): # The format uses a maximum of 3 digits for the atom and bond count return n_atoms < 1000 and n_bonds < 1000 + def _get_counts_v2000(counts_line): return int(counts_line[0:3]), int(counts_line[3:6]) + def _get_block_v3000(v30_lines, block_name): block_lines = [] in_block = False @@ -282,13 +282,12 @@ def _get_block_v3000(v30_lines, block_name): if in_block: return block_lines else: - raise InvalidFileError( - f"Block '{block_name}' ended before it began" - ) + raise InvalidFileError(f"Block '{block_name}' ended before it began") elif in_block: block_lines.append(line) return block_lines + def create_property_dict_v3000(property_strings): properties = {} for prop in property_strings: @@ -315,7 +314,8 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type): f" {atoms.element[i].capitalize():3}" f"{0:>2}" # Mass difference -> unused f"{CHARGE_MAPPING_REV.get(charge[i], 0):>3d}" - + f"{0:>3d}" * 10 # More unused fields + + f"{0:>3d}" + * 10 # More unused fields for i in range(atoms.array_length()) ] @@ -323,7 +323,8 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type): bond_lines = [ f"{i+1:>3d}{j+1:>3d}" f"{BOND_TYPE_MAPPING_REV.get(bond_type, default_bond_value):>3d}" - + f"{0:>3d}" * 4 + + f"{0:>3d}" + * 4 for i, j, bond_type in atoms.bonds.as_array() ] @@ -332,8 +333,7 @@ def _write_structure_to_ctab_v2000(atoms, default_bond_type): charge_lines = [] # Each `M CHG` line can contain up to 8 charges for batch in _batched( - [(atom_i, c) for atom_i, c in enumerate(charge) if c != 0], - N_CHARGES_PER_LINE + [(atom_i, c) for atom_i, c in enumerate(charge) if c != 0], N_CHARGES_PER_LINE ): charge_lines.append( f"M CHG{len(batch):>3d}" @@ -349,9 +349,7 @@ def _write_structure_to_ctab_v3000(atoms, default_bond_type): except AttributeError: charges = np.zeros(atoms.array_length(), dtype=int) - counts_line = ( - f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0" - ) + counts_line = f"COUNTS {atoms.array_length()} {atoms.bonds.get_bond_count()} 0 0 0" atom_lines = [ f"{i + 1}" @@ -375,32 +373,35 @@ def _write_structure_to_ctab_v3000(atoms, default_bond_type): ] lines = ( - ["BEGIN CTAB"] + - [counts_line] + - ["BEGIN ATOM"] + - atom_lines + - ["END ATOM"] + - ["BEGIN BOND"] + - bond_lines + - ["END BOND"] + - ["END CTAB"] + ["BEGIN CTAB"] + + [counts_line] + + ["BEGIN ATOM"] + + atom_lines + + ["END ATOM"] + + ["BEGIN BOND"] + + bond_lines + + ["END BOND"] + + ["END CTAB"] ) # Mark lines as V3000 CTAB lines = ["M V30 " + line for line in lines] return [V2000_COMPATIBILITY_LINE] + lines + ["M END"] + def _to_property(charge): if charge == 0: return "" else: return f"CHG={charge}" + def _quote(string): if " " in string or len(string) == 0: return f'"{string}"' else: return string + def _batched(iterable, n): """ Equivalent to :func:`itertools.batched()`. @@ -411,4 +412,4 @@ def _batched(iterable, n): """ iterator = iter(iterable) while batch := tuple(itertools.islice(iterator, n)): - yield batch \ No newline at end of file + yield batch diff --git a/src/biotite/structure/io/mol/header.py b/src/biotite/structure/io/mol/header.py index 3b4f1b48d..0c459acac 100644 --- a/src/biotite/structure/io/mol/header.py +++ b/src/biotite/structure/io/mol/header.py @@ -6,16 +6,15 @@ __author__ = "Patrick Kunzmann" __all__ = ["Header"] -import warnings import datetime +import warnings from dataclasses import dataclass - _DATE_FORMAT = "%m%d%y%H%M" @dataclass -class Header(): +class Header: """ The header for connection tables. @@ -70,20 +69,25 @@ def deserialize(text): try: time = datetime.datetime.strptime(time_string, _DATE_FORMAT) except ValueError: - warnings.warn( - f"Invalid time format '{time_string}' in file header" - ) + warnings.warn(f"Invalid time format '{time_string}' in file header") time = None dimensions = lines[1][20:22].strip() scaling_factors = lines[1][22:34].strip() - energy = lines[1][34:46].strip() + energy = lines[1][34:46].strip() registry_number = lines[1][46:52].strip() comments = lines[2].strip() return Header( - mol_name, initials, program, time, dimensions, - scaling_factors, energy, registry_number, comments + mol_name, + initials, + program, + time, + dimensions, + scaling_factors, + energy, + registry_number, + comments, ) def serialize(self): @@ -113,4 +117,4 @@ def serialize(self): return text def __str__(self): - return self.serialize() \ No newline at end of file + return self.serialize() diff --git a/src/biotite/structure/io/mol/mol.py b/src/biotite/structure/io/mol/mol.py index 25c9cc5ca..72122f0ef 100644 --- a/src/biotite/structure/io/mol/mol.py +++ b/src/biotite/structure/io/mol/mol.py @@ -6,11 +6,13 @@ __author__ = "Patrick Kunzmann" __all__ = ["MOLFile"] -from ....file import TextFile, InvalidFileError -from .ctab import read_structure_from_ctab, write_structure_to_ctab -from .header import Header -from ...bonds import BondType - +from biotite.file import InvalidFileError, TextFile +from biotite.structure.bonds import BondType +from biotite.structure.io.mol.ctab import ( + read_structure_from_ctab, + write_structure_to_ctab, +) +from biotite.structure.io.mol.header import Header # Number of header lines N_HEADER = 3 @@ -80,66 +82,23 @@ def __init__(self): self.lines = [""] * N_HEADER self._header = None - @classmethod def read(cls, file): mol_file = super().read(file) mol_file._header = None return mol_file - @property def header(self): if self._header is None: self._header = Header.deserialize("\n".join(self.lines[0:3]) + "\n") return self._header - @header.setter def header(self, header): self._header = header self.lines[0:3] = self._header.serialize().splitlines() - - def get_header(self): - """ - Get the header from the MOL file. - - DEPRECATED: Use the :attr:`header` property instead. - - Returns - ------- - header_attributes - See :class:`Header`. - """ - header = self.header - return ( - header.mol_name, - header.initials, - header.program, - header.time, - header.dimensions, - header.scaling_factors, - header.energy, - header.registry_number, - header.comments - ) - - - def set_header(self, *args, **kwargs): - """ - Set the header for the MOL file. - - DEPRECATED: Use the :attr:`header` property instead. - - Parameters - ---------- - **args, **kwars - See :class:`Header`. - """ - self.header = Header(*args, **kwargs) - - def get_structure(self): """ Get an :class:`AtomArray` from the MOL file. @@ -157,9 +116,7 @@ def get_structure(self): raise InvalidFileError("File does not contain structure data") return read_structure_from_ctab(ctab_lines) - - def set_structure(self, atoms, default_bond_type=BondType.ANY, - version=None): + def set_structure(self, atoms, default_bond_type=BondType.ANY, version=None): """ Set the :class:`AtomArray` for the file. @@ -185,9 +142,8 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY, ) - def _get_ctab_lines(lines): for i, line in enumerate(lines): if line.startswith("M END"): - return lines[N_HEADER:i+1] + return lines[N_HEADER : i + 1] return lines[N_HEADER:] diff --git a/src/biotite/structure/io/mol/sdf.py b/src/biotite/structure/io/mol/sdf.py index a2b35096b..2048a482a 100644 --- a/src/biotite/structure/io/mol/sdf.py +++ b/src/biotite/structure/io/mol/sdf.py @@ -8,16 +8,24 @@ import re import warnings +from collections.abc import Mapping, MutableMapping from dataclasses import dataclass -from collections.abc import MutableMapping, Mapping import numpy as np -from ....file import File, InvalidFileError, is_open_compatible, is_text, \ - DeserializationError, SerializationError -from .ctab import read_structure_from_ctab, write_structure_to_ctab -from .header import Header -from ...atoms import AtomArray -from ...bonds import BondList, BondType - +from biotite.file import ( + DeserializationError, + File, + InvalidFileError, + SerializationError, + is_open_compatible, + is_text, +) +from biotite.structure.atoms import AtomArray +from biotite.structure.bonds import BondList, BondType +from biotite.structure.io.mol.ctab import ( + read_structure_from_ctab, + write_structure_to_ctab, +) +from biotite.structure.io.mol.header import Header _N_HEADER = 3 # Number of header lines @@ -96,6 +104,7 @@ class Key: number, name, registry_internal, registry_external The same as the parameters. """ + # The characters that can be given as input to `name` # First character must be alphanumeric, # following characters may include underscores and periods @@ -103,7 +112,7 @@ class Key: # they are still used in practice and therefore allowed here _NAME_INPUT_REGEX = re.compile(r"^[a-zA-Z0-9][\w.]*$") # These regexes are used to parse the key from a line - _COMPONENT_REGEX = { + _COMPONENT_REGEX = { "number": re.compile(r"^DT(\d+)$"), "name": re.compile(r"^<([a-zA-Z0-9][\w.]*)>$"), "registry_internal": re.compile(r"^(\d+)$"), @@ -162,9 +171,7 @@ def deserialize(text): break else: # There is no matching pattern - raise DeserializationError( - f"Invalid key component '{component}'" - ) + raise DeserializationError(f"Invalid key component '{component}'") return Metadata.Key(**parsed_component_dict) def serialize(self): @@ -190,7 +197,6 @@ def serialize(self): def __str__(self): return self.serialize() - def __init__(self, metadata=None): if metadata is None: metadata = {} @@ -222,9 +228,7 @@ def deserialize(text): current_value = None else: if current_key is None: - raise DeserializationError( - "Value found before metadata key" - ) + raise DeserializationError("Value found before metadata key") if current_value is None: current_value = line else: @@ -388,7 +392,7 @@ def header(self): if isinstance(self._header, str): try: self._header = Header.deserialize(self._header) - except: + except Exception: raise DeserializationError("Failed to deserialize header") return self._header @@ -406,7 +410,7 @@ def metadata(self): if isinstance(self._metadata, str): try: self._metadata = Metadata.deserialize(self._metadata) - except: + except Exception: raise DeserializationError("Failed to deserialize metadata") return self._metadata @@ -483,8 +487,7 @@ def get_structure(self): raise InvalidFileError("File does not contain structure data") return read_structure_from_ctab(ctab_lines) - def set_structure(self, atoms, default_bond_type=BondType.ANY, - version=None): + def set_structure(self, atoms, default_bond_type=BondType.ANY, version=None): """ Set the structural data in the SD record. @@ -505,9 +508,9 @@ def set_structure(self, atoms, default_bond_type=BondType.ANY, By default, ``"V2000"`` is used, unless the number of atoms or bonds exceeds 999, in which case ``"V3000"`` is used. """ - self._ctab = _join_with_terminal_newline(write_structure_to_ctab( - atoms, default_bond_type, version - )) + self._ctab = _join_with_terminal_newline( + write_structure_to_ctab(atoms, default_bond_type, version) + ) def __eq__(self, other): if not isinstance(other, type(self)): @@ -736,28 +739,29 @@ def deserialize(text): The content to be deserialized. """ lines = text.splitlines() - record_ends = np.array([ - i for i, line in enumerate(lines) - if line.startswith(_RECORD_DELIMITER) - ], dtype=int) + record_ends = np.array( + [i for i, line in enumerate(lines) if line.startswith(_RECORD_DELIMITER)], + dtype=int, + ) if len(record_ends) == 0: warnings.warn( "Final record delimiter missing, " "maybe this is a MOL file instead of a SD file" ) - record_ends = np.array([len(lines)-1], dtype=int) + record_ends = np.array([len(lines) - 1], dtype=int) # The first record starts at the first line and the last # delimiter is at the end of the file # Records in the middle start directly after the delimiter record_starts = np.concatenate(([0], record_ends[:-1] + 1), dtype=int) record_names = [lines[start].strip() for start in record_starts] - return SDFile({ - # Do not include the delimiter - # -> stop at end (instead of end + 1) - name: _join_with_terminal_newline(lines[start : end]) - for name, start, end - in zip(record_names, record_starts, record_ends) - }) + return SDFile( + { + # Do not include the delimiter + # -> stop at end (instead of end + 1) + name: _join_with_terminal_newline(lines[start:end]) + for name, start, end in zip(record_names, record_starts, record_ends) + } + ) def serialize(self): """ @@ -776,7 +780,7 @@ def serialize(self): else: try: text_blocks.append(record.serialize()) - except: + except Exception: raise SerializationError( f"Failed to serialize record '{record_name}'" ) @@ -835,19 +839,15 @@ def __getitem__(self, key): # -> must be deserialized first try: record = SDRecord.deserialize(record) - except: - raise DeserializationError( - f"Failed to deserialize record '{key}'" - ) + except Exception: + raise DeserializationError(f"Failed to deserialize record '{key}'") # Update with deserialized object self._records[key] = record return record def __setitem__(self, key, record): if not isinstance(record, SDRecord): - raise TypeError( - f"Expected 'SDRecord', but got '{type(record).__name__}'" - ) + raise TypeError(f"Expected 'SDRecord', but got '{type(record).__name__}'") # The molecule name in the header is unique across the file record.header.mol_name = key self._records[key] = record @@ -895,22 +895,19 @@ def _to_metadata_key(key): return Metadata.Key(name=key) else: raise TypeError( - "Expected 'Metadata.Key' or str, " - f"but got '{type(key).__name__}'" + "Expected 'Metadata.Key' or str, " f"but got '{type(key).__name__}'" ) def _add_key_value_pair(metadata, key, value): if key is not None: if value is None: - raise DeserializationError( - f"No value found for metadata key {key}" - ) + raise DeserializationError(f"No value found for metadata key {key}") metadata[key] = value def _get_ctab_stop(lines): for i in range(_N_HEADER, len(lines)): if lines[i].startswith("M END"): - return i+1 - return len(lines) \ No newline at end of file + return i + 1 + return len(lines) diff --git a/src/biotite/structure/io/netcdf/__init__.py b/src/biotite/structure/io/netcdf/__init__.py index 9926d405c..085e0c080 100644 --- a/src/biotite/structure/io/netcdf/__init__.py +++ b/src/biotite/structure/io/netcdf/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.netcdf" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/netcdf/file.py b/src/biotite/structure/io/netcdf/file.py index c651657e1..c6e2a5a47 100644 --- a/src/biotite/structure/io/netcdf/file.py +++ b/src/biotite/structure/io/netcdf/file.py @@ -6,21 +6,21 @@ __author__ = "Patrick Kunzmann" __all__ = ["NetCDFFile"] +import biotraj import numpy as np -from ..trajfile import TrajectoryFile -from ...box import vectors_from_unitcell, unitcell_from_vectors +from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell +from biotite.structure.io.trajfile import TrajectoryFile class NetCDFFile(TrajectoryFile): """ This file class represents a NetCDF trajectory file. """ - + @classmethod def traj_type(cls): - import mdtraj.formats as traj - return traj.NetCDFTrajectoryFile - + return biotraj.NetCDFTrajectoryFile + @classmethod def process_read_values(cls, read_values): # .dcd files use Angstrom @@ -29,35 +29,36 @@ def process_read_values(cls, read_values): cell_lengths = read_values[2] cell_angles = read_values[3] if cell_lengths is None or cell_angles is None: - box = None + box = None else: box = np.stack( - [vectors_from_unitcell(a, b, c, alpha, beta, gamma) - for (a, b, c), (alpha, beta, gamma) - in zip(cell_lengths, np.deg2rad(cell_angles))], - axis=0 + [ + vectors_from_unitcell(a, b, c, alpha, beta, gamma) + for (a, b, c), (alpha, beta, gamma) in zip( + cell_lengths, np.deg2rad(cell_angles) + ) + ], + axis=0, ) return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): - coord = coord.astype(np.float32, copy=False) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None + coord = coord.astype(np.float32, copy=False) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None if box is None: cell_lengths = None - cell_angles = None + cell_angles = None else: cell_lengths = np.zeros((len(box), 3), dtype=np.float32) - cell_angles = np.zeros((len(box), 3), dtype=np.float32) + cell_angles = np.zeros((len(box), 3), dtype=np.float32) for i, model_box in enumerate(box): a, b, c, alpha, beta, gamma = unitcell_from_vectors(model_box) cell_lengths[i] = np.array((a, b, c)) cell_angles[i] = np.rad2deg((alpha, beta, gamma)) return { - "coordinates" : coord, - "time" : time, - "cell_lengths" : cell_lengths, - "cell_angles" : cell_angles, - } \ No newline at end of file + "coordinates": coord, + "time": time, + "cell_lengths": cell_lengths, + "cell_angles": cell_angles, + } diff --git a/src/biotite/structure/io/npz/__init__.py b/src/biotite/structure/io/npz/__init__.py deleted file mode 100644 index a84341f0a..000000000 --- a/src/biotite/structure/io/npz/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -""" -This subpackage is used for reading and writing an :class:`AtomArray` or -:class:`AtomArrayStack` using the internal NPZ file format. This binary -format is used to store `NumPy` arrays. Since atom arrays and stacks are -completely built on `NumPy` arrays, this format is preferable for -Biotite internal usage due to fast I/O operations and preservation -of all atom annotation arrays. - -DEPRECATED: Pickle data directly or use -:class:`biotite.structure.io.pdbx.BinaryCIFFile` instead. -""" - -__name__ = "biotite.structure.io.npz" -__author__ = "Patrick Kunzmann" - -from .file import * \ No newline at end of file diff --git a/src/biotite/structure/io/npz/file.py b/src/biotite/structure/io/npz/file.py deleted file mode 100644 index 52ca95f8f..000000000 --- a/src/biotite/structure/io/npz/file.py +++ /dev/null @@ -1,152 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.npz" -__author__ = "Patrick Kunzmann" -__all__ = ["NpzFile"] - -import numpy as np -from ...atoms import Atom, AtomArray, AtomArrayStack -from ...bonds import BondList -from ....file import File, is_binary - - -class NpzFile(File): - r""" - This class represents a NPZ file, the preferable format for - Biotite internal structure storage. - - Internally the this class writes/reads all attribute arrays of an - :class:`AtomArray` or :class:`AtomArrayStack` using the *NumPy* - :func:`save()`/:func:`load()` - method. This format offers the fastest I/O operations and completely - preserves the content all atom annotation arrays. - - DEPRECATED: Pickle data directly or use - :class:`biotite.structure.io.pdbx.BinaryCIFFile` instead. - - Examples - -------- - Load a \\*.npz file, modify the structure and save the new - structure into a new file: - - >>> import os.path - >>> file = NpzFile.read(os.path.join(path_to_structures, "1l2y.npz")) - >>> array_stack = file.get_structure() - >>> array_stack_mod = rotate(array_stack, [1,2,3]) - >>> file = NpzFile() - >>> file.set_structure(array_stack_mod) - >>> file.write(os.path.join(path_to_directory, "1l2y_mod.npz")) - - """ - - def __init__(self): - super().__init__() - self._data_dict = None - - def __copy_fill__(self, clone): - super().__copy_fill__(clone) - if self._data_dict is not None: - for key, value in self._data_dict.items(): - clone._data_dict[key] = np.copy(value) - - @classmethod - def read(cls, file): - """ - Read a NPZ file. - - Parameters - ---------- - file : file-like object or str - The file to be read. - Alternatively a file path can be supplied. - - Returns - ------- - file_object : NPZFile - The parsed file. - """ - npz_file = NpzFile() - # File name - if isinstance(file, str): - with open(file, "rb") as f: - npz_file._data_dict = dict(np.load(f, allow_pickle=False)) - # File object - else: - if not is_binary(file): - raise TypeError("A file opened in 'binary' mode is required") - npz_file._data_dict = dict(np.load(file, allow_pickle=False)) - return npz_file - - def write(self, file): - """ - Write a NPZ file. - - Parameters - ---------- - file : file-like object or str - The file to be read. - Alternatively, a file path can be supplied. - """ - if isinstance(file, str): - with open(file, "wb") as f: - np.savez(f, **self._data_dict) - else: - if not is_binary(file): - raise TypeError("A file opened in 'binary' mode is required") - np.savez(file, **self._data_dict) - - def get_structure(self): - """ - Get an :class:`AtomArray` or :class:`AtomArrayStack` from the - file. - - If this method returns an array or stack depends on which type - of object was used when the file was written. - - Returns - ------- - array : AtomArray or AtomArrayStack - The array or stack contained in this file. - """ - if self._data_dict is None: - raise ValueError("The structure of this file " - "has not been loaded or set yet") - coord = self._data_dict["coord"] - # The type of the structure is determined by the dimensionality - # of the 'coord' field - if len(coord.shape) == 3: - array = AtomArrayStack(coord.shape[0], coord.shape[1]) - else: - array = AtomArray(coord.shape[0]) - - for key, value in self._data_dict.items(): - if key == "coord": - array.coord = value - elif key == "bonds": - array.bonds = BondList(array.array_length(), value) - elif key == "box": - array.box = value - else: - array.set_annotation(key, value) - return array - - def set_structure(self, array): - """ - Set the :class:`AtomArray` or :class:`AtomArrayStack` for the - file. - - Parameters - ---------- - array : AtomArray or AtomArrayStack - The array or stack to be saved into this file. - """ - self._data_dict = {} - self._data_dict["coord"] = np.copy(array.coord) - if array.bonds is not None: - self._data_dict["bonds"] = array.bonds.as_array() - if array.box is not None: - self._data_dict["box"] = np.copy(array.box) - for annot in array.get_annotation_categories(): - self._data_dict[annot] = np.copy(array.get_annotation(annot)) \ No newline at end of file diff --git a/src/biotite/structure/io/pdb/__init__.py b/src/biotite/structure/io/pdb/__init__.py index 1dc97904b..687527d69 100644 --- a/src/biotite/structure/io/pdb/__init__.py +++ b/src/biotite/structure/io/pdb/__init__.py @@ -16,5 +16,5 @@ __name__ = "biotite.structure.io.pdb" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/structure/io/pdb/convert.py b/src/biotite/structure/io/pdb/convert.py index 7d4bc19dd..127e49fbb 100644 --- a/src/biotite/structure/io/pdb/convert.py +++ b/src/biotite/structure/io/pdb/convert.py @@ -9,8 +9,14 @@ __name__ = "biotite.structure.io.pdb" __author__ = "Patrick Kunzmann" -__all__ = ["get_model_count", "get_structure", "set_structure", - "list_assemblies", "get_assembly", "get_symmetry_mates"] +__all__ = [ + "get_model_count", + "get_structure", + "set_structure", + "list_assemblies", + "get_assembly", + "get_symmetry_mates", +] def get_model_count(pdb_file): @@ -30,8 +36,9 @@ def get_model_count(pdb_file): return pdb_file.get_model_count() -def get_structure(pdb_file, model=None, altloc="first", extra_fields=[], - include_bonds=False): +def get_structure( + pdb_file, model=None, altloc="first", extra_fields=[], include_bonds=False +): """ Create an :class:`AtomArray` or :class:`AtomArrayStack` from a :class:`PDBFile`. @@ -39,7 +46,7 @@ def get_structure(pdb_file, model=None, altloc="first", extra_fields=[], This function is a thin wrapper around the :class:`PDBFile` method :func:`get_structure()` for the sake of consistency with other ``structure.io`` subpackages. - + Parameters ---------- pdb_file : PDBFile @@ -77,12 +84,12 @@ def get_structure(pdb_file, model=None, altloc="first", extra_fields=[], (e.g. especially inter-residue bonds), have :attr:`BondType.ANY`, since the PDB format itself does not support bond orders. - + Returns ------- array : AtomArray or AtomArrayStack The return type depends on the `model` parameter. - + """ return pdb_file.get_structure(model, altloc, extra_fields, include_bonds) @@ -95,11 +102,11 @@ def set_structure(pdb_file, array, hybrid36=False): This function is a thin wrapper around the :class:`PDBFile` method :func:`set_structure()` for the sake of consistency with other ``structure.io`` subpackages. - + This will save the coordinates, the mandatory annotation categories and the optional annotation categories 'atom_id', 'b_factor', 'occupancy' and 'charge'. - + Parameters ---------- pdb_file : PDBFile @@ -137,7 +144,7 @@ def list_assemblies(pdb_file): ------- assemblies : list of str A list that contains the available assembly IDs. - + Examples -------- >>> import os.path @@ -148,8 +155,14 @@ def list_assemblies(pdb_file): return pdb_file.list_assemblies() -def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first", - extra_fields=[], include_bonds=False): +def get_assembly( + pdb_file, + assembly_id=None, + model=None, + altloc="first", + extra_fields=[], + include_bonds=False, +): """ Build the given biological assembly. @@ -205,7 +218,7 @@ def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first", assembly : AtomArray or AtomArrayStack The assembly. The return type depends on the `model` parameter. - + Examples -------- @@ -218,8 +231,9 @@ def get_assembly(pdb_file, assembly_id=None, model=None, altloc="first", ) -def get_symmetry_mates(pdb_file, model=None, altloc="first", - extra_fields=[], include_bonds=False): +def get_symmetry_mates( + pdb_file, model=None, altloc="first", extra_fields=[], include_bonds=False +): """ Build a structure model containing all symmetric copies of the structure within a single unit cell, given by the space @@ -274,13 +288,13 @@ def get_symmetry_mates(pdb_file, model=None, altloc="first", symmetry_mates : AtomArray or AtomArrayStack All atoms within a single unit cell. The return type depends on the `model` parameter. - + Notes ----- To expand the structure beyond a single unit cell, use :func:`repeat_box()` with the return value as its input. - + Examples -------- @@ -288,6 +302,4 @@ def get_symmetry_mates(pdb_file, model=None, altloc="first", >>> file = PDBFile.read(os.path.join(path_to_structures, "1aki.pdb")) >>> atoms_in_unit_cell = get_symmetry_mates(file, model=1) """ - return pdb_file.get_symmetry_mates( - model, altloc, extra_fields, include_bonds - ) \ No newline at end of file + return pdb_file.get_symmetry_mates(model, altloc, extra_fields, include_bonds) diff --git a/src/biotite/structure/io/pdb/file.py b/src/biotite/structure/io/pdb/file.py index 208f6acfb..6d192dac6 100644 --- a/src/biotite/structure/io/pdb/file.py +++ b/src/biotite/structure/io/pdb/file.py @@ -8,20 +8,23 @@ import warnings import numpy as np -from ...atoms import AtomArray, AtomArrayStack, repeat -from ...bonds import BondList, connect_via_residue_names -from ...box import vectors_from_unitcell, unitcell_from_vectors -from ....file import TextFile, InvalidFileError -from ...repair import infer_elements -from ...error import BadStructureError -from ...filter import ( +from biotite.file import InvalidFileError, TextFile +from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat +from biotite.structure.bonds import BondList, connect_via_residue_names +from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell +from biotite.structure.error import BadStructureError +from biotite.structure.filter import ( filter_first_altloc, filter_highest_occupancy_altloc, filter_solvent, ) -from ...util import matrix_rotate -from .hybrid36 import encode_hybrid36, decode_hybrid36, max_hybrid36_number - +from biotite.structure.io.pdb.hybrid36 import ( + decode_hybrid36, + encode_hybrid36, + max_hybrid36_number, +) +from biotite.structure.repair import infer_elements +from biotite.structure.util import matrix_rotate _PDB_MAX_ATOMS = 99999 _PDB_MAX_RESIDUES = 9999 @@ -82,6 +85,7 @@ class PDBFile(TextFile): >>> file.set_structure(array_stack_mod) >>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb")) """ + @classmethod def read(cls, file): file = super().read(file) @@ -91,7 +95,6 @@ def read(cls, file): file._index_models_and_atoms() return file - def get_remark(self, number): r""" Get the lines containing the *REMARK* records with the given @@ -140,7 +143,8 @@ def get_remark(self, number): remark_string = f"REMARK {number:>3d}" # Find lines and omit ``REMARK XXX `` part remark_lines = [ - line[CONTENT_START_COLUMN:] for line in self.lines + line[CONTENT_START_COLUMN:] + for line in self.lines if line.startswith(remark_string) ] if len(remark_lines) == 0: @@ -149,7 +153,6 @@ def get_remark(self, number): remark_lines = remark_lines[1:] return remark_lines - def get_model_count(self): """ Get the number of models contained in the PDB file. @@ -161,7 +164,6 @@ def get_model_count(self): """ return len(self._model_start_i) - def get_coord(self, model=None): """ Get only the coordinates from the PDB file. @@ -239,21 +241,21 @@ def get_coord(self, model=None): if model is None: coord = np.zeros( (len(self._model_start_i), self._get_model_length(), 3), - dtype=np.float32 + dtype=np.float32, ) m = 0 i = 0 for line_i in self._atom_line_i: if ( - m < len(self._model_start_i)-1 - and line_i > self._model_start_i[m+1] + m < len(self._model_start_i) - 1 + and line_i > self._model_start_i[m + 1] ): m += 1 i = 0 line = self.lines[line_i] - coord[m,i,0] = float(line[_coord_x]) - coord[m,i,1] = float(line[_coord_y]) - coord[m,i,2] = float(line[_coord_z]) + coord[m, i, 0] = float(line[_coord_x]) + coord[m, i, 1] = float(line[_coord_y]) + coord[m, i, 2] = float(line[_coord_z]) i += 1 return coord @@ -262,12 +264,11 @@ def get_coord(self, model=None): coord = np.zeros((len(coord_i), 3), dtype=np.float32) for i, line_i in enumerate(coord_i): line = self.lines[line_i] - coord[i,0] = float(line[_coord_x]) - coord[i,1] = float(line[_coord_y]) - coord[i,2] = float(line[_coord_z]) + coord[i, 0] = float(line[_coord_x]) + coord[i, 1] = float(line[_coord_y]) + coord[i, 2] = float(line[_coord_z]) return coord - def get_b_factor(self, model=None): """ Get only the B-factors from the PDB file. @@ -300,20 +301,19 @@ def get_b_factor(self, model=None): """ if model is None: b_factor = np.zeros( - (len(self._model_start_i), self._get_model_length()), - dtype=np.float32 + (len(self._model_start_i), self._get_model_length()), dtype=np.float32 ) m = 0 i = 0 for line_i in self._atom_line_i: if ( - m < len(self._model_start_i)-1 - and line_i > self._model_start_i[m+1] + m < len(self._model_start_i) - 1 + and line_i > self._model_start_i[m + 1] ): m += 1 i = 0 line = self.lines[line_i] - b_factor[m,i] = float(line[_temp_f]) + b_factor[m, i] = float(line[_temp_f]) i += 1 return b_factor @@ -325,9 +325,9 @@ def get_b_factor(self, model=None): b_factor[i] = float(line[_temp_f]) return b_factor - - def get_structure(self, model=None, altloc="first", extra_fields=[], - include_bonds=False): + def get_structure( + self, model=None, altloc="first", extra_fields=[], include_bonds=False + ): """ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the PDB file. @@ -391,17 +391,17 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], array = AtomArray(len(coord_i)) # Create mandatory and optional annotation arrays - chain_id = np.zeros(array.array_length(), array.chain_id.dtype) - res_id = np.zeros(array.array_length(), array.res_id.dtype) - ins_code = np.zeros(array.array_length(), array.ins_code.dtype) - res_name = np.zeros(array.array_length(), array.res_name.dtype) - hetero = np.zeros(array.array_length(), array.hetero.dtype) + chain_id = np.zeros(array.array_length(), array.chain_id.dtype) + res_id = np.zeros(array.array_length(), array.res_id.dtype) + ins_code = np.zeros(array.array_length(), array.ins_code.dtype) + res_name = np.zeros(array.array_length(), array.res_name.dtype) + hetero = np.zeros(array.array_length(), array.hetero.dtype) atom_name = np.zeros(array.array_length(), array.atom_name.dtype) - element = np.zeros(array.array_length(), array.element.dtype) + element = np.zeros(array.array_length(), array.element.dtype) atom_id_raw = np.zeros(array.array_length(), "U5") - charge_raw = np.zeros(array.array_length(), "U2") + charge_raw = np.zeros(array.array_length(), "U2") occupancy = np.zeros(array.array_length(), float) - b_factor = np.zeros(array.array_length(), float) + b_factor = np.zeros(array.array_length(), float) altloc_id = np.zeros(array.array_length(), dtype="U1") # Fill annotation array @@ -425,13 +425,11 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], occupancy[i] = float(line[_occupancy].strip()) b_factor[i] = float(line[_temp_f].strip()) - if include_bonds or \ - (extra_fields is not None and "atom_id" in extra_fields): - # The atom IDs are only required in these two cases - atom_id = np.array( - [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw], - dtype=int - ) + if include_bonds or (extra_fields is not None and "atom_id" in extra_fields): + # The atom IDs are only required in these two cases + atom_id = np.array( + [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw], dtype=int + ) else: atom_id = None @@ -444,16 +442,16 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], array.atom_name = atom_name array.element = element - for field in (extra_fields if extra_fields is not None else []): + for field in extra_fields if extra_fields is not None else []: if field == "atom_id": # Copy is necessary to avoid double masking in # later altloc ID filtering array.set_annotation("atom_id", atom_id.copy()) elif field == "charge": charge = np.array(charge_raw) - array.set_annotation("charge", np.where( - charge == " ", "0", charge - ).astype(int)) + array.set_annotation( + "charge", np.where(charge == " ", "0", charge).astype(int) + ) elif field == "occupancy": array.set_annotation("occupancy", occupancy) elif field == "b_factor": @@ -485,7 +483,10 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], m = 0 i = 0 for line_i in self._atom_line_i: - if m < len(self._model_start_i)-1 and line_i > self._model_start_i[m+1]: + if ( + m < len(self._model_start_i) - 1 + and line_i > self._model_start_i[m + 1] + ): m += 1 i = 0 line = self.lines[line_i] @@ -506,9 +507,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], alpha = np.deg2rad(float(line[_alpha])) beta = np.deg2rad(float(line[_beta])) gamma = np.deg2rad(float(line[_gamma])) - box = vectors_from_unitcell( - len_a, len_b, len_c, alpha, beta, gamma - ) + box = vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma) except ValueError: # File contains invalid 'CRYST1' record warnings.warn( @@ -526,9 +525,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], # Filter altloc IDs if altloc == "occupancy": - filter = filter_highest_occupancy_altloc( - array, altloc_id, occupancy - ) + filter = filter_highest_occupancy_altloc(array, altloc_id, occupancy) array = array[..., filter] atom_id = atom_id[filter] if atom_id is not None else None elif altloc == "first": @@ -548,7 +545,6 @@ def get_structure(self, model=None, altloc="first", extra_fields=[], return array - def set_structure(self, array, hybrid36=False): """ Set the :class:`AtomArray` or :class:`AtomArrayStack` for the @@ -596,39 +592,42 @@ def set_structure(self, array, hybrid36=False): occupancy = np.char.array(np.full(natoms, " 1.00", dtype="U6")) if "charge" in annot_categories: charge = np.char.array( - [str(np.abs(charge)) + "+" if charge > 0 else - (str(np.abs(charge)) + "-" if charge < 0 else "") - for charge in array.get_annotation("charge")] + [ + str(np.abs(charge)) + "+" + if charge > 0 + else (str(np.abs(charge)) + "-" if charge < 0 else "") + for charge in array.get_annotation("charge") + ] ) else: charge = np.char.array(np.full(natoms, " ", dtype="U2")) if hybrid36: - pdb_atom_id = np.char.array( - [encode_hybrid36(i, 5) for i in atom_id] - ) - pdb_res_id = np.char.array( - [encode_hybrid36(i, 4) for i in array.res_id] - ) + pdb_atom_id = np.char.array([encode_hybrid36(i, 5) for i in atom_id]) + pdb_res_id = np.char.array([encode_hybrid36(i, 4) for i in array.res_id]) else: # Atom IDs are supported up to 99999, # but negative IDs are also possible - pdb_atom_id = np.char.array(np.where( - atom_id > 0, - ((atom_id - 1) % _PDB_MAX_ATOMS) + 1, - atom_id - ).astype(str)) + pdb_atom_id = np.char.array( + np.where( + atom_id > 0, ((atom_id - 1) % _PDB_MAX_ATOMS) + 1, atom_id + ).astype(str) + ) # Residue IDs are supported up to 9999, # but negative IDs are also possible - pdb_res_id = np.char.array(np.where( - array.res_id > 0, - ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1, - array.res_id - ).astype(str)) + pdb_res_id = np.char.array( + np.where( + array.res_id > 0, + ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1, + array.res_id, + ).astype(str) + ) names = np.char.array( - [f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm - for atm, elem in zip(array.atom_name, array.element)] + [ + f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm + for atm, elem in zip(array.atom_name, array.element) + ] ) res_names = np.char.array(array.res_name) chain_ids = np.char.array(array.chain_id) @@ -637,17 +636,20 @@ def set_structure(self, array, hybrid36=False): elements = np.char.array(array.element) first_half = ( - record.ljust(6) + - pdb_atom_id.rjust(5) + - spaces + - names.ljust(4) + - spaces + res_names.rjust(3) + spaces + chain_ids + - pdb_res_id.rjust(4) + ins_codes.rjust(1) + record.ljust(6) + + pdb_atom_id.rjust(5) + + spaces + + names.ljust(4) + + spaces + + res_names.rjust(3) + + spaces + + chain_ids + + pdb_res_id.rjust(4) + + ins_codes.rjust(1) ) second_half = ( - occupancy + b_factor + 10 * spaces + - elements.rjust(2) + charge.rjust(2) + occupancy + b_factor + 10 * spaces + elements.rjust(2) + charge.rjust(2) ) coords = array.coord @@ -674,9 +676,10 @@ def set_structure(self, array, hybrid36=False): self.lines.append(f"MODEL {model_num:4}") # Bundle non-coordinate data to simplify iteration self.lines.extend( - [f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}" - for start, (x, y, z), end in - zip(first_half, coord_i, second_half)] + [ + f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}" + for start, (x, y, z), end in zip(first_half, coord_i, second_half) + ] ) if is_stack: self.lines.append("ENDMDL") @@ -688,18 +691,15 @@ def set_structure(self, array, hybrid36=False): hetero_indices = np.where(array.hetero & ~filter_solvent(array))[0] bond_array = array.bonds.as_array() bond_array = bond_array[ - np.isin(bond_array[:,0], hetero_indices) | - np.isin(bond_array[:,1], hetero_indices) | - (array.res_id [bond_array[:,0]] != array.res_id [bond_array[:,1]]) | - (array.chain_id[bond_array[:,0]] != array.chain_id[bond_array[:,1]]) + np.isin(bond_array[:, 0], hetero_indices) + | np.isin(bond_array[:, 1], hetero_indices) + | (array.res_id[bond_array[:, 0]] != array.res_id[bond_array[:, 1]]) + | (array.chain_id[bond_array[:, 0]] != array.chain_id[bond_array[:, 1]]) ] - self._set_bonds( - BondList(array.array_length(), bond_array), pdb_atom_id - ) + self._set_bonds(BondList(array.array_length(), bond_array), pdb_atom_id) self._index_models_and_atoms() - def list_assemblies(self): """ List the biological assemblies that are available for the @@ -727,14 +727,16 @@ def list_assemblies(self): raise InvalidFileError( "File does not contain assembly information (REMARK 300)" ) - return [ - assembly_id.strip() - for assembly_id in remark_lines[0][12:].split(",") - ] - - - def get_assembly(self, assembly_id=None, model=None, altloc="first", - extra_fields=[], include_bonds=False): + return [assembly_id.strip() for assembly_id in remark_lines[0][12:].split(",")] + + def get_assembly( + self, + assembly_id=None, + model=None, + altloc="first", + extra_fields=[], + include_bonds=False, + ): """ Build the given biological assembly. @@ -829,18 +831,16 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", if assembly_start_i is None: if assembly_id is None: raise InvalidFileError( - "File does not contain transformation " - "expressions for assemblies" + "File does not contain transformation " "expressions for assemblies" ) else: - raise KeyError( - f"The assembly ID '{assembly_id}' is not found" - ) - assembly_lines = remark_lines[assembly_start_i : assembly_stop_i] + raise KeyError(f"The assembly ID '{assembly_id}' is not found") + assembly_lines = remark_lines[assembly_start_i:assembly_stop_i] # Get transformations for a set of chains chain_set_start_indices = [ - i for i, line in enumerate(assembly_lines) + i + for i, line in enumerate(assembly_lines) if line.startswith("APPLY THE FOLLOWING TO CHAINS") ] # Add exclusive stop at end of records @@ -848,17 +848,17 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", assembly = None for i in range(len(chain_set_start_indices) - 1): start = chain_set_start_indices[i] - stop = chain_set_start_indices[i+1] + stop = chain_set_start_indices[i + 1] # Read affected chain IDs from the following line(s) affected_chain_ids = [] transform_start = None - for j, line in enumerate(assembly_lines[start : stop]): - if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or \ - line.startswith(" AND CHAINS:"): - affected_chain_ids += [ - chain_id.strip() - for chain_id in line[30:].split(",") - ] + for j, line in enumerate(assembly_lines[start:stop]): + if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or line.startswith( + " AND CHAINS:" + ): + affected_chain_ids += [ + chain_id.strip() for chain_id in line[30:].split(",") + ] else: # Chain specification has finished # BIOMT lines start directly after chain specification @@ -866,11 +866,9 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", break # Parse transformations from BIOMT lines if transform_start is None: - raise InvalidFileError( - "No 'BIOMT' records found for chosen assembly" - ) + raise InvalidFileError("No 'BIOMT' records found for chosen assembly") rotations, translations = _parse_transformations( - assembly_lines[transform_start : stop] + assembly_lines[transform_start:stop] ) # Filter affected chains sub_structure = structure[ @@ -888,9 +886,9 @@ def get_assembly(self, assembly_id=None, model=None, altloc="first", return assembly - - def get_symmetry_mates(self, model=None, altloc="first", - extra_fields=[], include_bonds=False): + def get_symmetry_mates( + self, model=None, altloc="first", extra_fields=[], include_bonds=False + ): """ Build a structure model containing all symmetric copies of the structure within a single unit cell, given by the space @@ -971,27 +969,15 @@ def get_symmetry_mates(self, model=None, altloc="first", "File does not contain crystallographic symmetry " "information (REMARK 350)" ) - transform_lines = [ - line for line in remark_lines if line.startswith(" SMTRY") - ] - rotations, translations = _parse_transformations( - transform_lines - ) - return _apply_transformations( - structure, rotations, translations - ) - - - + transform_lines = [line for line in remark_lines if line.startswith(" SMTRY")] + rotations, translations = _parse_transformations(transform_lines) + return _apply_transformations(structure, rotations, translations) def _index_models_and_atoms(self): # Line indices where a new model starts self._model_start_i = np.array( - [ - i for i in range(len(self.lines)) - if self.lines[i].startswith(("MODEL")) - ], - dtype=int + [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))], + dtype=int, ) if len(self._model_start_i) == 0: # It could be an empty file or a file with a single model, @@ -1005,13 +991,13 @@ def _index_models_and_atoms(self): # Line indices with ATOM or HETATM records self._atom_line_i = np.array( [ - i for i in range(len(self.lines)) + i + for i in range(len(self.lines)) if self.lines[i].startswith(("ATOM", "HETATM")) ], - dtype=int + dtype=int, ) - def _get_atom_record_indices_for_model(self, model): last_model = len(self._model_start_i) if model == 0: @@ -1020,12 +1006,11 @@ def _get_atom_record_indices_for_model(self, model): model = last_model + model + 1 if model < 0 else model if model < last_model: - line_filter = ( - (self._atom_line_i >= self._model_start_i[model-1]) & - (self._atom_line_i < self._model_start_i[model ]) + line_filter = (self._atom_line_i >= self._model_start_i[model - 1]) & ( + self._atom_line_i < self._model_start_i[model] ) elif model == last_model: - line_filter = (self._atom_line_i >= self._model_start_i[model-1]) + line_filter = self._atom_line_i >= self._model_start_i[model - 1] else: raise ValueError( f"The file has {last_model} models, " @@ -1033,7 +1018,6 @@ def _get_atom_record_indices_for_model(self, model): ) return self._atom_line_i[line_filter] - def _get_model_length(self): """ Determine length of models and check that all models @@ -1043,11 +1027,13 @@ def _get_model_length(self): length = None for model_i in range(len(self._model_start_i)): model_start = self._model_start_i[model_i] - model_stop = self._model_start_i[model_i+1] \ - if model_i+1 < n_models else len(self.lines) + model_stop = ( + self._model_start_i[model_i + 1] + if model_i + 1 < n_models + else len(self.lines) + ) model_length = np.count_nonzero( - (self._atom_line_i >= model_start) & - (self._atom_line_i < model_stop) + (self._atom_line_i >= model_start) & (self._atom_line_i < model_stop) ) if length is None: length = model_length @@ -1058,26 +1044,22 @@ def _get_model_length(self): ) return length - def _get_bonds(self, atom_ids): - conect_lines = [line for line in self.lines - if line.startswith("CONECT")] + conect_lines = [line for line in self.lines if line.startswith("CONECT")] # Mapping from atom ids to indices in an AtomArray - atom_id_to_index = np.zeros(atom_ids[-1]+1, dtype=int) + atom_id_to_index = np.zeros(atom_ids[-1] + 1, dtype=int) try: for i, id in enumerate(atom_ids): atom_id_to_index[id] = i except IndexError as e: - raise InvalidFileError( - "Atom IDs are not strictly increasing" - ) from e + raise InvalidFileError("Atom IDs are not strictly increasing") from e bonds = [] for line in conect_lines: - center_id = atom_id_to_index[decode_hybrid36(line[6 : 11])] + center_id = atom_id_to_index[decode_hybrid36(line[6:11])] for i in range(11, 31, 5): - id_string = line[i : i+5] + id_string = line[i : i + 5] try: id = atom_id_to_index[decode_hybrid36(id_string)] except ValueError: @@ -1089,7 +1071,6 @@ def _get_bonds(self, atom_ids): # is equal to the length of the AtomArray return BondList(len(atom_ids), np.array(bonds, dtype=np.uint32)) - def _set_bonds(self, bond_list, atom_ids): # Bond type is unused since PDB does not support bond orders bonds, _ = bond_list.get_all_bonds() @@ -1136,9 +1117,7 @@ def _parse_transformations(lines): # transformation index) are not used transformations = [float(e) for e in line.split()[2:]] if len(transformations) != 4: - raise InvalidFileError( - "Invalid number of transformation vector elements" - ) + raise InvalidFileError("Invalid number of transformation vector elements") rotations[transformation_i, component_i, :] = transformations[:3] translations[transformation_i, component_i] = transformations[3] @@ -1237,4 +1216,4 @@ def _number_of_integer_digits(values): n_digits = 0 n_digits = max(n_digits, len(str(np.min(values)))) n_digits = max(n_digits, len(str(np.max(values)))) - return n_digits \ No newline at end of file + return n_digits diff --git a/src/biotite/structure/io/pdbqt/__init__.py b/src/biotite/structure/io/pdbqt/__init__.py index 6c406636a..ea81ca4fc 100644 --- a/src/biotite/structure/io/pdbqt/__init__.py +++ b/src/biotite/structure/io/pdbqt/__init__.py @@ -11,5 +11,5 @@ __name__ = "biotite.structure.io.pdbqt" __author__ = "Patrick Kunzmann" +from .convert import * from .file import * -from .convert import * \ No newline at end of file diff --git a/src/biotite/structure/io/pdbqt/convert.py b/src/biotite/structure/io/pdbqt/convert.py index ee335ccc6..051339c4f 100644 --- a/src/biotite/structure/io/pdbqt/convert.py +++ b/src/biotite/structure/io/pdbqt/convert.py @@ -18,7 +18,7 @@ def get_structure(pdbqt_file, model=None): PDBQT file. EXPERIMENTAL: Future API changes are probable. - + Parameters ---------- pdbqt_file : PDBQTFile @@ -32,7 +32,7 @@ def get_structure(pdbqt_file, model=None): If this parameter is omitted, an :class:`AtomArrayStack` containing all models will be returned, even if the structure contains only one model. - + Returns ------- array : AtomArray or AtomArrayStack @@ -41,13 +41,20 @@ def get_structure(pdbqt_file, model=None): return pdbqt_file.get_structure(model) -def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, - rotatable_bonds=None, root=None, include_torsdof=True): +def set_structure( + pdbqt_file, + atoms, + charges=None, + atom_types=None, + rotatable_bonds=None, + root=None, + include_torsdof=True, +): """ Write an :class:`AtomArray` into a PDBQT file. EXPERIMENTAL: Future API changes are probable. - + Parameters ---------- pdbqt_file : PDBQTFile @@ -71,7 +78,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, be written. - ``'rigid'`` - The molecule is handled as rigid ligand: Only a ``ROOT`` line will be written. - - ``'all'`` - The molecule is handled as flexible + - ``'all'`` - The molecule is handled as flexible ligand: A ``ROOT`` line will be written and all rotatable bonds are included using ``BRANCH`` and ``ENDBRANCH`` @@ -81,7 +88,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, A ``ROOT`` line will be written and all bonds in the given :class:`BondList` are considered flexible via ``BRANCH`` and ``ENDBRANCH`` lines. - + root : int, optional Specifies the index of the atom following the ``ROOT`` line. Setting the root atom is useful for specifying the *anchor* @@ -93,7 +100,7 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, By default, a ``TORSDOF`` (torsional degrees of freedom) record is written at the end of the file. By setting this parameter to false, the record is omitted. - + Returns ------- mask : ndarray, shape=(n,), dtype=bool @@ -102,6 +109,5 @@ def set_structure(pdbqt_file, atoms, charges=None, atom_types=None, hydrogen. """ return pdbqt_file.set_structure( - atoms, charges, atom_types, rotatable_bonds, root, - include_torsdof - ) \ No newline at end of file + atoms, charges, atom_types, rotatable_bonds, root, include_torsdof + ) diff --git a/src/biotite/structure/io/pdbqt/file.py b/src/biotite/structure/io/pdbqt/file.py index 271d4bc69..21f883c0a 100644 --- a/src/biotite/structure/io/pdbqt/file.py +++ b/src/biotite/structure/io/pdbqt/file.py @@ -8,17 +8,33 @@ import warnings import numpy as np -from ....file import TextFile, InvalidFileError -from ...error import BadStructureError -from ...atoms import AtomArray, AtomArrayStack -from ...charges import partial_charges -from ...bonds import BondList, BondType, find_connected, find_rotatable_bonds - +from biotite.file import InvalidFileError, TextFile +from biotite.structure.atoms import AtomArray, AtomArrayStack +from biotite.structure.bonds import ( + BondList, + BondType, + find_connected, + find_rotatable_bonds, +) +from biotite.structure.charges import partial_charges +from biotite.structure.error import BadStructureError PARAMETRIZED_ELEMENTS = [ - "H", "C", "N", "O", "P", "S", - "F", "Cl", "Br", "I", - "Mg", "Ca", "Mn", "Fe", "Zn" + "H", + "C", + "N", + "O", + "P", + "S", + "F", + "Cl", + "Br", + "I", + "Mg", + "Ca", + "Mn", + "Fe", + "Zn", ] @@ -116,13 +132,15 @@ def get_remarks(self, model=None): ``'REMARKS'``. """ # Line indices where a new model starts - model_start_i = np.array([i for i in range(len(self.lines)) - if self.lines[i].startswith(("MODEL"))], - dtype=int) + model_start_i = np.array( + [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))], + dtype=int, + ) # Line indices with ATOM or HETATM records - remark_line_i = np.array([i for i in range(len(self.lines)) if - self.lines[i].startswith("REMARK")], - dtype=int) + remark_line_i = np.array( + [i for i in range(len(self.lines)) if self.lines[i].startswith("REMARK")], + dtype=int, + ) # Structures containing only one model may omit MODEL record # In these cases model starting index is set to 0 if len(model_start_i) == 0: @@ -131,11 +149,10 @@ def get_remarks(self, model=None): if model is None: # Add exclusive end of file model_start_i = np.concatenate((model_start_i, [len(self.lines)])) - model_i = 0 remarks = [] for i in range(len(model_start_i) - 1): start = model_start_i[i] - stop = model_start_i[i+1] + stop = model_start_i[i + 1] model_remark_line_i = remark_line_i[ (remark_line_i >= start) & (remark_line_i < stop) ] @@ -152,10 +169,11 @@ def get_remarks(self, model=None): model = last_model + model + 1 if model < 0 else model if model < last_model: - line_filter = ( ( remark_line_i >= model_start_i[model-1] ) & - ( remark_line_i < model_start_i[model ] ) ) + line_filter = (remark_line_i >= model_start_i[model - 1]) & ( + remark_line_i < model_start_i[model] + ) elif model == last_model: - line_filter = (remark_line_i >= model_start_i[model-1]) + line_filter = remark_line_i >= model_start_i[model - 1] else: raise ValueError( f"The file has {last_model} models, " @@ -166,7 +184,6 @@ def get_remarks(self, model=None): # Do not include 'REMARK ' itself -> begin from pos 8 return "\n".join([self.lines[i][7:] for i in remark_line_i]) - def get_structure(self, model=None): """ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the @@ -190,13 +207,19 @@ def get_structure(self, model=None): The return type depends on the `model` parameter. """ # Line indices where a new model starts - model_start_i = np.array([i for i in range(len(self.lines)) - if self.lines[i].startswith(("MODEL"))], - dtype=int) + model_start_i = np.array( + [i for i in range(len(self.lines)) if self.lines[i].startswith(("MODEL"))], + dtype=int, + ) # Line indices with ATOM or HETATM records - atom_line_i = np.array([i for i in range(len(self.lines)) if - self.lines[i].startswith(("ATOM", "HETATM"))], - dtype=int) + atom_line_i = np.array( + [ + i + for i in range(len(self.lines)) + if self.lines[i].startswith(("ATOM", "HETATM")) + ], + dtype=int, + ) # Structures containing only one model may omit MODEL record # In these cases model starting index is set to 0 if len(model_start_i) == 0: @@ -224,10 +247,11 @@ def get_structure(self, model=None): model = last_model + model + 1 if model < 0 else model if model < last_model: - line_filter = ( ( atom_line_i >= model_start_i[model-1] ) & - ( atom_line_i < model_start_i[model ] ) ) + line_filter = (atom_line_i >= model_start_i[model - 1]) & ( + atom_line_i < model_start_i[model] + ) elif model == last_model: - line_filter = (atom_line_i >= model_start_i[model-1]) + line_filter = atom_line_i >= model_start_i[model - 1] else: raise ValueError( f"The file has {last_model} models, " @@ -237,16 +261,16 @@ def get_structure(self, model=None): array = AtomArray(len(coord_i)) # Save atom IDs for later sorting into the original atom order - atom_id = np.zeros(array.array_length(), int) + atom_id = np.zeros(array.array_length(), int) # Create annotation arrays - chain_id = np.zeros(array.array_length(), array.chain_id.dtype) - res_id = np.zeros(array.array_length(), array.res_id.dtype) - ins_code = np.zeros(array.array_length(), array.ins_code.dtype) - res_name = np.zeros(array.array_length(), array.res_name.dtype) - hetero = np.zeros(array.array_length(), array.hetero.dtype) + chain_id = np.zeros(array.array_length(), array.chain_id.dtype) + res_id = np.zeros(array.array_length(), array.res_id.dtype) + ins_code = np.zeros(array.array_length(), array.ins_code.dtype) + res_name = np.zeros(array.array_length(), array.res_name.dtype) + hetero = np.zeros(array.array_length(), array.hetero.dtype) atom_name = np.zeros(array.array_length(), array.atom_name.dtype) - element = np.zeros(array.array_length(), array.element.dtype) + element = np.zeros(array.array_length(), array.element.dtype) # Fill annotation array # i is index in array, line_i is line index @@ -258,7 +282,7 @@ def get_structure(self, model=None): res_id[i] = int(line[22:26]) ins_code[i] = line[26].strip() res_name[i] = line[17:20].strip() - hetero[i] = (False if line[0:4] == "ATOM" else True) + hetero[i] = False if line[0:4] == "ATOM" else True atom_name[i] = line[12:16].strip() element[i] = line[76:78].strip() @@ -275,21 +299,21 @@ def get_structure(self, model=None): if isinstance(array, AtomArray): for i, line_i in enumerate(coord_i): line = self.lines[line_i] - array.coord[i,0] = float(line[30:38]) - array.coord[i,1] = float(line[38:46]) - array.coord[i,2] = float(line[46:54]) + array.coord[i, 0] = float(line[30:38]) + array.coord[i, 1] = float(line[38:46]) + array.coord[i, 2] = float(line[46:54]) elif isinstance(array, AtomArrayStack): m = 0 i = 0 for line_i in atom_line_i: - if m < len(model_start_i)-1 and line_i > model_start_i[m+1]: + if m < len(model_start_i) - 1 and line_i > model_start_i[m + 1]: m += 1 i = 0 line = self.lines[line_i] - array.coord[m,i,0] = float(line[30:38]) - array.coord[m,i,1] = float(line[38:46]) - array.coord[m,i,2] = float(line[46:54]) + array.coord[m, i, 0] = float(line[30:38]) + array.coord[m, i, 1] = float(line[38:46]) + array.coord[m, i, 2] = float(line[46:54]) i += 1 # Sort into the original atom order @@ -297,9 +321,15 @@ def get_structure(self, model=None): return array - - def set_structure(self, atoms, charges=None, atom_types=None, - rotatable_bonds=None, root=None, include_torsdof=True): + def set_structure( + self, + atoms, + charges=None, + atom_types=None, + rotatable_bonds=None, + root=None, + include_torsdof=True, + ): """ Write an :class:`AtomArray` into the PDBQT file. @@ -394,12 +424,8 @@ def set_structure(self, atoms, charges=None, atom_types=None, use_root = True else: if rotatable_bonds.ndim != 2 or rotatable_bonds.shape[1] != 2: - raise ValueError( - "An (nx2) array is expected for rotatable bonds" - ) - rotatable_bonds = BondList( - len(mask), np.asarray(rotatable_bonds) - )[mask] + raise ValueError("An (nx2) array is expected for rotatable bonds") + rotatable_bonds = BondList(len(mask), np.asarray(rotatable_bonds))[mask] use_root = True if root is None: @@ -426,35 +452,51 @@ def set_structure(self, atoms, charges=None, atom_types=None, # for simple branch determination in '_write_atoms()' atoms.bonds.remove_bonds(rotatable_bonds) - hetero = ["ATOM" if e == False else "HETATM" for e in atoms.hetero] + hetero = ["HETATM" if e else "ATOM" for e in atoms.hetero] if "atom_id" in atoms.get_annotation_categories(): atom_id = atoms.atom_id else: - atom_id = np.arange(1, atoms.array_length()+1) + atom_id = np.arange(1, atoms.array_length() + 1) occupancy = np.ones(atoms.array_length()) b_factor = np.zeros(atoms.array_length()) # Convert rotatable bonds into array for easier handling # The bond type is irrelevant from this point on - rotatable_bonds = rotatable_bonds.as_array()[:,:2] + rotatable_bonds = rotatable_bonds.as_array()[:, :2] self.lines = [] self._write_atoms( - atoms, charges, types, - atom_id, hetero, occupancy, b_factor, - root_index, rotatable_bonds, - np.zeros(len(rotatable_bonds), dtype=bool), use_root + atoms, + charges, + types, + atom_id, + hetero, + occupancy, + b_factor, + root_index, + rotatable_bonds, + np.zeros(len(rotatable_bonds), dtype=bool), + use_root, ) if include_torsdof: self.lines.append(f"TORSDOF {len(rotatable_bonds)}") return mask - - def _write_atoms(self, atoms, charges, types, - atom_id, hetero, occupancy, b_factor, - root_atom, rotatable_bonds, visited_rotatable_bonds, - is_root): + def _write_atoms( + self, + atoms, + charges, + types, + atom_id, + hetero, + occupancy, + b_factor, + root_atom, + rotatable_bonds, + visited_rotatable_bonds, + is_root, + ): if len(rotatable_bonds) != 0: # Get the indices to atoms of this branch, i.e. a group of # atoms that are connected by non-rotatable bonds @@ -465,9 +507,7 @@ def _write_atoms(self, atoms, charges, types, # the rotatable bond should always be listed first # -> Remove root atom and insert it at the beginning this_branch_indices = np.insert( - this_branch_indices[this_branch_indices != root_atom], - 0, - root_atom + this_branch_indices[this_branch_indices != root_atom], 0, root_atom ) else: # No rotatable bonds @@ -525,18 +565,24 @@ def _write_atoms(self, atoms, charges, types, f"BRANCH {atom_id[this_br_i]:>3d} {atom_id[new_br_i]:>3d}" ) self._write_atoms( - atoms, charges, types, - atom_id, hetero, occupancy, b_factor, + atoms, + charges, + types, + atom_id, + hetero, + occupancy, + b_factor, # The root atom of the branch - #is the other atom of the rotatable bond - new_br_i, rotatable_bonds, visited_rotatable_bonds, - False + # is the other atom of the rotatable bond + new_br_i, + rotatable_bonds, + visited_rotatable_bonds, + False, ) self.lines.append( f"ENDBRANCH {atom_id[this_br_i]:>3d} {atom_id[new_br_i]:>3d}" ) - def _get_model_length(self, model_start_i, atom_line_i): """ Determine length of models and check that all models @@ -546,8 +592,11 @@ def _get_model_length(self, model_start_i, atom_line_i): length = None for model_i in range(len(model_start_i)): model_start = model_start_i[model_i] - model_stop = model_start_i[model_i+1] if model_i+1 < n_models \ - else len(self.lines) + model_stop = ( + model_start_i[model_i + 1] + if model_i + 1 < n_models + else len(self.lines) + ) model_length = np.count_nonzero( (atom_line_i >= model_start) & (atom_line_i < model_stop) ) @@ -613,8 +662,7 @@ def convert_atoms(atoms, charges): ) elif element == "C": if np.isin( - all_bond_types[i], - [BondType.AROMATIC_SINGLE, BondType.AROMATIC_DOUBLE] + all_bond_types[i], [BondType.AROMATIC_SINGLE, BondType.AROMATIC_DOUBLE] ).any(): # Aromatic carbon atom_types[i] = "A" @@ -637,4 +685,4 @@ def convert_atoms(atoms, charges): atom_types[i] = "H" mask = ~hydrogen_removal_mask - return atoms[mask], charges[mask], atom_types[mask], mask \ No newline at end of file + return atoms[mask], charges[mask], atom_types[mask], mask diff --git a/src/biotite/structure/io/pdbx/__init__.py b/src/biotite/structure/io/pdbx/__init__.py index 289081f5d..0b3714b48 100644 --- a/src/biotite/structure/io/pdbx/__init__.py +++ b/src/biotite/structure/io/pdbx/__init__.py @@ -15,9 +15,8 @@ __name__ = "biotite.structure.io.pdbx" __author__ = "Patrick Kunzmann" -from .convert import * from .bcif import * from .cif import * from .component import * +from .convert import * from .encoding import * -from .legacy import * \ No newline at end of file diff --git a/src/biotite/structure/io/pdbx/bcif.py b/src/biotite/structure/io/pdbx/bcif.py index 4b9331ff6..4f3aef3a5 100644 --- a/src/biotite/structure/io/pdbx/bcif.py +++ b/src/biotite/structure/io/pdbx/bcif.py @@ -4,16 +4,29 @@ __name__ = "biotite.structure.io.pdbx" __author__ = "Patrick Kunzmann" -__all__ = ["BinaryCIFFile", "BinaryCIFBlock", "BinaryCIFCategory", - "BinaryCIFColumn", "BinaryCIFData"] +__all__ = [ + "BinaryCIFFile", + "BinaryCIFBlock", + "BinaryCIFCategory", + "BinaryCIFColumn", + "BinaryCIFData", +] from collections.abc import Sequence -import numpy as np import msgpack -from .component import _Component, _HierarchicalContainer, MaskValue -from .encoding import decode_stepwise, encode_stepwise, deserialize_encoding, \ - create_uncompressed_encoding -from ....file import File, is_binary, is_open_compatible, SerializationError +import numpy as np +from biotite.file import File, SerializationError, is_binary, is_open_compatible +from biotite.structure.io.pdbx.component import ( + MaskValue, + _Component, + _HierarchicalContainer, +) +from biotite.structure.io.pdbx.encoding import ( + create_uncompressed_encoding, + decode_stepwise, + deserialize_encoding, + encode_stepwise, +) class BinaryCIFData(_Component): @@ -74,10 +87,7 @@ class BinaryCIFData(_Component): """ def __init__(self, array, encoding=None): - if ( - not isinstance(array, (Sequence, np.ndarray)) - or isinstance(array, str) - ): + if not isinstance(array, (Sequence, np.ndarray)) or isinstance(array, str): array = [array] array = np.asarray(array) if np.issubdtype(array.dtype, np.object_): @@ -107,19 +117,13 @@ def supercomponent_class(): @staticmethod def deserialize(content): - encoding = [ - deserialize_encoding(enc) for enc in content["encoding"] - ] - return BinaryCIFData( - decode_stepwise(content["data"], encoding), encoding - ) + encoding = [deserialize_encoding(enc) for enc in content["encoding"]] + return BinaryCIFData(decode_stepwise(content["data"], encoding), encoding) def serialize(self): serialized_data = encode_stepwise(self._array, self._encoding) if not isinstance(serialized_data, bytes): - raise SerializationError( - "Final encoding must return 'bytes'" - ) + raise SerializationError("Final encoding must return 'bytes'") serialized_encoding = [enc.serialize() for enc in self._encoding] return {"data": serialized_data, "encoding": serialized_encoding} @@ -190,8 +194,7 @@ def __init__(self, data, mask=None): mask = BinaryCIFData(mask) if len(data) != len(mask): raise IndexError( - f"Data has length {len(data)}, " - f"but mask has length {len(mask)}" + f"Data has length {len(data)}, " f"but mask has length {len(mask)}" ) self._data = data self._mask = mask @@ -290,9 +293,7 @@ def as_array(self, dtype=None, masked_value=None): array = np.full(len(self._data), masked_value, dtype=dtype) present_mask = self._mask.array == MaskValue.PRESENT - array[present_mask] = ( - self._data.array[present_mask].astype(dtype) - ) + array[present_mask] = self._data.array[present_mask].astype(dtype) return array @staticmethod @@ -300,13 +301,14 @@ def deserialize(content): return BinaryCIFColumn( BinaryCIFData.deserialize(content["data"]), BinaryCIFData.deserialize(content["mask"]) - if content["mask"] is not None else None + if content["mask"] is not None + else None, ) def serialize(self): return { "data": self._data.serialize(), - "mask": self._mask.serialize() if self._mask is not None else None + "mask": self._mask.serialize() if self._mask is not None else None, } def __len__(self): @@ -392,10 +394,8 @@ def supercomponent_class(): @staticmethod def deserialize(content): return BinaryCIFCategory( - BinaryCIFCategory._deserialize_elements( - content["columns"], "name" - ), - content["rowCount"] + BinaryCIFCategory._deserialize_elements(content["columns"], "name"), + content["rowCount"], ) def serialize(self): @@ -470,9 +470,7 @@ def supercomponent_class(): @staticmethod def deserialize(content): return BinaryCIFBlock( - BinaryCIFBlock._deserialize_elements( - content["categories"], "name" - ) + BinaryCIFBlock._deserialize_elements(content["categories"], "name") ) def serialize(self): @@ -559,16 +557,14 @@ def supercomponent_class(): @staticmethod def deserialize(content): return BinaryCIFFile( - BinaryCIFFile._deserialize_elements( - content["dataBlocks"], "header" - ) + BinaryCIFFile._deserialize_elements(content["dataBlocks"], "header") ) def serialize(self): return {"dataBlocks": self._serialize_elements("header")} @classmethod - def read(self, file): + def read(cls, file): """ Read a *BinaryCIF* file. @@ -587,18 +583,14 @@ def read(self, file): if is_open_compatible(file): with open(file, "rb") as f: return BinaryCIFFile.deserialize( - msgpack.unpackb( - f.read(), use_list=True, raw=False - ) + msgpack.unpackb(f.read(), use_list=True, raw=False) ) # File object else: if not is_binary(file): raise TypeError("A file opened in 'binary' mode is required") return BinaryCIFFile.deserialize( - msgpack.unpackb( - file.read(), use_list=True, raw=False - ) + msgpack.unpackb(file.read(), use_list=True, raw=False) ) def write(self, file): diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py index b7f8780a4..db54f6138 100644 --- a/src/biotite/structure/io/pdbx/cif.py +++ b/src/biotite/structure/io/pdbx/cif.py @@ -6,14 +6,18 @@ __author__ = "Patrick Kunzmann" __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"] -import re import itertools +import re from collections.abc import MutableMapping, Sequence import numpy as np -from .component import _Component, MaskValue -from ....file import File, is_open_compatible, is_text, DeserializationError, \ - SerializationError - +from biotite.file import ( + DeserializationError, + File, + SerializationError, + is_open_compatible, + is_text, +) +from biotite.structure.io.pdbx.component import MaskValue, _Component UNICODE_CHAR_SIZE = 4 @@ -133,9 +137,7 @@ def __init__(self, data, mask=None): if not isinstance(data, CIFData): data = CIFData(data, str) if mask is None: - mask = np.full( - len(data), MaskValue.PRESENT, dtype=np.uint8 - ) + mask = np.full(len(data), MaskValue.PRESENT, dtype=np.uint8) mask[data.array == "."] = MaskValue.INAPPLICABLE mask[data.array == "?"] = MaskValue.MISSING if np.all(mask == MaskValue.PRESENT): @@ -148,8 +150,7 @@ def __init__(self, data, mask=None): mask = CIFData(mask, np.uint8) if len(mask) != len(data): raise IndexError( - f"Data has length {len(data)}, " - f"but mask has length {len(mask)}" + f"Data has length {len(data)}, " f"but mask has length {len(mask)}" ) self._data = data self._mask = mask @@ -222,9 +223,7 @@ def as_array(self, dtype=str, masked_value=None): elif np.issubdtype(dtype, np.str_): # Limit float precision to 3 decimals if np.issubdtype(self._data.array.dtype, np.floating): - array = np.array( - [f"{e:.3f}" for e in self._data.array], type=dtype - ) + array = np.array([f"{e:.3f}" for e in self._data.array], type=dtype) else: # Copy, as otherwise original data would be overwritten # with mask values @@ -247,9 +246,7 @@ def as_array(self, dtype=str, masked_value=None): array = np.full(len(self._data), masked_value, dtype=dtype) present_mask = self._mask.array == MaskValue.PRESENT - array[present_mask] = ( - self._data.array[present_mask].astype(dtype) - ) + array[present_mask] = self._data.array[present_mask].astype(dtype) return array def __len__(self): @@ -361,9 +358,7 @@ def supercomponent_class(): @staticmethod def deserialize(text, expect_whitespace=True): - lines = [ - line.strip() for line in text.splitlines() if not _is_empty(line) - ] + lines = [line.strip() for line in text.splitlines() if not _is_empty(line)] if _is_loop_start(lines[0]): is_looped = True @@ -373,15 +368,11 @@ def deserialize(text, expect_whitespace=True): category_name = _parse_category_name(lines[0]) if category_name is None: - raise DeserializationError( - "Failed to parse category name" - ) + raise DeserializationError("Failed to parse category name") lines = _to_single(lines, is_looped) if is_looped: - category_dict = CIFCategory._deserialize_looped( - lines, expect_whitespace - ) + category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace) else: category_dict = CIFCategory._deserialize_single(lines) return CIFCategory(category_dict, category_name) @@ -510,27 +501,21 @@ def _serialize_single(self): ] def _serialize_looped(self): - key_lines = [ - "_" + self._name + "." + key + " " - for key in self.keys() - ] + key_lines = ["_" + self._name + "." + key + " " for key in self.keys()] column_arrays = [] for column in self.values(): array = column.as_array(str) # Quote before measuring the number of chars, # as the quote characters modify the length - array = np.array( - [_multiline(_quote(element)) for element in array] - ) + array = np.array([_multiline(_quote(element)) for element in array]) column_arrays.append(array) # Number of characters the longest string in the column needs # This can be deduced from the dtype # The "+1" is for the small whitespace column column_n_chars = [ - array.dtype.itemsize // UNICODE_CHAR_SIZE + 1 - for array in column_arrays + array.dtype.itemsize // UNICODE_CHAR_SIZE + 1 for array in column_arrays ] value_lines = [""] * self._row_count for i in range(self._row_count): @@ -614,15 +599,11 @@ def deserialize(text): if is_loop_in_line: # In case of lines with "loop_" the category is # in the next line - category_name_in_line = _parse_category_name( - lines[i + 1] - ) + category_name_in_line = _parse_category_name(lines[i + 1]) current_category_name = category_name_in_line category_starts.append(i) category_names.append(current_category_name) - return CIFBlock(_create_element_dict( - lines, category_names, category_starts - )) + return CIFBlock(_create_element_dict(lines, category_names, category_starts)) def serialize(self): text_blocks = [] @@ -634,7 +615,7 @@ def serialize(self): try: category.name = category_name text_blocks.append(category.serialize()) - except: + except Exception: raise SerializationError( f"Failed to serialize category '{category_name}'" ) @@ -657,10 +638,8 @@ def __getitem__(self, key): else: expect_whitespace = True category = CIFCategory.deserialize(category, expect_whitespace) - except: - raise DeserializationError( - f"Failed to deserialize category '{key}'" - ) + except Exception: + raise DeserializationError(f"Failed to deserialize category '{key}'") # Update with deserialized object self._categories[key] = category return category @@ -808,7 +787,7 @@ def serialize(self): else: try: text_blocks.append(block.serialize()) - except: + except Exception: raise SerializationError( f"Failed to serialize block '{block_name}'" ) @@ -868,19 +847,15 @@ def __getitem__(self, key): # -> must be deserialized first try: block = CIFBlock.deserialize(block) - except: - raise DeserializationError( - f"Failed to deserialize block '{key}'" - ) + except Exception: + raise DeserializationError(f"Failed to deserialize block '{key}'") # Update with deserialized object self._blocks[key] = block return block def __setitem__(self, key, block): if not isinstance(block, CIFBlock): - raise TypeError( - f"Expected 'CIFBlock', but got '{type(block).__name__}'" - ) + raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'") self._blocks[key] = block def __delitem__(self, key): @@ -918,7 +893,7 @@ def _create_element_dict(lines, element_names, element_starts): # Lazy deserialization # -> keep as text for now and deserialize later if needed return { - element_name: "\n".join(lines[element_starts[i] : element_starts[i+1]]) + element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) for i, element_name in enumerate(element_names) } diff --git a/src/biotite/structure/io/pdbx/component.py b/src/biotite/structure/io/pdbx/component.py index 76eb0c8da..fb2f228ed 100644 --- a/src/biotite/structure/io/pdbx/component.py +++ b/src/biotite/structure/io/pdbx/component.py @@ -11,10 +11,10 @@ __author__ = "Patrick Kunzmann" __all__ = ["MaskValue"] -from enum import IntEnum from abc import ABCMeta, abstractmethod from collections.abc import MutableMapping -from ....file import SerializationError, DeserializationError +from enum import IntEnum +from biotite.file import DeserializationError, SerializationError class MaskValue(IntEnum): @@ -29,6 +29,7 @@ class MaskValue(IntEnum): - `MISSING` : For this row the value is missing or unknown (``?`` in *CIF*). """ + PRESENT = 0 INAPPLICABLE = 1 MISSING = 2 @@ -109,8 +110,7 @@ def __str__(self): return str(self.serialize()) -class _HierarchicalContainer(_Component, MutableMapping, - metaclass=ABCMeta): +class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta): """ A container for hierarchical data in BinaryCIF files. For example, the file contains multiple blocks, each block contains @@ -181,10 +181,8 @@ def _serialize_elements(self, store_key_in=None): if isinstance(element, self.subcomponent_class()): try: serialized_element = element.serialize() - except: - raise SerializationError( - f"Failed to serialize element '{key}'" - ) + except Exception: + raise SerializationError(f"Failed to serialize element '{key}'") else: # Element is already stored in serialized form serialized_element = element @@ -200,10 +198,8 @@ def __getitem__(self, key): # -> must be deserialized first try: element = self.subcomponent_class().deserialize(element) - except: - raise DeserializationError( - f"Failed to deserialize element '{key}'" - ) + except Exception: + raise DeserializationError(f"Failed to deserialize element '{key}'") # Update container with deserialized object self._elements[key] = element return element @@ -220,10 +216,8 @@ def __setitem__(self, key, element): else: try: element = self.subcomponent_class().deserialize(element) - except: - raise DeserializationError( - f"Failed to deserialize given value" - ) + except Exception: + raise DeserializationError("Failed to deserialize given value") self._elements[key] = element def __delitem__(self, key): diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 06ca9c02e..33ba92171 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -18,30 +18,41 @@ import itertools import warnings import numpy as np -from ....file import InvalidFileError -from ....sequence.seqtypes import NucleotideSequence, ProteinSequence -from ...atoms import AtomArray, AtomArrayStack, repeat -from ...bonds import BondList, BondType, connect_via_residue_names -from ...box import unitcell_from_vectors, vectors_from_unitcell -from ...filter import filter_first_altloc, filter_highest_occupancy_altloc -from ...residues import get_residue_count, get_residue_starts_for -from ...error import BadStructureError -from ...util import matrix_rotate -from .legacy import PDBxFile -from .component import MaskValue -from .cif import CIFFile, CIFBlock -from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn -from .encoding import StringArrayEncoding - +from biotite.file import InvalidFileError +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence +from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat +from biotite.structure.bonds import BondList, BondType, connect_via_residue_names +from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell +from biotite.structure.error import BadStructureError +from biotite.structure.filter import ( + filter_first_altloc, + filter_highest_occupancy_altloc, +) +from biotite.structure.io.pdbx.bcif import ( + BinaryCIFBlock, + BinaryCIFColumn, + BinaryCIFFile, +) +from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile +from biotite.structure.io.pdbx.component import MaskValue +from biotite.structure.io.pdbx.encoding import StringArrayEncoding +from biotite.structure.residues import get_residue_count, get_residue_starts_for +from biotite.structure.util import matrix_rotate # Cond types in `struct_conn` category that refer to covalent bonds PDBX_COVALENT_TYPES = [ - "covale", "covale_base", "covale_phosphate", "covale_sugar", - "disulf", "modres", "modres_link", "metalc" + "covale", + "covale_base", + "covale_phosphate", + "covale_sugar", + "disulf", + "modres", + "modres_link", + "metalc", ] # Map 'struct_conn' bond orders to 'BondType'... PDBX_BOND_ORDER_TO_TYPE = { - "": BondType.ANY, + "": BondType.ANY, "sing": BondType.SINGLE, "doub": BondType.DOUBLE, "trip": BondType.TRIPLE, @@ -61,13 +72,13 @@ } # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'... COMP_BOND_ORDER_TO_TYPE = { - ("SING", "N") : BondType.SINGLE, - ("DOUB", "N") : BondType.DOUBLE, - ("TRIP", "N") : BondType.TRIPLE, - ("QUAD", "N") : BondType.QUADRUPLE, - ("SING", "Y") : BondType.AROMATIC_SINGLE, - ("DOUB", "Y") : BondType.AROMATIC_DOUBLE, - ("TRIP", "Y") : BondType.AROMATIC_TRIPLE, + ("SING", "N"): BondType.SINGLE, + ("DOUB", "N"): BondType.DOUBLE, + ("TRIP", "N"): BondType.TRIPLE, + ("QUAD", "N"): BondType.QUADRUPLE, + ("SING", "Y"): BondType.AROMATIC_SINGLE, + ("DOUB", "Y"): BondType.AROMATIC_DOUBLE, + ("TRIP", "Y"): BondType.AROMATIC_TRIPLE, } # ...and vice versa COMP_BOND_TYPE_TO_ORDER = { @@ -98,16 +109,15 @@ def _filter(category, index): Column = Category.subcomponent_class() Data = Column.subcomponent_class() - return Category({ - key: Column( - Data(column.data.array[index]), - ( - Data(column.mask.array[index]) - if column.mask is not None else None + return Category( + { + key: Column( + Data(column.data.array[index]), + (Data(column.mask.array[index]) if column.mask is not None else None), ) - ) - for key, column in category.items() - }) + for key, column in category.items() + } + ) def get_sequence(pdbx_file, data_block=None): @@ -134,22 +144,22 @@ def get_sequence(pdbx_file, data_block=None): Returns ------- - sequence_dict : Dictionary of Sequences + sequence_dict : Dictionary of Sequences Dictionary keys are derived from ``entity_poly.pdbx_strand_id`` (often equivalent to chain_id and atom_site.auth_asym_id in most cases). Dictionary values are sequences. - + Notes ----- - The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial - complete sequence. If the structure represents a truncated or spliced - version of this initial sequence, it will include only a subset of the - initial sequence. Use biotite.structure.get_residues to retrieve only + The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial + complete sequence. If the structure represents a truncated or spliced + version of this initial sequence, it will include only a subset of the + initial sequence. Use biotite.structure.get_residues to retrieve only the residues that are represented in the structure. """ - + block = _get_block(pdbx_file, data_block) - poly_category= block["entity_poly"] + poly_category = block["entity_poly"] seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str) seq_type = poly_category["type"].as_array(str) @@ -158,10 +168,10 @@ def get_sequence(pdbx_file, data_block=None): _convert_string_to_sequence(string, stype) for string, stype in zip(seq_string, seq_type) ] - - strand_ids = poly_category['pdbx_strand_id'].as_array(str) + + strand_ids = poly_category["pdbx_strand_id"].as_array(str) strand_ids = [strand_id.split(",") for strand_id in strand_ids] - + sequence_dict = { strand_id: sequence for sequence, strand_ids in zip(sequences, strand_ids) @@ -174,7 +184,7 @@ def get_sequence(pdbx_file, data_block=None): def get_model_count(pdbx_file, data_block=None): """ - Get the number of models contained in a :class:`PDBxFile`. + Get the number of models contained in a file. Parameters ---------- @@ -193,17 +203,23 @@ def get_model_count(pdbx_file, data_block=None): The number of models. """ block = _get_block(pdbx_file, data_block) - return len(_get_model_starts( - block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32) - )) + return len( + _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)) + ) -def get_structure(pdbx_file, model=None, data_block=None, altloc="first", - extra_fields=None, use_author_fields=True, - include_bonds=False): +def get_structure( + pdbx_file, + model=None, + data_block=None, + altloc="first", + extra_fields=None, + use_author_fields=True, + include_bonds=False, +): """ Create an :class:`AtomArray` or :class:`AtomArrayStack` from the - ``atom_site`` category in a :class:`PDBxFile`. + ``atom_site`` category in a file. Parameters ---------- @@ -249,7 +265,7 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first", for example both, ``label_seq_id`` and ``auth_seq_id`` describe the ID of the residue. While, the ``label_xxx`` fields can be used as official pointers - to other categories in the :class:`PDBxFile`, the ``auth_xxx`` + to other categories in the file, the ``auth_xxx`` fields are set by the author(s) of the structure and are consistent with the corresponding values in PDB files. If `use_author_fields` is true, the annotation arrays will be @@ -311,12 +327,21 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first", "instead" ) - atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \ - .reshape((model_count, model_length)) - atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \ - .reshape((model_count, model_length)) - atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \ - .reshape((model_count, model_length)) + atoms.coord[:, :, 0] = ( + atom_site["Cartn_x"] + .as_array(np.float32) + .reshape((model_count, model_length)) + ) + atoms.coord[:, :, 1] = ( + atom_site["Cartn_y"] + .as_array(np.float32) + .reshape((model_count, model_length)) + ) + atoms.coord[:, :, 2] = ( + atom_site["Cartn_z"] + .as_array(np.float32) + .reshape((model_count, model_length)) + ) box = _get_box(block) if box is not None: @@ -346,31 +371,25 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first", atoms.box = _get_box(block) # The below part is the same for both, AtomArray and AtomArrayStack - _fill_annotations( - atoms, model_atom_site, extra_fields, use_author_fields - ) + _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields) if include_bonds: if "chem_comp_bond" in block: try: - custom_bond_dict = _parse_intra_residue_bonds( - block["chem_comp_bond"] - ) + custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"]) except KeyError: warnings.warn( "The 'chem_comp_bond' category has missing columns, " "falling back to using Chemical Component Dictionary", - UserWarning + UserWarning, ) custom_bond_dict = None - bonds = connect_via_residue_names( - atoms, custom_bond_dict=custom_bond_dict - ) + bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict) else: bonds = connect_via_residue_names(atoms) if "struct_conn" in block: - bonds = bonds.merge(_parse_inter_residue_bonds( - model_atom_site, block["struct_conn"] - )) + bonds = bonds.merge( + _parse_inter_residue_bonds(model_atom_site, block["struct_conn"]) + ) atoms.bonds = bonds atoms = _filter_altloc(atoms, model_atom_site, altloc) @@ -378,10 +397,6 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first", def _get_block(pdbx_component, block_name): - if isinstance(pdbx_component, PDBxFile): - # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile' - pdbx_component = pdbx_component.cif_file - if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)): # Determine block if block_name is None: @@ -393,24 +408,24 @@ def _get_block(pdbx_component, block_name): def _get_or_fallback(category, key, fallback_key): - """ - Return column related to key in category if it exists, - otherwise try to get the column related to fallback key. - """ - if key not in category: - warnings.warn( - f"Attribute '{key}' not found within 'atom_site' category. " - f"The fallback attribute '{fallback_key}' will be used instead", - UserWarning - ) - try: - return category[fallback_key] - except KeyError as key_exc: - raise InvalidFileError( - f"Fallback attribute '{fallback_key}' not found within " - "'atom_site' category" - ) from key_exc - return category[key] + """ + Return column related to key in category if it exists, + otherwise try to get the column related to fallback key. + """ + if key not in category: + warnings.warn( + f"Attribute '{key}' not found within 'atom_site' category. " + f"The fallback attribute '{fallback_key}' will be used instead", + UserWarning, + ) + try: + return category[fallback_key] + except KeyError as key_exc: + raise InvalidFileError( + f"Fallback attribute '{fallback_key}' not found within " + "'atom_site' category" + ) from key_exc + return category[key] def _fill_annotations(array, atom_site, extra_fields, use_author_fields): @@ -429,78 +444,52 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields): instead of ``label_``. """ - prefix, alt_prefix = ( - ("auth", "label") if use_author_fields else ("label", "auth") - ) + prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth") array.set_annotation( "chain_id", _get_or_fallback( atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id" - ).as_array("U4") + ).as_array("U4"), ) array.set_annotation( "res_id", _get_or_fallback( atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id" - ).as_array(int, -1) - ) - array.set_annotation( - "ins_code", - atom_site["pdbx_PDB_ins_code"].as_array("U1", "") + ).as_array(int, -1), ) + array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array("U1", "")) array.set_annotation( "res_name", _get_or_fallback( atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id" - ).as_array("U5") - ) - array.set_annotation( - "hetero", - atom_site["group_PDB"].as_array(str) == "HETATM" + ).as_array("U5"), ) + array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM") array.set_annotation( "atom_name", _get_or_fallback( atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id" - ).as_array("U6") - ) - array.set_annotation( - "element", - atom_site["type_symbol"].as_array("U2") + ).as_array("U6"), ) + array.set_annotation("element", atom_site["type_symbol"].as_array("U2")) if "atom_id" in extra_fields: - array.set_annotation( - "atom_id", - atom_site["id"].as_array(int) - ) + array.set_annotation("atom_id", atom_site["id"].as_array(int)) extra_fields.remove("atom_id") if "b_factor" in extra_fields: - array.set_annotation( - "b_factor", - atom_site["B_iso_or_equiv"].as_array(float) - ) + array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float)) extra_fields.remove("b_factor") if "occupancy" in extra_fields: - array.set_annotation( - "occupancy", - atom_site["occupancy"].as_array(float) - ) + array.set_annotation("occupancy", atom_site["occupancy"].as_array(float)) extra_fields.remove("occupancy") if "charge" in extra_fields: - array.set_annotation( - "charge", - atom_site["pdbx_formal_charge"].as_array(int, 0) - ) + array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0)) extra_fields.remove("charge") # Handle all remaining custom fields for field in extra_fields: - array.set_annotation( - field, - atom_site[field].as_array(str) - ) + array.set_annotation(field, atom_site[field].as_array(str)) def _parse_intra_residue_bonds(chem_comp_bond): @@ -514,7 +503,7 @@ def _parse_intra_residue_bonds(chem_comp_bond): chem_comp_bond["atom_id_1"].as_array(str), chem_comp_bond["atom_id_2"].as_array(str), chem_comp_bond["value_order"].as_array(str), - chem_comp_bond["pdbx_aromatic_flag"].as_array(str) + chem_comp_bond["pdbx_aromatic_flag"].as_array(str), ): if res_name not in custom_bond_dict: custom_bond_dict[res_name] = {} @@ -535,33 +524,32 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): IDENTITY = "1_555" # Columns in 'atom_site' that should be matched by 'struct_conn' COLUMNS = [ - "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id", - "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id", - "pdbx_PDB_ins_code" + "label_asym_id", + "label_comp_id", + "label_seq_id", + "label_atom_id", + "label_alt_id", + "auth_asym_id", + "auth_comp_id", + "auth_seq_id", + "pdbx_PDB_ins_code", ] covale_mask = np.isin( struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES ) if "ptnr1_symmetry" in struct_conn: - covale_mask &= ( - struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY - ) + covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY if "ptnr2_symmetry" in struct_conn: - covale_mask &= ( - struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY - ) + covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY atom_indices = [None] * 2 for i in range(2): reference_arrays = [] query_arrays = [] for col_name in COLUMNS: - struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1) - if ( - col_name not in atom_site - or struct_conn_col_name not in struct_conn - ): + struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1) + if col_name not in atom_site or struct_conn_col_name not in struct_conn: continue # Ensure both arrays have the same dtype to allow comparison reference = atom_site[col_name].as_array() @@ -598,7 +586,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): return BondList( atom_site.row_count, - np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1) + np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1), ) @@ -608,10 +596,13 @@ def _find_matches(query_arrays, reference_arrays): `reference_arrays` where all query values the reference counterpart. If no match is found for a query, the corresponding index is -1. """ - match_masks_for_all_columns = np.stack([ - query[:, np.newaxis] == reference[np.newaxis, :] - for query, reference in zip(query_arrays, reference_arrays) - ], axis=-1) + match_masks_for_all_columns = np.stack( + [ + query[:, np.newaxis] == reference[np.newaxis, :] + for query, reference in zip(query_arrays, reference_arrays) + ], + axis=-1, + ) match_masks = np.all(match_masks_for_all_columns, axis=-1) query_matches, reference_matches = np.where(match_masks) @@ -685,14 +676,8 @@ def _filter_model(atom_site, model_starts, model): Reduce the ``atom_site`` category to the values for the given model. """ - Category = type(atom_site) - Column = Category.subcomponent_class() - Data = Column.subcomponent_class() - # Append exclusive stop - model_starts = np.append( - model_starts, [atom_site.row_count] - ) + model_starts = np.append(model_starts, [atom_site.row_count]) # Indexing starts at 0, but model number starts at 1 model_index = model - 1 index = slice(model_starts[model_index], model_starts[model_index + 1]) @@ -778,9 +763,7 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): # Fill PDBx columns from information # in structures' attribute arrays as good as possible atom_site = Category() - atom_site["group_PDB"] = np.where( - array.hetero, "HETATM", "ATOM" - ) + atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM") atom_site["type_symbol"] = np.copy(array.element) atom_site["label_atom_id"] = np.copy(array.atom_name) atom_site["label_alt_id"] = Column( @@ -794,7 +777,7 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): atom_site["label_seq_id"] = np.copy(array.res_id) atom_site["pdbx_PDB_ins_code"] = Column( np.copy(array.ins_code), - np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT) + np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT), ) atom_site["auth_seq_id"] = atom_site["label_seq_id"] atom_site["auth_comp_id"] = atom_site["label_comp_id"] @@ -811,11 +794,11 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): if "charge" in annot_categories: atom_site["pdbx_formal_charge"] = Column( np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]), - np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT) + np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT), ) if array.bonds is not None: - struct_conn = _set_inter_residue_bonds(array, atom_site) + struct_conn = _set_inter_residue_bonds(array, atom_site) if struct_conn is not None: block["struct_conn"] = struct_conn if include_bonds: @@ -825,24 +808,20 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): # In case of a single model handle each coordinate # simply like a flattened array - if type(array) == AtomArray or ( - type(array) == AtomArrayStack and array.stack_depth() == 1 + if isinstance(array, AtomArray) or ( + isinstance(array, AtomArrayStack) and array.stack_depth() == 1 ): # 'ravel' flattens coord without copy # in case of stack with stack_depth = 1 atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0])) atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1])) atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2])) - atom_site["pdbx_PDB_model_num"] = np.ones( - array.array_length(), dtype=np.int32 - ) + atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32) # In case of multiple models repeat annotations # and use model specific coordinates else: atom_site = _repeat(atom_site, array.stack_depth()) - coord = np.reshape( - array.coord, (array.stack_depth() * array.array_length(), 3) - ) + coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3)) atom_site["Cartn_x"] = np.copy(coord[:, 0]) atom_site["Cartn_y"] = np.copy(coord[:, 1]) atom_site["Cartn_z"] = np.copy(coord[:, 2]) @@ -850,11 +829,9 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False): np.arange(1, array.stack_depth() + 1, dtype=np.int32), repeats=array.array_length(), ) - if not "atom_id" in annot_categories: + if "atom_id" not in annot_categories: # Count from 1 - atom_site["id"] = np.arange( - 1, len(atom_site["group_PDB"]) + 1 - ) + atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1) block["atom_site"] = atom_site # Write box into file @@ -891,10 +868,6 @@ def _check_non_empty(array): def _get_or_create_block(pdbx_component, block_name): - if isinstance(pdbx_component, PDBxFile): - # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile' - pdbx_component = pdbx_component.cif_file - Block = pdbx_component.subcomponent_class() if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)): @@ -922,7 +895,7 @@ def _determine_entity_id(chain_id): for i in range(len(chain_id)): try: entity_id[i] = id_translation[chain_id[i]] - except: + except KeyError: # chain_id is not in dictionary -> new entry id_translation[chain_id[i]] = id entity_id[i] = id_translation[chain_id[i]] @@ -947,8 +920,11 @@ def _repeat(category, repetitions): data = Data(np.tile(column.data.array, repetitions), data_encoding) else: data = Data(np.tile(column.data.array, repetitions)) - mask = Data(np.tile(column.mask.array, repetitions)) \ - if column.mask is not None else None + mask = ( + Data(np.tile(column.mask.array, repetitions)) + if column.mask is not None + else None + ) category_dict[key] = Column(data, mask) return Category(category_dict) @@ -995,22 +971,18 @@ def _set_intra_residue_bonds(array, atom_site): chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]] chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]] chem_comp_bond["value_order"] = Column( - value_order, - np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) + value_order, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) ) chem_comp_bond["pdbx_aromatic_flag"] = Column( - aromatic_flag, - np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) + aromatic_flag, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT) ) # BondList does not contain stereo information # -> all values are missing chem_comp_bond["pdbx_stereo_config"] = Column( np.zeros(len(bond_array), dtype="U1"), - np.full(len(bond_array), MaskValue.MISSING) - ) - chem_comp_bond["pdbx_ordinal"] = np.arange( - 1, len(bond_array) + 1, dtype=np.int32 + np.full(len(bond_array), MaskValue.MISSING), ) + chem_comp_bond["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1, dtype=np.int32) return chem_comp_bond @@ -1022,8 +994,11 @@ def _set_inter_residue_bonds(array, atom_site): ``atom_site`` category. """ COLUMNS = [ - "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id", - "pdbx_PDB_ins_code" + "label_asym_id", + "label_comp_id", + "label_seq_id", + "label_atom_id", + "pdbx_PDB_ins_code", ] Category = type(atom_site) @@ -1036,13 +1011,12 @@ def _set_inter_residue_bonds(array, atom_site): struct_conn["id"] = np.arange(1, len(bond_array) + 1) struct_conn["conn_type_id"] = np.full(len(bond_array), "covale") struct_conn["pdbx_value_order"] = Column( - np.array( - [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]] - ), + np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]), np.where( bond_array[:, 2] == BondType.ANY, - MaskValue.MISSING, MaskValue.PRESENT, - ) + MaskValue.MISSING, + MaskValue.PRESENT, + ), ) # Write the identifying annotation... for col_name in COLUMNS: @@ -1050,8 +1024,9 @@ def _set_inter_residue_bonds(array, atom_site): # ...for each bond partner for i in range(2): atom_indices = bond_array[:, i] - struct_conn[_get_struct_conn_col_name(col_name, i+1)] \ - = annot[atom_indices] + struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[ + atom_indices + ] return struct_conn @@ -1063,9 +1038,9 @@ def _filter_bonds(array, connection): bond_array = array.bonds.as_array() # To save computation time call 'get_residue_starts_for()' only once # with indices of the first and second atom of each bond - residue_starts_1, residue_starts_2 = get_residue_starts_for( - array, bond_array[:, :2].flatten() - ).reshape(-1, 2).T + residue_starts_1, residue_starts_2 = ( + get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T + ) if connection == "intra": return bond_array[residue_starts_1 == residue_starts_2] elif connection == "inter": @@ -1074,12 +1049,11 @@ def _filter_bonds(array, connection): raise ValueError("Invalid 'connection' option") -def get_component(pdbx_file, data_block=None, use_ideal_coord=True, - res_name=None): +def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None): """ Create an :class:`AtomArray` for a chemical component from the ``chem_comp_atom`` and, if available, the ``chem_comp_bond`` - category in a :class:`PDBxFile`. + category in a file. Parameters ---------- @@ -1175,16 +1149,16 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, coord_fields, alt_coord_fields = alt_coord_fields, coord_fields try: for i, field in enumerate(coord_fields): - array.coord[:,i] = atom_category[field].as_array(np.float32) + array.coord[:, i] = atom_category[field].as_array(np.float32) except KeyError as err: key = err.args[0] warnings.warn( f"Attribute '{key}' not found within 'chem_comp_atom' category. " f"The fallback coordinates will be used instead", - UserWarning + UserWarning, ) for i, field in enumerate(alt_coord_fields): - array.coord[:,i] = atom_category[field].as_array(np.float32) + array.coord[:, i] = atom_category[field].as_array(np.float32) try: bond_category = block["chem_comp_bond"] @@ -1194,9 +1168,8 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, ) except KeyError: warnings.warn( - f"Category 'chem_comp_bond' not found. " - f"No bonds will be parsed", - UserWarning + "Category 'chem_comp_bond' not found. " "No bonds will be parsed", + UserWarning, ) else: bonds = BondList(array.array_length()) @@ -1204,7 +1177,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, bond_category["atom_id_1"].as_array(str), bond_category["atom_id_2"].as_array(str), bond_category["value_order"].as_array(str), - bond_category["pdbx_aromatic_flag"].as_array(str) + bond_category["pdbx_aromatic_flag"].as_array(str), ): atom_i = np.where(array.atom_name == atom1)[0][0] atom_j = np.where(array.atom_name == atom2)[0][0] @@ -1246,9 +1219,7 @@ def set_component(pdbx_file, array, data_block=None): Category = block.subcomponent_class() if get_residue_count(array) > 1: - raise BadStructureError( - "The input atom array must comprise only one residue" - ) + raise BadStructureError("The input atom array must comprise only one residue") res_name = array.res_name[0] annot_categories = array.get_annotation_categories() @@ -1271,31 +1242,28 @@ def set_component(pdbx_file, array, data_block=None): atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"] atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"] atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"] - atom_cat["pdbx_ordinal"] = np.arange( - 1, array.array_length() + 1 - ).astype(str) + atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str) block["chem_comp_atom"] = atom_cat if array.bonds is not None and array.bonds.get_bond_count() > 0: bond_array = array.bonds.as_array() order_flags = [] aromatic_flags = [] - for bond_type in bond_array[:,2]: + for bond_type in bond_array[:, 2]: order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type] order_flags.append(order_flag) aromatic_flags.append(aromatic_flag) bond_cat = Category() bond_cat["comp_id"] = np.full(len(bond_array), res_name) - bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]] - bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]] + bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]] + bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]] bond_cat["value_order"] = np.array(order_flags) bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags) - bond_cat["pdbx_ordinal"] = np.arange( - 1, len(bond_array) + 1 - ).astype(str) + bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str) block["chem_comp_bond"] = bond_cat + def list_assemblies(pdbx_file, data_block=None): """ List the biological assemblies that are available for the structure @@ -1346,14 +1314,21 @@ def list_assemblies(pdbx_file, data_block=None): id: details for id, details in zip( assembly_category["id"].as_array(str), - assembly_category["details"].as_array(str) + assembly_category["details"].as_array(str), ) } -def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, - altloc="first", extra_fields=None, use_author_fields=True, - include_bonds=False): +def get_assembly( + pdbx_file, + assembly_id=None, + model=None, + data_block=None, + altloc="first", + extra_fields=None, + use_author_fields=True, + include_bonds=False, +): """ Build the given biological assembly. @@ -1410,7 +1385,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, for example both, ``label_seq_id`` and ``auth_seq_id`` describe the ID of the residue. While, the ``label_xxx`` fields can be used as official pointers - to other categories in the :class:`PDBxFile`, the ``auth_xxx`` + to other categories in the file, the ``auth_xxx`` fields are set by the author(s) of the structure and are consistent with the corresponding values in PDB files. If `use_author_fields` is true, the annotation arrays will be @@ -1443,9 +1418,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, try: assembly_gen_category = block["pdbx_struct_assembly_gen"] except KeyError: - raise InvalidFileError( - "File has no 'pdbx_struct_assembly_gen' category" - ) + raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category") try: struct_oper_category = block["pdbx_struct_oper_list"] @@ -1478,7 +1451,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, altloc, extra_fields_and_asym, use_author_fields, - include_bonds + include_bonds, ) ### Get transformations and apply them to the affected asym IDs @@ -1494,9 +1467,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None, operations = _parse_operation_expression(op_expr) asym_ids = asym_id_expr.split(",") # Filter affected asym IDs - sub_structure = structure[ - ..., np.isin(structure.label_asym_id, asym_ids) - ] + sub_structure = structure[..., np.isin(structure.label_asym_id, asym_ids)] sub_assembly = _apply_transformations( sub_structure, transformations, operations ) @@ -1555,10 +1526,9 @@ def _get_transformations(struct_oper): for i in (1, 2, 3) ] ) - translation_vector = np.array([ - struct_oper[f"vector[{i}]"].as_array(float)[index] - for i in (1, 2, 3) - ]) + translation_vector = np.array( + [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)] + ) transformation_dict[id] = (rotation_matrix, translation_vector) return transformation_dict @@ -1613,6 +1583,4 @@ def _convert_string_to_sequence(string, stype): elif stype in _other_type_list: return None else: - raise InvalidFileError( - "mmCIF _entity_poly.type unsupported" " type: " + stype - ) + raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype) diff --git a/src/biotite/structure/io/pdbx/legacy.py b/src/biotite/structure/io/pdbx/legacy.py deleted file mode 100644 index 63557addc..000000000 --- a/src/biotite/structure/io/pdbx/legacy.py +++ /dev/null @@ -1,267 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.pdbx" -__author__ = "Patrick Kunzmann" -__all__ = ["PDBxFile"] - -import copy -from collections.abc import MutableMapping -import warnings -from .cif import CIFFile, CIFBlock, CIFCategory, CIFColumn -from ....file import File, InvalidFileError - - -class PDBxFile(File, MutableMapping): - """ - This class represents the legacy interface to CIF files. - - The categories of the file can be accessed using the - :meth:`get_category()`/:meth:`set_category()` methods. - The content of each category is represented by a dictionary. - The dictionary contains the entry - (e.g. *label_entity_id* in *atom_site*) as key. - The corresponding values are either strings in *non-looped* - categories, or 1-D numpy arrays of string objects in case of - *looped* categories. - - A category can be changed or added using :meth:`set_category()`: - If a string-valued dictionary is provided, a *non-looped* category - will be created; if an array-valued dictionary is given, a - *looped* category will be created. In case of arrays, it is - important that all arrays have the same size. - - Alternatively, The content of this file can also be read/write - accessed using dictionary-like indexing: - You can either provide a data block and a category or only a - category, in which case the first data block is taken. - - DEPRECATED: Use :class:`CIFFile` instead. - - Notes - ----- - This class is also able to detect and parse multiline entries in the - file. However, when writing a category no multiline values are used. - This could lead to long lines. - - This class uses a lazy category dictionary creation: When reading - the file only the line positions of all categories are checked. The - time consuming task of dictionary creation is done when - :meth:`get_category()` is called. - - Examples - -------- - Read the file and get author names: - - >>> import os.path - >>> file = PDBxFile.read(os.path.join(path_to_structures, "1l2y.cif")) - >>> author_dict = file.get_category("citation_author", block="1L2Y") - >>> print(author_dict["name"]) - ['Neidigh, J.W.' 'Fesinmeyer, R.M.' 'Andersen, N.H.'] - - Dictionary style indexing, no specification of data block: - - >>> print(file["citation_author"]["name"]) - ['Neidigh, J.W.' 'Fesinmeyer, R.M.' 'Andersen, N.H.'] - - Get the structure from the file: - - >>> arr = get_structure(file) - >>> print(type(arr).__name__) - AtomArrayStack - >>> arr = get_structure(file, model=1) - >>> print(type(arr).__name__) - AtomArray - - Modify atom array and write it back into the file: - - >>> arr_mod = rotate(arr, [1,2,3]) - >>> set_structure(file, arr_mod) - >>> file.write(os.path.join(path_to_directory, "1l2y_mod.cif")) - """ - - def __init__(self): - warnings.warn( - "'PDBxFile' is deprecated, use 'CIFFile' instead", - DeprecationWarning - ) - super().__init__() - self._cif_file = CIFFile() - - @property - def cif_file(self): - return self._cif_file - - @property - def lines(self): - return self._cif_file.lines - - @classmethod - def read(cls, file): - """ - Read a PDBx/mmCIF file. - - Parameters - ---------- - file : file-like object or str - The file to be read. - Alternatively a file path can be supplied. - - Returns - ------- - file_object : PDBxFile - The parsed file. - """ - pdbx_file = PDBxFile() - pdbx_file._cif_file = CIFFile.read(file) - return pdbx_file - - def write(self, file): - self._cif_file.write(file) - - - def get_block_names(self): - """ - Get the names of all data blocks in the file. - - Returns - ------- - blocks : list - List of data block names. - """ - return sorted(self._cif_file.keys()) - - def get_category(self, category, block=None, expect_looped=False): - """ - Get the dictionary for a given category. - - Parameters - ---------- - category : string - The name of the category. The leading underscore is omitted. - block : string, optional - The name of the data block. Default is the first - (and most times only) data block of the file. - expect_looped : bool, optional - If set to true, the returned dictionary will always contain - arrays (only if the category exists): - If the category is *non-looped*, each array will contain - only one element. - - Returns - ------- - category_dict : dict of (str or ndarray, dtype=str) or None - A entry keyed dictionary. The corresponding values are - strings or array of strings for *non-looped* and - *looped* categories, respectively. - Returns None, if the data block does not contain the given - category. - """ - if block is None: - try: - block = self.get_block_names()[0] - except IndexError: - raise InvalidFileError("File is empty") - - if category not in self._cif_file[block]: - return None - - category_dict = {} - for column_name, column in self._cif_file[block][category].items(): - if not expect_looped and len(column) == 1: - category_dict[column_name] = column.as_item() - else: - category_dict[column_name] = column.as_array() - return category_dict - - def set_category(self, category, category_dict, block=None): - """ - Set the content of a category. - - If the category is already existing, all lines corresponding - to the category are replaced. Otherwise a new category is - created and the lines are appended at the end of the data block. - - Parameters - ---------- - category : string - The name of the category. The leading underscore is omitted. - category_dict : dict - The category content. The dictionary must have strings - (subcategories) as keys and strings or :class:`ndarray` - objects as values. - block : string, optional - The name of the data block. Default is the first - (and most times only) data block of the file. If the - block is not contained in the file yet, a new block is - appended at the end of the file. - """ - if block is None: - try: - block = self.get_block_names()[0] - except IndexError: - raise InvalidFileError( - "File is empty, give an explicit data block" - ) - - if block not in self._cif_file: - self._cif_file = CIFBlock() - self._cif_file[block][category] = CIFCategory({ - column_name: CIFColumn(array) - for column_name, array in category_dict.items() - }) - - def __copy_fill__(self, clone): - super().__copy_fill__(clone) - clone._cif_file = copy.deepcopy(self._cif_file) - - def __setitem__(self, index, item): - block, category_name = self._full_index(index) - self.set_category(category_name, item, block=block) - - def __getitem__(self, index): - block, category_name = self._full_index(index) - return self.get_category(category_name, block=block) - - def __delitem__(self, index): - block, category_name = self._full_index(index) - del self._cif_file[block][category_name] - - def __contains__(self, index): - block, category_name = self._full_index(index) - return (block, category_name) in self._categories - - def __iter__(self): - try: - block = self.get_block_names()[0] - except IndexError: - raise InvalidFileError( - "File is empty, give an explicit data block" - ) - - return iter(self._cif_file[block]) - - def __len__(self): - try: - block = self.get_block_names()[0] - except IndexError: - raise InvalidFileError( - "File is empty, give an explicit data block" - ) - - return len(self._cif_file[block]) - - def _full_index(self, index): - """ - Converts a an integer or tuple index into a block and a category - name. - """ - if isinstance(index, tuple): - return index[0], index[1] - elif isinstance(index, str): - return self.get_block_names()[0], index - else: - raise TypeError( - f"'{type(index).__name__}' is an invalid index type" - ) \ No newline at end of file diff --git a/src/biotite/structure/io/tng/__init__.py b/src/biotite/structure/io/tng/__init__.py deleted file mode 100644 index b344635fd..000000000 --- a/src/biotite/structure/io/tng/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -""" -This subpackage is used for reading and writing trajectories in the -compressed *Gromacs* TNG format. -""" - -__name__ = "biotite.structure.io.tng" -__author__ = "Patrick Kunzmann" - -from .file import * \ No newline at end of file diff --git a/src/biotite/structure/io/tng/file.py b/src/biotite/structure/io/tng/file.py deleted file mode 100644 index 8666ecc39..000000000 --- a/src/biotite/structure/io/tng/file.py +++ /dev/null @@ -1,46 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite.structure.io.tng" -__author__ = "Patrick Kunzmann" -__all__ = ["TNGFile"] - -import numpy as np -from ..trajfile import TrajectoryFile - - -class TNGFile(TrajectoryFile): - """ - This file class represents a TNG trajectory file. - """ - - @classmethod - def traj_type(cls): - import mdtraj.formats as traj - return traj.TNGTrajectoryFile - - @classmethod - def process_read_values(cls, read_values): - # nm to Angstrom - coord = read_values[0] * 10 - box = read_values[2] - if box is not None: - box *= 10 - time = read_values[1] - return coord, box, time - - @classmethod - def prepare_write_values(cls, coord, box, time): - # Angstrom to nm - xyz = np.divide(coord, 10, dtype=np.float32) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None - box = np.divide(box, 10, dtype=np.float32) \ - if box is not None else None - return { - "xyz" : xyz, - "box" : box, - "time" : time, - } diff --git a/src/biotite/structure/io/trajfile.py b/src/biotite/structure/io/trajfile.py index 23842ea4e..a3b3d4cf6 100644 --- a/src/biotite/structure/io/trajfile.py +++ b/src/biotite/structure/io/trajfile.py @@ -6,25 +6,22 @@ __author__ = "Patrick Kunzmann" __all__ = ["TrajectoryFile"] -import itertools import abc +import itertools import numpy as np -from ..atoms import AtomArray, AtomArrayStack, stack, from_template -from ...file import File +from biotite.file import File +from biotite.structure.atoms import AtomArray, AtomArrayStack, from_template class TrajectoryFile(File, metaclass=abc.ABCMeta): """ This file class represents a trajectory file interfacing a - trajectory file class from `MDtraj`. - + trajectory file class from `biotraj`. + A trajectory file stores atom coordinates over multiple (time) frames. The file formats are usually binary and involve sometimes heavy compression, so that a large number of frames can be stored in relatively small space. - Since all :class:`TrajectoryFile` subclasses interface *MDtraj* - trajectory file classes, `MDtraj` must be installed to use any of - them. Notes ----- @@ -34,27 +31,27 @@ class TrajectoryFile(File, metaclass=abc.ABCMeta): Therefore, it is strongly recommended to make a copy of the respective array, if the array is modified. """ - + def __init__(self): super().__init__() self._coord = None self._time = None self._box = None self._model_count = None - @classmethod - def read(cls, file_name, start=None, stop=None, step=None, - atom_i=None, chunk_size=None): + def read( + cls, file_name, start=None, stop=None, step=None, atom_i=None, chunk_size=None + ): """ Read a trajectory file. - + A trajectory file can be seen as a file representation of an :class:`AtomArrayStack`. Therefore, `start`, `stop` and `step` represent slice parameters of the index of the first dimension and `atom_i` represents an index array for the second dimension. - + Parameters ---------- file_name : str @@ -85,7 +82,7 @@ def read(cls, file_name, start=None, stop=None, step=None, Although lower values can decrease the memory consumption of reading trajectories, they also increase the computation time. - + Returns ------- file_object : TrajectoryFile @@ -105,7 +102,6 @@ def read(cls, file_name, start=None, stop=None, step=None, traj_type = cls.traj_type() with traj_type(file_name, "r") as f: - if start is None: start = 0 # Discard atoms before start @@ -116,13 +112,13 @@ def read(cls, file_name, start=None, stop=None, step=None, TrajectoryFile._read_chunk_wise( f, start, None, atom_i, chunk_size, discard=True ) - + # The upcoming frames are saved # Calculate the amount of frames to be read if stop is None: n_frames = None else: - n_frames = stop-start + n_frames = stop - start if step is not None and n_frames is not None: # Divide number of frames by 'step' in order to convert # 'step' into 'stride' @@ -130,7 +126,7 @@ def read(cls, file_name, start=None, stop=None, step=None, # the number of frames is decremented before division # and incremented afterwards again n_frames = ((n_frames - 1) // step) + 1 - + # Read frames if chunk_size is None: result = f.read(n_frames, stride=step, atom_indices=atom_i) @@ -138,7 +134,7 @@ def read(cls, file_name, start=None, stop=None, step=None, result = TrajectoryFile._read_chunk_wise( f, n_frames, step, atom_i, chunk_size, discard=False ) - + # nm to Angstrom coord, box, time = cls.process_read_values(result) file.set_coord(coord) @@ -146,15 +142,15 @@ def read(cls, file_name, start=None, stop=None, step=None, file.set_time(time) return file - @classmethod - def read_iter(cls, file_name, start=None, stop=None, step=None, - atom_i=None, stack_size=None): + def read_iter( + cls, file_name, start=None, stop=None, step=None, atom_i=None, stack_size=None + ): """ Create an iterator over each frame of the given trajectory file in the selected range. - + Parameters ---------- file_name : str @@ -181,7 +177,7 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, values. If the number of frames is not a multiple of `stack_size`, the final stack is smaller than `stack_size`. - + Yields ------ coord : ndarray, dtype=float32, shape=(n,3) or shape=(m,n,3) @@ -190,30 +186,29 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, The box vectors of the current frame or stack. time : float or ndarray, dtype=float32, shape=(n,) or None The simulation time of the current frame or stack in *ps*. - + See also -------- read_iter_structure - + Notes ----- The `step` parameter does currently not work for *DCD* files. """ traj_type = cls.traj_type() with traj_type(file_name, "r") as f: - if start is None: start = 0 # Discard atoms before start if start != 0: f.read(n_frames=start, stride=None, atom_indices=atom_i) - + # The upcoming frames are read # Calculate the amount of frames to be read if stop is None: n_frames = None else: - n_frames = stop-start + n_frames = stop - start if step is not None and n_frames is not None: # Divide number of frames by 'step' in order to convert # 'step' into 'stride' @@ -221,7 +216,6 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, # the number of frames is decremented before division # and incremented afterwards again n_frames = ((n_frames - 1) // step) + 1 - # Read frames if stack_size is None: @@ -242,7 +236,7 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, yield coord, box, time if remaining_frames is not None: remaining_frames -= 1 - + else: remaining_frames = n_frames while remaining_frames is None or remaining_frames > 0: @@ -260,11 +254,18 @@ def read_iter(cls, file_name, start=None, stop=None, step=None, yield coord, box, time if remaining_frames is not None: remaining_frames -= stack_size - @classmethod - def read_iter_structure(cls, file_name, template, start=None, stop=None, - step=None, atom_i=None, stack_size=None): + def read_iter_structure( + cls, + file_name, + template, + start=None, + stop=None, + step=None, + atom_i=None, + stack_size=None, + ): """ Create an iterator over each frame of the given trajectory file in the selected range. @@ -275,8 +276,8 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, information and no topology information, this method requires a template atom array or stack. This template can be acquired for example from a PDB file, which is associated with the - trajectory file. - + trajectory file. + Parameters ---------- file_name : str @@ -306,18 +307,18 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, determined by this parameter. If the number of frames is not a multiple of `stack_size`, the final stack is smaller than `stack_size`. - + Yields ------ structure : AtomArray or AtomArrayStack The structure of the current frame as :class:`AtomArray`. If `stack_size` is set, multiple frames are returned as :class:`AtomArrayStack`. - + See also -------- read_iter - + Notes ----- This iterator creates a new copy of the given template for every @@ -335,7 +336,7 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, f"An 'AtomArray' or 'AtomArrayStack' is expected as template, " f"not '{type(template).__name__}'" ) - + for coord, box, _ in cls.read_iter( file_name, start, stop, step, atom_i, stack_size ): @@ -347,7 +348,6 @@ def read_iter_structure(cls, file_name, template, start=None, stop=None, else: yield from_template(template, coord, box) - def write(self, file_name): """ Write the content into a trajectory file. @@ -360,9 +360,8 @@ def write(self, file_name): """ traj_type = self.traj_type() param = self.prepare_write_values(self._coord, self._box, self._time) - with traj_type(file_name, 'w') as f: + with traj_type(file_name, "w") as f: f.write(**param) - @classmethod def write_iter(cls, file_name, coord, box=None, time=None): @@ -376,7 +375,7 @@ def write_iter(cls, file_name, coord, box=None, time=None): Hence, this class method may save a large amount of memory if a large file should be written, if `coord` are provided as generator. - + Parameters ---------- file_name : str @@ -391,7 +390,7 @@ def write_iter(cls, file_name, coord, box=None, time=None): Notes ----- - The `time` parameter has no effect for *TNG* and *DCD* files. + The `time` parameter has no effect for *DCD* files. """ if box is None: box = itertools.repeat(None) @@ -399,7 +398,7 @@ def write_iter(cls, file_name, coord, box=None, time=None): time = itertools.repeat(None) traj_type = cls.traj_type() - with traj_type(file_name, 'w') as f: + with traj_type(file_name, "w") as f: for c, b, t in zip(coord, box, time): if c.ndim != 2: raise IndexError( @@ -414,24 +413,22 @@ def write_iter(cls, file_name, coord, box=None, time=None): t = np.expand_dims(t, axis=0) param = cls.prepare_write_values(c, b, t) f.write(**param) - def get_coord(self): """ Extract only the atom coordinates from the trajectory file. - + Returns ------- coord : ndarray, dtype=float, shape=(m,n,3) The coordinates stored in the trajectory file. """ return self._coord - def get_time(self): """ Get the simlation time in *ps* values for each frame. - + Returns ------- time : ndarray, dtype=float, shape=(m,) @@ -439,12 +436,11 @@ def get_time(self): frames, that were read from the file. """ return self._time - def get_box(self): """ Get the box vectors for each frame. - + Returns ------- box : ndarray, dtype=float, shape=(m,3,3) @@ -452,12 +448,11 @@ def get_box(self): frames, that were read from the file. """ return self._box - def set_coord(self, coord): """ Set the atom coordinates in the trajectory file. - + Parameters ---------- coord : ndarray, dtype=float, shape=(m,n,3) @@ -465,12 +460,11 @@ def set_coord(self, coord): """ self._check_model_count(coord) self._coord = coord - def set_time(self, time): """ Set the simulation time of each frame in the trajectory file. - + Parameters ---------- time : ndarray, dtype=float, shape=(m,) @@ -478,13 +472,12 @@ def set_time(self, time): """ self._check_model_count(time) self._time = time - def set_box(self, box): """ Set the periodic box vectors of each frame in the trajectory file. - + Parameters ---------- time : ndarray, dtype=float, shape=(m,3,3) @@ -492,25 +485,24 @@ def set_box(self, box): """ self._check_model_count(box) self._box = box - def get_structure(self, template): """ Convert the trajectory file content into an :class:`AtomArrayStack`. - + Since trajectory files usually only contain atom coordinate information and no topology information, this method requires a template atom array or stack. This template can be acquired for example from a PDB file, which is associated with the - trajectory file. - + trajectory file. + Parameters ---------- template : AtomArray or AtomArrayStack The template array or stack, where the atom annotation data is taken from. - + Returns ------- array_stack : AtomArrayStack @@ -519,15 +511,14 @@ def get_structure(self, template): trajectory file. """ return from_template(template, self.get_coord(), self.get_box()) - def set_structure(self, structure, time=None): """ Write an atom array (stack) into the trajectory file object. - + The topology information (chain, residue, etc.) is not saved in the file. - + Parameters ---------- structure : AtomArray or AtomArrayStack @@ -547,51 +538,47 @@ def set_structure(self, structure, time=None): if time is not None: self.set_time(time) - def copy(self): """ This operation is not implemented for trajectory files. - + Raises ------ NotImplementedError """ - raise NotImplementedError("Copying is not implemented " - "for trajectory files") - + raise NotImplementedError("Copying is not implemented " "for trajectory files") @classmethod @abc.abstractmethod def traj_type(cls): """ - The `MDtraj` files class to be used. - + The ``biotraj`` files class to be used. + PROTECTED: Override when inheriting. - + Returns ------- class - An `MDtraj` subclass of :class:`TrajectoryFile`. + An ``biotraj`` subclass of :class:`TrajectoryFile`. """ pass - @classmethod @abc.abstractmethod def process_read_values(cls, read_values): """ Convert the return value of the `read()` method of the - respective :class:`mdtraj.TrajectoryFile` into coordinates, + respective :class:`biotraj.TrajectoryFile` into coordinates, simulation box and simulation time. - + PROTECTED: Override when inheriting. - + Parameters ---------- read_values : tuple The return value of the respective - :func:`mdtraj.TrajectoryFile.read()` method. - + :func:`biotraj.TrajectoryFile.read()` method. + Returns ------- coord : ndarray, dtype=float, shape=(m,n,3) @@ -602,7 +589,6 @@ def process_read_values(cls, read_values): The simulation time in ps for each frame. """ pass - @classmethod @abc.abstractmethod @@ -610,7 +596,7 @@ def prepare_write_values(cls, coord, box, time): """ Convert the `coord`, `box` and `time` attribute into a dictionary that is given as *kwargs* to the respective - :func:`mdtraj.TrajectoryFile.write()` method. + :func:`biotraj.TrajectoryFile.write()` method. PROTECTED: Override when inheriting. @@ -622,16 +608,15 @@ def prepare_write_values(cls, coord, box, time): The box vectors in Å for each frame. time : ndarray, dtype=float, shape=(m,) The simulation time in ps for each frame. - + Returns ------- parameters : dict This dictionary is given as *kwargs* parameter to the - respective :func:`mdtraj.TrajectoryFile.write()` method. + respective :func:`biotraj.TrajectoryFile.write()` method. """ pass - def _check_model_count(self, array): """ Check if the amount of models in the given array is equal to @@ -650,11 +635,9 @@ def _check_model_count(self, array): f"{len(array)} models were given, " f"but the file contains {self._model_count} models" ) - @staticmethod - def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, - discard=False): + def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, discard=False): """ Similar to :func:`read()`, just for chunk-wise reading of the trajectory. @@ -674,7 +657,7 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, try: chunk = file.read(n_frames=n, stride=step, atom_indices=atom_i) except ValueError as e: - # MDTraj raises exception because no coordinates can be + # biotraj raises exception because no coordinates can be # concatenated # -> all frames have been read # -> stop reading chunks @@ -691,7 +674,7 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, chunks.append(chunk) if remaining_frames is not None: remaining_frames -= n - + if not discard: # Assemble the chunks into contiguous arrays # for each value (coord, box, time) @@ -707,4 +690,4 @@ def _read_chunk_wise(file, n_frames, step, atom_i, chunk_size, result[i] = None return tuple(result) else: - return None \ No newline at end of file + return None diff --git a/src/biotite/structure/io/trr/__init__.py b/src/biotite/structure/io/trr/__init__.py index cf2f0510d..c7ed3f8d9 100644 --- a/src/biotite/structure/io/trr/__init__.py +++ b/src/biotite/structure/io/trr/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.trr" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/trr/file.py b/src/biotite/structure/io/trr/file.py index 435fd6f7a..befef08ab 100644 --- a/src/biotite/structure/io/trr/file.py +++ b/src/biotite/structure/io/trr/file.py @@ -6,20 +6,20 @@ __author__ = "Patrick Kunzmann" __all__ = ["TRRFile"] +import biotraj import numpy as np -from ..trajfile import TrajectoryFile +from biotite.structure.io.trajfile import TrajectoryFile class TRRFile(TrajectoryFile): """ This file class represents a TRR trajectory file. """ - + @classmethod def traj_type(cls): - import mdtraj.formats as traj - return traj.TRRTrajectoryFile - + return biotraj.TRRTrajectoryFile + @classmethod def process_read_values(cls, read_values): # nm to Angstrom @@ -29,18 +29,15 @@ def process_read_values(cls, read_values): box *= 10 time = read_values[1] return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): # Angstrom to nm - xyz = np.divide(coord, 10, dtype=np.float32) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None - box = np.divide(box, 10, dtype=np.float32) \ - if box is not None else None + xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None + box = np.divide(box, 10, dtype=np.float32) if box is not None else None return { - "xyz" : xyz, - "box" : box, - "time" : time, + "xyz": xyz, + "box": box, + "time": time, } diff --git a/src/biotite/structure/io/xtc/__init__.py b/src/biotite/structure/io/xtc/__init__.py index 5803ef784..5fe71216e 100644 --- a/src/biotite/structure/io/xtc/__init__.py +++ b/src/biotite/structure/io/xtc/__init__.py @@ -10,4 +10,4 @@ __name__ = "biotite.structure.io.xtc" __author__ = "Patrick Kunzmann" -from .file import * \ No newline at end of file +from .file import * diff --git a/src/biotite/structure/io/xtc/file.py b/src/biotite/structure/io/xtc/file.py index 62d9a977f..0540655a1 100644 --- a/src/biotite/structure/io/xtc/file.py +++ b/src/biotite/structure/io/xtc/file.py @@ -6,19 +6,19 @@ __author__ = "Patrick Kunzmann" __all__ = ["XTCFile"] +import biotraj import numpy as np -from ..trajfile import TrajectoryFile +from biotite.structure.io.trajfile import TrajectoryFile class XTCFile(TrajectoryFile): """ This file class represents a XTC trajectory file. """ - + @classmethod def traj_type(cls): - import mdtraj.formats as traj - return traj.XTCTrajectoryFile + return biotraj.XTCTrajectoryFile @classmethod def process_read_values(cls, read_values): @@ -29,18 +29,15 @@ def process_read_values(cls, read_values): box *= 10 time = read_values[1] return coord, box, time - + @classmethod def prepare_write_values(cls, coord, box, time): # Angstrom to nm - xyz = np.divide(coord, 10, dtype=np.float32) \ - if coord is not None else None - time = time.astype(np.float32, copy=False) \ - if time is not None else None - box = np.divide(box, 10, dtype=np.float32) \ - if box is not None else None + xyz = np.divide(coord, 10, dtype=np.float32) if coord is not None else None + time = time.astype(np.float32, copy=False) if time is not None else None + box = np.divide(box, 10, dtype=np.float32) if box is not None else None return { - "xyz" : xyz, - "box" : box, - "time" : time, + "xyz": xyz, + "box": box, + "time": time, } diff --git a/src/biotite/structure/mechanics.py b/src/biotite/structure/mechanics.py index d79e23908..6e6ffedcb 100644 --- a/src/biotite/structure/mechanics.py +++ b/src/biotite/structure/mechanics.py @@ -12,17 +12,14 @@ __all__ = ["mass_center", "gyration_radius"] import numpy as np -from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .util import vector_dot, norm_vector -from .error import BadStructureError -from .geometry import distance -from .info.masses import mass +from biotite.structure.geometry import distance +from biotite.structure.info.masses import mass def gyration_radius(array, masses=None): """ Compute the radius/radii of gyration of an atom array or stack. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -33,7 +30,7 @@ def gyration_radius(array, masses=None): Must have the same length as `array`. By default, the standard atomic mass for each element is taken. - + Returns ------- masses : float or ndarray, dtype=float @@ -46,13 +43,14 @@ def gyration_radius(array, masses=None): masses = np.array([mass(element) for element in array.element]) center = mass_center(array, masses) radii = distance(array, center[..., np.newaxis, :]) - inertia_moment = np.sum(masses * radii*radii, axis=-1) + inertia_moment = np.sum(masses * radii * radii, axis=-1) return np.sqrt(inertia_moment / np.sum(masses)) + def mass_center(array, masses=None): """ Calculate the center(s) of mass of an atom array or stack. - + Parameters ---------- array : AtomArray or AtomArrayStack @@ -61,7 +59,7 @@ def mass_center(array, masses=None): The masses to use for each atom in the input `array`. Must have the same length as `array`. By default, the standard atomic mass for each element is taken. - + Returns ------- radius : ndarray, ndarray, dtype=float @@ -72,4 +70,4 @@ def mass_center(array, masses=None): """ if masses is None: masses = np.array([mass(element) for element in array.element]) - return np.sum(masses[:,np.newaxis] * array.coord, axis=-2) / np.sum(masses) \ No newline at end of file + return np.sum(masses[:, np.newaxis] * array.coord, axis=-2) / np.sum(masses) diff --git a/src/biotite/structure/molecules.py b/src/biotite/structure/molecules.py index d40920b18..f20a5a1b6 100644 --- a/src/biotite/structure/molecules.py +++ b/src/biotite/structure/molecules.py @@ -12,8 +12,8 @@ __all__ = ["get_molecule_indices", "get_molecule_masks", "molecule_iter"] import numpy as np -from .atoms import AtomArray, AtomArrayStack -from .bonds import BondList, find_connected +from biotite.structure.atoms import AtomArray, AtomArrayStack +from biotite.structure.bonds import BondList, find_connected def get_molecule_indices(array): @@ -244,8 +244,7 @@ def get_molecule_masks(array): molecule_indices = get_molecule_indices(bonds) molecule_masks = np.zeros( - (len(molecule_indices), bonds.get_atom_count()), - dtype=bool + (len(molecule_indices), bonds.get_atom_count()), dtype=bool ) for i in range(len(molecule_indices)): molecule_masks[i, molecule_indices[i]] = True diff --git a/src/biotite/structure/pseudoknots.py b/src/biotite/structure/pseudoknots.py index 36a877a84..2a065f16b 100644 --- a/src/biotite/structure/pseudoknots.py +++ b/src/biotite/structure/pseudoknots.py @@ -10,9 +10,10 @@ __author__ = "Tom David Müller" __all__ = ["pseudoknots"] -import numpy as np -import networkx as nx from itertools import chain, product +import networkx as nx +import numpy as np + def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): """ @@ -118,7 +119,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): return np.array([[]], dtype=np.int32) # List containing the results - results = [np.full(len(base_pairs), -1, dtype='int32')] + results = [np.full(len(base_pairs), -1, dtype="int32")] # if no score array is given, each base pairs' score is one if scores is None: @@ -126,9 +127,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): # Make sure `base_pairs` has the same length as the score array if len(base_pairs) != len(scores): - raise ValueError( - "'base_pair' and 'scores' must have the same shape" - ) + raise ValueError("'base_pair' and 'scores' must have the same shape") # Split the base pairs in regions regions = _find_regions(base_pairs, scores) @@ -139,7 +138,7 @@ def pseudoknots(base_pairs, scores=None, max_pseudoknot_order=None): return np.vstack(results) -class _Region(): +class _Region: """ This class represents a paired region. @@ -159,7 +158,7 @@ class _Region(): The score for each base pair. """ - def __init__ (self, base_pairs, region_pairs, scores): + def __init__(self, base_pairs, region_pairs, scores): # The Start and Stop indices for each Region self.start = np.min(base_pairs[region_pairs]) self.stop = np.max(base_pairs[region_pairs]) @@ -245,19 +244,18 @@ def _find_regions(base_pairs, scores): # Check if the current base pair belongs to the region that is # currently being defined - previous_upstream_rank = rank[i-1, 0] + previous_upstream_rank = rank[i - 1, 0] this_upstream_rank = rank[i, 0] - previous_downstream_rank = rank[i-1, 1] + previous_downstream_rank = rank[i - 1, 1] this_downstream_rank = rank[i, 1] # if the current base pair belongs to a new region, save the # current region and start a new region - if ((previous_downstream_rank - this_downstream_rank) != 1 or - (this_upstream_rank - previous_upstream_rank) != 1): - regions.add( - _Region(base_pairs, np.array(region_pairs), scores) - ) - region_pairs = [] + if (previous_downstream_rank - this_downstream_rank) != 1 or ( + this_upstream_rank - previous_upstream_rank + ) != 1: + regions.add(_Region(base_pairs, np.array(region_pairs), scores)) + region_pairs = [] # Append the current base pair to the region region_pairs.append(original_indices[i]) @@ -296,7 +294,7 @@ def _generate_graphical_representation(regions): # Get the region array and a boolean array, where the start of each # region is ``True``. region_array, (start_stops,) = _get_region_array_for( - regions, content=[lambda a : [True, False]], dtype=['bool'] + regions, content=[lambda a: [True, False]], dtype=["bool"] ) # Check each region for conflicts with other regions @@ -307,15 +305,15 @@ def _generate_graphical_representation(regions): # Find the index of the stopping of the region in the region # array - stop = _get_first_occurrence_for(region_array[start+1:], region) - stop += (start + 1) + stop = _get_first_occurrence_for(region_array[start + 1 :], region) + stop += start + 1 # Store regions the current region conflicts with conflicts = set() # Iterate over the regions between the starting and stopping # point of the current region - for other_region in region_array[start+1:stop]: + for other_region in region_array[start + 1 : stop]: # If the other region is not already a conflict, add it to # the conflict set if other_region not in conflicts: @@ -389,17 +387,17 @@ def _get_region_array_for(regions, content=[], dtype=[]): The custom output. """ # region_array and index array - region_array = np.empty(len(regions)*2, dtype=_Region) - index_array = np.empty(len(regions)*2, dtype='int32') + region_array = np.empty(len(regions) * 2, dtype=_Region) + index_array = np.empty(len(regions) * 2, dtype="int32") # Content array for custom return arrays - content_list = [None]*len(content) + content_list = [None] * len(content) for i in range(len(content)): - content_list[i] = np.empty(len(regions)*2, dtype=dtype[i]) + content_list[i] = np.empty(len(regions) * 2, dtype=dtype[i]) # Fill the arrays for i, reg in enumerate(regions): - indices = [2*i, 2*i+1] + indices = [2 * i, 2 * i + 1] region_array[indices] = reg for c in range(len(content_list)): content_list[c][indices] = content[c](reg) @@ -443,8 +441,8 @@ def _remove_pseudoknots(regions): represented as ``set`` of unknotted regions. """ # Create dynamic programming matrix - dp_matrix_shape = len(regions)*2, len(regions)*2 - dp_matrix = np.empty(dp_matrix_shape, dtype='object') + dp_matrix_shape = len(regions) * 2, len(regions) * 2 + dp_matrix = np.empty(dp_matrix_shape, dtype="object") dp_matrix_solutions_starts = np.zeros_like(dp_matrix) dp_matrix_solutions_stops = np.zeros_like(dp_matrix) @@ -452,9 +450,7 @@ def _remove_pseudoknots(regions): # ``region_array`` contains the region objects and ``start_stops`` # contains the lowest and highest positions of the regions region_array, (start_stops,) = _get_region_array_for( - regions, - [lambda a : (a.start, a.stop)], - ['int32'] + regions, [lambda a: (a.start, a.stop)], ["int32"] ) # Initialise the matrix diagonal with ndarrays of empty frozensets for i in range(len(dp_matrix)): @@ -462,11 +458,11 @@ def _remove_pseudoknots(regions): # Iterate through the top right half of the dynamic programming # matrix - for j in range(len(regions)*2): - for i in range(j-1, -1, -1): + for j in range(len(regions) * 2): + for i in range(j - 1, -1, -1): solution_candidates = set() - left = dp_matrix[i, j-1] - bottom = dp_matrix[i+1, j] + left = dp_matrix[i, j - 1] + bottom = dp_matrix[i + 1, j] # Add all solutions of the cell to the left for solution in left: @@ -474,24 +470,21 @@ def _remove_pseudoknots(regions): # Add all solutions of the cell to the bottom for solution in bottom: - solution_candidates.add(solution) + solution_candidates.add(solution) # Check if i and j are start/end-points of the same region if region_array[i] is region_array[j]: - # Add all solutions from the cell to the bottom left # plus this region - bottom_left = dp_matrix[i+1, j-1] + bottom_left = dp_matrix[i + 1, j - 1] for solution in bottom_left: solution_candidates.add(solution | set([region_array[i]])) # Perform additional tests if solution in the left cell and # bottom cell both differ from an empty solution - if (np.any(left != [frozenset()]) and - np.any(bottom != [frozenset()])): - - left_highest = dp_matrix_solutions_stops[i, j-1] - bottom_lowest = dp_matrix_solutions_starts[i+1, j] + if np.any(left != [frozenset()]) and np.any(bottom != [frozenset()]): + left_highest = dp_matrix_solutions_stops[i, j - 1] + bottom_lowest = dp_matrix_solutions_starts[i + 1, j] # For each pair of solutions check if solutions are # disjoint @@ -504,11 +497,11 @@ def _remove_pseudoknots(regions): # Both solutions are not disjoint # Add subsolutions for k in range( - np.where(start_stops==lowest)[0][0]-1, - np.where(start_stops==highest)[0][0]+1 + np.where(start_stops == lowest)[0][0] - 1, + np.where(start_stops == highest)[0][0] + 1, ): cell1 = dp_matrix[i, k] - cell2 = dp_matrix[k+1, j] + cell2 = dp_matrix[k + 1, j] for subsolution1 in cell1: for subsolution2 in cell2: solution_candidates.add( @@ -536,16 +529,12 @@ def _remove_pseudoknots(regions): # Add the solutions to the dynamic programming matrix dp_matrix[i, j] = solution_candidates - solution_starts = np.zeros_like(solution_candidates, dtype='int32') - solution_stops = np.zeros_like(solution_candidates, dtype='int32') + solution_starts = np.zeros_like(solution_candidates, dtype="int32") + solution_stops = np.zeros_like(solution_candidates, dtype="int32") for s, solution in enumerate(solution_candidates): - solution_starts[s] = min( - [reg.start for reg in solution], default=-1 - ) - solution_stops[s] = max( - [reg.stop for reg in solution], default=-1 - ) + solution_starts[s] = min([reg.start for reg in solution], default=-1) + solution_stops[s] = max([reg.stop for reg in solution], default=-1) dp_matrix_solutions_starts[i, j] = solution_starts dp_matrix_solutions_stops[i, j] = solution_stops @@ -586,14 +575,11 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Non-conflicting regions are of the current order: index_list_non_conflicting = list( - chain( - *[region.get_index_array() for region in non_conflicting] - ) - ) + chain(*[region.get_index_array() for region in non_conflicting]) + ) for result in results: result[index_list_non_conflicting] = order - # If no conflicts remain, the results are complete if len(regions) == 0: return results @@ -601,9 +587,10 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Get the optimal solutions for given regions. Evaluate each clique # of mutually conflicting regions seperately cliques = [component for component in nx.connected_components(regions)] - solutions = [set(chain(*e)) for e in product( - *[_remove_pseudoknots(clique) for clique in cliques] - )] + solutions = [ + set(chain(*e)) + for e in product(*[_remove_pseudoknots(clique) for clique in cliques]) + ] # Get a copy of the current results for each optimal solution results_list = [ @@ -612,16 +599,13 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Evaluate each optimal solution for i, solution in enumerate(solutions): - # Get the pseudoknotted regions pseudoknotted_regions = regions.copy() pseudoknotted_regions.remove_nodes_from(solution) # Get an index list of the unknotted base pairs index_list_unknotted = list( - chain( - *[region.get_index_array() for region in solution] - ) + chain(*[region.get_index_array() for region in solution]) ) # Write results for current solution @@ -634,8 +618,10 @@ def _get_results(regions, results, max_pseudoknot_order, order=0): # Evaluate the pseudoknotted region results_list[i] = _get_results( - pseudoknotted_regions, results_list[i], - max_pseudoknot_order, order=order+1 + pseudoknotted_regions, + results_list[i], + max_pseudoknot_order, + order=order + 1, ) # Flatten the results diff --git a/src/biotite/structure/rdf.py b/src/biotite/structure/rdf.py index 563cd0ae3..448a81ffa 100644 --- a/src/biotite/structure/rdf.py +++ b/src/biotite/structure/rdf.py @@ -12,15 +12,16 @@ from numbers import Integral import numpy as np -from .atoms import Atom, AtomArray, stack, array, coord, AtomArrayStack -from .box import box_volume -from .geometry import displacement -from .util import vector_dot -from .celllist import CellList +from biotite.structure.atoms import AtomArray, coord, stack +from biotite.structure.box import box_volume +from biotite.structure.celllist import CellList +from biotite.structure.geometry import displacement +from biotite.structure.util import vector_dot -def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, - periodic=False): +def rdf( + center, atoms, selection=None, interval=(0, 10), bins=100, box=None, periodic=False +): r""" Compute the radial distribution function *g(r)* (RDF) for one or multiple given central positions based on a given system of @@ -155,7 +156,7 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, Find the radius for the first solvation shell. In this simple case, the density peak is identified by finding the maximum of the function. - + >>> peak_position = np.argmax(g_r) >>> print(f"{bins[peak_position]/10:.2f} nm") 0.29 nm @@ -165,9 +166,9 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, atoms = stack([atoms]) if selection is not None: atoms = atoms[..., selection] - + atom_coord = atoms.coord - + if box is None: if atoms.box is None: raise ValueError("A box must be supplied") @@ -175,17 +176,15 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, box = atoms.box elif box.ndim == 2 and atoms.stack_depth() == 1: box = box[np.newaxis, :, :] - + center = coord(center) if center.ndim == 1: center = center.reshape((1, 1) + center.shape) elif center.ndim == 2: center = center.reshape((1,) + center.shape) - + if box.shape[0] != center.shape[0] or box.shape[0] != atom_coord.shape[0]: - raise ValueError( - "Center, box, and atoms must have the same model count" - ) + raise ValueError("Center, box, and atoms must have the same model count") # Calculate distance histogram edges = _calculate_edges(interval, bins) @@ -209,17 +208,20 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None, for j in range(center.shape[1]): dist_box = box[i] if periodic else None # Calculate squared distances - disp.append(displacement( - center[i,j], atom_coord[i, near_atom_mask[j]], box=dist_box - )) + disp.append( + displacement( + center[i, j], atom_coord[i, near_atom_mask[j]], box=dist_box + ) + ) # Make one array from multiple arrays with different length disp = np.concatenate(disp) sq_distances = vector_dot(disp, disp) hist, _ = np.histogram(sq_distances, bins=sq_edges) # Normalize with average particle density (N/V) in each bin - bin_volume = (4 / 3 * np.pi * np.power(edges[1: ], 3)) \ - - (4 / 3 * np.pi * np.power(edges[:-1], 3)) + bin_volume = (4 / 3 * np.pi * np.power(edges[1:], 3)) - ( + 4 / 3 * np.pi * np.power(edges[:-1], 3) + ) n_frames = len(atoms) volume = box_volume(box).mean() density = atoms.array_length() / volume @@ -237,7 +239,7 @@ def _calculate_edges(interval, bins): if isinstance(bins, Integral): if bins < 1: raise ValueError("At least one bin is required") - return np.linspace(*interval, bins+1) + return np.linspace(*interval, bins + 1) else: # 'bins' contains edges return np.array(bins, dtype=float) diff --git a/src/biotite/structure/repair.py b/src/biotite/structure/repair.py index 8a5e1b6c2..2a567ea4a 100644 --- a/src/biotite/structure/repair.py +++ b/src/biotite/structure/repair.py @@ -8,80 +8,14 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Daniel Bauer" -__all__ = ["renumber_atom_ids", "renumber_res_ids", - "create_continuous_res_ids", "infer_elements", "create_atom_names"] +__all__ = ["create_continuous_res_ids", "infer_elements", "create_atom_names"] -from collections import Counter import warnings +from collections import Counter import numpy as np -from .atoms import AtomArray, AtomArrayStack -from .residues import get_residue_starts -from .chains import get_chain_starts - - -def renumber_atom_ids(array, start=None): - """ - Renumber the atom IDs of the given array. - - DEPRECATED. - - Parameters - ---------- - array : AtomArray or AtomArrayStack - The array to be checked. - start : int, optional - The starting index for renumbering. - The first ID in the array is taken by default. - - Returns - ------- - array : AtomArray or AtomArrayStack - The renumbered array. - """ - warnings.warn( - "'renumber_atom_ids()' is deprecated", - DeprecationWarning - ) - if "atom_id" not in array.get_annotation_categories(): - raise ValueError("The atom array must have the 'atom_id' annotation") - if start is None: - start = array.atom_id[0] - array = array.copy() - array.atom_id = np.arange(start, array.shape[-1]+1) - return array - - -def renumber_res_ids(array, start=None): - """ - Renumber the residue IDs of the given array, so that are continuous. - - DEPRECATED: Use :func:`create_continuous_res_ids()`instead. - - Parameters - ---------- - array : AtomArray or AtomArrayStack - The array to be checked. - start : int, optional - The starting index for renumbering. - The first ID in the array is taken by default. - - Returns - ------- - array : AtomArray or AtomArrayStack - The renumbered array. - """ - warnings.warn( - "'renumber_res_ids()' is deprecated, use 'create_continuous_res_ids()'", - DeprecationWarning - ) - if start is None: - start = array.res_id[0] - diff = np.diff(array.res_id) - diff[diff != 0] = 1 - new_res_ids = np.concatenate(([start], diff)).cumsum() - array = array.copy() - array.res_id = new_res_ids - return array +from biotite.structure.atoms import AtomArray, AtomArrayStack +from biotite.structure.chains import get_chain_starts +from biotite.structure.residues import get_residue_starts def create_continuous_res_ids(atoms, restart_each_chain=True): @@ -217,18 +151,131 @@ def create_atom_names(atoms): return atom_names -_elements = [elem.upper() for elem in -["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", -"Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", -"Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", -"Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", -"I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", -"Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", -"Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", -"U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", -"Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", -"Og"] +_elements = [ + elem.upper() + for elem in [ + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + "Rf", + "Db", + "Sg", + "Bh", + "Hs", + "Mt", + "Ds", + "Rg", + "Cn", + "Nh", + "Fl", + "Mc", + "Lv", + "Ts", + "Og", + ] ] + + def _guess_element(atom_name): # remove digits (1H -> H) elem = "".join([i for i in atom_name if not i.isdigit()]) @@ -237,9 +284,13 @@ def _guess_element(atom_name): return "" # Some often used elements for biomolecules - if elem.startswith("C") or elem.startswith("N") or \ - elem.startswith("O") or elem.startswith("S") or \ - elem.startswith("H"): + if ( + elem.startswith("C") + or elem.startswith("N") + or elem.startswith("O") + or elem.startswith("S") + or elem.startswith("H") + ): return elem[0] # Exactly match element abbreviations @@ -250,4 +301,4 @@ def _guess_element(atom_name): return _elements[_elements.index(elem[0])] except ValueError: warnings.warn(f"Could not infer element for '{atom_name}'") - return "" \ No newline at end of file + return "" diff --git a/src/biotite/structure/residues.py b/src/biotite/structure/residues.py index e438e35ed..61ae1712a 100644 --- a/src/biotite/structure/residues.py +++ b/src/biotite/structure/residues.py @@ -9,14 +9,27 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["get_residue_starts", "apply_residue_wise", "spread_residue_wise", - "get_residue_masks", "get_residue_starts_for", - "get_residue_positions", "get_residues", "get_residue_count", - "residue_iter"] +__all__ = [ + "get_residue_starts", + "apply_residue_wise", + "spread_residue_wise", + "get_residue_masks", + "get_residue_starts_for", + "get_residue_positions", + "get_residues", + "get_residue_count", + "residue_iter", +] import numpy as np -from .atoms import AtomArray, AtomArrayStack -from .resutil import * +from biotite.structure.segments import ( + apply_segment_wise, + get_segment_masks, + get_segment_positions, + get_segment_starts_for, + segment_iter, + spread_segment_wise, +) def get_residue_starts(array, add_exclusive_stop=False): @@ -57,23 +70,20 @@ def get_residue_starts(array, add_exclusive_stop=False): 278 292 304] """ # These mask are 'true' at indices where the value changes - chain_id_changes = (array.chain_id[1:] != array.chain_id[:-1]) - res_id_changes = (array.res_id[1:] != array.res_id[:-1] ) - ins_code_changes = (array.ins_code[1:] != array.ins_code[:-1]) - res_name_changes = (array.res_name[1:] != array.res_name[:-1]) + chain_id_changes = array.chain_id[1:] != array.chain_id[:-1] + res_id_changes = array.res_id[1:] != array.res_id[:-1] + ins_code_changes = array.ins_code[1:] != array.ins_code[:-1] + res_name_changes = array.res_name[1:] != array.res_name[:-1] # If any of these annotation arrays change, a new residue starts residue_change_mask = ( - chain_id_changes | - res_id_changes | - ins_code_changes | - res_name_changes + chain_id_changes | res_id_changes | ins_code_changes | res_name_changes ) # Convert mask to indices # Add 1, to shift the indices from the end of a residue # to the start of a new residue - residue_starts = np.where(residue_change_mask)[0] +1 + residue_starts = np.where(residue_change_mask)[0] + 1 # The first residue is not included yet -> Insert '[0]' if add_exclusive_stop: @@ -197,7 +207,7 @@ def spread_residue_wise(array, input_data): Spread secondary structure annotation to every atom of a 20 residue peptide (with 304 atoms). - >>> sse = annotate_sse(atom_array, "A") + >>> sse = annotate_sse(atom_array) >>> print(len(sse)) 20 >>> print(sse) diff --git a/src/biotite/structure/resutil.py b/src/biotite/structure/segments.py similarity index 83% rename from src/biotite/structure/resutil.py rename to src/biotite/structure/segments.py index 64c5339e1..5841346b3 100644 --- a/src/biotite/structure/resutil.py +++ b/src/biotite/structure/segments.py @@ -4,8 +4,14 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["apply_segment_wise", "spread_segment_wise", "get_segment_masks", - "get_segment_starts_for", "get_segment_positions", "segment_iter"] +__all__ = [ + "apply_segment_wise", + "spread_segment_wise", + "get_segment_masks", + "get_segment_starts_for", + "get_segment_positions", + "segment_iter", +] import numpy as np @@ -24,9 +30,9 @@ def apply_segment_wise(starts, data, function, axis): """ # The result array processed_data = None - for i in range(len(starts)-1): - segment = data[starts[i]:starts[i+1]] - if axis == None: + for i in range(len(starts) - 1): + segment = data[starts[i] : starts[i + 1]] + if axis is None: value = function(segment) else: value = function(segment, axis=axis) @@ -39,13 +45,11 @@ def apply_segment_wise(starts, data, function, axis): # is length of segment of size 1 -> length of all IDs # (equal to atom array length) processed_data = np.zeros( - (len(starts)-1,) + value.shape, dtype=value.dtype + (len(starts) - 1,) + value.shape, dtype=value.dtype ) else: # Scalar value -> one dimensional result array - processed_data = np.zeros( - len(starts)-1, dtype=type(value) - ) + processed_data = np.zeros(len(starts) - 1, dtype=type(value)) # Write values into result arrays processed_data[i] = value return processed_data @@ -64,7 +68,7 @@ def spread_segment_wise(starts, input_data): atom array. """ output_data = np.zeros(starts[-1], dtype=input_data.dtype) - for i in range(len(starts)-1): + for i in range(len(starts) - 1): start = starts[i] stop = starts[i + 1] output_data[start:stop] = input_data[i] @@ -92,14 +96,13 @@ def get_segment_masks(starts, indices): if (indices >= length).any(): index = np.min(np.where(indices >= length)[0]) raise ValueError( - f"Index {index} is out of range for " - f"an atom array with length {length}" + f"Index {index} is out of range for " f"an atom array with length {length}" ) - + insertion_points = np.searchsorted(starts, indices, side="right") - 1 for i, point in enumerate(insertion_points): - masks[i, starts[point] : starts[point+1]] = True - + masks[i, starts[point] : starts[point + 1]] = True + return masks @@ -125,10 +128,9 @@ def get_segment_starts_for(starts, indices): if (indices >= length).any(): index = np.min(np.where(indices >= length)[0]) raise ValueError( - f"Index {index} is out of range for " - f"an atom array with length {length}" + f"Index {index} is out of range for " f"an atom array with length {length}" ) - + insertion_points = np.searchsorted(starts, indices, side="right") - 1 return starts[insertion_points] @@ -155,10 +157,9 @@ def get_segment_positions(starts, indices): if (indices >= length).any(): index = np.min(np.where(indices >= length)[0]) raise ValueError( - f"Index {index} is out of range for " - f"an atom array with length {length}" + f"Index {index} is out of range for " f"an atom array with length {length}" ) - + return np.searchsorted(starts, indices, side="right") - 1 @@ -174,5 +175,5 @@ def segment_iter(array, starts): Includes exclusive stop, i.e. the length of the corresponding atom array. """ - for i in range(len(starts)-1): - yield array[..., starts[i] : starts[i+1]] + for i in range(len(starts) - 1): + yield array[..., starts[i] : starts[i + 1]] diff --git a/src/biotite/structure/sequence.py b/src/biotite/structure/sequence.py index 0cad79b73..a0538e314 100644 --- a/src/biotite/structure/sequence.py +++ b/src/biotite/structure/sequence.py @@ -11,13 +11,12 @@ __all__ = ["to_sequence"] import numpy as np -from .info.misc import one_letter_code -from .info.groups import amino_acid_names, nucleotide_names -from .residues import get_residues -from .chains import get_chain_starts -from .error import BadStructureError -from ..sequence.seqtypes import ProteinSequence, NucleotideSequence - +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence +from biotite.structure.chains import get_chain_starts +from biotite.structure.error import BadStructureError +from biotite.structure.info.groups import amino_acid_names, nucleotide_names +from biotite.structure.info.misc import one_letter_code +from biotite.structure.residues import get_residues HETERO_PLACEHOLDER = "." @@ -63,9 +62,9 @@ def to_sequence(atoms, allow_hetero=False): """ sequences = [] chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) - for i in range(len(chain_start_indices)-1): + for i in range(len(chain_start_indices) - 1): start = chain_start_indices[i] - stop = chain_start_indices[i+1] + stop = chain_start_indices[i + 1] chain = atoms[start:stop] _, residues = get_residues(chain) one_letter_symbols = np.array( @@ -73,7 +72,7 @@ def to_sequence(atoms, allow_hetero=False): ) hetero_mask = one_letter_symbols == HETERO_PLACEHOLDER - aa_count = np.count_nonzero(np.isin(residues, amino_acid_names())) + aa_count = np.count_nonzero(np.isin(residues, amino_acid_names())) nuc_count = np.count_nonzero(np.isin(residues, nucleotide_names())) if aa_count == 0 and nuc_count == 0: raise BadStructureError( @@ -109,4 +108,4 @@ def to_sequence(atoms, allow_hetero=False): sequences.append(NucleotideSequence("".join(one_letter_symbols))) # Remove exclusive stop - return sequences, chain_start_indices[:-1] \ No newline at end of file + return sequences, chain_start_indices[:-1] diff --git a/src/biotite/structure/sse.py b/src/biotite/structure/sse.py index 4e2017424..ee505716c 100644 --- a/src/biotite/structure/sse.py +++ b/src/biotite/structure/sse.py @@ -12,51 +12,43 @@ __all__ = ["annotate_sse"] import numpy as np -from .celllist import CellList -from .geometry import distance, angle, dihedral -from .filter import filter_amino_acids -from .residues import get_residue_starts -from .integrity import check_res_id_continuity +from biotite.structure.celllist import CellList +from biotite.structure.filter import filter_amino_acids +from biotite.structure.geometry import angle, dihedral, distance +from biotite.structure.integrity import check_res_id_continuity +from biotite.structure.residues import get_residue_starts +_r_helix = (np.deg2rad(89 - 12), np.deg2rad(89 + 12)) +_a_helix = (np.deg2rad(50 - 20), np.deg2rad(50 + 20)) +_d2_helix = ((5.5 - 0.5), (5.5 + 0.5)) # Not used in the algorithm description +_d3_helix = ((5.3 - 0.5), (5.3 + 0.5)) +_d4_helix = ((6.4 - 0.6), (6.4 + 0.6)) -_r_helix = (np.deg2rad(89-12), np.deg2rad(89+12)) -_a_helix = (np.deg2rad(50-20), np.deg2rad(50+20)) -_d2_helix = ((5.5-0.5), (5.5+0.5)) # Not used in the algorithm description -_d3_helix = ((5.3-0.5), (5.3+0.5)) -_d4_helix = ((6.4-0.6), (6.4+0.6)) +_r_strand = (np.deg2rad(124 - 14), np.deg2rad(124 + 14)) +_a_strand = (np.deg2rad(-180), np.deg2rad(-125), np.deg2rad(145), np.deg2rad(180)) +_d2_strand = ((6.7 - 0.6), (6.7 + 0.6)) +_d3_strand = ((9.9 - 0.9), (9.9 + 0.9)) +_d4_strand = ((12.4 - 1.1), (12.4 + 1.1)) -_r_strand = (np.deg2rad(124-14), np.deg2rad(124+14)) -_a_strand = (np.deg2rad(-180), np.deg2rad(-125), - np.deg2rad(145), np.deg2rad(180)) -_d2_strand = ((6.7-0.6), (6.7+0.6)) -_d3_strand = ((9.9-0.9), (9.9+0.9)) -_d4_strand = ((12.4-1.1), (12.4+1.1)) - -def annotate_sse(atom_array, chain_id=None): +def annotate_sse(atom_array): r""" Calculate the secondary structure elements (SSEs) of a peptide chain based on the `P-SEA` algorithm. :footcite:`Labesse1997` - + The annotation is based CA coordinates only, specifically distances and dihedral angles. Discontinuities between chains are detected by residue ID. - + Parameters ---------- atom_array : AtomArray The atom array to annotate for. Non-peptide residues are also allowed and obtain a ``''`` SSE. - chain_id : str, optional - The peptide atoms belonging to this chain are filtered and - annotated. - DEPRECATED: By now multiple chains can be annotated at once. - To annotate only a certain chain, filter the `atom_array` before - giving it as input to this function. - - + + Returns ------- sse : ndarray @@ -67,37 +59,30 @@ def annotate_sse(atom_array, chain_id=None): :math:`{\beta}`-strand/sheet, ``'c'`` means coil. ``''`` indicates that a residue is not an amino acid or it comprises no ``CA`` atom. - + Notes ----- Although this function is based on the original `P-SEA` algorithm, there are deviations compared to the official `P-SEA` software in some cases. Do not rely on getting the exact same results. - + References ---------- .. footbibliography:: - + Examples -------- - + SSE of PDB 1L2Y: - - >>> sse = annotate_sse(atom_array, "A") + + >>> sse = annotate_sse(atom_array) >>> print(sse) ['c' 'a' 'a' 'a' 'a' 'a' 'a' 'a' 'a' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c' 'c'] - - """ - if chain_id is not None: - # Filter all CA atoms in the relevant chain - atom_array = atom_array[ - (atom_array.chain_id == chain_id) & filter_amino_acids(atom_array) - ] - + """ residue_starts = get_residue_starts(atom_array) # Sort CA coord into the coord array at the respective residue index # If a residue has no CA, e.g. because it is not an amino acid, @@ -106,9 +91,9 @@ def annotate_sse(atom_array, chain_id=None): ca_indices = np.where( filter_amino_acids(atom_array) & (atom_array.atom_name == "CA") )[0] - ca_coord[ - np.searchsorted(residue_starts, ca_indices, "right") - 1 - ] = atom_array.coord[ca_indices] + ca_coord[np.searchsorted(residue_starts, ca_indices, "right") - 1] = ( + atom_array.coord[ca_indices] + ) if len(ca_coord) <= 5: # The number of atoms is too small # @@ -125,12 +110,12 @@ def annotate_sse(atom_array, chain_id=None): # purpose of geometric measurements # -> the distances/angles spanning discontinuities are NaN discont_indices = check_res_id_continuity(atom_array) - discont_res_indices = np.searchsorted( - residue_starts, discont_indices, "right" - ) - 1 + discont_res_indices = np.searchsorted(residue_starts, discont_indices, "right") - 1 ca_coord = np.insert( - ca_coord, discont_res_indices, - np.full((len(discont_res_indices),3), np.nan), axis=0 + ca_coord, + discont_res_indices, + np.full((len(discont_res_indices), 3), np.nan), + axis=0, ) # Later the SSE for virtual residues are removed again # via this mask @@ -139,73 +124,74 @@ def annotate_sse(atom_array, chain_id=None): length = len(ca_coord) - # The distances and angles are not defined for the entire interval, # therefore the indices do not have the full range # Values that are not defined are NaN d2i = np.full(length, np.nan) d3i = np.full(length, np.nan) d4i = np.full(length, np.nan) - ri = np.full(length, np.nan) - ai = np.full(length, np.nan) - - d2i[1 : length-1] = distance(ca_coord[0 : length-2], ca_coord[2 : length]) - d3i[1 : length-2] = distance(ca_coord[0 : length-3], ca_coord[3 : length]) - d4i[1 : length-3] = distance(ca_coord[0 : length-4], ca_coord[4 : length]) - ri[1 : length-1] = angle( - ca_coord[0 : length-2], - ca_coord[1 : length-1], - ca_coord[2 : length] + ri = np.full(length, np.nan) + ai = np.full(length, np.nan) + + d2i[1 : length - 1] = distance(ca_coord[0 : length - 2], ca_coord[2:length]) + d3i[1 : length - 2] = distance(ca_coord[0 : length - 3], ca_coord[3:length]) + d4i[1 : length - 3] = distance(ca_coord[0 : length - 4], ca_coord[4:length]) + ri[1 : length - 1] = angle( + ca_coord[0 : length - 2], ca_coord[1 : length - 1], ca_coord[2:length] ) - ai[1 : length-2] = dihedral( - ca_coord[0 : length-3], - ca_coord[1 : length-2], - ca_coord[2 : length-1], - ca_coord[3 : length-0] + ai[1 : length - 2] = dihedral( + ca_coord[0 : length - 3], + ca_coord[1 : length - 2], + ca_coord[2 : length - 1], + ca_coord[3 : length - 0], ) - + # Find CA that meet criteria for potential helices and strands - relaxed_helix = ( - (d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1]) - ) | ( - (ri >= _r_helix[0] ) & ( ri <= _r_helix[1]) + relaxed_helix = ((d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1])) | ( + (ri >= _r_helix[0]) & (ri <= _r_helix[1]) ) strict_helix = ( - (d3i >= _d3_helix[0]) & (d3i <= _d3_helix[1]) & - (d4i >= _d4_helix[0]) & (d4i <= _d4_helix[1]) + (d3i >= _d3_helix[0]) + & (d3i <= _d3_helix[1]) + & (d4i >= _d4_helix[0]) + & (d4i <= _d4_helix[1]) ) | ( - (ri >= _r_helix[0] ) & ( ri <= _r_helix[1]) & - (ai >= _a_helix[0] ) & ( ai <= _a_helix[1]) + (ri >= _r_helix[0]) + & (ri <= _r_helix[1]) + & (ai >= _a_helix[0]) + & (ai <= _a_helix[1]) ) relaxed_strand = (d3i >= _d3_strand[0]) & (d3i <= _d3_strand[1]) strict_strand = ( - (d2i >= _d2_strand[0]) & (d2i <= _d2_strand[1]) & - (d3i >= _d3_strand[0]) & (d3i <= _d3_strand[1]) & - (d4i >= _d4_strand[0]) & (d4i <= _d4_strand[1]) + (d2i >= _d2_strand[0]) + & (d2i <= _d2_strand[1]) + & (d3i >= _d3_strand[0]) + & (d3i <= _d3_strand[1]) + & (d4i >= _d4_strand[0]) + & (d4i <= _d4_strand[1]) ) | ( - (ri >= _r_strand[0] ) & ( ri <= _r_strand[1]) & - ( + (ri >= _r_strand[0]) + & (ri <= _r_strand[1]) + & ( # Account for periodic boundary of dihedral angle - ((ai >= _a_strand[0] ) & ( ai <= _a_strand[1])) | - ((ai >= _a_strand[2] ) & ( ai <= _a_strand[3])) + ((ai >= _a_strand[0]) & (ai <= _a_strand[1])) + | ((ai >= _a_strand[2]) & (ai <= _a_strand[3])) ) ) - helix_mask = _mask_consecutive(strict_helix, 5) helix_mask = _extend_region(helix_mask, relaxed_helix) - + strand_mask = _mask_consecutive(strict_strand, 4) short_strand_mask = _mask_regions_with_contacts( ca_coord, _mask_consecutive(strict_strand, 3), - min_contacts=5, min_distance=4.2, max_distance=5.2 - ) - strand_mask = _extend_region( - strand_mask | short_strand_mask, relaxed_strand + min_contacts=5, + min_distance=4.2, + max_distance=5.2, ) - + strand_mask = _extend_region(strand_mask | short_strand_mask, relaxed_strand) sse = np.full(length, "c", dtype="U1") sse[helix_mask] = "a" @@ -215,7 +201,7 @@ def annotate_sse(atom_array, chain_id=None): sse[np.isnan(ca_coord).any(axis=-1)] = "" # Remove SSE for virtual atoms and return return sse[no_virtual_mask] - + def _mask_consecutive(mask, number): """ @@ -228,17 +214,17 @@ def _mask_consecutive(mask, number): # if it and the following `number-1` elements are True # The elements `mask[-(number-1):]` cannot have the sufficient count # by this definition, as they are at the end of the array - counts = np.zeros(len(mask) - (number-1), dtype=int) + counts = np.zeros(len(mask) - (number - 1), dtype=int) for i in range(number): counts[mask[i : i + len(counts)]] += 1 - consecutive_seed = (counts == number) - + consecutive_seed = counts == number + # Not only that element, but also the # following `number-1` elements are in a consecutive region consecutive_mask = np.zeros(len(mask), dtype=bool) for i in range(number): consecutive_mask[i : i + len(consecutive_seed)] |= consecutive_seed - + return consecutive_mask @@ -253,7 +239,7 @@ def _extend_region(base_condition_mask, extension_condition_mask): # Prepend absent region to the start to capture the event, # that the first element is already the start of a region region_change_mask = np.diff(np.append([False], base_condition_mask)) - + # These masks point to the first `False` element # left and right of a 'True' region # The left end is the element before the first element of a 'True' region @@ -262,7 +248,7 @@ def _extend_region(base_condition_mask, extension_condition_mask): left_end_mask = np.append(left_end_mask[1:], [False]) # The right end is first element of a 'False' region right_end_mask = region_change_mask & ~base_condition_mask - + # The 'base_condition_mask' gets additional 'True' elements # at left or right ends, which meet the extension criterion return base_condition_mask | ( @@ -270,8 +256,9 @@ def _extend_region(base_condition_mask, extension_condition_mask): ) -def _mask_regions_with_contacts(coord, candidate_mask, - min_contacts, min_distance, max_distance): +def _mask_regions_with_contacts( + coord, candidate_mask, min_contacts, min_distance, max_distance +): """ Mask regions of `candidate_mask` that have at least `min_contacts` contacts with `coord` in the range `min_distance` to `max_distance`. @@ -281,47 +268,41 @@ def _mask_regions_with_contacts(coord, candidate_mask, # No potential contacts -> no contacts # -> no residue can satisfy 'min_contacts' return np.zeros(len(candidate_mask), dtype=bool) - - cell_list = CellList( - potential_contact_coord, max_distance - ) + + cell_list = CellList(potential_contact_coord, max_distance) # For each candidate position, # get all contacts within maximum distance all_within_max_dist_indices = cell_list.get_atoms( coord[candidate_mask], max_distance ) - + contacts = np.zeros(len(coord), dtype=int) for i, atom_index in enumerate(np.where(candidate_mask)[0]): within_max_dist_indices = all_within_max_dist_indices[i] # Remove padding values - within_max_dist_indices = within_max_dist_indices[ - within_max_dist_indices != -1 - ] - # Now count all contacts within maximum distance + within_max_dist_indices = within_max_dist_indices[within_max_dist_indices != -1] + # Now count all contacts within maximum distance # that also satisfy the minimum distance contacts[atom_index] = np.count_nonzero( distance( - coord[atom_index], - potential_contact_coord[within_max_dist_indices] - ) > min_distance + coord[atom_index], potential_contact_coord[within_max_dist_indices] + ) + > min_distance ) - + # Count the number of contacts per region # These indices mark the start of either a 'True' or 'False' region # Prepend absent region to the start to capture the event, # that the first element is already the start of a region - region_change_indices = np.where( - np.diff(np.append([False], candidate_mask)) - )[0] + region_change_indices = np.where(np.diff(np.append([False], candidate_mask)))[0] # Add exclusive stop region_change_indices = np.append(region_change_indices, [len(coord)]) output_mask = np.zeros(len(candidate_mask), dtype=bool) for i in range(len(region_change_indices) - 1): start = region_change_indices[i] - stop = region_change_indices[i+1] - total_contacts = np.sum(contacts[start : stop]) + stop = region_change_indices[i + 1] + total_contacts = np.sum(contacts[start:stop]) if total_contacts >= min_contacts: - output_mask[start : stop] = True - - return output_mask \ No newline at end of file + output_mask[start:stop] = True + + return output_mask diff --git a/src/biotite/structure/superimpose.py b/src/biotite/structure/superimpose.py index 6c7449e25..d06d0abdb 100755 --- a/src/biotite/structure/superimpose.py +++ b/src/biotite/structure/superimpose.py @@ -8,19 +8,22 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann, Claude J. Rogers" -__all__ = ["superimpose", "superimpose_homologs", - "superimpose_without_outliers", - "AffineTransformation", "superimpose_apply"] +__all__ = [ + "superimpose", + "superimpose_homologs", + "superimpose_without_outliers", + "AffineTransformation", +] import numpy as np -from .atoms import coord -from .geometry import centroid, distance -from .filter import filter_amino_acids, filter_nucleotides -from .sequence import to_sequence -from ..sequence.alphabet import common_alphabet -from ..sequence.seqtypes import ProteinSequence -from ..sequence.align import SubstitutionMatrix, align_optimal, get_codes +from biotite.sequence.align import SubstitutionMatrix, align_optimal, get_codes +from biotite.sequence.alphabet import common_alphabet +from biotite.sequence.seqtypes import ProteinSequence +from biotite.structure.atoms import coord +from biotite.structure.filter import filter_amino_acids, filter_nucleotides +from biotite.structure.geometry import centroid, distance +from biotite.structure.sequence import to_sequence class AffineTransformation: @@ -45,12 +48,12 @@ class AffineTransformation: The dimensions are always expanded to *(m,3)* or *(m,3,3)*, respectively. """ + def __init__(self, center_translation, rotation, target_translation): self.center_translation = _expand_dims(center_translation, 2) self.rotation = _expand_dims(rotation, 3) self.target_translation = _expand_dims(target_translation, 2) - def apply(self, atoms): """ Apply this transformation on the given structure. @@ -118,7 +121,6 @@ def apply(self, atoms): superimposed.coord = superimposed_coord return superimposed - def as_matrix(self): """ Get the translations and rotation as a combined 4x4 @@ -316,16 +318,19 @@ def superimpose(fixed, mobile, atom_mask=None): mob_centered_filtered = mob_filtered - mob_centroid[:, np.newaxis, :] fix_centered_filtered = fix_filtered - fix_centroid[:, np.newaxis, :] - rotation = _get_rotation_matrices( - fix_centered_filtered, mob_centered_filtered - ) + rotation = _get_rotation_matrices(fix_centered_filtered, mob_centered_filtered) transform = AffineTransformation(-mob_centroid, rotation, fix_centroid) return transform.apply(mobile), transform -def superimpose_without_outliers(fixed, mobile, min_anchors=3, - max_iterations=10, quantiles=(0.25, 0.75), - outlier_threshold=1.5): +def superimpose_without_outliers( + fixed, + mobile, + min_anchors=3, + max_iterations=10, + quantiles=(0.25, 0.75), + outlier_threshold=1.5, +): r""" Superimpose structures onto a fixed structure, ignoring conformational outliers. @@ -458,8 +463,9 @@ def superimpose_without_outliers(fixed, mobile, min_anchors=3, return transform.apply(mobile), transform, anchor_indices -def superimpose_homologs(fixed, mobile, substitution_matrix=None, - gap_penalty=-10, min_anchors=3, **kwargs): +def superimpose_homologs( + fixed, mobile, substitution_matrix=None, gap_penalty=-10, min_anchors=3, **kwargs +): r""" Superimpose one protein or nucleotide chain onto another one, considering sequence differences and conformational outliers. @@ -530,8 +536,8 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None, fixed_anchor_indices = _get_backbone_anchor_indices(fixed) mobile_anchor_indices = _get_backbone_anchor_indices(mobile) if ( - len(fixed_anchor_indices) < min_anchors or - len(mobile_anchor_indices) < min_anchors + len(fixed_anchor_indices) < min_anchors + or len(mobile_anchor_indices) < min_anchors ): raise ValueError( "Structures have too few CA atoms for required number of anchors" @@ -562,7 +568,7 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None, fixed[..., fixed_anchor_indices], mobile[..., mobile_anchor_indices], min_anchors, - **kwargs + **kwargs, ) fixed_anchor_indices = fixed_anchor_indices[selected_anchor_indices] mobile_anchor_indices = mobile_anchor_indices[selected_anchor_indices] @@ -575,54 +581,18 @@ def superimpose_homologs(fixed, mobile, substitution_matrix=None, ) -def superimpose_apply(atoms, transformation): - """ - Superimpose structures using a given :class:`AffineTransformation`. - - The :class:`AffineTransformation` can be obtained by prior - superimposition. - - DEPRECATED: Use :func:`AffineTransformation.apply()` instead. - - Parameters - ---------- - atoms : AtomArray or ndarray, shape(n,), dtype=float - The structure to apply the transformation on. - Alternatively coordinates can be given. - transformation: AffineTransformation - The transformation, obtained by :func:`superimpose()`. - - Returns - ------- - fitted : AtomArray or AtomArrayStack - A copy of the `atoms` structure, - with transformations applied. - Only coordinates are returned, if coordinates were given in - `atoms`. - - See Also - -------- - superimpose - """ - return transformation.apply(atoms) - - def _reshape_to_3d(coord): """ Reshape the coordinate array to 3D, if it is 2D. """ if coord.ndim < 2: - raise ValueError( - "Coordinates must be at least two-dimensional" - ) + raise ValueError("Coordinates must be at least two-dimensional") if coord.ndim == 2: return coord[np.newaxis, ...] elif coord.ndim == 3: return coord else: - raise ValueError( - "Coordinates must be at most three-dimensional" - ) + raise ValueError("Coordinates must be at most three-dimensional") def _get_rotation_matrices(fixed, mobile): @@ -634,10 +604,10 @@ def _get_rotation_matrices(fixed, mobile): Both sets of coordinates must already be centered at origin. """ # Calculate cross-covariance matrices - cov = np.sum(fixed[:,:,:,np.newaxis] * mobile[:,:,np.newaxis,:], axis=1) + cov = np.sum(fixed[:, :, :, np.newaxis] * mobile[:, :, np.newaxis, :], axis=1) v, s, w = np.linalg.svd(cov) # Remove possibility of reflected atom coordinates - reflected_mask = (np.linalg.det(v) * np.linalg.det(w) < 0) + reflected_mask = np.linalg.det(v) * np.linalg.det(w) < 0 v[reflected_mask, :, -1] *= -1 matrices = np.matmul(v, w) return matrices @@ -649,11 +619,7 @@ def _multi_matmul(matrices, vectors): with m x n vectors. """ return np.transpose( - np.matmul( - matrices, - np.transpose(vectors, axes=(0, 2, 1)) - ), - axes=(0, 2, 1) + np.matmul(matrices, np.transpose(vectors, axes=(0, 2, 1))), axes=(0, 2, 1) ) @@ -663,8 +629,8 @@ def _get_backbone_anchor_indices(atoms): nucleotide and return their indices. """ return np.where( - ((filter_amino_acids(atoms)) & (atoms.atom_name == "CA")) | - ((filter_nucleotides(atoms)) & (atoms.atom_name == "P")) + ((filter_amino_acids(atoms)) & (atoms.atom_name == "CA")) + | ((filter_nucleotides(atoms)) & (atoms.atom_name == "P")) )[0] @@ -717,11 +683,7 @@ def _find_matching_anchors( def _to_sequence(atoms): sequences, _ = to_sequence(atoms, allow_hetero=True) if len(sequences) == 0: - raise ValueError( - "Structure does not contain any amino acids or nucleotides" - ) + raise ValueError("Structure does not contain any amino acids or nucleotides") if len(sequences) > 1: - raise ValueError( - "Structure contains multiple chains, but only one is allowed" - ) - return sequences[0] \ No newline at end of file + raise ValueError("Structure contains multiple chains, but only one is allowed") + return sequences[0] diff --git a/src/biotite/structure/transform.py b/src/biotite/structure/transform.py index 0ab281c8d..c094b7730 100644 --- a/src/biotite/structure/transform.py +++ b/src/biotite/structure/transform.py @@ -9,20 +9,25 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann", "Claude J. Rogers" -__all__ = ["translate", "rotate", "rotate_centered", "rotate_about_axis", - "orient_principal_components", "align_vectors"] +__all__ = [ + "translate", + "rotate", + "rotate_centered", + "rotate_about_axis", + "orient_principal_components", + "align_vectors", +] import numpy as np -from .geometry import centroid -from .error import BadStructureError -from .atoms import Atom, AtomArray, AtomArrayStack, coord -from .util import norm_vector, vector_dot, matrix_rotate +from biotite.structure.atoms import Atom, AtomArray, AtomArrayStack, coord +from biotite.structure.geometry import centroid +from biotite.structure.util import matrix_rotate, norm_vector, vector_dot def translate(atoms, vector): """ Translate the given atoms or coordinates by a given vector. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -30,7 +35,7 @@ def translate(atoms, vector): The coordinates can be directly provided as :class:`ndarray`. vector: array-like, shape=(3,) or shape=(n,3) or shape=(m,n,3) The translation vector :math:`(x, y, z)`. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -39,7 +44,7 @@ def translate(atoms, vector): """ positions = coord(atoms).copy() vector = np.asarray(vector) - + if vector.shape[-1] != 3: raise ValueError("Translation vector must contain 3 coordinates") positions += vector @@ -50,10 +55,10 @@ def rotate(atoms, angles): """ Rotate the given atoms or coordinates about the *x*, *y* and *z* axes by given angles. - + The rotations are centered at the origin and are performed sequentially in the order *x*, *y*, *z*. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -61,13 +66,13 @@ def rotate(atoms, angles): The coordinates can be directly provided as :class:`ndarray`. angles: array-like, length=3 The rotation angles in radians around *x*, *y* and *z*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates, rotated by the given angles. - + See Also -------- rotate_centered @@ -82,27 +87,39 @@ def rotate(atoms, angles): >>> print(rotated) [1.225e-16 2.000e+00 0.000e+00] """ - from numpy import sin, cos + from numpy import cos, sin # Check if "angles" contains 3 angles for all dimensions if len(angles) != 3: raise ValueError("Translation vector must be container of length 3") # Create rotation matrices for all 3 dimensions - rot_x = np.array([[ 1, 0, 0 ], - [ 0, cos(angles[0]), -sin(angles[0]) ], - [ 0, sin(angles[0]), cos(angles[0]) ]]) - - rot_y = np.array([[ cos(angles[1]), 0, sin(angles[1]) ], - [ 0, 1, 0 ], - [ -sin(angles[1]), 0, cos(angles[1]) ]]) - - rot_z = np.array([[ cos(angles[2]), -sin(angles[2]), 0 ], - [ sin(angles[2]), cos(angles[2]), 0 ], - [ 0, 0, 1 ]]) - + rot_x = np.array( + [ + [1, 0, 0], + [0, cos(angles[0]), -sin(angles[0])], + [0, sin(angles[0]), cos(angles[0])], + ] + ) + + rot_y = np.array( + [ + [cos(angles[1]), 0, sin(angles[1])], + [0, 1, 0], + [-sin(angles[1]), 0, cos(angles[1])], + ] + ) + + rot_z = np.array( + [ + [cos(angles[2]), -sin(angles[2]), 0], + [sin(angles[2]), cos(angles[2]), 0], + [0, 0, 1], + ] + ) + positions = coord(atoms).copy() positions = matrix_rotate(positions, rot_z @ rot_y @ rot_x) - + return _put_back(atoms, positions) @@ -110,10 +127,10 @@ def rotate_centered(atoms, angles): """ Rotate the given atoms or coordinates about the *x*, *y* and *z* axes by given angles. - + The rotations are centered at the centroid of the corresponding structure and are performed sequentially in the order *x*, *y*, *z*. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -121,13 +138,13 @@ def rotate_centered(atoms, angles): The coordinates can be directly provided as :class:`ndarray`. angles: array-like, length=3 The rotation angles in radians around axes *x*, *y* and *z*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates, rotated by the given angles. - + See Also -------- rotate @@ -136,7 +153,7 @@ def rotate_centered(atoms, angles): if len(coord(atoms).shape) == 1: # Single value -> centered rotation does not change coordinates return atoms.copy() - + # Rotation around centroid requires moving centroid to origin center = coord(centroid(atoms)) # 'centroid()' removes the second last dimesion @@ -152,7 +169,7 @@ def rotate_about_axis(atoms, axis, angle, support=None): """ Rotate the given atoms or coordinates about a given axis by a given angle. - + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -167,13 +184,13 @@ def rotate_about_axis(atoms, axis, angle, support=None): An optional support vector for the rotation axis, i.e. the center of the rotation. By default, the center of the rotation is at *(0,0,0)*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates, rotated about the given axis. - + See Also -------- rotate @@ -194,7 +211,7 @@ def rotate_about_axis(atoms, axis, angle, support=None): # Transform coordinates # so that the axis support vector is at (0,0,0) positions -= np.asarray(support) - + # Normalize axis axis = np.asarray(axis, dtype=np.float32).copy() if np.linalg.norm(axis) == 0: @@ -205,16 +222,30 @@ def rotate_about_axis(atoms, axis, angle, support=None): sin_a = np.sin(angle) cos_a = np.cos(angle) icos_a = 1 - cos_a - x = axis[...,0] - y = axis[...,1] - z = axis[...,2] + x = axis[..., 0] + y = axis[..., 1] + z = axis[..., 2] # Rotation matrix is taken from # https://en.wikipedia.org/wiki/Rotation_matrix#Rotation_matrix_from_axis_and_angle - rot_matrix = np.array([ - [ cos_a + icos_a*x**2, icos_a*x*y - z*sin_a, icos_a*x*z + y*sin_a], - [icos_a*x*y + z*sin_a, cos_a + icos_a*y**2, icos_a*y*z - x*sin_a], - [icos_a*x*z - y*sin_a, icos_a*y*z + x*sin_a, cos_a + icos_a*z**2] - ]) + rot_matrix = np.array( + [ + [ + cos_a + icos_a * x**2, + icos_a * x * y - z * sin_a, + icos_a * x * z + y * sin_a, + ], + [ + icos_a * x * y + z * sin_a, + cos_a + icos_a * y**2, + icos_a * y * z - x * sin_a, + ], + [ + icos_a * x * z - y * sin_a, + icos_a * y * z + x * sin_a, + cos_a + icos_a * z**2, + ], + ] + ) # For proper rotation reshape into a maximum of 2 dimensions orig_ndim = positions.ndim @@ -230,7 +261,7 @@ def rotate_about_axis(atoms, axis, angle, support=None): if support is not None: # Transform coordinates back to original support vector position positions += np.asarray(support) - + return _put_back(atoms, positions) @@ -298,9 +329,7 @@ def orient_principal_components(atoms, order=None): else: order = np.asarray(order, dtype=int) if order.shape != (3,): - raise ValueError( - f"Expected order to have shape (3,), not {order.shape}" - ) + raise ValueError(f"Expected order to have shape (3,), not {order.shape}") if not (np.sort(order) == np.arange(3)).all(): raise ValueError("Expected order to contain [0, 1, 2].") @@ -333,8 +362,13 @@ def orient_principal_components(atoms, order=None): return _put_back(atoms, centered) -def align_vectors(atoms, origin_direction, target_direction, - origin_position=None, target_position=None): +def align_vectors( + atoms, + origin_direction, + target_direction, + origin_position=None, + target_position=None, +): """ Apply a transformation to atoms or coordinates, that would transfer a origin vector to a target vector. @@ -345,8 +379,8 @@ def align_vectors(atoms, origin_direction, target_direction, This means, that the application of the transformation on the origin vector would give the target vector. Then the same transformation is applied to the given - atoms/coordinates. - + atoms/coordinates. + Parameters ---------- atoms : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) @@ -359,13 +393,13 @@ def align_vectors(atoms, origin_direction, target_direction, origin_position, target_position : array-like, length=3, optional Optional support vectors for the origin or target, respectively. By default, origin and target start at *(0,0,0)*. - + Returns ------- transformed : Atom or AtomArray or AtomArrayStack or ndarray, shape=(3,) or shape=(n,3) or shape=(m,n,3) A copy of the input atoms or coordinates with the applied transformation. - + See Also -------- rotate @@ -428,12 +462,8 @@ def align_vectors(atoms, origin_direction, target_direction, A 2 LEU HD22 H -6.255 7.544 -2.657 A 2 LEU HD23 H -5.592 8.445 -1.281 """ - origin_direction = np.asarray( - origin_direction, dtype=np.float32 - ).squeeze() - target_direction = np.asarray( - target_direction, dtype=np.float32 - ).squeeze() + origin_direction = np.asarray(origin_direction, dtype=np.float32).squeeze() + target_direction = np.asarray(target_direction, dtype=np.float32).squeeze() # check that original and target direction are vectors of shape (3,) if origin_direction.shape != (3,): raise ValueError( @@ -449,9 +479,9 @@ def align_vectors(atoms, origin_direction, target_direction, raise ValueError("Length of the origin vector is 0") if np.linalg.norm(target_direction) == 0: raise ValueError("Length of the target vector is 0") - if origin_position is not None: + if origin_position is not None: origin_position = np.asarray(origin_position, dtype=np.float32) - if target_position is not None: + if target_position is not None: target_position = np.asarray(target_position, dtype=np.float32) positions = coord(atoms).copy() @@ -459,7 +489,7 @@ def align_vectors(atoms, origin_direction, target_direction, # Transform coordinates # so that the position of the origin vector is at (0,0,0) positions -= origin_position - + # Normalize direction vectors origin_direction = origin_direction.copy() norm_vector(origin_direction) @@ -468,11 +498,7 @@ def align_vectors(atoms, origin_direction, target_direction, # Formula is taken from # https://math.stackexchange.com/questions/180418/calculate-rotation-matrix-to-align-vector-a-to-vector-b-in-3d/476311#476311 vx, vy, vz = np.cross(origin_direction, target_direction) - v_c = np.array([ - [ 0, -vz, vy], - [ vz, 0, -vx], - [-vy, vx, 0] - ], dtype=float) + v_c = np.array([[0, -vz, vy], [vz, 0, -vx], [-vy, vx, 0]], dtype=float) cos_a = vector_dot(origin_direction, target_direction) if np.all(cos_a == -1): raise ValueError( @@ -480,9 +506,9 @@ def align_vectors(atoms, origin_direction, target_direction, "cannot calculate rotation matrix" ) rot_matrix = np.identity(3) + v_c + (v_c @ v_c) / (1 + cos_a) - + positions = matrix_rotate(positions, rot_matrix) - + if target_position is not None: # Transform coordinates to position of the target vector positions += target_position @@ -501,4 +527,4 @@ def _put_back(input_atoms, transformed): moved_atoms.coord = transformed return moved_atoms else: - return transformed \ No newline at end of file + return transformed diff --git a/src/biotite/structure/util.py b/src/biotite/structure/util.py index 68f13f20d..cabbdc8f5 100644 --- a/src/biotite/structure/util.py +++ b/src/biotite/structure/util.py @@ -11,31 +11,30 @@ __all__ = ["vector_dot", "norm_vector", "distance", "matrix_rotate"] import numpy as np -from .atoms import Atom, array -def vector_dot(v1,v2): +def vector_dot(v1, v2): """ Calculate vector dot product of two vectors. - + Parameters ---------- v1,v2 : ndarray The arrays to calculate the product from. The vectors are represented by the last axis. - + Returns ------- product : float or ndarray Scalar product over the last dimension of the arrays. """ - return (v1*v2).sum(axis=-1) + return (v1 * v2).sum(axis=-1) def norm_vector(v): """ Normalise a vector. - + Parameters ---------- v : ndarray @@ -47,25 +46,25 @@ def norm_vector(v): v /= factor[..., np.newaxis] else: v /= factor - -def distance(v1,v2): + +def distance(v1, v2): """ Calculate the distance between two position vectors. - + Parameters ---------- v1,v2 : ndarray The arrays to calculate the product from. The vectors are represented by the last axis. - + Returns ------- product : float or ndarray Vector distance over the last dimension of the array. """ dif = v1 - v2 - return np.sqrt((dif*dif).sum(axis=-1)) + return np.sqrt((dif * dif).sum(axis=-1)) def matrix_rotate(v, matrix): @@ -78,7 +77,7 @@ def matrix_rotate(v, matrix): The coordinates to rotate. matrix : ndarray The rotation matrix. - + Returns ------- rotated : ndarray @@ -95,4 +94,3 @@ def matrix_rotate(v, matrix): if orig_ndim > 2: v = v.reshape(*orig_shape) return v - diff --git a/src/biotite/temp.py b/src/biotite/temp.py deleted file mode 100644 index 2d40ab890..000000000 --- a/src/biotite/temp.py +++ /dev/null @@ -1,86 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__name__ = "biotite" -__author__ = "Patrick Kunzmann" -__all__ = ["temp_file", "temp_dir"] - -import shutil -import atexit -import os -import tempfile -import warnings - - -_temp_dir = "" - -def _create_temp_dir(): - global _temp_dir - if _temp_dir == "": - cwd = os.getcwd() - _temp_dir = os.path.join(cwd, ".biotitetemp") - if not os.path.isdir(_temp_dir): - os.makedirs(_temp_dir) - atexit.register(_delete_temp) - - -def _delete_temp(): - global _temp_dir - # Condition only for savety reasons - if ".biotitetemp" in _temp_dir: - shutil.rmtree(_temp_dir) - - -def temp_file(suffix=""): - """ - Get a file path to a temporary file. - - All temporary files will be deleted after script execution. - - DEPRECATED: Please use the :mod:`tempfile` module from the - standard library. - - Parameters - ---------- - suffix : str - Suffix of the file. - By default no suffix will be appended. - - Returns - ------- - temp_file_name : str - a file name in the temporary directory. - """ - global _temp_dir - warnings.warn( - "Please use the :mod:`tempfile` module from the standard library", - DeprecationWarning - ) - _create_temp_dir() - if suffix != "" and not suffix.startswith("."): - suffix = "." + suffix - return tempfile.mktemp(suffix=suffix, dir=_temp_dir) - - -def temp_dir(): - """ - Get the temporary directory path. - - The temporary directory will be deleted after script execution. - - DEPRECATED: Please use the :mod:`tempfile` module from the - standard library. - - Returns - ------- - temp_dir : str - Path of the temporary directory. - """ - global _temp_dir - warnings.warn( - "Please use the :mod:`tempfile` module from the standard library", - DeprecationWarning - ) - _create_temp_dir() - return _temp_dir \ No newline at end of file diff --git a/src/biotite/visualize.py b/src/biotite/visualize.py index a2839c6a6..eb7444c54 100644 --- a/src/biotite/visualize.py +++ b/src/biotite/visualize.py @@ -6,25 +6,25 @@ __author__ = "Patrick Kunzmann" __all__ = ["colors", "set_font_size_in_coord", "AdaptiveFancyArrow"] -import abc from collections import OrderedDict import numpy as np from numpy.linalg import norm - # Biotite themed colors -colors = OrderedDict([ - ("brightorange" , "#ffb569ff"), - ("lightorange" , "#ff982dff"), - ("orange" , "#ff8405ff"), - ("dimorange" , "#dc7000ff"), - ("darkorange" , "#b45c00ff"), - ("brightgreen" , "#98e97fff"), - ("lightgreen" , "#6fe04cff"), - ("green" , "#52da2aff"), - ("dimgreen" , "#45bc20ff"), - ("darkgreen" , "#389a1aff"), -]) +colors = OrderedDict( + [ + ("brightorange", "#ffb569ff"), + ("lightorange", "#ff982dff"), + ("orange", "#ff8405ff"), + ("dimorange", "#dc7000ff"), + ("darkorange", "#b45c00ff"), + ("brightgreen", "#98e97fff"), + ("lightgreen", "#6fe04cff"), + ("green", "#52da2aff"), + ("dimgreen", "#45bc20ff"), + ("darkgreen", "#389a1aff"), + ] +) def set_font_size_in_coord(text, width=None, height=None, mode="unlocked"): @@ -75,8 +75,8 @@ def set_font_size_in_coord(text, width=None, height=None, mode="unlocked"): This behavior is not equal for all initial font sizes (in 'pt'), the boundaries for an initial size of 1 'pt' seem to be most exact. """ - from matplotlib.transforms import Bbox, Affine2D from matplotlib.patheffects import AbstractPathEffect + from matplotlib.transforms import Affine2D, Bbox class TextScaler(AbstractPathEffect): def __init__(self, text, width, height, mode): @@ -85,11 +85,11 @@ def __init__(self, text, width, height, mode): self._width = width self._height = height - def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): + def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): # noqa: N803 ax = self._text.axes try: renderer = ax.get_figure().canvas.get_renderer() - except: + except Exception: # Use cached renderer for backends, where # `get_renderer()` is not available # Based on the strategy from `Text.get_window_extent()` @@ -127,25 +127,21 @@ def draw_path(self, renderer, gc, tpath, affine, rgbFace=None): if mode in ["unlocked", "minimum", "maximum"]: if width is None or height is None: - raise TypeError( - f"Width and height must be set in '{mode}' mode" - ) + raise TypeError(f"Width and height must be set in '{mode}' mode") elif mode == "proportional": - if not (width is None and height is not None) or \ - not (height is None and width is not None): - raise TypeError( - f"Either width or height must be set in '{mode}' mode" - ) + if not (width is None and height is not None) or not ( + height is None and width is not None + ): + raise TypeError(f"Either width or height must be set in '{mode}' mode") else: - raise ValueError( - f"Unknown mode '{mode}'" - ) + raise ValueError(f"Unknown mode '{mode}'") text.set_path_effects([TextScaler(text, width, height, mode)]) + try: # Only create this class when matplotlib is installed - from matplotlib.transforms import Bbox from matplotlib.patches import FancyArrow + from matplotlib.transforms import Bbox class AdaptiveFancyArrow(FancyArrow): """ @@ -177,9 +173,19 @@ class AdaptiveFancyArrow(FancyArrow): `FancyArrow`. """ - def __init__(self, x, y, dx, dy, - tail_width, head_width, head_ratio, draw_head=True, - shape="full", **kwargs): + def __init__( + self, + x, + y, + dx, + dy, + tail_width, + head_width, + head_ratio, + draw_head=True, + shape="full", + **kwargs, + ): self._x = x self._y = y self._dx = dx @@ -193,23 +199,25 @@ def __init__(self, x, y, dx, dy, if not draw_head: head_width = tail_width super().__init__( - x, y, dx, dy, - width=tail_width, head_width=head_width, - overhang=0, shape=shape, - length_includes_head=True, **kwargs + x, + y, + dx, + dy, + width=tail_width, + head_width=head_width, + overhang=0, + shape=shape, + length_includes_head=True, + **kwargs, ) def draw(self, renderer): - arrow_box = Bbox([(0,0), (0,self._head_width)]) + arrow_box = Bbox([(0, 0), (0, self._head_width)]) arrow_box_display = self.axes.transData.transform_bbox(arrow_box) - head_length_display = np.abs( - arrow_box_display.height * self._head_ratio - ) + head_length_display = np.abs(arrow_box_display.height * self._head_ratio) arrow_box_display.x1 = arrow_box_display.x0 + head_length_display # Transfrom back to data coordinates for plotting - arrow_box = self.axes.transData.inverted().transform_bbox( - arrow_box_display - ) + arrow_box = self.axes.transData.inverted().transform_bbox(arrow_box_display) head_length = arrow_box.width arrow_length = norm((self._dx, self._dy)) if head_length > arrow_length: @@ -221,11 +229,19 @@ def draw(self, renderer): # Renew the arrow's properties super().__init__( - self._x, self._y, self._dx, self._dy, - width=self._tail_width, head_width=self._head_width, - overhang=0, shape=self._shape, - head_length=head_length, length_includes_head=True, - axes=self.axes, transform=self.get_transform(), **self._kwargs + self._x, + self._y, + self._dx, + self._dy, + width=self._tail_width, + head_width=self._head_width, + overhang=0, + shape=self._shape, + head_length=head_length, + length_includes_head=True, + axes=self.axes, + transform=self.get_transform(), + **self._kwargs, ) self.set_clip_path(self.axes.patch) super().draw(renderer) @@ -234,18 +250,16 @@ def draw(self, renderer): # Removes warning: # unknown document: /tutorials/intermediate/constrainedlayout_guide def get_in_layout(self): - """ - """ + """ """ return super().get_in_layout() + def set_in_layout(self, in_layout): - """ - """ + """ """ return super().set_in_layout(in_layout) except ImportError: - # Dummy class that propagates a meaningful error, # i.e. that Matplotlib is not installed - class AdaptiveFancyArrow(): + class AdaptiveFancyArrow: def __init__(*args, **kwargs): - raise ModuleNotFoundError(f"No module named 'matplotlib'") \ No newline at end of file + raise ModuleNotFoundError("No module named 'matplotlib'") diff --git a/tests/application/test_autodock.py b/tests/application/test_autodock.py index 126f424d2..846b88f29 100644 --- a/tests/application/test_autodock.py +++ b/tests/application/test_autodock.py @@ -9,12 +9,10 @@ import biotite.structure.info as info import biotite.structure.io.pdbx as pdbx from biotite.application.autodock import VinaApp -from ..util import data_dir, is_not_installed +from tests.util import data_dir, is_not_installed -@pytest.mark.skipif( - is_not_installed("vina"), reason="Autodock Vina is not installed" -) +@pytest.mark.skipif(is_not_installed("vina"), reason="Autodock Vina is not installed") @pytest.mark.parametrize("flexible", [False, True]) def test_docking(flexible): """ @@ -24,9 +22,7 @@ def test_docking(flexible): PDB structure. """ # A structure of a straptavidin-biotin complex - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("application"), "2rtg.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("application"), "2rtg.bcif")) structure = pdbx.get_structure( pdbx_file, model=1, extra_fields=["charge"], include_bonds=True ) @@ -46,8 +42,11 @@ def test_docking(flexible): flexible_mask = None app = VinaApp( - ligand, receptor, struc.centroid(ref_ligand), [20, 20, 20], - flexible=flexible_mask + ligand, + receptor, + struc.centroid(ref_ligand), + [20, 20, 20], + flexible=flexible_mask, ) app.set_seed(0) app.start() @@ -65,7 +64,7 @@ def test_docking(flexible): # Select best binding pose test_ligand_coord = test_ligand_coord[0] not_nan_mask = ~np.isnan(test_ligand_coord).any(axis=-1) - ref_ligand_coord = ref_ligand_coord[not_nan_mask] + ref_ligand_coord = ref_ligand_coord[not_nan_mask] test_ligand_coord = test_ligand_coord[not_nan_mask] # Check if it least one atom is preserved assert test_ligand_coord.shape[1] > 0 @@ -78,7 +77,7 @@ def test_docking(flexible): # Select best binding pose test_receptor_coord = test_receptor_coord[0] not_nan_mask = ~np.isnan(test_receptor_coord).any(axis=-1) - ref_receptor_coord = receptor[not_nan_mask] + ref_receptor_coord = receptor[not_nan_mask] test_receptor_coord = test_receptor_coord[not_nan_mask] # Check if it least one atom is preserved assert test_receptor_coord.shape[1] > 0 @@ -86,9 +85,7 @@ def test_docking(flexible): # from the original conformation # NOTE: Currently 1.0 Å is sufficient in local testing, # but not in the CI (1.6 Å) - assert np.max( - struc.distance(test_receptor_coord, ref_receptor_coord) - ) < 1.7 + assert np.max(struc.distance(test_receptor_coord, ref_receptor_coord)) < 1.7 else: ref_receptor_coord = receptor.coord for model_coord in test_receptor_coord: diff --git a/tests/application/test_blast.py b/tests/application/test_blast.py index 49bfed2b4..d9bb69f3a 100644 --- a/tests/application/test_blast.py +++ b/tests/application/test_blast.py @@ -2,15 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +import os.path +import pytest +import biotite.application.blast as blast import biotite.sequence as seq import biotite.sequence.io as seqio -import biotite.application.blast as blast -import numpy as np -from requests.exceptions import ConnectionError -import pytest -import os.path -from ..util import data_dir, cannot_connect_to - +from tests.util import cannot_connect_to, data_dir BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" @@ -22,10 +19,7 @@ prot_seq = seq.ProteinSequence("MTMITPSFPGNS") -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastn(): app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False) app.set_max_expect_value(100) @@ -36,10 +30,8 @@ def test_blastn(): assert dna_seq == alignments[0].sequences[0] assert dna_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastx(): app = blast.BlastWebApp("blastx", dna_seq, obey_rules=False) app.set_max_expect_value(100) @@ -50,10 +42,8 @@ def test_blastx(): assert prot_seq == alignments[0].sequences[0] assert prot_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_tblastx(): app = blast.BlastWebApp("tblastx", dna_seq, obey_rules=False) app.set_max_expect_value(100) @@ -61,16 +51,14 @@ def test_tblastx(): app.join(timeout=300) alignments = app.get_alignments() # BLAST should find original sequence as best hit - print (alignments[0].sequences[0]) - print (alignments[0].sequences[1]) + print(alignments[0].sequences[0]) + print(alignments[0].sequences[1]) rev_prot_seq = dna_seq.reverse().complement().translate(complete=True) assert rev_prot_seq == alignments[0].sequences[0] assert rev_prot_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_blastp(): app = blast.BlastWebApp("blastp", prot_seq, obey_rules=False) app.set_max_expect_value(100) @@ -81,10 +69,8 @@ def test_blastp(): assert prot_seq == alignments[0].sequences[0] assert prot_seq == alignments[0].sequences[1] -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_tblastn(): app = blast.BlastWebApp("tblastn", prot_seq, obey_rules=False) app.set_max_expect_value(200) @@ -95,20 +81,20 @@ def test_tblastn(): assert prot_seq == alignments[0].sequences[0] assert prot_seq == alignments[0].sequences[1] + def test_file_input(): path = os.path.join(data_dir("sequence"), "prot.fasta") - app = blast.BlastWebApp("blastp", path, obey_rules=False) + blast.BlastWebApp("blastp", path, obey_rules=False) + def test_invalid_query(): with pytest.raises(ValueError): - app = blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False) + blast.BlastWebApp("blastn", "ABCDEFGHIJKLMNOP", obey_rules=False) with pytest.raises(ValueError): - app = blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False) - -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + blast.BlastWebApp("blastp", "ABCDEFGHIJKLMNOP", obey_rules=False) + + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_no_hit(): app = blast.BlastWebApp("blastn", "ACTGTACGAAACTCGGCGTA", obey_rules=False) app.set_word_size(20) @@ -118,10 +104,8 @@ def test_no_hit(): # BLAST should find original sequence as best hit assert len(alignments) == 0 -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) + +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_invalid_input(): app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False) # Set some invalid parameters @@ -132,18 +116,15 @@ def test_invalid_input(): app.join(timeout=300) -@pytest.mark.skipif( - cannot_connect_to(BLAST_URL), - reason="NCBI BLAST is not available" -) +@pytest.mark.skipif(cannot_connect_to(BLAST_URL), reason="NCBI BLAST is not available") def test_hit_with_selenocysteine(): # Sequence is taken from issue #344 query = seqio.load_sequence( os.path.join(data_dir("sequence"), "selenocysteine.fasta") ) - + # Expect hit containing selenocysteine when searching Swiss-Prot blast_app = blast.BlastWebApp("blastp", query, "swissprot") blast_app.start() # No AlphabetError should be raised here - blast_app.join() \ No newline at end of file + blast_app.join() diff --git a/tests/application/test_dssp.py b/tests/application/test_dssp.py index 197790236..0a201c922 100644 --- a/tests/application/test_dssp.py +++ b/tests/application/test_dssp.py @@ -10,14 +10,13 @@ import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx from biotite.application.dssp import DsspApp -from ..util import data_dir, is_not_installed +from tests.util import data_dir, is_not_installed @pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed") def test_multiple_chains(): atoms = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")), - model=1 + pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1igy.bcif")), model=1 ) atoms = atoms[struc.filter_canonical_amino_acids(atoms)] sse = DsspApp.annotate_sse(atoms) diff --git a/tests/application/test_msa.py b/tests/application/test_msa.py index ca0554e1e..942a781e9 100644 --- a/tests/application/test_msa.py +++ b/tests/application/test_msa.py @@ -2,64 +2,70 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from distutils.version import Version +import numpy as np +import pytest import biotite.sequence as seq -import biotite.sequence.phylo as phylo import biotite.sequence.align as align +import biotite.sequence.phylo as phylo from biotite.application import VersionError -from biotite.application.muscle import MuscleApp, Muscle5App -from biotite.application.mafft import MafftApp from biotite.application.clustalo import ClustalOmegaApp -import numpy as np -import pytest -import shutil -from ..util import is_not_installed - +from biotite.application.mafft import MafftApp +from biotite.application.muscle import Muscle5App, MuscleApp +from tests.util import is_not_installed BIN_PATH = { - MuscleApp : "muscle", - Muscle5App : "muscle", - MafftApp : "mafft", - ClustalOmegaApp: "clustalo" + MuscleApp: "muscle", + Muscle5App: "muscle", + MafftApp: "mafft", + ClustalOmegaApp: "clustalo", } @pytest.fixture def sequences(): - return [seq.ProteinSequence(string) for string in [ - "BIQTITE", - "TITANITE", - "BISMITE", - "IQLITE" -]] - - -@pytest.mark.parametrize("app_cls, exp_ali, exp_order", - [(MuscleApp, - "BIQT-ITE\n" - "TITANITE\n" - "BISM-ITE\n" - "-IQL-ITE", - [1, 2, 0, 3]), - (Muscle5App, - "BI-QTITE\n" - "TITANITE\n" - "BI-SMITE\n" - "-I-QLITE", - [0, 3, 1, 2]), - (MafftApp, - "-BIQTITE\n" - "TITANITE\n" - "-BISMITE\n" - "--IQLITE", - [0, 3, 2, 1]), - (ClustalOmegaApp, - "-BIQTITE\n" - "TITANITE\n" - "-BISMITE\n" - "--IQLITE", - [1, 2, 0, 3])] -) + return [ + seq.ProteinSequence(string) + for string in ["BIQTITE", "TITANITE", "BISMITE", "IQLITE"] + ] + + +@pytest.mark.parametrize( + "app_cls, exp_ali, exp_order", + [ + ( + MuscleApp, + "BIQT-ITE\n" + "TITANITE\n" + "BISM-ITE\n" + "-IQL-ITE", + [1, 2, 0, 3] + ), + ( + Muscle5App, + "BI-QTITE\n" + "TITANITE\n" + "BI-SMITE\n" + "-I-QLITE", + [0, 3, 1, 2] + ), + ( + MafftApp, + "-BIQTITE\n" + "TITANITE\n" + "-BISMITE\n" + "--IQLITE", + [0, 3, 2, 1] + ), + ( + ClustalOmegaApp, + "-BIQTITE\n" + "TITANITE\n" + "-BISMITE\n" + "--IQLITE", + [1, 2, 0, 3] + ) + ] +) # fmt: skip def test_msa(sequences, app_cls, exp_ali, exp_order): """ Test MSA software on short toy sequences with known alignment @@ -72,7 +78,7 @@ def test_msa(sequences, app_cls, exp_ali, exp_order): try: app = app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() @@ -104,14 +110,13 @@ def test_large_sequence_number(app_cls): try: app = app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() # Expect completely matching sequences - assert alignment.trace.tolist() == [ - [i]*SEQ_NUMBER for i in range(SEQ_LENGTH) - ] + assert alignment.trace.tolist() == [[i] * SEQ_NUMBER for i in range(SEQ_LENGTH)] + def test_additional_options(sequences): bin_path = BIN_PATH[ClustalOmegaApp] @@ -120,15 +125,15 @@ def test_additional_options(sequences): app1 = ClustalOmegaApp(sequences) app1.start() - + app2 = ClustalOmegaApp(sequences) app2.add_additional_options(["--full"]) app2.start() - + app1.join() app2.join() assert "--full" not in app1.get_command() - assert "--full" in app2.get_command() + assert "--full" in app2.get_command() assert app1.get_alignment() == app2.get_alignment() @@ -137,7 +142,7 @@ def test_custom_substitution_matrix(sequences, app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.ProteinSequence.alphabet # Strong identity matrix score_matrix = np.identity(len(alph)) * 1000 @@ -147,11 +152,11 @@ def test_custom_substitution_matrix(sequences, app_cls): "TITANITE\n" "BI-SMITE\n" "-I-QLITE" - ) + ) # fmt: skip try: app = app_cls(sequences, matrix=matrix) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() @@ -165,21 +170,21 @@ def test_custom_sequence_type(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.Alphabet(("foo", "bar", 42)) sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], - ]] + ]] # fmt: skip exp_trace = [ - [ 0, 0], - [ 1, -1], - [ 2, 1], - [ 3, 2], - [-1, 3], - [ 4, 4], - [ 5, 5], - [ 6, 6], + [0, 0], + [1, -1], + [2, 1], + [3, 2], + [-1, 3], + [4, 4], + [5, 5], + [6, 6], ] # Strong identity matrix score_matrix = np.identity(len(alph)) @@ -189,7 +194,7 @@ def test_custom_sequence_type(app_cls): try: app = app_cls(sequences, matrix=matrix) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() alignment = app.get_alignment() @@ -206,17 +211,17 @@ def test_invalid_sequence_type_no_matrix(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.Alphabet(("foo", "bar", 42)) sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ ["foo", "bar", 42, "foo", "foo", 42, 42], ["foo", 42, "foo", "bar", "foo", 42, 42], - ]] + ]] # fmt: skip with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") @pytest.mark.parametrize("app_cls", [MuscleApp, MafftApp, ClustalOmegaApp]) @@ -228,17 +233,20 @@ def test_invalid_sequence_type_unsuitable_alphabet(app_cls): bin_path = BIN_PATH[app_cls] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + alph = seq.Alphabet(range(50)) - sequences = [seq.GeneralSequence(alph, sequence) for sequence in [ - [1,2,3], - [1,2,3], - ]] + sequences = [ + seq.GeneralSequence(alph, sequence) + for sequence in [ + [1, 2, 3], + [1, 2, 3], + ] + ] with pytest.raises(TypeError): try: app_cls(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") def test_invalid_muscle_version(sequences): @@ -249,9 +257,9 @@ def test_invalid_muscle_version(sequences): bin_path = BIN_PATH[MuscleApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + if is_not_installed("muscle"): - pytest.skip(f"'muscle' is not installed") + pytest.skip("'muscle' is not installed") with pytest.raises(VersionError): MuscleApp(sequences) @@ -262,13 +270,13 @@ def test_clustalo_matrix(sequences): bin_path = BIN_PATH[ClustalOmegaApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + ref_matrix = [ [0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0] - ] + ] # fmt: skip app = ClustalOmegaApp(sequences) app.full_matrix_calculation() app.set_distance_matrix(np.array(ref_matrix)) @@ -282,7 +290,7 @@ def test_clustalo_tree(sequences): bin_path = BIN_PATH[ClustalOmegaApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + leaves = [phylo.TreeNode(index=i) for i in range(len(sequences))] inter1 = phylo.TreeNode([leaves[0], leaves[1]], [1.0, 1.0]) inter2 = phylo.TreeNode([leaves[2], leaves[3]], [2.5, 2.5]) @@ -305,7 +313,7 @@ def test_mafft_tree(sequences): bin_path = BIN_PATH[MafftApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + app = MafftApp(sequences) app.start() app.join() @@ -317,11 +325,11 @@ def test_muscle_tree(sequences): bin_path = BIN_PATH[MuscleApp] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + try: app = MuscleApp(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.start() app.join() tree1 = app.get_guide_tree(iteration="kmer") @@ -334,11 +342,11 @@ def test_muscle5_options(sequences): bin_path = BIN_PATH[Muscle5App] if is_not_installed(bin_path): pytest.skip(f"'{bin_path}' is not installed") - + try: app = Muscle5App(sequences) except VersionError: - pytest.skip(f"Invalid software version") + pytest.skip("Invalid software version") app.use_super5() app.set_iterations(2, 100) app.set_thread_number(2) @@ -350,7 +358,9 @@ def test_muscle5_options(sequences): assert "-threads" in app.get_command() app.join() - assert str(app.get_alignment()) == "BI-QTITE\n" \ - "TITANITE\n" \ - "BI-SMITE\n" \ - "-I-QLITE" \ No newline at end of file + assert str(app.get_alignment()) == ( + "BI-QTITE\n" \ + "TITANITE\n" \ + "BI-SMITE\n" \ + "-I-QLITE" + ) # fmt: skip diff --git a/tests/application/test_rnaalifold.py b/tests/application/test_rnaalifold.py index f55b6bdb1..a432b65fe 100644 --- a/tests/application/test_rnaalifold.py +++ b/tests/application/test_rnaalifold.py @@ -7,7 +7,7 @@ import biotite.sequence as seq import biotite.sequence.align as align from biotite.application.viennarna import RNAalifoldApp -from ..util import is_not_installed +from tests.util import is_not_installed @pytest.fixture @@ -29,7 +29,7 @@ def sample_app(): is_not_installed("RNAalifold"), reason="RNAalifold is not installed" ) def test_get_dot_bracket(sample_app): - assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." + assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." @pytest.mark.skipif( @@ -38,19 +38,17 @@ def test_get_dot_bracket(sample_app): def test_get_free_energy(sample_app): assert sample_app.get_free_energy() == -1.3 + @pytest.mark.skipif( is_not_installed("RNAalifold"), reason="RNAalifold is not installed" ) def test_get_base_pairs(sample_app): - expected_basepairs = np.array([[ 0, 22], - [ 1, 21], - [ 2, 20], - [ 4, 19], - [ 5, 18], - [ 6, 16], - [ 7, 15]]) + expected_basepairs = np.array( + [[0, 22], [1, 21], [2, 20], [4, 19], [5, 18], [6, 16], [7, 15]] + ) assert np.all(sample_app.get_base_pairs() == expected_basepairs) + @pytest.mark.skipif( is_not_installed("RNAalifold"), reason="RNAalifold is not installed" ) @@ -63,7 +61,7 @@ def test_constraints(): sequence = seq.NucleotideSequence("A" * 20) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() alignment = align.align_ungapped(sequence, sequence, matrix) - + # An arbitrary secondary structure # The loop in the center must probably comprise at least 5 bases # due to the dynamic programming algorithm @@ -72,15 +70,18 @@ def test_constraints(): app = RNAalifoldApp(alignment) app.set_constraints( - pairs=np.stack([ - np.where(ref_dotbracket_array == "(")[0], - np.where(ref_dotbracket_array == ")")[0][::-1] - ], axis=-1), - unpaired = (ref_dotbracket_array == "x"), - enforce=True + pairs=np.stack( + [ + np.where(ref_dotbracket_array == "(")[0], + np.where(ref_dotbracket_array == ")")[0][::-1], + ], + axis=-1, + ), + unpaired=(ref_dotbracket_array == "x"), + enforce=True, ) app.start() app.join() test_dotbracket = app.get_dot_bracket() - assert test_dotbracket == ref_dotbracket.replace("x", ".") \ No newline at end of file + assert test_dotbracket == ref_dotbracket.replace("x", ".") diff --git a/tests/application/test_rnafold.py b/tests/application/test_rnafold.py index f8b0ccfd7..c40f16070 100644 --- a/tests/application/test_rnafold.py +++ b/tests/application/test_rnafold.py @@ -6,7 +6,7 @@ import pytest import biotite.sequence as seq from biotite.application.viennarna import RNAfoldApp -from ..util import is_not_installed +from tests.util import is_not_installed @pytest.fixture @@ -22,36 +22,25 @@ def sample_app(): return app -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_get_dot_bracket(sample_app): - assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." + assert sample_app.get_dot_bracket() == "(((.((((.......)).)))))...." -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_get_free_energy(sample_app): assert sample_app.get_free_energy() == -1.3 -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) + +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_get_base_pairs(sample_app): - expected_basepairs = np.array([[ 0, 22], - [ 1, 21], - [ 2, 20], - [ 4, 19], - [ 5, 18], - [ 6, 16], - [ 7, 15]]) + expected_basepairs = np.array( + [[0, 22], [1, 21], [2, 20], [4, 19], [5, 18], [6, 16], [7, 15]] + ) assert np.all(sample_app.get_base_pairs() == expected_basepairs) -@pytest.mark.skipif( - is_not_installed("RNAfold"), reason="RNAfold is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAfold"), reason="RNAfold is not installed") def test_constraints(): """ Constrain every position of the input sequence and expect that the @@ -59,7 +48,7 @@ def test_constraints(): """ # Sequence should not matter sequence = seq.NucleotideSequence("A" * 20) - + # An arbitrary secondary structure # The loop in the center must probably comprise at least 5 bases # due to the dynamic programming algorithm @@ -68,15 +57,18 @@ def test_constraints(): app = RNAfoldApp(sequence) app.set_constraints( - pairs=np.stack([ - np.where(ref_dotbracket_array == "(")[0], - np.where(ref_dotbracket_array == ")")[0][::-1] - ], axis=-1), - unpaired = (ref_dotbracket_array == "x"), - enforce=True + pairs=np.stack( + [ + np.where(ref_dotbracket_array == "(")[0], + np.where(ref_dotbracket_array == ")")[0][::-1], + ], + axis=-1, + ), + unpaired=(ref_dotbracket_array == "x"), + enforce=True, ) app.start() app.join() test_dotbracket = app.get_dot_bracket() - assert test_dotbracket == ref_dotbracket.replace("x", ".") \ No newline at end of file + assert test_dotbracket == ref_dotbracket.replace("x", ".") diff --git a/tests/application/test_rnaplot.py b/tests/application/test_rnaplot.py index 8810d1131..e0eb8649d 100644 --- a/tests/application/test_rnaplot.py +++ b/tests/application/test_rnaplot.py @@ -5,7 +5,7 @@ import numpy as np import pytest from biotite.application.viennarna import RNAplotApp -from ..util import is_not_installed +from tests.util import is_not_installed @pytest.fixture @@ -14,23 +14,24 @@ def sample_app(): Provide a `RNAplotApp` object, where *RNAplot* has been executed for a sample structure. """ - app = RNAplotApp('((..))') + app = RNAplotApp("((..))") app.start() app.join() return app -@pytest.mark.skipif( - is_not_installed("RNAplot"), reason="RNAplot is not installed" -) +@pytest.mark.skipif(is_not_installed("RNAplot"), reason="RNAplot is not installed") def test_get_cooordinates(sample_app): - assert ( - np.all( - sample_app.get_coordinates() == np.array([[ -92.5 , 92.5 ], - [ -92.5 , 77.5 ], - [ -90.31, 58.24], - [-109.69, 58.24], - [-107.5 , 77.5 ], - [-107.5 , 92.5 ]]) + assert np.all( + sample_app.get_coordinates() + == np.array( + [ + [-92.5, 92.5], + [-92.5, 77.5], + [-90.31, 58.24], + [-109.69, 58.24], + [-107.5, 77.5], + [-107.5, 92.5], + ] ) ) diff --git a/tests/application/test_sra.py b/tests/application/test_sra.py index 78b471538..7728ae33a 100644 --- a/tests/application/test_sra.py +++ b/tests/application/test_sra.py @@ -6,16 +6,14 @@ from os.path import join from tempfile import gettempdir import pytest -from biotite.application.sra import FastqDumpApp, FastaDumpApp -from biotite.sequence.io.fastq import FastqFile +from biotite.application.sra import FastaDumpApp, FastqDumpApp from biotite.sequence.io.fasta import FastaFile +from biotite.sequence.io.fastq import FastqFile @pytest.mark.parametrize( - "app_class, custom_prefix", itertools.product( - [FastqDumpApp, FastaDumpApp], - [False, True] - ) + "app_class, custom_prefix", + itertools.product([FastqDumpApp, FastaDumpApp], [False, True]), ) def test_objects(app_class, custom_prefix): """ @@ -45,10 +43,8 @@ def test_objects(app_class, custom_prefix): @pytest.mark.parametrize( - "app_class, custom_prefix", itertools.product( - [FastqDumpApp, FastaDumpApp], - [False, True] - ) + "app_class, custom_prefix", + itertools.product([FastqDumpApp, FastaDumpApp], [False, True]), ) def test_classmethod(app_class, custom_prefix): """ diff --git a/tests/application/test_tantan.py b/tests/application/test_tantan.py index dd88abd66..91bce56b8 100644 --- a/tests/application/test_tantan.py +++ b/tests/application/test_tantan.py @@ -7,24 +7,20 @@ import biotite.sequence as seq import biotite.sequence.align as align from biotite.application.tantan import TantanApp -from ..util import is_not_installed +from tests.util import is_not_installed + @pytest.fixture def simple_matrix(): alph = seq.NucleotideSequence.alphabet_unamb return align.SubstitutionMatrix( - alph, alph, np.array( - [[ 1, -1, -1, -1], - [-1, 1, -1, -1], - [-1, -1, 1, -1], - [-1, -1, -1, 1]] - ) + alph, + alph, + np.array([[1, -1, -1, -1], [-1, 1, -1, -1], [-1, -1, 1, -1], [-1, -1, -1, 1]]), ) -@pytest.mark.skipif( - is_not_installed("tantan"), reason="tantan is not installed" -) +@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed") @pytest.mark.parametrize("use_custom_matrix", [False, True]) def test_nucleotide(simple_matrix, use_custom_matrix): """ @@ -45,9 +41,7 @@ def test_nucleotide(simple_matrix, use_custom_matrix): assert test_mask.tolist() == ref_mask -@pytest.mark.skipif( - is_not_installed("tantan"), reason="tantan is not installed" -) +@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed") @pytest.mark.parametrize("use_custom_matrix", [False, True]) def test_protein(use_custom_matrix): """ @@ -68,16 +62,14 @@ def test_protein(use_custom_matrix): assert test_mask.tolist() == ref_mask -@pytest.mark.skipif( - is_not_installed("tantan"), reason="tantan is not installed" -) +@pytest.mark.skipif(is_not_installed("tantan"), reason="tantan is not installed") def test_multiple_sequences(): """ Test masking multiple sequences in a single run. """ seq_strings = [ "CANYQVcanacanasacannercancanACAN", - "NEARAnearanearerearanearlyeerieear" + "NEARAnearanearerearanearlyeerieear", ] sequences = [seq.ProteinSequence(seq_string) for seq_string in seq_strings] @@ -91,4 +83,4 @@ def test_multiple_sequences(): assert len(test_masks) == len(ref_masks) for test_mask, ref_mask in zip(test_masks, ref_masks): - assert test_mask.tolist() == ref_mask \ No newline at end of file + assert test_mask.tolist() == ref_mask diff --git a/tests/conftest.py b/tests/conftest.py index af3b9597b..7701902e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,8 +2,6 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest -import numpy as np def pytest_sessionstart(session): """ @@ -13,10 +11,11 @@ def pytest_sessionstart(session): try: import numpy as np import pyximport + pyximport.install( build_in_temp=False, - setup_args={"include_dirs":np.get_include()}, - language_level=3 + setup_args={"include_dirs": np.get_include()}, + language_level=3, ) except ImportError: - pass \ No newline at end of file + pass diff --git a/tests/database/test_entrez.py b/tests/database/test_entrez.py index a0c4dee44..bc1e94f34 100644 --- a/tests/database/test_entrez.py +++ b/tests/database/test_entrez.py @@ -4,40 +4,29 @@ import itertools import tempfile -import numpy as np -from requests.exceptions import ConnectionError import pytest import biotite.database.entrez as entrez import biotite.sequence.io.fasta as fasta from biotite.database import RequestError -from ..util import cannot_connect_to - +from tests.util import cannot_connect_to NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/" -@pytest.mark.skipif( - cannot_connect_to(NCBI_URL), - reason="NCBI Entrez is not available" -) +@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available") @pytest.mark.parametrize( - "common_name, as_file_like", - itertools.product([False, True], [False, True]) + "common_name, as_file_like", itertools.product([False, True], [False, True]) ) def test_fetch(common_name, as_file_like): path = None if as_file_like else tempfile.gettempdir() db_name = "Protein" if common_name else "protein" - file = entrez.fetch( - "1L2Y_A", path, "fa", db_name, "fasta", overwrite=True - ) + file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 20 -@pytest.mark.skipif( - cannot_connect_to(NCBI_URL), - reason="NCBI Entrez is not available" -) + +@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available") @pytest.mark.parametrize("as_file_like", [False, True]) def test_fetch_single_file(as_file_like): if as_file_like: @@ -45,7 +34,7 @@ def test_fetch_single_file(as_file_like): else: file = tempfile.NamedTemporaryFile("r", suffix=".fa") file_name = file.name - + downloaded_file_name = entrez.fetch_single_file( ["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta" ) @@ -56,17 +45,12 @@ def test_fetch_single_file(as_file_like): if not as_file_like: file.close() -@pytest.mark.skipif( - cannot_connect_to(NCBI_URL), - reason="NCBI Entrez is not available" -) + +@pytest.mark.skipif(cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available") def test_fetch_invalid(): with pytest.raises(RequestError): # Empty ID list - file = entrez.fetch_single_file( - [], None, "protein", "fasta", overwrite=True) + entrez.fetch_single_file([], None, "protein", "fasta", overwrite=True) with pytest.raises(RequestError): # Nonexisting ID - file = entrez.fetch( - "xxxx", None, "fa", "protein", "fasta", overwrite=True - ) \ No newline at end of file + entrez.fetch("xxxx", None, "fa", "protein", "fasta", overwrite=True) diff --git a/tests/database/test_pubchem.py b/tests/database/test_pubchem.py index 8c26a1ddc..ed84809e3 100644 --- a/tests/database/test_pubchem.py +++ b/tests/database/test_pubchem.py @@ -2,27 +2,22 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import re import itertools +import re import tempfile -import pytest import numpy as np +import pytest import biotite.database.pubchem as pubchem import biotite.structure.io.mol as mol from biotite.database import RequestError -from ..util import cannot_connect_to - +from tests.util import cannot_connect_to PUBCHEM_URL = "https://pubchem.ncbi.nlm.nih.gov/" -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="Pubchem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="Pubchem is not available") @pytest.mark.parametrize( - "format, as_file_like", - itertools.product(["sdf", "png"], [False, True]) + "format, as_file_like", itertools.product(["sdf", "png"], [False, True]) ) def test_fetch(format, as_file_like): """ @@ -39,10 +34,7 @@ def test_fetch(format, as_file_like): mol_file.get_structure() -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize("as_structural_formula", [False, True]) def test_fetch_structural_formula(as_structural_formula): """ @@ -52,9 +44,9 @@ def test_fetch_structural_formula(as_structural_formula): """ CID = 2244 - mol_file = mol.MOLFile.read(pubchem.fetch( - 2244, as_structural_formula=as_structural_formula - )) + mol_file = mol.MOLFile.read( + pubchem.fetch(CID, as_structural_formula=as_structural_formula) + ) atoms = mol_file.get_structure() if as_structural_formula: @@ -63,10 +55,7 @@ def test_fetch_structural_formula(as_structural_formula): assert np.any(atoms.coord[:, 2] != 0) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") def test_fetch_invalid(): """ An exception is expected when the CID is not available. @@ -77,10 +66,7 @@ def test_fetch_invalid(): pubchem.fetch(1234567890) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize( "query, ref_ids", [ @@ -89,7 +75,7 @@ def test_fetch_invalid(): (pubchem.InchiQuery("InChI=1S/C4H10/c1-3-4-2/h3-4H2,1-2H3"), [7843]), (pubchem.InchiKeyQuery("IJDNQMDRQITEOD-UHFFFAOYSA-N"), [7843]), ], - ids=["NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery"] + ids=["NameQuery", "SmilesQuery", "InchiQuery", "InchiKeyQuery"], ) def test_search_simple(query, ref_ids): """ @@ -102,10 +88,7 @@ def test_search_simple(query, ref_ids): assert set(ref_ids).issubset(pubchem.search(query)) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") def test_search_formula(): """ Download a structure and search for its molecular formula in @@ -115,23 +98,17 @@ def test_search_formula(): CID = 101608985 atoms = mol.MOLFile.read(pubchem.fetch(CID)).get_structure() - test_cids = pubchem.search( - pubchem.FormulaQuery.from_atoms(atoms) - ) + test_cids = pubchem.search(pubchem.FormulaQuery.from_atoms(atoms)) assert CID in (test_cids) -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize( - "cid, from_atoms, query_type", itertools.product( - [2244], - [False, True], - [pubchem.SuperstructureQuery, pubchem.SubstructureQuery] - ) + "cid, from_atoms, query_type", + itertools.product( + [2244], [False, True], [pubchem.SuperstructureQuery, pubchem.SubstructureQuery] + ), ) def test_search_super_and_substructure(cid, from_atoms, query_type): """ @@ -170,16 +147,9 @@ def test_search_super_and_substructure(cid, from_atoms, query_type): assert atoms.array_length() >= original_atoms.array_length() -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize( - "conformation_based, from_atoms", - itertools.product( - [False, True], - [False, True] - ) + "conformation_based, from_atoms", itertools.product([False, True], [False, True]) ) def test_search_similarity(conformation_based, from_atoms): """ @@ -192,8 +162,7 @@ def test_search_similarity(conformation_based, from_atoms): if from_atoms: original_atoms = mol.MOLFile.read(pubchem.fetch(CID)).get_structure() query = pubchem.SimilarityQuery.from_atoms( - original_atoms, threshold=1.0, - conformation_based=conformation_based + original_atoms, threshold=1.0, conformation_based=conformation_based ) else: query = pubchem.SimilarityQuery( @@ -204,10 +173,7 @@ def test_search_similarity(conformation_based, from_atoms): assert CID in cids -@pytest.mark.skipif( - cannot_connect_to(PUBCHEM_URL), - reason="PubChem is not available" -) +@pytest.mark.skipif(cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available") @pytest.mark.parametrize("from_atoms", [False, True]) def test_search_identity(from_atoms): """ @@ -222,4 +188,4 @@ def test_search_identity(from_atoms): query = pubchem.IdentityQuery(cid=CID) cids = pubchem.search(query) - assert cids == [CID] \ No newline at end of file + assert cids == [CID] diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 415b713a4..ae175529a 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -2,33 +2,28 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join import itertools import tempfile -import pytest +from os.path import join import numpy as np +import pytest import biotite.database.rcsb as rcsb +import biotite.sequence.align as align +import biotite.sequence.io.fasta as fasta import biotite.structure.io.pdb as pdb import biotite.structure.io.pdbx as pdbx -import biotite.structure.io.mmtf as mmtf -import biotite.sequence.io.fasta as fasta -import biotite.sequence.align as align from biotite.database import RequestError -from ..util import cannot_connect_to, data_dir - +from tests.util import cannot_connect_to, data_dir RCSB_URL = "https://www.rcsb.org/" # Search term that should only find the entry 1L2Y TC5B_TERM = "Miniprotein Construct TC5b" -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize( "format, as_file_like", - itertools.product(["pdb", "cif", "bcif", "mmtf", "fasta"], [False, True]) + itertools.product(["pdb", "cif", "bcif", "fasta"], [False, True]), ) def test_fetch(format, as_file_like): path = None if as_file_like else tempfile.gettempdir() @@ -37,30 +32,22 @@ def test_fetch(format, as_file_like): file = pdb.PDBFile.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": - file = pdbx.PDBxFile.read(file_path_or_obj) + file = pdbx.CIFFile.read(file_path_or_obj) pdbx.get_structure(file) elif format == "bcif": file = pdbx.BinaryCIFFile.read(file_path_or_obj) pdbx.get_structure(file) - elif format == "mmtf": - file = mmtf.MMTFFile.read(file_path_or_obj) - mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0 -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) -@pytest.mark.parametrize("format", ["pdb", "cif", "bcif", "mmtf", "fasta"]) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") +@pytest.mark.parametrize("format", ["pdb", "cif", "bcif", "fasta"]) def test_fetch_invalid(format): with pytest.raises(RequestError): - rcsb.fetch( - "xxxx", format, tempfile.gettempdir(), overwrite=True - ) + rcsb.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True) def test_search_basic(): @@ -76,58 +63,78 @@ def test_search_basic(): "pdbx_serial_crystallography_sample_delivery_injection.preparation", False, {}, - ["6IG7", "6IG6", "7JRI", "7JR5", "7QX4", "7QX5", "7QX6", "7QX7", - "8A2O", "8A2P"] + [ + "6IG7", + "6IG6", + "7JRI", + "7JR5", + "7QX4", + "7QX5", + "7QX6", + "7QX7", + "8A2O", + "8A2P", + ], ), ( "audit_author.name", False, {"is_in": ["Neidigh, J.W."]}, - ["1JRJ", "1L2Y", "2O3P", "2O63", "2O64", "2O65"] + ["1JRJ", "1L2Y", "2O3P", "2O63", "2O64", "2O65"], ), ( "rcsb_entity_source_organism.rcsb_gene_name.value", False, {"exact_match": "lacA"}, - ["5JUV", "1KQA", "1KRV", "1KRU", "1KRR", "3U7V", "4IUG", "4LFK", - "4LFL", "4LFM", "4LFN", "5IFP", "5IFT", "5IHR", "4DUW", "5MGD", - "5MGC"] + [ + "5JUV", + "1KQA", + "1KRV", + "1KRU", + "1KRR", + "3U7V", + "4IUG", + "4LFK", + "4LFL", + "4LFM", + "4LFN", + "5IFP", + "5IFT", + "5IHR", + "4DUW", + "5MGD", + "5MGC", + ], ), ( "struct.title", False, {"contains_words": "tc5b"}, - ["1L2Y", "8ANH", "8ANM", "8ANG", "8ANI"] + ["1L2Y", "8ANH", "8ANM", "8ANG", "8ANI", "8QWW"], ), ( "reflns.d_resolution_high", False, {"less_or_equal": 0.6}, - ["1EJG", "1I0T", "3NIR", "3P4J", "5D8V", "5NW3", "4JLJ", "7ATG", - "7R0H"] + ["1EJG", "1I0T", "3NIR", "3P4J", "5D8V", "5NW3", "4JLJ", "7ATG", "7R0H"], ), ( "rcsb_entry_info.deposited_model_count", False, {"range_closed": (60, 61)}, - ["1BBO", "1GB1", "1O5P", "1XU6", "2LUM", "2NO8"] + ["1BBO", "1GB1", "1O5P", "1XU6", "2LUM", "2NO8"], ), ( "rcsb_id", True, {"exact_match": "AIN"}, - ["1OXR", "1TGM", "3IAZ", "3GCL", "6MQF", "2QQT", "4NSB", "8J3W"] + ["1OXR", "1TGM", "3IAZ", "3GCL", "6MQF", "2QQT", "4NSB", "8J3W"], ), - ] -) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" + ], ) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_field(field, molecular_definition, params, ref_ids): - query = rcsb.FieldQuery( - field, molecular_definition, **params - ) + query = rcsb.FieldQuery(field, molecular_definition, **params) test_ids = rcsb.search(query) test_count = rcsb.count(query) @@ -135,17 +142,12 @@ def test_search_field(field, molecular_definition, params, ref_ids): assert test_count == len(ref_ids) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_sequence(): IDENTIY_CUTOFF = 0.9 - pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) - ref_sequence = pdbx.get_sequence(pdbx_file)['A'] - query = rcsb.SequenceQuery( - ref_sequence, "protein", min_identity=IDENTIY_CUTOFF - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) + ref_sequence = pdbx.get_sequence(pdbx_file)["A"] + query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) assert len(test_ids) >= 2 @@ -160,20 +162,14 @@ def test_search_sequence(): assert identity >= IDENTIY_CUTOFF -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_structure(): query = rcsb.StructureQuery("1L2Y", chain="A") test_ids = rcsb.search(query) assert "1L2Y" in test_ids -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_motif(): # motif is taken from official RCSB search API tutorial MOTIF = "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H." @@ -182,25 +178,18 @@ def test_search_motif(): assert test_count == pytest.approx(639, rel=0.1) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_composite(): query1 = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" - ) - query2 = rcsb.FieldQuery( - "exptl.method", - exact_match="SOLUTION NMR" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) + query2 = rcsb.FieldQuery("exptl.method", exact_match="SOLUTION NMR") ids_1 = set(rcsb.search(query1)) ids_2 = set(rcsb.search(query2)) ids_or = set(rcsb.search(query1 | query2)) ids_and = set(rcsb.search(query1 & query2)) - assert ids_or == ids_1 | ids_2 + assert ids_or == ids_1 | ids_2 assert ids_and == ids_1 & ids_2 @@ -213,26 +202,19 @@ def test_search_composite(): ("non_polymer_entity", [] ), ("polymer_instance", ["1L2Y.A"]), ] -) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +) # fmt: skip +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_return_type(return_type, expected): query = rcsb.BasicQuery(TC5B_TERM) assert rcsb.search(query, return_type) == expected assert rcsb.count(query, return_type) == len(expected) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize("seed", np.arange(5)) def test_search_range(seed): query = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) count = rcsb.count(query) ref_entries = rcsb.search(query) @@ -245,15 +227,11 @@ def test_search_range(seed): assert test_entries == ref_entries[range[0] : range[1]] -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize("as_sorting_object", [False, True]) def test_search_sort(as_sorting_object): query = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) if as_sorting_object: sort_by = rcsb.Sorting("reflns.d_resolution_high", descending=False) @@ -263,8 +241,8 @@ def test_search_sort(as_sorting_object): resolutions = [] for pdb_id in entries[:5]: - pdbx_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx")) - resolutions.append(float(pdbx_file["reflns"]["d_resolution_high"])) + pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif")) + resolutions.append(pdbx_file.block["reflns"]["d_resolution_high"].as_item()) if as_sorting_object: # In the tested case the Sorting object uses ascending order @@ -274,20 +252,18 @@ def test_search_sort(as_sorting_object): assert resolutions == list(reversed(sorted(resolutions))) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_content_types(): # Query to limit the number of returned results # for improved performance query = rcsb.FieldQuery( - "rcsb_entity_host_organism.scientific_name", - exact_match="Homo sapiens" + "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) - experimental_set = set(rcsb.search(query, content_types=["experimental"])) + experimental_set = set(rcsb.search(query, content_types=["experimental"])) computational_set = set(rcsb.search(query, content_types=["computational"])) - combined_set = set(rcsb.search(query, content_types=["experimental", "computational"])) + combined_set = set( + rcsb.search(query, content_types=["experimental", "computational"]) + ) # If there are no results, the following tests make no sense assert len(combined_set) > 0 @@ -298,7 +274,9 @@ def test_search_content_types(): assert rcsb.count(query, content_types=["experimental"]) == len(experimental_set) assert rcsb.count(query, content_types=["computational"]) == len(computational_set) - assert rcsb.count(query, content_types=["experimental", "computational"]) == len(combined_set) + assert rcsb.count(query, content_types=["experimental", "computational"]) == len( + combined_set + ) # Expect an exception if no content_type with pytest.raises(ValueError): @@ -307,10 +285,7 @@ def test_search_content_types(): rcsb.count(query, content_types=[]) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") @pytest.mark.parametrize( "grouping, resolution_threshold, return_type, ref_groups", [ @@ -320,79 +295,65 @@ def test_search_content_types(): ), 0.7, "polymer_entity", - set([ - ("3X2M_1",), - ("6E6O_1",), - ("1YK4_1",), - ("5NW3_1",), - ("1US0_1",), - ("4HP2_1",), - ("2DSX_1",), - ("2VB1_1",), - ("7VOS_1", "5D8V_1", "3A38_1"), - ("1UCS_1",), - ("3NIR_1", "1EJG_1"), - ]) + set( + [ + ("3X2M_1",), + ("6E6O_1",), + ("1YK4_1",), + ("5NW3_1",), + ("1US0_1",), + ("4HP2_1",), + ("2DSX_1",), + ("2VB1_1",), + ("3A38_1", "5D8V_1", "7VOS_1"), + ("1UCS_1",), + ("1EJG_1", "3NIR_1"), + ] + ), ), - ( - rcsb.UniprotGrouping( - sort_by="rcsb_accession_info.initial_release_date" - ), + rcsb.UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"), 0.7, "polymer_entity", - set([ - ("3X2M_1",), - ("6E6O_1",), - ("1YK4_1",), - ("5NW3_1",), - ("1US0_1",), - ("4HP2_1",), - ("2DSX_1",), - ("2VB1_1",), - ("7VOS_1", "5D8V_1", "3A38_1"), - ("1UCS_1",), - ("3NIR_1", "1EJG_1"), - ]) + set( + [ + ("3X2M_1",), + ("6E6O_1",), + ("1YK4_1",), + ("5NW3_1",), + ("1US0_1",), + ("4HP2_1",), + ("2DSX_1",), + ("2VB1_1",), + ("3A38_1", "5D8V_1", "7VOS_1"), + ("1UCS_1",), + ("1EJG_1", "3NIR_1"), + ] + ), ), - ( - rcsb.DepositGrouping( - sort_by="rcsb_accession_info.initial_release_date" - ), + rcsb.DepositGrouping(sort_by="rcsb_accession_info.initial_release_date"), 0.9, "entry", - set([ - ("5R32",), - ("5RDH", "5RBR"), - ("7G0Z", "7FXV") - ]) - ) - ] + set([("5R32",), ("5RBR", "5RDH"), ("7FXV", "7G0Z")]), + ), + ], ) -def test_search_grouping(grouping, resolution_threshold, return_type, - ref_groups): +def test_search_grouping(grouping, resolution_threshold, return_type, ref_groups): """ Check whether the same result as in a known example is achieved. """ - query = ( - rcsb.FieldQuery( - "exptl.method", - exact_match="X-RAY DIFFRACTION" - ) - & rcsb.FieldQuery( - "rcsb_entry_info.resolution_combined", - range_closed=(0.0, resolution_threshold) - ) + query = rcsb.FieldQuery( + "exptl.method", exact_match="X-RAY DIFFRACTION" + ) & rcsb.FieldQuery( + "rcsb_entry_info.resolution_combined", range_closed=(0.0, resolution_threshold) ) - test_groups = list(rcsb.search( - query, return_type, - group_by=grouping, return_groups=True - ).values()) + test_groups = list( + rcsb.search(query, return_type, group_by=grouping, return_groups=True).values() + ) test_representatives = rcsb.search( - query, return_type, - group_by=grouping, return_groups=False + query, return_type, group_by=grouping, return_groups=False ) test_count = rcsb.count(query, return_type, group_by=grouping) @@ -402,10 +363,7 @@ def test_search_grouping(grouping, resolution_threshold, return_type, assert test_count == len(ref_groups) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" -) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_empty(): query = rcsb.BasicQuery("This will not match any ID") assert rcsb.search(query) == [] @@ -414,21 +372,9 @@ def test_search_empty(): @pytest.mark.parametrize( "field, params", - [ - ( - "invalid.field", - {"exact_match": "Some Value"} - ), - ( - "exptl.method", - {"less": 5} - ) - ] -) -@pytest.mark.skipif( - cannot_connect_to(RCSB_URL), - reason="RCSB PDB is not available" + [("invalid.field", {"exact_match": "Some Value"}), ("exptl.method", {"less": 5})], ) +@pytest.mark.skipif(cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available") def test_search_invalid(field, params): invalid_query = rcsb.FieldQuery(field, **params) with pytest.raises(RequestError, match="400"): diff --git a/tests/database/test_uniprot.py b/tests/database/test_uniprot.py index 53c12e60b..7af70393a 100644 --- a/tests/database/test_uniprot.py +++ b/tests/database/test_uniprot.py @@ -8,76 +8,51 @@ import biotite.database.uniprot as uniprot import biotite.sequence.io.fasta as fasta from biotite.database import RequestError -from ..util import cannot_connect_to - +from tests.util import cannot_connect_to UNIPROT_URL = "https://www.uniprot.org/" -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) -@pytest.mark.parametrize( - "as_file_like", - itertools.product([False, True]) -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") +@pytest.mark.parametrize("as_file_like", itertools.product([False, True])) def test_fetch(as_file_like): path = None if as_file_like else tempfile.gettempdir() # UniProtKB - file = uniprot.fetch( - "P12345", "fasta", path, overwrite=True - ) + file = uniprot.fetch("P12345", "fasta", path, overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 430 # UniRef - file = uniprot.fetch( - "UniRef90_P99999", "fasta", path, overwrite=True - ) + file = uniprot.fetch("UniRef90_P99999", "fasta", path, overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 105 # UniParc - file = uniprot.fetch( - "UPI000000001F", "fasta", path, overwrite=True - ) + file = uniprot.fetch("UPI000000001F", "fasta", path, overwrite=True) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 551 -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") @pytest.mark.parametrize("format", ["fasta", "gff", "txt", "xml", "rdf", "tab"]) def test_fetch_invalid(format): with pytest.raises(RequestError): - file = uniprot.fetch( - "xxxx", format, tempfile.gettempdir(), overwrite=True - ) + uniprot.fetch("xxxx", format, tempfile.gettempdir(), overwrite=True) -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") def test_search_simple(): query = uniprot.SimpleQuery("accession", "P12345") - assert uniprot.search(query) \ - == ['P12345'] + assert uniprot.search(query) == ["P12345"] -@pytest.mark.skipif( - cannot_connect_to(UNIPROT_URL), - reason="UniProt is not available" -) +@pytest.mark.skipif(cannot_connect_to(UNIPROT_URL), reason="UniProt is not available") def test_search_composite(): - query = uniprot.SimpleQuery("accession", "P12345") & uniprot.SimpleQuery("reviewed", "true") - assert uniprot.search(query) \ - == ['P12345'] - + query = uniprot.SimpleQuery("accession", "P12345") & uniprot.SimpleQuery( + "reviewed", "true" + ) + assert uniprot.search(query) == ["P12345"] diff --git a/tests/sequence/align/util.py b/tests/sequence/align/conftest.py similarity index 91% rename from tests/sequence/align/util.py rename to tests/sequence/align/conftest.py index 191fbde6f..3320f4255 100644 --- a/tests/sequence/align/util.py +++ b/tests/sequence/align/conftest.py @@ -6,7 +6,7 @@ import pytest import biotite.sequence as seq import biotite.sequence.io.fasta as fasta -from ...util import data_dir +from tests.util import data_dir @pytest.fixture @@ -15,4 +15,4 @@ def sequences(): 10 Cas9 sequences. """ fasta_file = fasta.FastaFile.read(join(data_dir("sequence"), "cas9.fasta")) - return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()] \ No newline at end of file + return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()] diff --git a/tests/sequence/align/test_alignment.py b/tests/sequence/align/test_alignment.py index a56cee3c8..971aecb73 100644 --- a/tests/sequence/align/test_alignment.py +++ b/tests/sequence/align/test_alignment.py @@ -6,8 +6,6 @@ import pytest import biotite.sequence as seq import biotite.sequence.align as align -from .util import sequences - def test_alignment_str(): @@ -16,12 +14,15 @@ def test_alignment_str(): """ seq1 = seq.NucleotideSequence("ACCTGA") seq2 = seq.NucleotideSequence("TATGCT") - ali_str = ["A-CCTGA----", - "----T-ATGCT"] + ali_str = [ + "A-CCTGA----", + "----T-ATGCT" + ] # fmt: skip trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) assert str(alignment).split("\n") == ali_str + def test_conversion_to_symbols(): """ Test conversion of alignments to strings. @@ -30,16 +31,20 @@ def test_conversion_to_symbols(): seq_str2 = "HA--PRDDADWKLHH" seq_str3 = "HA----DDADWKLHH" seq_strings = [seq_str1, seq_str2, seq_str3] - sequences = [seq.ProteinSequence(seq_str.replace("-","")) - for seq_str in seq_strings] + sequences = [ + seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings + ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Test the conversion bach to strings of symbols symbols = align.get_symbols(alignment) - symbols = ["".join([sym if sym is not None else "-" for sym in sym_list]) - for sym_list in symbols] + symbols = [ + "".join([sym if sym is not None else "-" for sym in sym_list]) + for sym_list in symbols + ] assert symbols == seq_strings + def test_identity(): """ Test correct calculation of `get_sequence_identity()` via a known @@ -48,16 +53,18 @@ def test_identity(): seq_str1 = "--HAKLPRDD--WL--" seq_str2 = "FRHA--QRTDADWLHH" seq_strings = [seq_str1, seq_str2] - sequences = [seq.ProteinSequence(seq_str.replace("-","")) - for seq_str in seq_strings] + sequences = [ + seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings + ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Assert correct sequence identity calculation modes = ["all", "not_terminal", "shortest"] - values = [6/16, 6/12, 6/10] + values = [6 / 16, 6 / 12, 6 / 10] for mode, value in zip(modes, values): assert align.get_sequence_identity(alignment, mode=mode) == value + @pytest.mark.parametrize("mode", ["all", "not_terminal", "shortest"]) def test_pairwise_identity(sequences, mode): """ @@ -66,19 +73,18 @@ def test_pairwise_identity(sequences, mode): """ sequences = sequences msa, _, _, _ = align.align_multiple( - sequences, - matrix=align.SubstitutionMatrix.std_protein_matrix() + sequences, matrix=align.SubstitutionMatrix.std_protein_matrix() ) - + ref_identity_matrix = np.zeros((len(sequences), len(sequences))) for i in range(len(sequences)): for j in range(len(sequences)): - ref_identity_matrix[i,j] = align.get_sequence_identity( - msa[:, [i,j]], mode=mode + ref_identity_matrix[i, j] = align.get_sequence_identity( + msa[:, [i, j]], mode=mode ) - + test_identity_matrix = align.get_pairwise_sequence_identity(msa, mode=mode) - + # Identity of two equal sequences should be 1, if only the length of # the sequence is counted if mode == "shortest": @@ -88,4 +94,4 @@ def test_pairwise_identity(sequences, mode): # Identity matrix is symmetric assert (test_identity_matrix == test_identity_matrix.T).all() # Pairwise identity must be equal in the two functions - assert (test_identity_matrix == ref_identity_matrix).all() \ No newline at end of file + assert (test_identity_matrix == ref_identity_matrix).all() diff --git a/tests/sequence/align/test_banded.py b/tests/sequence/align/test_banded.py index 351139925..85e297dcb 100644 --- a/tests/sequence/align/test_banded.py +++ b/tests/sequence/align/test_banded.py @@ -3,19 +3,16 @@ # information. import itertools -import pytest import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align -from .util import sequences @pytest.mark.parametrize( - "gap_penalty, local, band_width", itertools.product( - [-10, (-10,-1)], - [False, True], - [2, 5, 20, 100] -)) + "gap_penalty, local, band_width", + itertools.product([-10, (-10, -1)], [False, True], [2, 5, 20, 100]), +) def test_simple_alignment(gap_penalty, local, band_width): """ Test `align_banded()` by comparing the output to `align_optimal()`. @@ -28,16 +25,19 @@ def test_simple_alignment(gap_penalty, local, band_width): matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=local, terminal_penalty=False + seq1, seq2, matrix, gap_penalty=gap_penalty, local=local, terminal_penalty=False ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] - + test_alignments = align.align_banded( - seq1, seq2, matrix, (-band_width, band_width), - gap_penalty=gap_penalty, local=local + seq1, + seq2, + matrix, + (-band_width, band_width), + gap_penalty=gap_penalty, + local=local, ) assert len(test_alignments) == len(ref_alignments) @@ -46,11 +46,13 @@ def test_simple_alignment(gap_penalty, local, band_width): @pytest.mark.parametrize( - "gap_penalty, local, seq_indices", itertools.product( - [-10, (-10,-1)], - [False, True], - [(i,j) for i in range(10) for j in range(i+1)] -)) + "gap_penalty, local, seq_indices", + itertools.product( + [-10, (-10, -1)], + [False, True], + [(i, j) for i in range(10) for j in range(i + 1)], + ), +) def test_complex_alignment(sequences, gap_penalty, local, seq_indices): """ Test `align_banded()` by comparing the output to `align_optimal()`. @@ -59,28 +61,37 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): can return the optimal alignment(s). """ MAX_NUMBER = 100 - + matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=local, terminal_penalty=False, - max_number=MAX_NUMBER + seq1, + seq2, + matrix, + gap_penalty=gap_penalty, + local=local, + terminal_penalty=False, + max_number=MAX_NUMBER, ) # Remove terminal gaps in reference to obtain a true semi-global # alignment, as returned by align_banded() ref_alignments = [align.remove_terminal_gaps(al) for al in ref_alignments] - + identity = align.get_sequence_identity(ref_alignments[0]) # Use a relatively small band width, if the sequences are similar, # otherwise use the entire search space band_width = 100 if identity > 0.5 else len(seq1) + len(seq2) test_alignments = align.align_banded( - seq1, seq2, matrix, (-band_width, band_width), - gap_penalty=gap_penalty, local=local, max_number=MAX_NUMBER + seq1, + seq2, + matrix, + (-band_width, band_width), + gap_penalty=gap_penalty, + local=local, + max_number=MAX_NUMBER, ) try: @@ -103,18 +114,16 @@ def test_complex_alignment(sequences, gap_penalty, local, seq_indices): @pytest.mark.parametrize( - "length, excerpt_length, seed", itertools.product( - [1_000, 1_000_000], - [50, 500], - range(10) -)) + "length, excerpt_length, seed", + itertools.product([1_000, 1_000_000], [50, 500], range(10)), +) def test_large_sequence_mapping(length, excerpt_length, seed): """ Test whether an excerpt of a very large sequence is aligned to that sequence at the position, where the excerpt was taken from. """ BAND_WIDTH = 100 - + np.random.seed(seed) sequence = seq.NucleotideSequence() @@ -122,51 +131,37 @@ def test_large_sequence_mapping(length, excerpt_length, seed): excerpt_pos = np.random.randint(len(sequence) - excerpt_length) excerpt = sequence[excerpt_pos : excerpt_pos + excerpt_length] - diagonal = np.random.randint( - excerpt_pos - BAND_WIDTH, - excerpt_pos + BAND_WIDTH - ) - band = ( - diagonal - BAND_WIDTH, - diagonal + BAND_WIDTH - ) + diagonal = np.random.randint(excerpt_pos - BAND_WIDTH, excerpt_pos + BAND_WIDTH) + band = (diagonal - BAND_WIDTH, diagonal + BAND_WIDTH) print(band) print(len(sequence), len(excerpt)) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() - test_alignments = align.align_banded( - excerpt, sequence, matrix, band=band - ) + test_alignments = align.align_banded(excerpt, sequence, matrix, band=band) # The excerpt should be uniquely mappable to a single location on # the long sequence assert len(test_alignments) == 1 test_alignment = test_alignments[0] test_trace = test_alignment.trace - ref_trace = np.stack([ - np.arange(len(excerpt)), - np.arange(excerpt_pos, len(excerpt) + excerpt_pos) - ], axis=1) + ref_trace = np.stack( + [np.arange(len(excerpt)), np.arange(excerpt_pos, len(excerpt) + excerpt_pos)], + axis=1, + ) assert np.array_equal(test_trace, ref_trace) - @pytest.mark.parametrize( - "gap_penalty, local, seed", itertools.product( - [-10, (-10, -1)], - [False, True], - range(100) -)) + "gap_penalty, local, seed", + itertools.product([-10, (-10, -1)], [False, True], range(100)), +) def test_swapping(gap_penalty, local, seed): """ Check if `align_banded()` returns a 'swapped' alignment, if the order of input sequences is swapped. """ np.random.seed(seed) - band = ( - np.random.randint(-30, -10), - np.random.randint( 10, 30) - ) + band = (np.random.randint(-30, -10), np.random.randint(10, 30)) seq1, seq2 = _create_random_pair(seed) matrix = align.SubstitutionMatrix.std_protein_matrix() @@ -178,7 +173,7 @@ def test_swapping(gap_penalty, local, seed): seq2, seq1, matrix, band=band, local=local, gap_penalty=gap_penalty ) - if len(ref_alignments) != 1 or len(test_alignments) != 1: + if len(ref_alignments) != 1 or len(test_alignments) != 1: # If multiple optimal alignments exist, # it is not easy to assign a swapped one to an original one # therefore, simply return in this case @@ -187,16 +182,20 @@ def test_swapping(gap_penalty, local, seed): return ref_alignment = ref_alignments[0] test_alignment = test_alignments[0] - + assert test_alignment.sequences[0] == ref_alignment.sequences[1] assert test_alignment.sequences[1] == ref_alignment.sequences[0] assert np.array_equal(test_alignment.trace, ref_alignment.trace[:, ::-1]) - -def _create_random_pair(seed, length=100, max_subsitutions=5, - max_insertions=5, max_deletions=5, - max_truncations=5): +def _create_random_pair( + seed, + length=100, + max_subsitutions=5, + max_insertions=5, + max_deletions=5, + max_truncations=5, +): """ generate a pair of protein sequences. Each pair contains @@ -217,9 +216,7 @@ def _create_random_pair(seed, length=100, max_subsitutions=5, subsitution_indices = np.random.choice( np.arange(len(mutant)), size=n_subsitutions, replace=False ) - subsitution_values = np.random.randint( - len(original.alphabet), size=n_subsitutions - ) + subsitution_values = np.random.randint(len(original.alphabet), size=n_subsitutions) mutant.code[subsitution_indices] = subsitution_values # Random insertions @@ -227,9 +224,7 @@ def _create_random_pair(seed, length=100, max_subsitutions=5, insertion_indices = np.random.choice( np.arange(len(mutant)), size=n_insertions, replace=False ) - insertion_values = np.random.randint( - len(original.alphabet), size=n_insertions - ) + insertion_values = np.random.randint(len(original.alphabet), size=n_insertions) mutant.code = np.insert(mutant.code, insertion_indices, insertion_values) # Random deletions @@ -241,12 +236,10 @@ def _create_random_pair(seed, length=100, max_subsitutions=5, # Truncate at both ends of original and mutant original = original[ - np.random.randint(max_truncations) : - -(1 + np.random.randint(max_truncations)) + np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations)) ] mutant = mutant[ - np.random.randint(max_truncations) : - -(1 + np.random.randint(max_truncations)) + np.random.randint(max_truncations) : -(1 + np.random.randint(max_truncations)) ] - return original, mutant \ No newline at end of file + return original, mutant diff --git a/tests/sequence/align/test_cigar.py b/tests/sequence/align/test_cigar.py index 2c4767ddc..e4ffe4b04 100644 --- a/tests/sequence/align/test_cigar.py +++ b/tests/sequence/align/test_cigar.py @@ -18,10 +18,12 @@ def _generate_cigar(seed): # Alternatingly insert matches and insertions/deletions cigar += f"{np.random.randint(1, 100)}M" op = align.CigarOp( - np.random.choice([ - align.CigarOp.INSERTION, - align.CigarOp.DELETION, - ]) + np.random.choice( + [ + align.CigarOp.INSERTION, + align.CigarOp.DELETION, + ] + ) ).to_cigar_symbol() cigar += f"{np.random.randint(1, 100)}{op}" # Alignment must end with a match @@ -34,8 +36,9 @@ def _generate_cigar(seed): return cigar -def _mutate_sequence(original, - max_subsitutions=50, max_insertions=50, max_deletions=50): +def _mutate_sequence( + original, max_subsitutions=50, max_insertions=50, max_deletions=50 +): """ Introduce random deletions, insertions and substitutions into a sequence. @@ -47,9 +50,7 @@ def _mutate_sequence(original, subsitution_indices = np.random.choice( np.arange(len(mutant)), size=n_subsitutions, replace=False ) - subsitution_values = np.random.randint( - len(original.alphabet), size=n_subsitutions - ) + subsitution_values = np.random.randint(len(original.alphabet), size=n_subsitutions) mutant.code[subsitution_indices] = subsitution_values # Random insertions @@ -57,9 +58,7 @@ def _mutate_sequence(original, insertion_indices = np.random.choice( np.arange(len(mutant)), size=n_insertions, replace=False ) - insertion_values = np.random.randint( - len(original.alphabet), size=n_insertions - ) + insertion_values = np.random.randint(len(original.alphabet), size=n_insertions) mutant.code = np.insert(mutant.code, insertion_indices, insertion_values) # Random deletions @@ -83,8 +82,8 @@ def test_cigar_conversion(cigar): # The sequences are arbitrary, only the alignment trace matters # However, they still need to be long enough for the number of CIGAR # operations - ref = seq.NucleotideSequence(["A"]*LENGTH) - seg = seq.NucleotideSequence(["A"]*LENGTH) + ref = seq.NucleotideSequence(["A"] * LENGTH) + seg = seq.NucleotideSequence(["A"] * LENGTH) alignment = align.read_alignment_from_cigar(cigar, 0, ref, seg) print(alignment) @@ -103,10 +102,9 @@ def test_cigar_conversion(cigar): [False, True], [False, True], [False, True], - ) + ), ) -def test_alignment_conversion(seed, local, distinguish_matches, - include_terminal_gaps): +def test_alignment_conversion(seed, local, distinguish_matches, include_terminal_gaps): """ Check whether an :class:`Alignment` converted into a CIGAR string and back again into an :class:`Alignment` gives the same result. @@ -114,20 +112,16 @@ def test_alignment_conversion(seed, local, distinguish_matches, REF_LENGTH = 1000 np.random.seed(seed) ref = seq.NucleotideSequence(ambiguous=False) - ref.code = np.random.randint( - 0, len(ref.alphabet), REF_LENGTH, dtype=np.uint8 - ) + ref.code = np.random.randint(0, len(ref.alphabet), REF_LENGTH, dtype=np.uint8) excerpt_start = np.random.randint(0, 200) - excerpt_stop = np.random.randint(REF_LENGTH-200, REF_LENGTH) - seg = ref[excerpt_start: excerpt_stop] + excerpt_stop = np.random.randint(REF_LENGTH - 200, REF_LENGTH) + seg = ref[excerpt_start:excerpt_stop] seg = _mutate_sequence(seg) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() if local: - ref_ali = align.align_optimal( - ref, seg, matrix, local=True, max_number=1 - )[0] + ref_ali = align.align_optimal(ref, seg, matrix, local=True, max_number=1)[0] else: ref_ali = align.align_optimal( ref, seg, matrix, terminal_penalty=False, max_number=1 @@ -138,17 +132,15 @@ def test_alignment_conversion(seed, local, distinguish_matches, # Remove score as the compared reconstructed alignment does not # contain it either ref_ali.score = None - start_position = ref_ali.trace[0,0] + start_position = ref_ali.trace[0, 0] cigar = align.write_alignment_to_cigar( ref_ali, distinguish_matches=distinguish_matches, - include_terminal_gaps=include_terminal_gaps + include_terminal_gaps=include_terminal_gaps, ) - test_ali = align.read_alignment_from_cigar( - cigar, start_position, ref, seg - ) + test_ali = align.read_alignment_from_cigar(cigar, start_position, ref, seg) print(cigar) print("\n\n") @@ -156,4 +148,4 @@ def test_alignment_conversion(seed, local, distinguish_matches, print("\n\n") print(test_ali) print("\n\n") - assert test_ali == ref_ali \ No newline at end of file + assert test_ali == ref_ali diff --git a/tests/sequence/align/test_kmeralphabet.py b/tests/sequence/align/test_kmeralphabet.py index 1ea31a400..67b3f9b03 100644 --- a/tests/sequence/align/test_kmeralphabet.py +++ b/tests/sequence/align/test_kmeralphabet.py @@ -7,7 +7,6 @@ import biotite.sequence as seq import biotite.sequence.align as align - K = 3 @@ -15,21 +14,24 @@ def kmer_alphabet(): return align.KmerAlphabet(seq.ProteinSequence.alphabet, K) + @pytest.fixture def spaced_kmer_alphabet(): - return align.KmerAlphabet(seq.ProteinSequence.alphabet, K, spacing=[0,1,2]) - + return align.KmerAlphabet(seq.ProteinSequence.alphabet, K, spacing=[0, 1, 2]) np.random.seed(0) N = 10 L = 30 + + @pytest.mark.parametrize( "ref_split_kmer_code", # Test for single instances as input - list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))) + + list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K))) + + # Test for multiple instances as input - list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, L, K))) + list(np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, L, K))), ) def test_fuse_and_split(kmer_alphabet, ref_split_kmer_code): """ @@ -38,15 +40,16 @@ def test_fuse_and_split(kmer_alphabet, ref_split_kmer_code): """ fused = kmer_alphabet.fuse(ref_split_kmer_code) test_split_kmer_code = kmer_alphabet.split(fused) - + assert test_split_kmer_code.tolist() == ref_split_kmer_code.tolist() np.random.seed(0) N = 10 + + @pytest.mark.parametrize( - "split_kmer_code", - np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K)) + "split_kmer_code", np.random.randint(len(seq.ProteinSequence.alphabet), size=(N, K)) ) def test_encode_and_decode(kmer_alphabet, split_kmer_code): """ @@ -58,7 +61,7 @@ def test_encode_and_decode(kmer_alphabet, split_kmer_code): ref_kmer_symbol = alph.decode_multiple(split_kmer_code) kmer_code = kmer_alphabet.encode(ref_kmer_symbol) test_kmer_symbol = kmer_alphabet.decode(kmer_code) - + assert test_kmer_symbol.tolist() == ref_kmer_symbol.tolist() @@ -86,6 +89,8 @@ def test_create_continuous_kmers(kmer_alphabet): N = 50 + + @pytest.mark.parametrize("seed", range(N)) def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed): """ @@ -99,8 +104,7 @@ def test_create_spaced_kmers(kmer_alphabet, spaced_kmer_alphabet, seed): np.random.seed(seed) sequence = seq.ProteinSequence() sequence.code = np.random.randint( - len(sequence.alphabet), - size=np.random.randint(MIN_LENGTH, MAX_LENGTH) + len(sequence.alphabet), size=np.random.randint(MIN_LENGTH, MAX_LENGTH) ) ref_kmers = kmer_alphabet.create_kmers(sequence.code) diff --git a/tests/sequence/align/test_kmersimilarity.py b/tests/sequence/align/test_kmersimilarity.py index a72aeca72..5f1fbbf44 100644 --- a/tests/sequence/align/test_kmersimilarity.py +++ b/tests/sequence/align/test_kmersimilarity.py @@ -15,22 +15,24 @@ def kmer_alphabet(): np.random.seed(0) N = 10 -@pytest.mark.parametrize("ref_kmer, threshold", zip( - np.random.randint(10000, size=N), - np.random.randint(-5, 15, size=N) -)) + + +@pytest.mark.parametrize( + "ref_kmer, threshold", + zip(np.random.randint(10000, size=N), np.random.randint(-5, 15, size=N)), +) def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold): """ Test if the similar k-mers given by :class:`ScoreThresholdRule` are equal to k-mers generated by a brute-force approach. """ matrix = align.SubstitutionMatrix.std_protein_matrix() - + ref_kmer_sequence = seq.ProteinSequence() ref_kmer_sequence.code = kmer_alphabet.split(ref_kmer) - + ref_sim_kmer_set = set() - # Iterate through all possible k-mers + # Iterate through all possible k-mers for kmer in range(len(kmer_alphabet)): kmer_sequence = seq.ProteinSequence() kmer_sequence.code = kmer_alphabet.split(kmer) @@ -40,7 +42,7 @@ def test_score_threshold_rule(kmer_alphabet, ref_kmer, threshold): # Add k-mer to list if the threshold score is reached if score >= threshold: ref_sim_kmer_set.add(kmer) - + test_rule = align.ScoreThresholdRule(matrix, threshold) test_sim_kmer_set = set(test_rule.similar_kmers(kmer_alphabet, ref_kmer)) @@ -68,4 +70,4 @@ def test_invalid_kmer(kmer_alphabet, invalid_kmer): align.SubstitutionMatrix.std_protein_matrix(), 0 ) with pytest.raises(seq.AlphabetError): - test_rule.similar_kmers(kmer_alphabet, invalid_kmer) \ No newline at end of file + test_rule.similar_kmers(kmer_alphabet, invalid_kmer) diff --git a/tests/sequence/align/test_kmertable.py b/tests/sequence/align/test_kmertable.py index 64439bd27..deb4b1923 100644 --- a/tests/sequence/align/test_kmertable.py +++ b/tests/sequence/align/test_kmertable.py @@ -4,9 +4,8 @@ import functools import itertools -import string import pickle -from typing import Any +import string import numpy as np import pytest import biotite.sequence as seq @@ -27,9 +26,7 @@ def __init__(self, n_buckets): def __getattr__(self, name): attr = getattr(align.BucketKmerTable, name) - if attr.__name__ in [ - "from_sequences", "from_kmers", "from_kmer_selection" - ]: + if attr.__name__ in ["from_sequences", "from_kmers", "from_kmer_selection"]: return functools.partial(attr, n_buckets=self._n_buckets) else: return attr @@ -47,10 +44,12 @@ def idfn(val): def k(): return 8 + @pytest.fixture def alphabet(): return seq.NucleotideSequence.unambiguous_alphabet() + @pytest.fixture def random_sequences(k, alphabet): N_SEQS = 10 @@ -75,10 +74,10 @@ def random_sequences(k, alphabet): # with less buckets than number of possible kmers ... FixedBucketKmerTable(1000), # ... and one test case with more buckets (perfect hashing) - FixedBucketKmerTable(1000000) - ] + FixedBucketKmerTable(1000000), + ], ), - ids = idfn + ids=idfn, ) def test_from_sequences(k, random_sequences, spacing, table_class): """ @@ -86,29 +85,23 @@ def test_from_sequences(k, random_sequences, spacing, table_class): sequence position, if the position is in the C-array of the corresponding k-mer. """ - table = table_class.from_sequences( - k, random_sequences, spacing=spacing - ) + table = table_class.from_sequences(k, random_sequences, spacing=spacing) kmer_alph = align.KmerAlphabet(random_sequences[0].alphabet, k, spacing) assert kmer_alph == table.kmer_alphabet for i, sequence in enumerate(random_sequences): for j in range(kmer_alph.kmer_array_length(len(sequence))): if spacing is None: - kmer = kmer_alph.fuse(sequence.code[j : j+k]) + kmer = kmer_alph.fuse(sequence.code[j : j + k]) else: kmer = kmer_alph.fuse(sequence.code[kmer_alph.spacing + j]) - assert np.array([i,j]) in table[kmer] + assert np.array([i, j]) in table[kmer] @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_from_kmers(k, random_sequences, table_class): """ @@ -128,12 +121,8 @@ def test_from_kmers(k, random_sequences, table_class): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_from_kmer_selection(k, alphabet, random_sequences, table_class): """ @@ -149,8 +138,7 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class): ] np.random.seed(0) filtered_pos_arrays = [ - np.random.randint(len(kmers), size=N_POSITIONS) - for kmers in kmer_arrays + np.random.randint(len(kmers), size=N_POSITIONS) for kmers in kmer_arrays ] filtered_kmer_arrays = [ kmers[filtered_pos] @@ -162,8 +150,9 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class): # The total number of k-mers in the table # should be the total number of input k-mers - assert np.sum(kmer_table.count(np.arange(len(kmer_alph)))) \ - == np.sum([len(kmers) for kmers in filtered_kmer_arrays]) + assert np.sum(kmer_table.count(np.arange(len(kmer_alph)))) == np.sum( + [len(kmers) for kmers in filtered_kmer_arrays] + ) # Each k-mer in the table should be found # in the original k-mer sequences for kmer in range(len(kmer_alph)): @@ -173,12 +162,8 @@ def test_from_kmer_selection(k, alphabet, random_sequences, table_class): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_from_tables(k, random_sequences, table_class): """ @@ -205,10 +190,8 @@ def test_from_positions(k, random_sequences): """ ref_table = align.KmerTable.from_sequences(k, random_sequences) - kmer_dict = {kmer : ref_table[kmer] for kmer in range(len(ref_table))} - test_table = align.KmerTable.from_positions( - ref_table.kmer_alphabet, kmer_dict - ) + kmer_dict = {kmer: ref_table[kmer] for kmer in range(len(ref_table))} + test_table = align.KmerTable.from_positions(ref_table.kmer_alphabet, kmer_dict) assert test_table == ref_table @@ -216,14 +199,10 @@ def test_from_positions(k, random_sequences): @pytest.mark.parametrize( "table_class, use_similarity_rule", itertools.product( - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(10000000) - ], - [False, True] + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(10000000)], + [False, True], ), - ids = idfn + ids=idfn, ) def test_match_table(table_class, use_similarity_rule): """ @@ -233,8 +212,7 @@ def test_match_table(table_class, use_similarity_rule): chosen to yield only the same k-mer as similar k-mer. """ alphabet = seq.LetterAlphabet(string.ascii_lowercase + "_") - phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" \ - "chuck_wood" + phrase1 = "how_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_" "chuck_wood" phrase2 = "woodchuck" sequence1 = seq.GeneralSequence(alphabet, phrase1) sequence2 = seq.GeneralSequence(alphabet, phrase2) @@ -244,30 +222,32 @@ def test_match_table(table_class, use_similarity_rule): table1 = table_class.from_sequences(4, [sequence1]) table2 = table_class.from_sequences(4, [sequence2]) - ref_matches = set([ - (0, 9), - (0, 22), - (1, 23), - (2, 24), - (3, 25), - (4, 26), - (5, 27), - (4, 32), - (5, 33), - (0, 43), - (1, 44), - (2, 45), - (3, 46), - (4, 47), - (5, 48), - (4, 59), - (5, 60), - (0, 65), - ]) + ref_matches = set( + [ + (0, 9), + (0, 22), + (1, 23), + (2, 24), + (3, 25), + (4, 26), + (5, 27), + (4, 32), + (5, 33), + (0, 43), + (1, 44), + (2, 45), + (3, 46), + (4, 47), + (5, 48), + (4, 59), + (5, 60), + (0, 65), + ] + ) test_matches = table1.match_table(table2, similarity_rule=rule) # the reference indices are irrelevant for this test - test_matches = test_matches[:, [1,3]] + test_matches = test_matches[:, [1, 3]] test_matches = set([tuple(match) for match in test_matches]) assert test_matches == ref_matches @@ -275,14 +255,10 @@ def test_match_table(table_class, use_similarity_rule): @pytest.mark.parametrize( "table_class, use_similarity_rule", itertools.product( - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - [False, True] + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + [False, True], ), - ids = idfn + ids=idfn, ) def test_match(k, random_sequences, table_class, use_similarity_rule): """ @@ -301,12 +277,8 @@ def test_match(k, random_sequences, table_class, use_similarity_rule): for i, kmer in enumerate(kmers): matches = table[kmer] matches = np.stack( - [ - np.full(len(matches), i, dtype=np.uint32), - matches[:,0], - matches[:,1] - ], - axis=1 + [np.full(len(matches), i, dtype=np.uint32), matches[:, 0], matches[:, 1]], + axis=1, ) ref_matches.append(matches) ref_matches = np.concatenate(ref_matches) @@ -319,12 +291,8 @@ def test_match(k, random_sequences, table_class, use_similarity_rule): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_match_kmer_selection(k, random_sequences, table_class): """ @@ -344,12 +312,8 @@ def test_match_kmer_selection(k, random_sequences, table_class): kmer = kmers[pos] matches = table[kmer] matches = np.stack( - [ - np.full(len(matches), pos, dtype=np.uint32), - matches[:,0], - matches[:,1] - ], - axis=1 + [np.full(len(matches), pos, dtype=np.uint32), matches[:, 0], matches[:, 1]], + axis=1, ) ref_matches.append(matches) ref_matches = np.concatenate(ref_matches) @@ -362,14 +326,10 @@ def test_match_kmer_selection(k, random_sequences, table_class): @pytest.mark.parametrize( "table_class, use_mask", itertools.product( - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - [False, True] + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + [False, True], ), - ids = idfn + ids=idfn, ) def test_match_equivalence(k, random_sequences, table_class, use_mask): """ @@ -391,27 +351,22 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask): query_mask = removal_masks[0] table_masks = removal_masks[1:] - table = table_class.from_sequences( - k, table_sequences, ignore_masks=table_masks - ) + table = table_class.from_sequences(k, table_sequences, ignore_masks=table_masks) # 42 -> Dummy value that is distinct from all reference indices ref_table = table_class.from_sequences( k, [query_sequence], [42], ignore_masks=[query_mask] ) ref_matches = table.match_table(ref_table) - assert np.all(ref_matches[:,0] == 42) + assert np.all(ref_matches[:, 0] == 42) # Store matches in set to remove the order dependency # The first column is not present in the matches # returned by 'match_sequence()' -> [:, 1:] ref_matches = set([tuple(match) for match in ref_matches[:, 1:]]) - test_matches = table.match( - query_sequence, ignore_mask=query_mask - ) + test_matches = table.match(query_sequence, ignore_mask=query_mask) test_matches = set([tuple(match) for match in test_matches]) - # Check if any match is found at all assert len(ref_matches) > 0 # The first column is not present in 'test_matches' @@ -433,7 +388,7 @@ def test_match_equivalence(k, random_sequences, table_class, use_mask): ), ], ids = idfn -) +) # fmt: skip def test_masking(k, input_mask, ref_output_mask): """ Explicitly test the conversion of removal masks to k-mer masks @@ -446,9 +401,7 @@ def test_masking(k, input_mask, ref_output_mask): sequence = seq.NucleotideSequence() sequence.code = np.zeros(len(input_mask)) - table = align.KmerTable.from_sequences( - k, [sequence], ignore_masks=[input_mask] - ) + table = align.KmerTable.from_sequences(k, [sequence], ignore_masks=[input_mask]) # Get the k-mer positions that were masked test_output_mask = np.zeros(len(ref_output_mask), dtype=bool) @@ -467,7 +420,7 @@ def test_masking(k, input_mask, ref_output_mask): (FixedBucketKmerTable(1000), True), (FixedBucketKmerTable(1000000), True), ], - ids = idfn + ids=idfn, ) def test_count(k, random_sequences, table_class, selected_kmers): """ @@ -476,9 +429,7 @@ def test_count(k, random_sequences, table_class, selected_kmers): """ N_KMERS = 100 - table = table_class.from_sequences( - k, random_sequences - ) + table = table_class.from_sequences(k, random_sequences) if selected_kmers: np.random.seed(0) @@ -486,9 +437,7 @@ def test_count(k, random_sequences, table_class, selected_kmers): ref_counts = [len(table[kmer]) for kmer in kmers] test_counts = table.count(kmers) else: - ref_counts = [ - len(table[kmer]) for kmer in range(len(table.kmer_alphabet)) - ] + ref_counts = [len(table[kmer]) for kmer in range(len(table.kmer_alphabet))] test_counts = table.count() assert test_counts.tolist() == ref_counts @@ -496,12 +445,8 @@ def test_count(k, random_sequences, table_class, selected_kmers): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_get_kmers(table_class): """ @@ -511,10 +456,7 @@ def test_get_kmers(table_class): """ np.random.seed(0) - kmer_alphabet = align.KmerAlphabet( - seq.NucleotideSequence.unambiguous_alphabet(), - 8 - ) + kmer_alphabet = align.KmerAlphabet(seq.NucleotideSequence.unambiguous_alphabet(), 8) ref_mask = np.random.choice([False, True], size=len(kmer_alphabet)) ref_kmers = np.where(ref_mask)[0] table = table_class.from_kmers(kmer_alphabet, [ref_kmers]) @@ -526,12 +468,8 @@ def test_get_kmers(table_class): @pytest.mark.parametrize( "table_class", - [ - align.KmerTable, - FixedBucketKmerTable(1000), - FixedBucketKmerTable(1000000) - ], - ids = idfn + [align.KmerTable, FixedBucketKmerTable(1000), FixedBucketKmerTable(1000000)], + ids=idfn, ) def test_pickle(k, random_sequences, table_class): """ @@ -548,10 +486,7 @@ def test_pickle(k, random_sequences, table_class): @pytest.mark.parametrize( "n_kmers, load_factor", - itertools.product( - [1_000, 100_000, 10_000_000, 1_000_000_000], - [0.2, 1.0, 2.0] - ) + itertools.product([1_000, 100_000, 10_000_000, 1_000_000_000], [0.2, 1.0, 2.0]), ) def test_bucket_number(n_kmers, load_factor): """ @@ -563,7 +498,6 @@ def test_bucket_number(n_kmers, load_factor): min_n_buckets = int(n_kmers / load_factor) test_n_buckets = align.bucket_number(n_kmers, load_factor) - assert test_n_buckets >= min_n_buckets assert test_n_buckets <= min_n_buckets * 1.05 @@ -573,4 +507,4 @@ def _identity_rule(alphabet): np.fill_diagonal(score_matrix, 0) matrix = align.SubstitutionMatrix(alphabet, alphabet, score_matrix) rule = align.ScoreThresholdRule(matrix, 0) - return rule \ No newline at end of file + return rule diff --git a/tests/sequence/align/test_localgapped.py b/tests/sequence/align/test_localgapped.py index 7fbe19f48..714004118 100644 --- a/tests/sequence/align/test_localgapped.py +++ b/tests/sequence/align/test_localgapped.py @@ -3,25 +3,23 @@ # information. import itertools -import pytest import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align -from .util import sequences @pytest.mark.parametrize( "gap_penalty, seed, threshold, direction, score_only", itertools.product( - [-10, (-10,-1)], + [-10, (-10, -1)], [(0, 0), (11, 11), (20, 19), (30, 29)], [20, 100, 500], - ["both", "upstream","downstream"], - [False, True] - ) + ["both", "upstream", "downstream"], + [False, True], + ), ) -def test_simple_alignment(gap_penalty, seed, threshold, - direction, score_only): +def test_simple_alignment(gap_penalty, seed, threshold, direction, score_only): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. @@ -34,22 +32,20 @@ def test_simple_alignment(gap_penalty, seed, threshold, matrix = align.SubstitutionMatrix.std_protein_matrix() ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=True + seq1, seq2, matrix, gap_penalty=gap_penalty, local=True ) # Limit reference alignment range to seed # if the alignment does not extend in both directions for alignment in ref_alignments: - seed_index = np.where(alignment.trace[:,0] == seed[0])[0][0] + seed_index = np.where(alignment.trace[:, 0] == seed[0])[0][0] if direction == "upstream": - alignment.trace = alignment.trace[:seed_index + 1] + alignment.trace = alignment.trace[: seed_index + 1] elif direction == "downstream": alignment.trace = alignment.trace[seed_index:] alignment.score = align.score(alignment, matrix, gap_penalty) - + test_result = align.align_local_gapped( - seq1, seq2, matrix, seed, threshold, gap_penalty, - 1000, direction, score_only + seq1, seq2, matrix, seed, threshold, gap_penalty, 1000, direction, score_only ) if score_only: @@ -66,13 +62,12 @@ def test_simple_alignment(gap_penalty, seed, threshold, @pytest.mark.parametrize( "gap_penalty, score_only, seq_indices", itertools.product( - [-10, (-10,-1)], + [-10, (-10, -1)], [False, True], - [(i,j) for i in range(10) for j in range(i+1)] - ) + [(i, j) for i in range(10) for j in range(i + 1)], + ), ) -def test_complex_alignment(sequences, gap_penalty, score_only, - seq_indices): +def test_complex_alignment(sequences, gap_penalty, score_only, seq_indices): """ Test `align_local_gapped()` by comparing the output to `align_optimal()`. @@ -84,24 +79,22 @@ def test_complex_alignment(sequences, gap_penalty, score_only, # The linear gap penalty for longer gaps easily exceeds # a small threshold -> increase threshold for linear penalty THRESHOLD = 200 if isinstance(gap_penalty, int) else 50 - + matrix = align.SubstitutionMatrix.std_protein_matrix() index1, index2 = seq_indices seq1 = sequences[index1] seq2 = sequences[index2] ref_alignments = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER + seq1, seq2, matrix, gap_penalty=gap_penalty, local=True, max_number=MAX_NUMBER ) # Select the center of the alignment as seed trace = ref_alignments[0].trace trace = trace[(trace != -1).all(axis=1)] seed = trace[len(trace) // 2] - + test_result = align.align_local_gapped( - seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, - MAX_NUMBER, "both", score_only + seq1, seq2, matrix, seed, THRESHOLD, gap_penalty, MAX_NUMBER, "both", score_only ) if score_only: @@ -113,30 +106,29 @@ def test_complex_alignment(sequences, gap_penalty, score_only, test_alignments = test_result assert test_alignments[0].score == ref_alignments[0].score # Test if the score is also correctly calculated - assert align.score(test_alignments[0], matrix, gap_penalty) \ + assert ( + align.score(test_alignments[0], matrix, gap_penalty) == ref_alignments[0].score - if len(ref_alignments) < MAX_NUMBER \ - and len(test_alignments) < MAX_NUMBER: - # Only test if the exact same alignments were created, - # if the number of traces was not limited by MAX_NUMBER - for i, alignment in enumerate(test_alignments): - try: - assert alignment in ref_alignments - except AssertionError: - # Edge case: - # In rare case the local alignment may be - # slightly longer on the upstream side for - # 'align_local_ungapped()', since the - # upstream side is handled in an inverted - # manner - # However this does not effect the score - # Consequently, the exception is ignored - # if the alignment is longer than all - # reference alignments - if len(alignment) <= max( - [len(ali) for ali in ref_alignments] - ): - raise + ) + if len(ref_alignments) < MAX_NUMBER and len(test_alignments) < MAX_NUMBER: + # Only test if the exact same alignments were created, + # if the number of traces was not limited by MAX_NUMBER + for i, alignment in enumerate(test_alignments): + try: + assert alignment in ref_alignments + except AssertionError: + # Edge case: + # In rare case the local alignment may be + # slightly longer on the upstream side for + # 'align_local_ungapped()', since the + # upstream side is handled in an inverted + # manner + # However this does not effect the score + # Consequently, the exception is ignored + # if the alignment is longer than all + # reference alignments + if len(alignment) <= max([len(ali) for ali in ref_alignments]): + raise except AssertionError: print(f"Missing test alignment at index {i}:") print() @@ -151,11 +143,11 @@ def test_complex_alignment(sequences, gap_penalty, score_only, @pytest.mark.parametrize( "gap_penalty, direction, score_only, should_raise", itertools.product( - [-10, (-10,-1)], - ["both", "upstream","downstream"], + [-10, (-10, -1)], + ["both", "upstream", "downstream"], [False, True], - [False, True] - ) + [False, True], + ), ) def test_max_table_size(gap_penalty, direction, score_only, should_raise): """ @@ -171,7 +163,7 @@ def test_max_table_size(gap_penalty, direction, score_only, should_raise): max_table_size = 1_000_000_000 # Align a long random sequence to itself, - # effectively resulting in a global alignment + # effectively resulting in a global alignment np.random.seed(0) seq1 = seq.NucleotideSequence() seq1.code = np.random.randint(len(seq1.alphabet), size=10000) @@ -184,15 +176,31 @@ def test_max_table_size(gap_penalty, direction, score_only, should_raise): if should_raise: with pytest.raises(MemoryError): align.align_local_gapped( - seq1, seq1, matrix, seed, threshold, gap_penalty, 1, - direction, score_only, max_table_size + seq1, + seq1, + matrix, + seed, + threshold, + gap_penalty, + 1, + direction, + score_only, + max_table_size, ) else: result = align.align_local_gapped( - seq1, seq1, matrix, seed, threshold, gap_penalty, 1, - direction, score_only, max_table_size + seq1, + seq1, + matrix, + seed, + threshold, + gap_penalty, + 1, + direction, + score_only, + max_table_size, ) if not score_only and direction == "both": alignment = result[0] # Expect that no gaps are introduced - assert len(alignment) == len(seq1) \ No newline at end of file + assert len(alignment) == len(seq1) diff --git a/tests/sequence/align/test_localungapped.py b/tests/sequence/align/test_localungapped.py index b3f24dc59..11105a11a 100644 --- a/tests/sequence/align/test_localungapped.py +++ b/tests/sequence/align/test_localungapped.py @@ -66,15 +66,24 @@ ], [["both"], ["upstream"], ["downstream"]], # direction - + [[False], [True]], # score_only [[False], [True]], # uint8_code )] -) -def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, - ref_range1, ref_range2, - direction, score_only, uint8_code): +) # fmt: skip +def test_simple_alignments( + seq_type, + seq1, + seq2, + seed, + threshold, + ref_range1, + ref_range2, + direction, + score_only, + uint8_code, +): """ Check if `algin_local_ungapped()` produces correct alignments based on simple known examples. @@ -90,29 +99,26 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, seq1 = seq_type(seq1) seq2 = seq_type(seq2) - + if seq_type == seq.NucleotideSequence: matrix = align.SubstitutionMatrix.std_nucleotide_matrix() else: matrix = align.SubstitutionMatrix.std_protein_matrix() - + if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) - ref_alignment = align.Alignment( [seq1, seq2], - np.stack([ - np.arange(*ref_range1), - np.arange(*ref_range2) - ], axis=-1) + np.stack([np.arange(*ref_range1), np.arange(*ref_range2)], axis=-1), ) ref_score = align.score(ref_alignment, matrix) ref_alignment.score = ref_score test_result = align.align_local_ungapped( - seq1, seq2, matrix, seed, threshold, direction, score_only) - + seq1, seq2, matrix, seed, threshold, direction, score_only + ) + if score_only: assert test_result == ref_score else: @@ -120,10 +126,7 @@ def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, @pytest.mark.parametrize( - "seed, uint8_code", itertools.product( - range(100), - [False, True] - ) + "seed, uint8_code", itertools.product(range(100), [False, True]) ) def test_random_alignment(seed, uint8_code): """ @@ -141,29 +144,26 @@ def test_random_alignment(seed, uint8_code): CONSERVED_ENDS = 5 MUTATION_PROB = 0.1 THRESHOLD = 100 - + np.random.seed(seed) # Create conserved regions conserved1 = ProteinSequence() - conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE+1) + conserved_len = np.random.randint(MIN_CONSERVED_SIZE, MAX_CONSERVED_SIZE + 1) conserved1.code = np.random.randint( # Do not include stop symbol for aesthetic reasons -> -1 - len(conserved1.alphabet)-1, - size=conserved_len + len(conserved1.alphabet) - 1, + size=conserved_len, ) conserved2 = ProteinSequence() # The second conserved regions is equal to the first one, # except a few point mutations conserved2.code = conserved1.code.copy() mutation_mask = np.random.choice( - [False, True], - size=conserved_len, - p = [1 - MUTATION_PROB, MUTATION_PROB] + [False, True], size=conserved_len, p=[1 - MUTATION_PROB, MUTATION_PROB] ) conserved2.code[mutation_mask] = np.random.randint( - len(conserved2.alphabet)-1, - size=np.count_nonzero(mutation_mask) + len(conserved2.alphabet) - 1, size=np.count_nonzero(mutation_mask) ) # Flank the conserved regions with equal termini to ensure # that the alignment extends from start to end of the region @@ -174,36 +174,33 @@ def test_random_alignment(seed, uint8_code): seq1 = ProteinSequence() seq2 = ProteinSequence() offset = [] - for sequence, conserved in zip( - (seq1, seq2), (conserved1, conserved2) - ): + for sequence, conserved in zip((seq1, seq2), (conserved1, conserved2)): sequence.code = np.random.randint( - len(sequence.alphabet)-1, - size=np.random.randint(MIN_SIZE, MAX_SIZE+1) + len(sequence.alphabet) - 1, size=np.random.randint(MIN_SIZE, MAX_SIZE + 1) ) # Place conserved region randomly within the sequence conserved_pos = np.random.randint(0, len(sequence) - len(conserved)) - sequence.code[conserved_pos : conserved_pos + len(conserved)] \ - = conserved.code + sequence.code[conserved_pos : conserved_pos + len(conserved)] = conserved.code offset.append(conserved_pos) # The seed is placed somewhere in the conserved region seed = np.array(offset) + np.random.randint(len(conserved)) - matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) - + ref_score = align.align_optimal( - seq1, seq2, matrix, local=True, max_number=1, - # High gap penalty to prevent introduction of gaps, + seq1, + seq2, + matrix, + local=True, + max_number=1, + # High gap penalty to prevent introduction of gaps, # since 'align_local_ungapped()' is also no able to place gaps - gap_penalty=-1000 + gap_penalty=-1000, )[0].score - test_alignment = align.align_local_ungapped( - seq1, seq2, matrix, seed, THRESHOLD - ) + test_alignment = align.align_local_ungapped(seq1, seq2, matrix, seed, THRESHOLD) assert test_alignment.score == ref_score # Test if the score is also correctly calculated @@ -211,23 +208,23 @@ def test_random_alignment(seed, uint8_code): def _convert_to_uint16_code(seq1, seq2, matrix): - """ - Adjust sequences, so that they use 'uint16' as dtype for the - code. - This is a necessary test, since 'uint8' uses a separate - implementation. - """ - new_alph = seq.Alphabet(np.arange(500)) - code = seq1.code - seq1 = seq.GeneralSequence(new_alph) - seq1.code = code - code = seq2.code - seq2 = seq.GeneralSequence(new_alph) - seq2.code = code - # Adjust the substitution matrix as well, - # so that it is compatible with the new alphabet - score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) - orig_len = len(matrix.score_matrix()) - score_matrix[:orig_len, :orig_len] = matrix.score_matrix() - matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) - return seq1, seq2, matrix \ No newline at end of file + """ + Adjust sequences, so that they use 'uint16' as dtype for the + code. + This is a necessary test, since 'uint8' uses a separate + implementation. + """ + new_alph = seq.Alphabet(np.arange(500)) + code = seq1.code + seq1 = seq.GeneralSequence(new_alph) + seq1.code = code + code = seq2.code + seq2 = seq.GeneralSequence(new_alph) + seq2.code = code + # Adjust the substitution matrix as well, + # so that it is compatible with the new alphabet + score_matrix = np.zeros((len(new_alph), len(new_alph)), dtype=np.int32) + orig_len = len(matrix.score_matrix()) + score_matrix[:orig_len, :orig_len] = matrix.score_matrix() + matrix = align.SubstitutionMatrix(new_alph, new_alph, score_matrix) + return seq1, seq2, matrix diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 79763213f..570878945 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -8,16 +8,22 @@ import biotite.sequence.align as align -@pytest.mark.parametrize("db_entry", [entry for entry - in align.SubstitutionMatrix.list_db() - if entry not in ["NUC","GONNET"]]) +@pytest.mark.parametrize( + "db_entry", + [ + entry + for entry in align.SubstitutionMatrix.list_db() + if entry not in ["NUC", "GONNET"] + ], +) def test_matrices(db_entry): """ Test for exceptions when reading matrix files. """ alph1 = seq.ProteinSequence.alphabet alph2 = seq.ProteinSequence.alphabet - matrix = align.SubstitutionMatrix(alph1, alph2, db_entry) + align.SubstitutionMatrix(alph1, alph2, db_entry) + def test_matrix_str(): """ @@ -26,11 +32,11 @@ def test_matrix_str(): """ alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("def") - score_matrix = np.arange(9).reshape((3,3)) + score_matrix = np.arange(9).reshape((3, 3)) matrix = align.SubstitutionMatrix(alph1, alph2, score_matrix) assert str(matrix) == "\n".join( [" d e f", "a 0 1 2", "b 3 4 5", "c 6 7 8"] - ) \ No newline at end of file + ) # fmt: skip diff --git a/tests/sequence/align/test_multiple.py b/tests/sequence/align/test_multiple.py index 3a5a470c9..47f5200c4 100644 --- a/tests/sequence/align/test_multiple.py +++ b/tests/sequence/align/test_multiple.py @@ -3,19 +3,14 @@ # information. import pytest -import biotite.sequence.align as align import biotite.application.muscle as muscle +import biotite.sequence.align as align from biotite.application import VersionError -from ...util import is_not_installed -from .util import sequences +from tests.util import is_not_installed - -@pytest.mark.skipif( - is_not_installed("muscle"), - reason="MUSCLE is not installed" -) -@pytest.mark.parametrize("gap_penalty", [-10, (-10,-1)]) +@pytest.mark.skipif(is_not_installed("muscle"), reason="MUSCLE is not installed") +@pytest.mark.parametrize("gap_penalty", [-10, (-10, -1)]) def test_align_multiple(sequences, gap_penalty): r""" Test `align_multiple()` function using actual long sequences, @@ -26,22 +21,18 @@ def test_align_multiple(sequences, gap_penalty): score of the MUSCLE alignment. """ matrix = align.SubstitutionMatrix.std_protein_matrix() - + test_alignment, order, tree, distances = align.align_multiple( sequences, matrix, gap_penalty=gap_penalty, terminal_penalty=True ) - test_score = align.score( - test_alignment, matrix, gap_penalty, terminal_penalty=True - ) - + test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True) + try: ref_alignment = muscle.MuscleApp.align( sequences, matrix=matrix, gap_penalty=gap_penalty ) except VersionError: - pytest.skip(f"Invalid Muscle software version") - ref_score = align.score( - ref_alignment, matrix, gap_penalty, terminal_penalty=True - ) - - assert test_score >= ref_score * 0.5 \ No newline at end of file + pytest.skip("Invalid Muscle software version") + ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True) + + assert test_score >= ref_score * 0.5 diff --git a/tests/sequence/align/test_pairwise.py b/tests/sequence/align/test_pairwise.py index 00717df15..712dfb6b8 100644 --- a/tests/sequence/align/test_pairwise.py +++ b/tests/sequence/align/test_pairwise.py @@ -5,12 +5,11 @@ import itertools import numpy as np import pytest +import biotite.application.muscle as muscle import biotite.sequence as seq import biotite.sequence.align as align -import biotite.application.muscle as muscle from biotite.application import VersionError -from ...util import is_not_installed -from .util import sequences +from tests.util import is_not_installed def test_align_ungapped(): @@ -26,32 +25,35 @@ def test_align_ungapped(): # [local, gap_penalty, input1, input2, expect] -align_cases = [(False,True, -7, "TATGGGTATCC","TATGTATAA", - ("TATGGGTATCC\nTATG--TATAA", - "TATGGGTATCC\nTAT-G-TATAA", - "TATGGGTATCC\nTAT--GTATAA",)), - (True, True, -6, "TATGGGTATCC","TATGTATAA", - ("TATGGGTAT\nTATG--TAT", - "TATGGGTAT\nTAT-G-TAT", - "TATGGGTAT\nTAT--GTAT",)), - (False,True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", - ("TACTATGGGTATCC\nTCATATG--TATAA", - "TACTATGGGTATCC\nTCATAT--GTATAA",)), - (True, True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", - ("TATGGGTAT\nTATG--TAT", - "TATGGGTAT\nTAT--GTAT",)), - (False,True, (-7,-1), "T","TTT", - ("T--\nTTT", - "--T\nTTT",)), - (False,True, -7, "TAAAGCGAAAT","TGCGT", - ("TAAAGCGAAAT\nT---GCG---T")), - (False,False,-7, "TAAAGCGAAAT","TGCGT", - ("TAAAGCGAAAT\n---TGCGT---")) - ] -@pytest.mark.parametrize("local, term, gap_penalty, input1, input2, expect", - align_cases) -def test_align_optimal_simple(local, term, gap_penalty, - input1, input2, expect): +align_cases = [ + (False,True, -7, "TATGGGTATCC","TATGTATAA", + ("TATGGGTATCC\nTATG--TATAA", + "TATGGGTATCC\nTAT-G-TATAA", + "TATGGGTATCC\nTAT--GTATAA",)), + (True, True, -6, "TATGGGTATCC","TATGTATAA", + ("TATGGGTAT\nTATG--TAT", + "TATGGGTAT\nTAT-G-TAT", + "TATGGGTAT\nTAT--GTAT",)), + (False,True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", + ("TACTATGGGTATCC\nTCATATG--TATAA", + "TACTATGGGTATCC\nTCATAT--GTATAA",)), + (True, True, (-7,-1), "TACTATGGGTATCC","TCATATGTATAA", + ("TATGGGTAT\nTATG--TAT", + "TATGGGTAT\nTAT--GTAT",)), + (False,True, (-7,-1), "T","TTT", + ("T--\nTTT", + "--T\nTTT",)), + (False,True, -7, "TAAAGCGAAAT","TGCGT", + ("TAAAGCGAAAT\nT---GCG---T")), + (False,False,-7, "TAAAGCGAAAT","TGCGT", + ("TAAAGCGAAAT\n---TGCGT---")) +] # fmt: skip + + +@pytest.mark.parametrize( + "local, term, gap_penalty, input1, input2, expect", align_cases +) +def test_align_optimal_simple(local, term, gap_penalty, input1, input2, expect): """ Test `align_optimal()` function using constructed test cases. """ @@ -59,29 +61,27 @@ def test_align_optimal_simple(local, term, gap_penalty, seq2 = seq.NucleotideSequence(input2) matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Test alignment function - alignments = align.align_optimal(seq1, seq2, - matrix, - gap_penalty=gap_penalty, terminal_penalty=term, - local=local) - + alignments = align.align_optimal( + seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, local=local + ) + for ali in alignments: assert str(ali) in expect # Test if separate score function calculates the same score for ali in alignments: - score = align.score(ali, matrix, - gap_penalty=gap_penalty, terminal_penalty=term) + score = align.score(ali, matrix, gap_penalty=gap_penalty, terminal_penalty=term) assert score == ali.score -@pytest.mark.skipif( - is_not_installed("muscle"), - reason="MUSCLE is not installed" -) +@pytest.mark.skipif(is_not_installed("muscle"), reason="MUSCLE is not installed") # Ignore warning about MUSCLE writing no second guide tree @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize("gap_penalty, seq_indices", itertools.product( - [-10, (-10,-1)], [(i,j) for i in range(10) for j in range(i+1)] -)) +@pytest.mark.parametrize( + "gap_penalty, seq_indices", + itertools.product( + [-10, (-10, -1)], [(i, j) for i in range(10) for j in range(i + 1)] + ), +) def test_align_optimal_complex(sequences, gap_penalty, seq_indices): """ Test `align_optimal()` function using real world sequences, @@ -92,8 +92,7 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices): seq1 = sequences[index1] seq2 = sequences[index2] test_alignment = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, terminal_penalty=True, max_number=1 + seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=True, max_number=1 )[0] try: @@ -101,18 +100,14 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices): [seq1, seq2], matrix=matrix, gap_penalty=gap_penalty ) except VersionError: - pytest.skip(f"Invalid Muscle software version") + pytest.skip("Invalid Muscle software version") # Check whether the score of the optimal alignments is the same # or higher as the MUSCLE alignment # Direct alignment comparison is not feasible, # since the treatment of terminal gaps is different in MUSCLE - test_score = align.score( - test_alignment, matrix, gap_penalty, terminal_penalty=True - ) - ref_score = align.score( - ref_alignment, matrix, gap_penalty, terminal_penalty=True - ) + test_score = align.score(test_alignment, matrix, gap_penalty, terminal_penalty=True) + ref_score = align.score(ref_alignment, matrix, gap_penalty, terminal_penalty=True) try: assert test_score >= ref_score except AssertionError: @@ -127,9 +122,8 @@ def test_align_optimal_complex(sequences, gap_penalty, seq_indices): @pytest.mark.parametrize( - "local, term, gap_penalty, seed", itertools.product( - [True, False], [True, False], [-5, -8, -10, -15], range(10) - ) + "local, term, gap_penalty, seed", + itertools.product([True, False], [True, False], [-5, -8, -10, -15], range(10)), ) def test_affine_gap_penalty(local, term, gap_penalty, seed): """ @@ -144,11 +138,9 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed): for _ in range(2): sequence = seq.NucleotideSequence() length = np.random.randint(*LENGTH_RANGE) - sequence.code = np.random.randint( - len(sequence.alphabet), size=length - ) + sequence.code = np.random.randint(len(sequence.alphabet), size=length) sequences.append(sequence) - + matrix = align.SubstitutionMatrix.std_nucleotide_matrix() ref_alignments = align.align_optimal( @@ -177,13 +169,15 @@ def test_affine_gap_penalty(local, term, gap_penalty, seed): @pytest.mark.parametrize( - "local, term, gap_penalty, seq_indices", itertools.product( - [True, False], [True, False], [-10, (-10,-1)], - [(i,j) for i in range(10) for j in range(i+1)] - ) + "local, term, gap_penalty, seq_indices", + itertools.product( + [True, False], + [True, False], + [-10, (-10, -1)], + [(i, j) for i in range(10) for j in range(i + 1)], + ), ) -def test_align_optimal_symmetry(sequences, local, term, gap_penalty, - seq_indices): +def test_align_optimal_symmetry(sequences, local, term, gap_penalty, seq_indices): """ Alignments should be indifferent about which sequence comes first. """ @@ -192,15 +186,23 @@ def test_align_optimal_symmetry(sequences, local, term, gap_penalty, seq1 = sequences[index1] seq2 = sequences[index2] alignment1 = align.align_optimal( - seq1, seq2, matrix, - gap_penalty=gap_penalty, terminal_penalty=term, local=local, - max_number=1 + seq1, + seq2, + matrix, + gap_penalty=gap_penalty, + terminal_penalty=term, + local=local, + max_number=1, )[0] # Swap the sequences alignment2 = align.align_optimal( - seq2, seq1, matrix, - gap_penalty=gap_penalty, terminal_penalty=term, local=local, - max_number=1 + seq2, + seq1, + matrix, + gap_penalty=gap_penalty, + terminal_penalty=term, + local=local, + max_number=1, )[0] # Comparing all traces of both alignments to each other # would be unfeasible @@ -209,10 +211,12 @@ def test_align_optimal_symmetry(sequences, local, term, gap_penalty, @pytest.mark.parametrize( - "gap_penalty, term, seq_indices", itertools.product( - [-10, (-10,-1)], [False, True], - [(i,j) for i in range(10) for j in range(i+1)] - ) + "gap_penalty, term, seq_indices", + itertools.product( + [-10, (-10, -1)], + [False, True], + [(i, j) for i in range(10) for j in range(i + 1)], + ), ) def test_scoring(sequences, gap_penalty, term, seq_indices): """ @@ -224,12 +228,10 @@ def test_scoring(sequences, gap_penalty, term, seq_indices): seq1 = sequences[index1] seq2 = sequences[index2] alignment = align.align_optimal( - seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, - max_number=1 + seq1, seq2, matrix, gap_penalty=gap_penalty, terminal_penalty=term, max_number=1 )[0] try: - assert align.score(alignment, matrix, gap_penalty, term) \ - == alignment.score + assert align.score(alignment, matrix, gap_penalty, term) == alignment.score except AssertionError: print(alignment) - raise \ No newline at end of file + raise diff --git a/tests/sequence/align/test_permutation.py b/tests/sequence/align/test_permutation.py index 9f9085f52..1b22b5579 100644 --- a/tests/sequence/align/test_permutation.py +++ b/tests/sequence/align/test_permutation.py @@ -3,9 +3,9 @@ # information. import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align -import pytest def _create_frequency_permutation(k): @@ -34,10 +34,7 @@ def test_random_permutation_modulo(): np.iinfo(np.int64).max + 1, size=SEQ_LENGTH, dtype=np.int64 ) - ref_order = [ - (LCG_A * kmer.item() + LCG_C) % LCG_M - for kmer in kmers - ] + ref_order = [(LCG_A * kmer.item() + LCG_C) % LCG_M for kmer in kmers] permutation = align.RandomPermutation() test_order = permutation.permute(kmers) @@ -60,11 +57,9 @@ def test_random_permutation_randomness(): kmers = np.arange(0, SEQ_LENGTH, dtype=np.int64) permutation = align.RandomPermutation() order = permutation.permute(kmers) - positive = (np.sign(order) == 1) - n_positive = np.convolve(positive, np.ones(FRAME_SIZE), mode='valid') - distribution, _ = np.histogram( - n_positive, bins=np.arange(0, 10 * FRAME_SIZE) - ) + positive = np.sign(order) == 1 + n_positive = np.convolve(positive, np.ones(FRAME_SIZE), mode="valid") + distribution, _ = np.histogram(n_positive, bins=np.arange(0, 10 * FRAME_SIZE)) # Since each value in the k-mer array is unique, # all mapped values should be unique as well @@ -76,9 +71,7 @@ def test_random_permutation_randomness(): def test_frequency_permutation(): K = 5 - kmer_alphabet = align.KmerAlphabet( - seq.NucleotideSequence.alphabet_unamb, K - ) + kmer_alphabet = align.KmerAlphabet(seq.NucleotideSequence.alphabet_unamb, K) np.random.seed(0) # Generate a random count order for each k-mer # Use 'np.arange()' to generate a unique order, @@ -89,21 +82,24 @@ def test_frequency_permutation(): kmer_alphabet, # The actual k-mer positions are dummy values, # only the number of each k-mer is important for this test - {i: np.zeros((count, 2)) for i, count in enumerate(counts)} + {i: np.zeros((count, 2)) for i, count in enumerate(counts)}, ) permutation = align.FrequencyPermutation.from_table(kmer_table) kmers_sorted_by_frequency = np.argsort(counts) - assert permutation.permute(kmers_sorted_by_frequency).tolist() \ + assert ( + permutation.permute(kmers_sorted_by_frequency).tolist() == np.arange(len(kmer_alphabet), dtype=np.int64).tolist() + ) @pytest.mark.parametrize( - "kmer_range, permutation", [ + "kmer_range, permutation", + [ (np.iinfo(np.int64).max, align.RandomPermutation()), (int(4**5), _create_frequency_permutation(5)), (int(4**8), _create_frequency_permutation(8)), - ] + ], ) def test_min_max(kmer_range, permutation): """ diff --git a/tests/sequence/align/test_selector.py b/tests/sequence/align/test_selector.py index cd2bcc4bb..a062df7eb 100644 --- a/tests/sequence/align/test_selector.py +++ b/tests/sequence/align/test_selector.py @@ -11,12 +11,7 @@ @pytest.mark.parametrize( "seed, window, from_sequence, use_permutation", - itertools.product( - range(20), - [2, 5, 10, 25], - [False, True], - [False, True] - ) + itertools.product(range(20), [2, 5, 10, 25], [False, True], [False, True]), ) def test_minimizer(seed, window, from_sequence, use_permutation): """ @@ -40,23 +35,20 @@ def test_minimizer(seed, window, from_sequence, use_permutation): order = kmers # Use an inefficient but simple algorithm for comparison - ref_minimizer_pos = np.array([ - np.argmin(order[i : i + window]) + i - for i in range(len(order) - (window - 1)) - ]) + ref_minimizer_pos = np.array( + [np.argmin(order[i : i + window]) + i for i in range(len(order) - (window - 1))] + ) # Remove duplicates ref_minimizer_pos = np.unique(ref_minimizer_pos) ref_minimizers = kmers[ref_minimizer_pos] - minimizer_selector = align.MinimizerSelector( - kmer_alph, window, permutation - ) + minimizer_selector = align.MinimizerSelector(kmer_alph, window, permutation) if from_sequence: - test_minimizer_pos, test_minimizers \ - = minimizer_selector.select(sequence) + test_minimizer_pos, test_minimizers = minimizer_selector.select(sequence) else: - test_minimizer_pos, test_minimizers \ - = minimizer_selector.select_from_kmers(kmers) + test_minimizer_pos, test_minimizers = minimizer_selector.select_from_kmers( + kmers + ) assert test_minimizer_pos.tolist() == ref_minimizer_pos.tolist() assert test_minimizers.tolist() == ref_minimizers.tolist() @@ -69,10 +61,10 @@ def test_minimizer(seed, window, from_sequence, use_permutation): [2, 3, 5, 7], [(0,), (0, 1, 2), (0, -1), (-2, -1)], [False, True], - [False, True] + [False, True], ), # Print tuples in name of test - ids=lambda x: str(x).replace(" ", "") if isinstance(x, tuple) else None + ids=lambda x: str(x).replace(" ", "") if isinstance(x, tuple) else None, ) def test_syncmer(seed, s, offset, from_sequence, use_permutation): """ @@ -113,11 +105,9 @@ def test_syncmer(seed, s, offset, from_sequence, use_permutation): sequence.alphabet, K, s, permutation, offset ) if from_sequence: - test_syncmer_pos, test_syncmers \ - = syncmer_selector.select(sequence) + test_syncmer_pos, test_syncmers = syncmer_selector.select(sequence) else: - test_syncmer_pos, test_syncmers \ - = syncmer_selector.select_from_kmers(kmers) + test_syncmer_pos, test_syncmers = syncmer_selector.select_from_kmers(kmers) assert test_syncmer_pos.tolist() == ref_syncmer_pos.tolist() assert test_syncmers.tolist() == ref_syncmers.tolist() @@ -141,14 +131,10 @@ def test_cached_syncmer(): np.random.seed(0) sequence.code = np.random.randint(len(sequence.alphabet), size=LENGTH) - syncmer_selector = align.SyncmerSelector( - sequence.alphabet, K, S - ) + syncmer_selector = align.SyncmerSelector(sequence.alphabet, K, S) ref_syncmer_pos, ref_syncmers = syncmer_selector.select(sequence) - cached_syncmer_selector = align.CachedSyncmerSelector( - sequence.alphabet, K, S - ) + cached_syncmer_selector = align.CachedSyncmerSelector(sequence.alphabet, K, S) test_syncmer_pos, test_syncmers = cached_syncmer_selector.select(sequence) assert test_syncmer_pos.tolist() == ref_syncmer_pos.tolist() @@ -159,13 +145,13 @@ def test_cached_syncmer(): "offset, exception_type", [ # Duplicate values - ((1, 1), ValueError), + ((1, 1), ValueError), ((0, 2, 0), ValueError), - ((0, -10), ValueError), + ((0, -10), ValueError), # Offset out of window range - ((-11,), IndexError), - ((10,), IndexError), - ] + ((-11,), IndexError), + ((10,), IndexError), + ], ) def test_syncmer_invalid_offset(offset, exception_type): """ @@ -176,7 +162,10 @@ def test_syncmer_invalid_offset(offset, exception_type): with pytest.raises(exception_type): align.SyncmerSelector( # Any alphabet would work here - seq.NucleotideSequence.alphabet_unamb, K, S, offset=offset + seq.NucleotideSequence.alphabet_unamb, + K, + S, + offset=offset, ) @@ -205,12 +194,9 @@ def test_mincode(use_permutation): permutation_range = len(kmer_alph) order = kmers - mincode_selector = align.MincodeSelector( - kmer_alph, COMPRESSION, permutation - ) + mincode_selector = align.MincodeSelector(kmer_alph, COMPRESSION, permutation) _, mincode_pos = mincode_selector.select_from_kmers(kmers) threshold = permutation_offset + permutation_range / COMPRESSION assert mincode_pos.tolist() == np.where(order < threshold)[0].tolist() - assert len(mincode_pos) * COMPRESSION \ - == pytest.approx(len(kmers), rel=0.02) \ No newline at end of file + assert len(mincode_pos) * COMPRESSION == pytest.approx(len(kmers), rel=0.02) diff --git a/tests/sequence/align/test_statistics.py b/tests/sequence/align/test_statistics.py index b9defcc46..cb0840a16 100644 --- a/tests/sequence/align/test_statistics.py +++ b/tests/sequence/align/test_statistics.py @@ -2,50 +2,55 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import numpy as np +import pytest import biotite.sequence as seq import biotite.sequence.align as align from biotite.sequence.align.statistics import EValueEstimator -from .util import sequences - - -BACKGROUND = np.array(list({ - "A": 35155, - "C": 8669, - "D": 24161, - "E": 28354, - "F": 17367, - "G": 33229, - "H": 9906, - "I": 23161, - "K": 25872, - "L": 40625, - "M": 10101, - "N": 20212, - "P": 23435, - "Q": 19208, - "R": 23105, - "S": 32070, - "T": 26311, - "V": 29012, - "W": 5990, - "Y": 14488, - "B": 0, - "Z": 0, - "X": 0, - "*": 0, -}.values())) / 450431 + +BACKGROUND = ( + np.array( + list( + { + "A": 35155, + "C": 8669, + "D": 24161, + "E": 28354, + "F": 17367, + "G": 33229, + "H": 9906, + "I": 23161, + "K": 25872, + "L": 40625, + "M": 10101, + "N": 20212, + "P": 23435, + "Q": 19208, + "R": 23105, + "S": 32070, + "T": 26311, + "V": 29012, + "W": 5990, + "Y": 14488, + "B": 0, + "Z": 0, + "X": 0, + "*": 0, + }.values() + ) + ) + / 450431 +) @pytest.mark.parametrize( "matrix_name, gap_penalty, ref_lam, ref_k", [ ("BLOSUM62", (-10000, -10000), 0.318, 0.130), - ("BLOSUM62", ( -12, -2), 0.300, 0.090), - ("BLOSUM62", ( -5, -5), 0.131, 0.009), - ( "PAM250", ( -16, -1), 0.172, 0.018), - ] + ("BLOSUM62", (-12, -2), 0.300, 0.090), + ("BLOSUM62", (-5, -5), 0.131, 0.009), + ("PAM250", (-16, -1), 0.172, 0.018), + ], ) def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k): """ @@ -55,14 +60,13 @@ def test_distribution_param(matrix_name, gap_penalty, ref_lam, ref_k): """ SAMPLE_LENGTH = 500 SAMPLE_SIZE = 1000 - + alphabet = seq.ProteinSequence.alphabet matrix = align.SubstitutionMatrix(alphabet, alphabet, matrix_name) np.random.seed(0) estimator = align.EValueEstimator.from_samples( - alphabet, matrix, gap_penalty, BACKGROUND, - SAMPLE_LENGTH, SAMPLE_SIZE + alphabet, matrix, gap_penalty, BACKGROUND, SAMPLE_LENGTH, SAMPLE_SIZE ) # Due to relatively low sample size, expect rather large deviation @@ -85,35 +89,29 @@ def test_evalue(): matrix = align.SubstitutionMatrix.std_protein_matrix() estimator = align.EValueEstimator.from_samples( - seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, - BACKGROUND + seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND ) # Generate large number of alignments of random sequences np.random.seed(0) random_sequence_code = np.random.choice( - len(seq.ProteinSequence.alphabet), - size=(N_SAMPLES, 2, SEQ_LENGTH), - p=BACKGROUND + len(seq.ProteinSequence.alphabet), size=(N_SAMPLES, 2, SEQ_LENGTH), p=BACKGROUND ) sample_scores = np.zeros(N_SAMPLES, dtype=int) for i in range(N_SAMPLES): seq1 = seq.ProteinSequence() seq2 = seq.ProteinSequence() - seq1.code = random_sequence_code[i,0] - seq2.code = random_sequence_code[i,1] + seq1.code = random_sequence_code[i, 0] + seq2.code = random_sequence_code[i, 1] sample_scores[i] = align.align_optimal( - seq1, seq2, matrix, - local=True, gap_penalty=GAP_PENALTY, max_number=1 + seq1, seq2, matrix, local=True, gap_penalty=GAP_PENALTY, max_number=1 )[0].score e_values = [ 10 ** estimator.log_evalue(score, SEQ_LENGTH, SEQ_LENGTH * N_SAMPLES) for score in TEST_SCORES ] - counts = [ - np.count_nonzero(sample_scores >= score) for score in TEST_SCORES - ] + counts = [np.count_nonzero(sample_scores >= score) for score in TEST_SCORES] assert e_values == pytest.approx(counts, rel=0.5) @@ -133,45 +131,50 @@ def test_score_scaling(sequences): np.random.seed(0) std_estimator = align.EValueEstimator.from_samples( - seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, - BACKGROUND + seq.ProteinSequence.alphabet, matrix, GAP_PENALTY, BACKGROUND ) scores = [ align.align_optimal( - sequences[i], sequences[i+1], matrix, GAP_PENALTY, local=True, - max_number=1 - )[0].score for i in range(9) + sequences[i], + sequences[i + 1], + matrix, + GAP_PENALTY, + local=True, + max_number=1, + )[0].score + for i in range(9) ] - std_log_evalues = std_estimator.log_evalue( - scores, SEQ_LENGTH, SEQ_LENGTH - ) + std_log_evalues = std_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) scaled_matrix = align.SubstitutionMatrix( seq.ProteinSequence.alphabet, seq.ProteinSequence.alphabet, - matrix.score_matrix() * SCALING_FACTOR + matrix.score_matrix() * SCALING_FACTOR, ) scaled_gap_penalty = ( GAP_PENALTY[0] * SCALING_FACTOR, - GAP_PENALTY[1] * SCALING_FACTOR + GAP_PENALTY[1] * SCALING_FACTOR, ) scaled_estimator = align.EValueEstimator.from_samples( - seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, - BACKGROUND + seq.ProteinSequence.alphabet, scaled_matrix, scaled_gap_penalty, BACKGROUND ) scores = [ align.align_optimal( - sequences[i], sequences[i+1], scaled_matrix, scaled_gap_penalty, - local=True, max_number=1 - )[0].score for i in range(9) + sequences[i], + sequences[i + 1], + scaled_matrix, + scaled_gap_penalty, + local=True, + max_number=1, + )[0].score + for i in range(9) ] - scaled_log_evalues = scaled_estimator.log_evalue( - scores, SEQ_LENGTH, SEQ_LENGTH - ) + scaled_log_evalues = scaled_estimator.log_evalue(scores, SEQ_LENGTH, SEQ_LENGTH) # Due to relatively low sample size, expect rather large deviation - assert std_log_evalues.tolist() \ - == pytest.approx(scaled_log_evalues.tolist(), rel=0.2) + assert std_log_evalues.tolist() == pytest.approx( + scaled_log_evalues.tolist(), rel=0.2 + ) def test_invalid_scoring_scheme(): @@ -185,6 +188,6 @@ def test_invalid_scoring_scheme(): ) # Uniform background frequencies freq = np.ones(len(alph)) - + with pytest.raises(ValueError): - estimator = EValueEstimator.from_samples(alph, matrix, -10, freq) \ No newline at end of file + EValueEstimator.from_samples(alph, matrix, -10, freq) diff --git a/tests/sequence/test_alphabet.py b/tests/sequence/test_alphabet.py index b99756e79..ba6ef023f 100644 --- a/tests/sequence/test_alphabet.py +++ b/tests/sequence/test_alphabet.py @@ -3,16 +3,19 @@ # information. import itertools -import pytest import numpy as np +import pytest import biotite.sequence as seq - test_cases = { - "A" : [0], - "D" : [3], - "ABC" : [0,1,2,], - "ABAFF" : [0,1,0,5,5] + "A": [0], + "D": [3], + "ABC": [ + 0, + 1, + 2, + ], + "ABAFF": [0, 1, 0, 5, 5], } @@ -24,17 +27,17 @@ def alphabet_symbols(): @pytest.mark.parametrize( "symbols, exp_code, use_letter_alphabet", zip( - list(test_cases.keys() ) * 2, + list(test_cases.keys()) * 2, list(test_cases.values()) * 2, - [False] * len(test_cases) + [True] * len(test_cases) - ) + [False] * len(test_cases) + [True] * len(test_cases), + ), ) def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) - + if len(symbols) == 1: assert alph.encode(symbols[0]) == exp_code[0] else: @@ -44,17 +47,17 @@ def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet): @pytest.mark.parametrize( "exp_symbols, code, use_letter_alphabet", zip( - list(test_cases.keys() ) * 2, + list(test_cases.keys()) * 2, list(test_cases.values()) * 2, - [False] * len(test_cases) + [True] * len(test_cases) - ) + [False] * len(test_cases) + [True] * len(test_cases), + ), ) def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet): if use_letter_alphabet: alph = seq.LetterAlphabet(alphabet_symbols) else: alph = seq.Alphabet(alphabet_symbols) - + code = np.array(code, dtype=np.uint8) if len(code) == 1: assert alph.decode(code[0]) == exp_symbols[0] @@ -64,9 +67,7 @@ def test_decoding(alphabet_symbols, exp_symbols, code, use_letter_alphabet): @pytest.mark.parametrize( "use_letter_alphabet, is_single_val", - itertools.product( - [False, True], [False, True] - ) + itertools.product([False, True], [False, True]), ) def test_error(alphabet_symbols, use_letter_alphabet, is_single_val): if use_letter_alphabet: @@ -96,8 +97,13 @@ def test_error(alphabet_symbols, use_letter_alphabet, is_single_val): @pytest.mark.parametrize( "symbols", - ["ABC", b"ABC", ["A","B","C"], - np.array(["A","B","C"]), np.array([b"A",b"B",b"C"])] + [ + "ABC", + b"ABC", + ["A", "B", "C"], + np.array(["A", "B", "C"]), + np.array([b"A", b"B", b"C"]), + ], ) def test_input_types(alphabet_symbols, symbols): """ @@ -108,13 +114,14 @@ def test_input_types(alphabet_symbols, symbols): alph = seq.LetterAlphabet(alphabet_symbols) code = alph.encode_multiple(symbols) conv_symbols = alph.decode_multiple(code) - - + if isinstance(symbols, bytes): symbols = symbols.decode("ASCII") assert list(conv_symbols) == list( - [symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol - for symbol in symbols] + [ + symbol.decode("ASCII") if isinstance(symbol, bytes) else symbol + for symbol in symbols + ] ) @@ -137,26 +144,24 @@ def test_contains(alphabet_symbols, use_letter_alphabet): @pytest.mark.parametrize( - "source_alph_symbols, target_alph_symbols", + "source_alph_symbols, target_alph_symbols", [ ("A", "AB"), (["foo", "bar"], ["bar", "foo", 42]), ("ACGT", "AGTC"), ("ACGT", "ACGNT"), (np.arange(0, 1000), np.arange(999, -1, -1)), - ] + ], ) def test_alphabet_mapper(source_alph_symbols, target_alph_symbols): CODE_LENGTH = 10000 source_alph = seq.Alphabet(source_alph_symbols) target_alph = seq.Alphabet(target_alph_symbols) mapper = seq.AlphabetMapper(source_alph, target_alph) - + ref_sequence = seq.GeneralSequence(source_alph) np.random.seed(0) - ref_sequence.code = np.random.randint( - len(source_alph), size=CODE_LENGTH, dtype=int - ) + ref_sequence.code = np.random.randint(len(source_alph), size=CODE_LENGTH, dtype=int) test_sequence = seq.GeneralSequence(target_alph) test_sequence.code = mapper[ref_sequence.code] @@ -164,22 +169,25 @@ def test_alphabet_mapper(source_alph_symbols, target_alph_symbols): assert test_sequence.symbols == ref_sequence.symbols -@pytest.mark.parametrize("alphabets, common_alph", [ - ( - [ +@pytest.mark.parametrize( + "alphabets, common_alph", + [ + ( + [ + seq.NucleotideSequence.alphabet_amb, + seq.NucleotideSequence.alphabet_unamb, + ], seq.NucleotideSequence.alphabet_amb, - seq.NucleotideSequence.alphabet_unamb, - ], - seq.NucleotideSequence.alphabet_amb - ), - ( - [ - seq.NucleotideSequence.alphabet_unamb, + ), + ( + [ + seq.NucleotideSequence.alphabet_unamb, + seq.NucleotideSequence.alphabet_amb, + ], seq.NucleotideSequence.alphabet_amb, - ], - seq.NucleotideSequence.alphabet_amb - ), -]) + ), + ], +) def test_common_alphabet(alphabets, common_alph): """ Check if :func:`common_alphabet()` correctly identifies the common @@ -188,13 +196,14 @@ def test_common_alphabet(alphabets, common_alph): seq.common_alphabet(alphabets) == common_alph - def test_common_alphabet_no_common(): """ Check if :func:`common_alphabet()` correctly identifies that no common alphabet exists in a simple known test case. """ - assert seq.common_alphabet([ - seq.NucleotideSequence.alphabet_unamb, - seq.ProteinSequence.alphabet - ]) is None \ No newline at end of file + assert ( + seq.common_alphabet( + [seq.NucleotideSequence.alphabet_unamb, seq.ProteinSequence.alphabet] + ) + is None + ) diff --git a/tests/sequence/test_annotation.py b/tests/sequence/test_annotation.py index 4ce771692..b1159933f 100644 --- a/tests/sequence/test_annotation.py +++ b/tests/sequence/test_annotation.py @@ -2,58 +2,62 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +from os.path import join import biotite.sequence as seq -from biotite.sequence import Location, Feature, Annotation, AnnotatedSequence import biotite.sequence.io.genbank as gb -import numpy as np -from os.path import join -from ..util import data_dir -import pytest +from biotite.sequence import AnnotatedSequence, Annotation, Feature, Location +from tests.util import data_dir def test_annotation_creation(): - feature1 = Feature("CDS", [seq.Location(1,2)], qual={"gene" : "test1"}) - feature2 = Feature("CDS", [seq.Location(3,4)], qual={"gene" : "test2"}) + feature1 = Feature("CDS", [seq.Location(1, 2)], qual={"gene": "test1"}) + feature2 = Feature("CDS", [seq.Location(3, 4)], qual={"gene": "test2"}) feature_list = [feature1, feature2] annotation = Annotation(feature_list) for feature in annotation: assert feature.key in [f.key for f in feature_list] - assert feature.qual["gene"] in [ - f.qual["gene"] for f in feature_list - ] + assert feature.qual["gene"] in [f.qual["gene"] for f in feature_list] + def test_annotation_concatenation(): - feature1 = Feature("CDS", [seq.Location(1,1)], qual={"gene" : "test1"}) - feature2 = Feature("CDS", [seq.Location(2,2)], qual={"gene" : "test2"}) + feature1 = Feature("CDS", [seq.Location(1, 1)], qual={"gene": "test1"}) + feature2 = Feature("CDS", [seq.Location(2, 2)], qual={"gene": "test2"}) annot1 = Annotation([feature1, feature2]) - feature3 = Feature("CDS", [seq.Location(3,3)], qual={"gene" : "test3"}) - feature4 = Feature("CDS", [seq.Location(4,4)], qual={"gene" : "test4"}) + feature3 = Feature("CDS", [seq.Location(3, 3)], qual={"gene": "test3"}) + feature4 = Feature("CDS", [seq.Location(4, 4)], qual={"gene": "test4"}) annot2 = Annotation([feature3, feature4]) - feature5 = Feature("CDS", [seq.Location(5,5)], qual={"gene" : "test5"}) + feature5 = Feature("CDS", [seq.Location(5, 5)], qual={"gene": "test5"}) concat = annot1 + annot2 + feature5 - assert set([f.qual["gene"] for f in concat]) \ - == set(["test1", "test2", "test3", "test4", "test5"]) + assert set([f.qual["gene"] for f in concat]) == set( + ["test1", "test2", "test3", "test4", "test5"] + ) + def test_annotation_indexing(): - feature1 = Feature("CDS", [Location(-10,30 )], qual={"gene" : "test1"}) - feature2 = Feature("CDS", [Location(20, 50 )], qual={"gene" : "test2"}) - feature3 = Feature("CDS", [Location(100,130)], qual={"gene" : "test3"}) - feature4 = Feature("CDS", [Location(150,250)], qual={"gene" : "test4"}) - feature5 = Feature("CDS", [Location(-50,200)], qual={"gene" : "test5"}) - annotation = Annotation([feature1,feature2,feature3,feature4,feature5]) + feature1 = Feature("CDS", [Location(-10, 30)], qual={"gene": "test1"}) + feature2 = Feature("CDS", [Location(20, 50)], qual={"gene": "test2"}) + feature3 = Feature("CDS", [Location(100, 130)], qual={"gene": "test3"}) + feature4 = Feature("CDS", [Location(150, 250)], qual={"gene": "test4"}) + feature5 = Feature("CDS", [Location(-50, 200)], qual={"gene": "test5"}) + annotation = Annotation([feature1, feature2, feature3, feature4, feature5]) sub_annot = annotation[40:150] # Only one location per feature - assert set([list(f.locs)[0].defect for f in sub_annot]) \ - == set([Location.Defect.MISS_LEFT, Location.Defect.NONE, - (Location.Defect.MISS_LEFT | Location.Defect.MISS_RIGHT)]) - assert set([f.qual["gene"] for f in sub_annot]) \ - == set(["test2", "test3", "test5"]) + assert set([list(f.locs)[0].defect for f in sub_annot]) == set( + [ + Location.Defect.MISS_LEFT, + Location.Defect.NONE, + (Location.Defect.MISS_LEFT | Location.Defect.MISS_RIGHT), + ] + ) + assert set([f.qual["gene"] for f in sub_annot]) == set(["test2", "test3", "test5"]) + def test_annotated_sequence(): sequence = seq.NucleotideSequence("ATGGCGTACGATTAGAAAAAAA") - feature1 = Feature("misc_feature", [Location(1,2), Location(11,12)], - {"note" : "walker"}) - feature2 = Feature("misc_feature", [Location(16,22)], {"note" : "poly-A"}) + feature1 = Feature( + "misc_feature", [Location(1, 2), Location(11, 12)], {"note": "walker"} + ) + feature2 = Feature("misc_feature", [Location(16, 22)], {"note": "poly-A"}) annotation = Annotation([feature1, feature2]) annot_seq = AnnotatedSequence(annotation, sequence) assert annot_seq[2] == "T" @@ -62,17 +66,19 @@ def test_annotated_sequence(): # test slicing with only stop annot_seq2 = annot_seq[:16] assert annot_seq2.sequence == seq.NucleotideSequence("ATGGCGTACGATTAG") - assert set([f.qual['note'] for f in annot_seq2.annotation]) == {'walker'} + assert set([f.qual["note"] for f in annot_seq2.annotation]) == {"walker"} # test slicing with only start annot_seq3 = annot_seq[16:] assert annot_seq3.sequence == seq.NucleotideSequence("AAAAAAA") - assert set([f.qual['note'] for f in annot_seq3.annotation]) == {'poly-A'} + assert set([f.qual["note"] for f in annot_seq3.annotation]) == {"poly-A"} # test slicing with start and stop annot_seq4 = annot_seq[1:17] - assert annot_seq4.sequence == seq.NucleotideSequence("ATGGCGTACGATTAGA") # sequences are 1-indexed - assert set([f.qual['note'] for f in annot_seq4.annotation]) == {'walker', 'poly-A'} + assert annot_seq4.sequence == seq.NucleotideSequence( + "ATGGCGTACGATTAGA" + ) # sequences are 1-indexed + assert set([f.qual["note"] for f in annot_seq4.annotation]) == {"walker", "poly-A"} assert annot_seq[feature1] == seq.NucleotideSequence("ATAT") assert annot_seq[feature2] == seq.NucleotideSequence("AAAAAAA") @@ -80,12 +86,17 @@ def test_annotated_sequence(): assert annot_seq.sequence == seq.NucleotideSequence("CCGGCGTACGCCTAGAAAAAAA") # test slicing with feature on minus strand - feature3 = Feature("misc_feature", [Location(1,4), Location(8,12)]) - feature4 = Feature("misc_feature_minus", [ - Location(1,4,strand=Location.Strand.REVERSE), - Location(8,12,strand=Location.Strand.REVERSE)]) + feature3 = Feature("misc_feature", [Location(1, 4), Location(8, 12)]) + feature4 = Feature( + "misc_feature_minus", + [ + Location(1, 4, strand=Location.Strand.REVERSE), + Location(8, 12, strand=Location.Strand.REVERSE), + ], + ) assert annot_seq[feature4] == annot_seq[feature3].reverse().complement() + def test_reverse_complement(): gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) annot_seq = gb.get_annotated_sequence(gb_file) diff --git a/tests/sequence/test_codon.py b/tests/sequence/test_codon.py index 315f2b0fc..fe8d38eb4 100644 --- a/tests/sequence/test_codon.py +++ b/tests/sequence/test_codon.py @@ -2,14 +2,41 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.sequence as seq import pytest +import biotite.sequence as seq -@pytest.mark.parametrize("table_id", - [1,2,3,4,5,6,9,10,11,12,13,14,16,21,22,23,24,25,26,27,28,29,30,31]) +@pytest.mark.parametrize( + "table_id", + [ + 1, + 2, + 3, + 4, + 5, + 6, + 9, + 10, + 11, + 12, + 13, + 14, + 16, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + ], +) def test_table_load(table_id): - table = seq.CodonTable.load(table_id) + seq.CodonTable.load(table_id) def test_table_indexing(): diff --git a/tests/sequence/test_fasta.py b/tests/sequence/test_fasta.py index 1b7103e30..68133f44b 100644 --- a/tests/sequence/test_fasta.py +++ b/tests/sequence/test_fasta.py @@ -2,18 +2,18 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import itertools import glob import io -import biotite.sequence as seq -import biotite.sequence.io.fasta as fasta -import numpy as np +import itertools import os import os.path -from ..util import data_dir +import numpy as np import pytest +import biotite.sequence as seq +import biotite.sequence.io.fasta as fasta +from tests.util import data_dir + - def test_access_low_level(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) @@ -21,21 +21,21 @@ def test_access_low_level(): assert file["another dna sequence"] == "A" assert file["third dna sequence"] == "ACGT" assert dict(file.items()) == { - "dna sequence" : "ACGCTACGT", - "another dna sequence" : "A", - "third dna sequence" : "ACGT", - "rna sequence" : "ACGU", - "ambiguous rna sequence" : "ACGUNN", + "dna sequence": "ACGCTACGT", + "another dna sequence": "A", + "third dna sequence": "ACGT", + "rna sequence": "ACGU", + "ambiguous rna sequence": "ACGUNN", } file["another dna sequence"] = "AA" del file["dna sequence"] file["yet another sequence"] = "ACGT" assert dict(file.items()) == { - "another dna sequence" : "AA", - "third dna sequence" : "ACGT", - "rna sequence" : "ACGU", - "ambiguous rna sequence" : "ACGUNN", - "yet another sequence" : "ACGT", + "another dna sequence": "AA", + "third dna sequence": "ACGT", + "rna sequence": "ACGU", + "ambiguous rna sequence": "ACGUNN", + "yet another sequence": "ACGT", } @@ -45,16 +45,16 @@ def test_access_high_level(seq_type): file = fasta.FastaFile.read(path) sequences = fasta.get_sequences(file, seq_type=seq_type) assert sequences == { - "dna sequence" : seq.NucleotideSequence("ACGCTACGT", False), - "another dna sequence" : seq.NucleotideSequence("A", False), - "third dna sequence" : seq.NucleotideSequence("ACGT", False), - "rna sequence" : seq.NucleotideSequence("ACGT", False), - "ambiguous rna sequence" : seq.NucleotideSequence("ACGTNN", True), + "dna sequence": seq.NucleotideSequence("ACGCTACGT", False), + "another dna sequence": seq.NucleotideSequence("A", False), + "third dna sequence": seq.NucleotideSequence("ACGT", False), + "rna sequence": seq.NucleotideSequence("ACGT", False), + "ambiguous rna sequence": seq.NucleotideSequence("ACGTNN", True), } @pytest.mark.parametrize( - "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) + "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) ) def test_sequence_conversion_ambiguous(seq_type): path = os.path.join(data_dir("sequence"), "nuc.fasta") @@ -67,10 +67,8 @@ def test_sequence_conversion_ambiguous(seq_type): file, seq_type=None ) else: - assert seq_type(sequence) == fasta.get_sequence( - file, seq_type=seq_type - ) - + assert seq_type(sequence) == fasta.get_sequence(file, seq_type=seq_type) + seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) @@ -84,7 +82,7 @@ def test_sequence_conversion_ambiguous(seq_type): assert str(seq1) == str(seq2) else: assert seq_dict == seq_dict2 - + if seq_type is not None: sequence = "AACCTTGG" file3 = fasta.FastaFile() @@ -93,7 +91,7 @@ def test_sequence_conversion_ambiguous(seq_type): @pytest.mark.parametrize( - "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) + "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) ) def test_sequence_conversion_protein(seq_type): path = os.path.join(data_dir("sequence"), "prot.fasta") @@ -112,7 +110,7 @@ def test_sequence_conversion_protein(seq_type): @pytest.mark.parametrize( - "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) + "seq_type", (None, seq.NucleotideSequence, seq.ProteinSequence) ) def test_sequence_conversion_invalid(seq_type): path = os.path.join(data_dir("sequence"), "invalid.fasta") @@ -134,32 +132,31 @@ def test_alignment_conversion(): path = os.path.join(data_dir("sequence"), "alignment.fasta") file = fasta.FastaFile.read(path) alignment = fasta.get_alignment(file) - assert str(alignment) == ("ADTRCGTARDCGTR-DRTCGRAGD\n" - "ADTRCGT---CGTRADRTCGRAGD\n" - "ADTRCGTARDCGTRADR--GRAGD") - + assert str(alignment) == ( + "ADTRCGTARDCGTR-DRTCGRAGD\n" + "ADTRCGT---CGTRADRTCGRAGD\n" + "ADTRCGTARDCGTRADR--GRAGD" + ) + file2 = fasta.FastaFile() - fasta.set_alignment(file2, alignment, seq_names=["seq1","seq2","seq3"]) + fasta.set_alignment(file2, alignment, seq_names=["seq1", "seq2", "seq3"]) alignment2 = fasta.get_alignment(file2) assert str(alignment) == str(alignment2) + @pytest.mark.parametrize( - "file_name", - glob.glob(os.path.join(data_dir("sequence"), "*.fasta")) + "file_name", glob.glob(os.path.join(data_dir("sequence"), "*.fasta")) ) def test_read_iter(file_name): ref_dict = dict(fasta.FastaFile.read(file_name).items()) - + test_dict = dict(fasta.FastaFile.read_iter(file_name)) assert test_dict == ref_dict @pytest.mark.parametrize( - "chars_per_line, n_sequences", itertools.product( - [80, 200], - [1, 10] - ) + "chars_per_line, n_sequences", itertools.product([80, 200], [1, 10]) ) def test_write_iter(chars_per_line, n_sequences): """ @@ -168,7 +165,6 @@ def test_write_iter(chars_per_line, n_sequences): random sequences. """ LENGTH_RANGE = (50, 150) - SCORE_RANGE = (10, 60) # Generate random sequences and scores np.random.seed(0) @@ -176,28 +172,24 @@ def test_write_iter(chars_per_line, n_sequences): for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint( - len(seq.NucleotideSequence.alphabet_unamb), - size=seq_length + len(seq.NucleotideSequence.alphabet_unamb), size=seq_length ) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) - + fasta_file = fasta.FastaFile(chars_per_line) for i, sequence in enumerate(sequences): header = f"seq_{i}" fasta_file[header] = str(sequence) ref_file = io.StringIO() fasta_file.write(ref_file) - + test_file = io.StringIO() fasta.FastaFile.write_iter( test_file, - ( - (f"seq_{i}", str(sequence)) - for i, sequence in enumerate(sequences) - ), - chars_per_line + ((f"seq_{i}", str(sequence)) for i, sequence in enumerate(sequences)), + chars_per_line, ) - assert test_file.getvalue() == ref_file.getvalue() \ No newline at end of file + assert test_file.getvalue() == ref_file.getvalue() diff --git a/tests/sequence/test_fastq.py b/tests/sequence/test_fastq.py index d497787a8..3d0023337 100644 --- a/tests/sequence/test_fastq.py +++ b/tests/sequence/test_fastq.py @@ -5,43 +5,40 @@ import glob import io import itertools -from tempfile import TemporaryFile -import biotite.sequence as seq -import biotite.sequence.io.fastq as fastq -import numpy as np import os import os.path -from ..util import data_dir +from tempfile import TemporaryFile +import numpy as np import pytest +import biotite.sequence as seq +import biotite.sequence.io.fastq as fastq +from tests.util import data_dir + @pytest.mark.parametrize("chars_per_line", [None, 80]) def test_access(chars_per_line): path = os.path.join(data_dir("sequence"), "random.fastq") - file = fastq.FastqFile.read( - path, offset=33, chars_per_line=chars_per_line - ) + file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line) assert len(file) == 20 assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)] - del(file["Read:05"]) + del file["Read:05"] assert len(file) == 19 - assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20) - if i+1 != 5] + assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5] for seq_str, scores in file.values(): assert len(seq_str) == len(scores) assert (scores >= 0).all() seq_str = "ACTCGGT" - scores = np.array([10,12,20,11,0,80,42]) + scores = np.array([10, 12, 20, 11, 0, 80, 42]) file["test"] = seq_str, scores seq_str2, scores2 = file["test"] assert seq_str == seq_str2 assert np.array_equal(scores, scores2) + @pytest.mark.parametrize("chars_per_line", [None, 80]) def test_conversion(chars_per_line): path = os.path.join(data_dir("sequence"), "random.fastq") - fasta_file = fastq.FastqFile.read( - path, offset=33, chars_per_line=chars_per_line - ) + fasta_file = fastq.FastqFile.read(path, offset=33, chars_per_line=chars_per_line) ref_content = dict(fasta_file.items()) fasta_file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) @@ -51,48 +48,46 @@ def test_conversion(chars_per_line): fasta_file.write(temp) temp.seek(0) - fasta_file = fastq.FastqFile.read( - temp, offset=33, chars_per_line=chars_per_line - ) + fasta_file = fastq.FastqFile.read(temp, offset=33, chars_per_line=chars_per_line) content = dict(fasta_file.items()) temp.close() - + for identifier in ref_content: ref_sequence, ref_scores = ref_content[identifier] test_sequence, test_scores = content[identifier] assert test_sequence == ref_sequence assert np.array_equal(test_scores, ref_scores) + def test_rna_conversion(): sequence = seq.NucleotideSequence("ACGT") scores = np.array([0, 0, 0, 0]) fastq_file = fastq.FastqFile(offset="Sanger") fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False) fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True) - assert fastq_file["seq1"][0] == "ACGT" + assert fastq_file["seq1"][0] == "ACGT" assert fastq_file["seq2"][0] == "ACGU" + @pytest.mark.parametrize( - "file_name", - glob.glob(os.path.join(data_dir("sequence"), "*.fastq")) + "file_name", glob.glob(os.path.join(data_dir("sequence"), "*.fastq")) ) def test_read_iter(file_name): ref_dict = dict(fastq.FastqFile.read(file_name, offset="Sanger").items()) - + test_dict = dict(fastq.FastqFile.read_iter(file_name, offset="Sanger")) - for (test_id, (test_seq, test_sc)), (ref_id, (ref_seq, ref_sc)) \ - in zip(test_dict.items(), ref_dict.items()): - assert test_id == ref_id - assert test_seq == ref_seq - assert (test_sc == ref_sc).all() + for (test_id, (test_seq, test_sc)), (ref_id, (ref_seq, ref_sc)) in zip( + test_dict.items(), ref_dict.items() + ): + assert test_id == ref_id + assert test_seq == ref_seq + assert (test_sc == ref_sc).all() + @pytest.mark.parametrize( - "offset, chars_per_line, n_sequences", itertools.product( - [33, 42, "Solexa"], - [None, 80], - [1, 10] - ) + "offset, chars_per_line, n_sequences", + itertools.product([33, 42, "Solexa"], [None, 80], [1, 10]), ) def test_write_iter(offset, chars_per_line, n_sequences): """ @@ -110,22 +105,21 @@ def test_write_iter(offset, chars_per_line, n_sequences): for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint( - len(seq.NucleotideSequence.alphabet_unamb), - size=seq_length + len(seq.NucleotideSequence.alphabet_unamb), size=seq_length ) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) score = np.random.randint(*SCORE_RANGE, size=seq_length) scores.append(score) - + fastq_file = fastq.FastqFile(offset, chars_per_line) for i, (sequence, score) in enumerate(zip(sequences, scores)): identifier = f"seq_{i}" fastq_file[identifier] = (str(sequence), score) ref_file = io.StringIO() fastq_file.write(ref_file) - + test_file = io.StringIO() fastq.FastqFile.write_iter( test_file, @@ -133,7 +127,8 @@ def test_write_iter(offset, chars_per_line, n_sequences): (f"seq_{i}", (str(sequence), score)) for i, (sequence, score) in enumerate(zip(sequences, scores)) ), - offset, chars_per_line + offset, + chars_per_line, ) - assert test_file.getvalue() == ref_file.getvalue() \ No newline at end of file + assert test_file.getvalue() == ref_file.getvalue() diff --git a/tests/sequence/test_genbank.py b/tests/sequence/test_genbank.py index 6ecefd061..d96cbddc6 100644 --- a/tests/sequence/test_genbank.py +++ b/tests/sequence/test_genbank.py @@ -2,20 +2,19 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile import glob from os.path import join +from tempfile import TemporaryFile +import pytest import biotite.sequence as seq import biotite.sequence.io.genbank as gb -import numpy as np -import pytest -from ..util import data_dir +from tests.util import data_dir @pytest.mark.parametrize( "path", - glob.glob(join(data_dir("sequence"), "*.gb")) + \ - glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")) + glob.glob(join(data_dir("sequence"), "*.gb")) + + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")), ) def test_contiguous_field_pos(path): """ @@ -25,7 +24,7 @@ def test_contiguous_field_pos(path): assert gb_file._field_pos[0][0] == 0 for i in range(1, len(gb_file._field_pos)): start, _, _ = gb_file._field_pos[i] - _, stop, _ = gb_file._field_pos[i-1] + _, stop, _ = gb_file._field_pos[i - 1] assert start == stop @@ -37,27 +36,23 @@ def test_file_access(): gb_file = gb.GenBankFile() gb_file.append("SOMEFIELD", ["Some content", "some other content"]) gb_file.insert(0, "OTHERFIELD", ["Additional content"]) - assert gb_file[1] \ - == ("SOMEFIELD", ["Some content", "some other content"], {}) - gb_file[1] \ - = "NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]} + assert gb_file[1] == ("SOMEFIELD", ["Some content", "some other content"], {}) + gb_file[1] = "NEWFIELD", ["Extra content"], {"SUBFIELD": ["L 1", "L 2"]} gb_file.append("THIRDFIELD", ["Supplementary content"]) assert len(gb_file) == 3 assert gb_file[0] == ("OTHERFIELD", ["Additional content"], {}) del gb_file[0] - assert gb_file[0] \ - == ("NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]}) + assert gb_file[0] == ("NEWFIELD", ["Extra content"], {"SUBFIELD": ["L 1", "L 2"]}) del gb_file[0] assert gb_file[0] == ("THIRDFIELD", ["Supplementary content"], {}) del gb_file[0] assert len(gb_file) == 0 - @pytest.mark.parametrize( "path", - glob.glob(join(data_dir("sequence"), "*.gb")) + \ - glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")) + glob.glob(join(data_dir("sequence"), "*.gb")) + + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")), ) def test_conversion_lowlevel(path): """ @@ -72,7 +67,7 @@ def test_conversion_lowlevel(path): gb_file.append(name, content, subfields) temp = TemporaryFile("w+") gb_file.write(temp) - + temp.seek(0) gb_file = gb.GenBankFile.read(temp) temp.close() @@ -82,8 +77,8 @@ def test_conversion_lowlevel(path): @pytest.mark.parametrize( "path", - glob.glob(join(data_dir("sequence"), "*.gb")) + \ - glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")) + glob.glob(join(data_dir("sequence"), "*.gb")) + + glob.glob(join(data_dir("sequence"), "[!multifile]*.gp")), ) def test_conversion_highlevel(path): """ @@ -101,44 +96,55 @@ def test_conversion_highlevel(path): gb.set_annotated_sequence(gb_file, ref_annot_seq) temp = TemporaryFile("w+") gb_file.write(temp) - + temp.seek(0) gb_file = gb.GenBankFile.read(temp) temp.close() test_locus = gb.get_locus(gb_file) test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) assert test_locus == ref_locus - assert test_annot_seq.sequence == ref_annot_seq.sequence - assert test_annot_seq.annotation == ref_annot_seq.annotation + assert test_annot_seq.sequence == ref_annot_seq.sequence + assert test_annot_seq.annotation == ref_annot_seq.annotation assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start def test_genbank_utility_gb(): """ Check whether the high-level utility functions return the expected - content of a known GenBank file. + content of a known GenBank file. """ gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) - assert gb.get_locus(gb_file) \ - == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017") - assert gb.get_definition(gb_file) \ - == ("Escherichia coli BL21(DE3), complete genome.") + assert gb.get_locus(gb_file) == ( + "CP001509", + 4558953, + "DNA", + True, + "BCT", + "16-FEB-2017", + ) + assert gb.get_definition(gb_file) == ( + "Escherichia coli BL21(DE3), complete genome." + ) assert gb.get_version(gb_file) == "CP001509.3" assert gb.get_gi(gb_file) == 296142109 - assert gb.get_db_link(gb_file) \ - == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"} + assert gb.get_db_link(gb_file) == { + "BioProject": "PRJNA20713", + "BioSample": "SAMN02603478", + } annotation = gb.get_annotation(gb_file, include_only=["CDS"]) feature = seq.Feature( "CDS", [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)], - {"gene": "yaaA", "transl_table": "11"} + {"gene": "yaaA", "transl_table": "11"}, ) in_annotation = False for f in annotation: - if f.key == feature.key and f.locs == feature.locs and \ - all([(key, val in f.qual.items()) - for key, val in feature.qual.items()]): - in_annotation = True + if ( + f.key == feature.key + and f.locs == feature.locs + and all([(key, val in f.qual.items()) for key, val in feature.qual.items()]) + ): + in_annotation = True assert in_annotation assert len(gb.get_sequence(gb_file, format="gb")) == 4558953 @@ -146,30 +152,34 @@ def test_genbank_utility_gb(): def test_genbank_utility_gp(): """ Check whether the high-level utility functions return the expected - content of a known GenPept file. + content of a known GenPept file. """ gp_file = gb.GenBankFile.read(join(data_dir("sequence"), "bt_lysozyme.gp")) - #[print(e) for e in gp_file._field_pos] - assert gb.get_locus(gp_file) \ - == ("AAC37312", 147, None, False, "MAM", "27-APR-1993") + # [print(e) for e in gp_file._field_pos] + assert gb.get_locus(gp_file) == ("AAC37312", 147, None, False, "MAM", "27-APR-1993") assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]." assert gb.get_version(gp_file) == "AAC37312.1" assert gb.get_gi(gp_file) == 163334 annotation = gb.get_annotation(gp_file) feature = seq.Feature( "Site", - [seq.Location(start, stop) for start, stop in zip( - [52,55,62,76,78,81,117,120,125], - [53,55,62,76,78,81,117,120,126] - )], - {"note": "lysozyme catalytic cleft [active]", "site_type": "active"} + [ + seq.Location(start, stop) + for start, stop in zip( + [52, 55, 62, 76, 78, 81, 117, 120, 125], + [53, 55, 62, 76, 78, 81, 117, 120, 126], + ) + ], + {"note": "lysozyme catalytic cleft [active]", "site_type": "active"}, ) in_annotation = False for f in annotation: - if f.key == feature.key and f.locs == feature.locs and \ - all([(key, val in f.qual.items()) - for key, val in feature.qual.items()]): - in_annotation = True + if ( + f.key == feature.key + and f.locs == feature.locs + and all([(key, val in f.qual.items()) for key, val in feature.qual.items()]) + ): + in_annotation = True assert in_annotation assert len(gb.get_sequence(gp_file, format="gp")) == 147 @@ -184,21 +194,27 @@ def test_multi_file(): "locus_content, expected_result", [ ( - "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID 1224 bp DNA linear VRT 14-NOV-2006", - ("AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID", 1224, "DNA", False, "VRT", "14-NOV-2006") + "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID 1224 bp DNA linear VRT 14-NOV-2006", + ( + "AJ311647LOOOOOOOOOOOOOOOOOOOOOOOOOONGID", + 1224, + "DNA", + False, + "VRT", + "14-NOV-2006", + ), ), ( - "SCU49845 5028 bp DNA PLN 21-JUN-1999", - ("SCU49845", 5028, "DNA", False, "PLN", "21-JUN-1999") + "SCU49845 5028 bp DNA PLN 21-JUN-1999", + ("SCU49845", 5028, "DNA", False, "PLN", "21-JUN-1999"), ), ( - "123MissingMolTypeAndCircular 5028 bp PLN 21-JUN-1999", - ("123MissingMolTypeAndCircular", 5028, None, False, "PLN", "21-JUN-1999") - ) - ] + "123MissingMolTypeAndCircular 5028 bp PLN 21-JUN-1999", + ("123MissingMolTypeAndCircular", 5028, None, False, "PLN", "21-JUN-1999"), + ), + ], ) def test_parse_locus(locus_content, expected_result): gb_file = gb.GenBankFile() gb_file.append("LOCUS", [locus_content]) assert gb.get_locus(gb_file) == expected_result - \ No newline at end of file diff --git a/tests/sequence/test_generalio.py b/tests/sequence/test_generalio.py index a5b21315b..36ba1a7e4 100644 --- a/tests/sequence/test_generalio.py +++ b/tests/sequence/test_generalio.py @@ -2,33 +2,24 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile -import biotite -import biotite.sequence as seq -import biotite.sequence.io as seqio -import numpy as np import glob from os.path import join -from ..util import data_dir +from tempfile import NamedTemporaryFile import pytest +import biotite.sequence.io as seqio +from tests.util import data_dir -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("sequence"), "random.*")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("sequence"), "random.*"))) def test_loading_single(path): - ref_sequence = seqio.load_sequence( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequence = seqio.load_sequence(join(data_dir("sequence"), "random.fasta")) sequence = seqio.load_sequence(path) assert ref_sequence == sequence @pytest.mark.parametrize("suffix", ["fasta", "fastq"]) def test_saving_single(suffix): - ref_sequence = seqio.load_sequence( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequence = seqio.load_sequence(join(data_dir("sequence"), "random.fasta")) temp = NamedTemporaryFile("w+", suffix=f".{suffix}") try: seqio.save_sequence(temp.name, ref_sequence) @@ -37,22 +28,16 @@ def test_saving_single(suffix): pytest.skip("Permission is denied") -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("sequence"), "random.*")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("sequence"), "random.*"))) def test_loading_multiple(path): - ref_sequences = seqio.load_sequences( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequences = seqio.load_sequences(join(data_dir("sequence"), "random.fasta")) sequences = seqio.load_sequences(path) assert ref_sequences == sequences @pytest.mark.parametrize("suffix", ["fasta", "fastq"]) def test_saving_multiple(suffix): - ref_sequences = seqio.load_sequences( - join(data_dir("sequence"), "random.fasta") - ) + ref_sequences = seqio.load_sequences(join(data_dir("sequence"), "random.fasta")) temp = NamedTemporaryFile("w+", suffix=f".{suffix}") try: seqio.save_sequences(temp.name, ref_sequences) @@ -60,6 +45,7 @@ def test_saving_multiple(suffix): # This error might occur on AppVeyor pytest.skip("Permission is denied") + @pytest.mark.parametrize("file_name", ["gg_avidin.gb", "bt_lysozyme.gp"]) def test_genbank(file_name): """ @@ -73,4 +59,4 @@ def test_genbank(file_name): seqio.save_sequence(temp.name, sequence) except PermissionError: # This error might occur on AppVeyor - pytest.skip("Permission is denied") \ No newline at end of file + pytest.skip("Permission is denied") diff --git a/tests/sequence/test_gff.py b/tests/sequence/test_gff.py index 5c6ee77b4..0713a8324 100644 --- a/tests/sequence/test_gff.py +++ b/tests/sequence/test_gff.py @@ -2,19 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile from os.path import join +from tempfile import TemporaryFile +import pytest import biotite.sequence as seq -import biotite.sequence.io.gff as gff import biotite.sequence.io.genbank as gb -import numpy as np -import pytest -from ..util import data_dir +import biotite.sequence.io.gff as gff +from tests.util import data_dir @pytest.mark.parametrize( - "path", - ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] + "path", ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] ) def test_conversion_lowlevel(path): """ @@ -38,8 +36,7 @@ def test_conversion_lowlevel(path): @pytest.mark.parametrize( - "path", - ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] + "path", ["bt_lysozyme.gff3", "gg_avidin.gff3", "ec_bl21.gff3", "sc_chrom1.gff3"] ) def test_conversion_highlevel(path): """ @@ -69,7 +66,7 @@ def test_conversion_highlevel(path): for _, _, type, _, _, _, _, phase, _ in gff_file: if type == "CDS": test_phases.append(phase) - + assert ref_annot == test_annot assert test_phases == ref_phases @@ -87,7 +84,7 @@ def test_genbank_consistency(path): gff_file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3")) test_annot = gff.get_annotation(gff_file) - + # Remove qualifiers, since they will be different # in GFF3 and GenBank ref_annot = seq.Annotation( @@ -115,7 +112,7 @@ def test_file_access(): file. """ file = gff.GFFFile() - entry_scaffold = ("ab", "cd", 1, 2, None, None, None, {"Id":"foo"}) + entry_scaffold = ("ab", "cd", 1, 2, None, None, None, {"Id": "foo"}) entry = ("a",) + entry_scaffold file.append(*entry) assert file[0] == entry @@ -124,8 +121,11 @@ def test_file_access(): file[1] = ("d",) + entry_scaffold file.insert(3, *(("e",) + entry_scaffold)) del file[2] - assert [seqid for seqid, _, _, _, _, _, _, _, _ in file] \ - == ["a", "d", "e", ] + assert [seqid for seqid, _, _, _, _, _, _, _, _ in file] == [ + "a", + "d", + "e", + ] def test_entry_indexing(): @@ -134,17 +134,14 @@ def test_entry_indexing(): test file with multiple directives, including '##FASTA'. """ with pytest.warns(UserWarning): - file = gff.GFFFile.read( - join(data_dir("sequence"), "indexing_test.gff3") - ) + file = gff.GFFFile.read(join(data_dir("sequence"), "indexing_test.gff3")) assert file._directives == [ ("directive 1", 1), ("directive 2", 2), ("directive 3", 7), ("FASTA", 8), ] - assert file._entries == [3,4,6] - + assert file._entries == [3, 4, 6] def test_percent_encoding(): @@ -153,21 +150,19 @@ def test_percent_encoding(): artificial test file. """ file = gff.GFFFile.read(join(data_dir("sequence"), "percent_test.gff3")) - seqid, source, type, start, end, score, strand, phase, attrib \ - = file[0] + seqid, source, type, start, end, score, strand, phase, attrib = file[0] assert seqid == "123,456" assert source == "ääh" assert type == "regi&n" assert attrib == { - "ID" : "AnID;AnotherID", - "Name" : "Ångström", - "c$l$r": "red\tgreen\tblue" + "ID": "AnID;AnotherID", + "Name": "Ångström", + "c$l$r": "red\tgreen\tblue", } file2 = gff.GFFFile() - file.append(seqid, source, type, start, end, score, strand, phase, attrib) - assert (seqid, source, type, start, end, score, strand, phase, attrib) \ - == file[0] + file2.append(seqid, source, type, start, end, score, strand, phase, attrib) + assert (seqid, source, type, start, end, score, strand, phase, attrib) == file2[0] def test_error(): @@ -177,16 +172,17 @@ def test_error(): file = gff.GFFFile() with pytest.raises(ValueError): # 'seqid' beginning with '>' is not legal - file.append(">xyz", "ab", "cd", 1, 2, None, None, None, {"Id":"foo"}) + file.append(">xyz", "ab", "cd", 1, 2, None, None, None, {"Id": "foo"}) with pytest.raises(ValueError): # String fields must not be empty - file.append("", "ab", "cd", 1, 2, None, None, None, {"Id":"foo"}) + file.append("", "ab", "cd", 1, 2, None, None, None, {"Id": "foo"}) with pytest.raises(ValueError): # String fields must not be empty - file.append("xyz", "", "cd", 1, 2, None, None, None, {"Id":"foo"}) + file.append("xyz", "", "cd", 1, 2, None, None, None, {"Id": "foo"}) with pytest.raises(ValueError): # String fields must not be empty - file.append("xyz", "ab", "", 1, 2, None, None, None, {"Id":"foo"}) + file.append("xyz", "ab", "", 1, 2, None, None, None, {"Id": "foo"}) + def test_feature_without_id(): """ @@ -194,12 +190,14 @@ def test_feature_without_id(): locations and consequently multiple entries in the GFF3 file. """ annot = seq.Annotation( - [seq.Feature( - key = "CDS", - locs = [seq.Location(1,2), seq.Location(4,5)], - qual = {"some" : "qualifiers"} - )] + [ + seq.Feature( + key="CDS", + locs=[seq.Location(1, 2), seq.Location(4, 5)], + qual={"some": "qualifiers"}, + ) + ] ) file = gff.GFFFile() with pytest.raises(ValueError): - gff.set_annotation(file, annot) \ No newline at end of file + gff.set_annotation(file, annot) diff --git a/tests/sequence/test_graphics.py b/tests/sequence/test_graphics.py index bfad27840..cddb45ad6 100644 --- a/tests/sequence/test_graphics.py +++ b/tests/sequence/test_graphics.py @@ -2,23 +2,19 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import dirname, abspath, join import glob +from os.path import abspath, dirname, join import pytest import biotite.sequence as seq -from ..util import cannot_import +from tests.util import cannot_import -@pytest.mark.skipif( - cannot_import("matplotlib"), reason="Matplotlib is not installed" -) +@pytest.mark.skipif(cannot_import("matplotlib"), reason="Matplotlib is not installed") @pytest.mark.parametrize( - "scheme_path", glob.glob( - join( - dirname(abspath(seq.__file__)), - "graphics", "color_schemes", "*.json" - ) - ) + "scheme_path", + glob.glob( + join(dirname(abspath(seq.__file__)), "graphics", "color_schemes", "*.json") + ), ) def test_load_color_scheme(scheme_path): from matplotlib.colors import to_rgb @@ -27,9 +23,9 @@ def test_load_color_scheme(scheme_path): supported_alphabets = [ seq.NucleotideSequence.alphabet_amb, seq.ProteinSequence.alphabet, - seq.LetterAlphabet("abcdefghijklmnop") # Protein block alphabet + seq.LetterAlphabet("abcdefghijklmnop"), # Protein block alphabet ] - + test_scheme = graphics.load_color_scheme(scheme_path) assert test_scheme["alphabet"] in supported_alphabets @@ -37,4 +33,4 @@ def test_load_color_scheme(scheme_path): for color in test_scheme["colors"]: if color is not None: # Should not raise error - to_rgb(color) \ No newline at end of file + to_rgb(color) diff --git a/tests/sequence/test_phylo.py b/tests/sequence/test_phylo.py index e2e52349c..0943d6002 100644 --- a/tests/sequence/test_phylo.py +++ b/tests/sequence/test_phylo.py @@ -7,7 +7,7 @@ import pytest import biotite import biotite.sequence.phylo as phylo -from ..util import data_dir +from tests.util import data_dir @pytest.fixture @@ -43,10 +43,12 @@ def test_upgma(tree, upgma_newick): for i in range(len(tree)): for j in range(len(tree)): # Check for equal distances and equal topologies - assert tree.get_distance(i,j) \ - == pytest.approx(ref_tree.get_distance(i,j), abs=1e-3) - assert tree.get_distance(i,j, topological=True) \ - == ref_tree.get_distance(i,j, topological=True) + assert tree.get_distance(i, j) == pytest.approx( + ref_tree.get_distance(i, j), abs=1e-3 + ) + assert tree.get_distance(i, j, topological=True) == ref_tree.get_distance( + i, j, topological=True + ) def test_neighbor_joining(): @@ -60,34 +62,36 @@ def test_neighbor_joining(): [ 7, 10, 7, 0, 5, 9], [ 6, 9, 6, 5, 0, 8], [ 8, 11, 8, 9, 8, 0], - ]) - - ref_tree = phylo.Tree(phylo.TreeNode( - [ - phylo.TreeNode( - [ - phylo.TreeNode( - [ - phylo.TreeNode(index=0), - phylo.TreeNode(index=1), - ], - [1,4] - ), - phylo.TreeNode(index=2), - ], - [1, 2] - ), - phylo.TreeNode( - [ - phylo.TreeNode(index=3), - phylo.TreeNode(index=4), - ], - [3,2] - ), - phylo.TreeNode(index=5), - ], - [1,1,5] - )) + ]) # fmt: skip + + ref_tree = phylo.Tree( + phylo.TreeNode( + [ + phylo.TreeNode( + [ + phylo.TreeNode( + [ + phylo.TreeNode(index=0), + phylo.TreeNode(index=1), + ], + [1, 4], + ), + phylo.TreeNode(index=2), + ], + [1, 2], + ), + phylo.TreeNode( + [ + phylo.TreeNode(index=3), + phylo.TreeNode(index=4), + ], + [3, 2], + ), + phylo.TreeNode(index=5), + ], + [1, 1, 5], + ) + ) test_tree = phylo.neighbor_joining(dist) @@ -106,20 +110,20 @@ def test_node_distance(tree): assert leaf.distance_to(tree.root) == dist # Example topological distances assert tree.get_distance(0, 19, True) == 9 - assert tree.get_distance(4, 2, True) == 10 - + assert tree.get_distance(4, 2, True) == 10 + # All pairwise leaf node distances should be sufficient # to reconstruct the same tree via UPGMA ref_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - ref_dist_mat[i,j] = tree.get_distance(i,j) + ref_dist_mat[i, j] = tree.get_distance(i, j) assert np.allclose(ref_dist_mat, ref_dist_mat.T) new_tree = phylo.upgma(ref_dist_mat) test_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - test_dist_mat[i,j] = new_tree.get_distance(i,j) + test_dist_mat[i, j] = new_tree.get_distance(i, j) assert np.allclose(test_dist_mat, ref_dist_mat) @@ -136,19 +140,18 @@ def test_distances(tree): assert leaf.distance_to(tree.root) == dist # Example topological distances assert tree.get_distance(0, 19, True) == 9 - assert tree.get_distance(4, 2, True) == 10 + assert tree.get_distance(4, 2, True) == 10 def test_get_leaves(tree): # Manual example cases - node = tree.leaves[6] assert set(tree.leaves[6].parent.get_indices()) == set( - [6,11,2,3,13,8,14,5,0,15,16] + [6, 11, 2, 3, 13, 8, 14, 5, 0, 15, 16] ) assert set(tree.leaves[10].get_indices()) == set([10]) assert tree.root.get_leaf_count() == 20 - + def test_copy(tree): assert tree is not tree.copy() assert tree == tree.copy() @@ -190,30 +193,33 @@ def test_immutability(): phylo.Tree(node1) -@pytest.mark.parametrize("newick, labels, error", [ - # Reference index out of range - ("((1,0),4),2);", None, biotite.InvalidFileError), - # Empty string - ("", None, biotite.InvalidFileError), - # Empty node - ("();", None, biotite.InvalidFileError), - # Missing brackets - ("((0,1,(2,3));", None, biotite.InvalidFileError), - # A node with three leaves - ("((0,1),(2,3),(4,5));", None, None), - # A node with one leaf - ("((0,1),(2,3),(4));", None, None), - # Named intermediate nodes - ("((0,1,3)A,2)B;", None, None), - # Named intermediate nodes and distances - ("((0:1.0,1:3.0,3:5.0)A:2.0,2:5.0)B;", None, None), - # Nodes with labels - ("((((A,B),(C,D)),E),F);", ["A","B","C","D","E","F"], None), - # Nodes with labels and distances - ("((((A:1,B:2),(C:3,D:4)),E:5),F:6);", ["A","B","C","D","E","F"], None), - # Newick with spaces - (" ( 0 : 1.0 , 1 : 3.0 ) A ; ", None, None), -]) +@pytest.mark.parametrize( + "newick, labels, error", + [ + # Reference index out of range + ("((1,0),4),2);", None, biotite.InvalidFileError), + # Empty string + ("", None, biotite.InvalidFileError), + # Empty node + ("();", None, biotite.InvalidFileError), + # Missing brackets + ("((0,1,(2,3));", None, biotite.InvalidFileError), + # A node with three leaves + ("((0,1),(2,3),(4,5));", None, None), + # A node with one leaf + ("((0,1),(2,3),(4));", None, None), + # Named intermediate nodes + ("((0,1,3)A,2)B;", None, None), + # Named intermediate nodes and distances + ("((0:1.0,1:3.0,3:5.0)A:2.0,2:5.0)B;", None, None), + # Nodes with labels + ("((((A,B),(C,D)),E),F);", ["A", "B", "C", "D", "E", "F"], None), + # Nodes with labels and distances + ("((((A:1,B:2),(C:3,D:4)),E:5),F:6);", ["A", "B", "C", "D", "E", "F"], None), + # Newick with spaces + (" ( 0 : 1.0 , 1 : 3.0 ) A ; ", None, None), + ], +) def test_newick_simple(newick, labels, error): # Read, write and read again a Newick notation and expect # the same reult from both reads @@ -223,8 +229,8 @@ def test_newick_simple(newick, labels, error): tree2 = phylo.Tree.from_newick(newick, labels) assert tree1 == tree2 else: - with pytest.raises(error): - tree1 = phylo.Tree.from_newick(newick, labels) + with pytest.raises(error): + tree1 = phylo.Tree.from_newick(newick, labels) @pytest.mark.parametrize("use_labels", [False, True]) @@ -243,14 +249,16 @@ def test_newick_complex(upgma_newick, use_labels): def test_newick_rounding(): # Create the distance matrix distances = np.array( - [[0. , 0.53, 0.93, 0.78, 0.38, 0.99, 1.02, 0.76], - [0.53, 0. , 0.59, 0.41, 0.35, 0.87, 1.03, 0.83], - [0.93, 0.59, 0. , 0.16, 0.58, 0.55, 1.59, 1.19], - [0.78, 0.41, 0.16, 0. , 0.42, 0.69, 1.4 , 1.18], - [0.38, 0.35, 0.58, 0.42, 0. , 1.02, 1.11, 0.89], - [0.99, 0.87, 0.55, 0.69, 1.02, 0. , 1.47, 1.26], - [1.02, 1.03, 1.59, 1.4 , 1.11, 1.47, 0. , 1.39], - [0.76, 0.83, 1.19, 1.18, 0.89, 1.26, 1.39, 0. ]] + [ + [0.0, 0.53, 0.93, 0.78, 0.38, 0.99, 1.02, 0.76], + [0.53, 0.0, 0.59, 0.41, 0.35, 0.87, 1.03, 0.83], + [0.93, 0.59, 0.0, 0.16, 0.58, 0.55, 1.59, 1.19], + [0.78, 0.41, 0.16, 0.0, 0.42, 0.69, 1.4, 1.18], + [0.38, 0.35, 0.58, 0.42, 0.0, 1.02, 1.11, 0.89], + [0.99, 0.87, 0.55, 0.69, 1.02, 0.0, 1.47, 1.26], + [1.02, 1.03, 1.59, 1.4, 1.11, 1.47, 0.0, 1.39], + [0.76, 0.83, 1.19, 1.18, 0.89, 1.26, 1.39, 0.0], + ] ) # Create the tree tree = phylo.neighbor_joining(distances) @@ -270,12 +278,15 @@ def test_newick_rounding(): ) -@pytest.mark.parametrize("newick_in, exp_newick_out", [ - ("(0:1.0, 1:2.0);", "(0:1.0,1:2.0):0.0;" ), - ("(0:1.0, 1:2.0, 2:3.0);", "((0:1.0,1:2.0):0.0,2:3.0):0.0;" ), - ("(((0:1.0, 1:2.0):10.0):5.0, 2:8.0);", "((0:1.0,1:2.0):15.0,2:8.0):0.0;"), - ("((0:1.0, 1:2.0):10.0):5.0;", "(0:1.0,1:2.0):0.0;" ), -]) +@pytest.mark.parametrize( + "newick_in, exp_newick_out", + [ + ("(0:1.0, 1:2.0);", "(0:1.0,1:2.0):0.0;"), + ("(0:1.0, 1:2.0, 2:3.0);", "((0:1.0,1:2.0):0.0,2:3.0):0.0;"), + ("(((0:1.0, 1:2.0):10.0):5.0, 2:8.0);", "((0:1.0,1:2.0):15.0,2:8.0):0.0;"), + ("((0:1.0, 1:2.0):10.0):5.0;", "(0:1.0,1:2.0):0.0;"), + ], +) def test_as_binary_cases(newick_in, exp_newick_out): """ Test the `as_binary()` function based on known cases. @@ -296,13 +307,13 @@ def test_as_binary_distances(): ref_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - ref_dist_mat[i,j] = tree.get_distance(i,j) - + ref_dist_mat[i, j] = tree.get_distance(i, j) + bin_tree = phylo.as_binary(tree) test_dist_mat = np.zeros((len(tree), len(tree))) for i in range(len(tree)): for j in range(len(tree)): - test_dist_mat[i,j] = bin_tree.get_distance(i,j) + test_dist_mat[i, j] = bin_tree.get_distance(i, j) assert np.allclose(test_dist_mat, ref_dist_mat) @@ -313,26 +324,27 @@ def test_equality(tree): """ assert tree == tree.copy() # Order of children is not important - assert tree == phylo.Tree(phylo.TreeNode( - [tree.root.children[1].copy(), tree.root.children[0].copy()], - [tree.root.children[1].distance, tree.root.children[0].distance] - )) + assert tree == phylo.Tree( + phylo.TreeNode( + [tree.root.children[1].copy(), tree.root.children[0].copy()], + [tree.root.children[1].distance, tree.root.children[0].distance], + ) + ) # Different distance -> Unequal tree - assert tree != phylo.Tree(phylo.TreeNode( - [tree.root.children[0].copy(), tree.root.children[1].copy()], - [tree.root.children[0].distance, 42] - )) + assert tree != phylo.Tree( + phylo.TreeNode( + [tree.root.children[0].copy(), tree.root.children[1].copy()], + [tree.root.children[0].distance, 42], + ) + ) # Additional node -> Unequal tree - assert tree != phylo.Tree(phylo.TreeNode( - [ - tree.root.children[0].copy(), - tree.root.children[1].copy(), - phylo.TreeNode(index=len(tree)) - ], - [ - tree.root.children[0].distance, - tree.root.children[1].distance, - 42 - ] - )) - + assert tree != phylo.Tree( + phylo.TreeNode( + [ + tree.root.children[0].copy(), + tree.root.children[1].copy(), + phylo.TreeNode(index=len(tree)), + ], + [tree.root.children[0].distance, tree.root.children[1].distance, 42], + ) + ) diff --git a/tests/sequence/test_profile.py b/tests/sequence/test_profile.py index 658779bd2..3f7669bbd 100644 --- a/tests/sequence/test_profile.py +++ b/tests/sequence/test_profile.py @@ -11,24 +11,43 @@ def test_from_alignment(): seq1 = seq.NucleotideSequence("CGTCAT") seq2 = seq.NucleotideSequence("TCATGC") - ali_str = ["CGTCAT--", - "--TCATGC"] + ali_str = ["CGTCAT--", "--TCATGC"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) profile = seq.SequenceProfile.from_alignment(alignment) - symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) + symbols = np.array( + [ + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) assert np.array_equal(symbols, profile.symbols) assert np.array_equal(gaps, profile.gaps) - assert (alphabet == profile.alphabet) + assert alphabet == profile.alphabet def test_to_consensus_nuc(): - symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) + symbols = np.array( + [ + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) profile = seq.SequenceProfile(symbols, gaps, alphabet) @@ -37,8 +56,18 @@ def test_to_consensus_nuc(): def test_to_consensus_nuc_ambiguous(): - symbols = np.array([[1, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) + symbols = np.array( + [ + [1, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) profile = seq.SequenceProfile(symbols, gaps, alphabet) @@ -48,45 +77,65 @@ def test_to_consensus_nuc_ambiguous(): def test_to_consensus_prot(): # Avidin protein sequence - seq1 = seq.ProteinSequence("MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP" - "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE") + seq1 = seq.ProteinSequence( + "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQP" + "TFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE" + ) # Streptavidin protein sequence - seq2 = seq.ProteinSequence("MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA" - "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN" - "GNPLDAVQQ") + seq2 = seq.ProteinSequence( + "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSA" + "PATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNN" + "GNPLDAVQQ" + ) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(seq1, seq2, matrix)[0] profile = seq.SequenceProfile.from_alignment(alignment) - assert seq.ProteinSequence("MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD" - "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG" - "INIFNPLDAQKE") == profile.to_consensus() + assert ( + seq.ProteinSequence( + "MRHIATAAIALSLLLLSITALASADPGKDSKAQLSAAEAGITGKWTNDLGSNFIIGAVGADGAFTGTYESAVGNAESNEIKEGPLD" + "GAPATDGKGTALGWTFAFKNNWKFAESATTFSGQCFGGADARINGKELLTKGTMEANAWKSTLLGHDSFSKVKDIAADIDAAKKAG" + "INIFNPLDAQKE" + ) + == profile.to_consensus() + ) def test_new_position_matrices(): - seqs = [seq.NucleotideSequence("AAGAAT"), - seq.NucleotideSequence("ATCATA"), - seq.NucleotideSequence("AAGTAA"), - seq.NucleotideSequence("AACAAA"), - seq.NucleotideSequence("ATTAAA"), - seq.NucleotideSequence("AAGAAT")] + seqs = [ + seq.NucleotideSequence("AAGAAT"), + seq.NucleotideSequence("ATCATA"), + seq.NucleotideSequence("AAGTAA"), + seq.NucleotideSequence("AACAAA"), + seq.NucleotideSequence("ATTAAA"), + seq.NucleotideSequence("AAGAAT"), + ] alignment = align.Alignment( sequences=seqs, - trace=np.tile(np.arange(len(seqs[0])), len(seqs)) \ - .reshape(len(seqs), len(seqs[0])) \ - .transpose(), - score=0 + trace=np.tile(np.arange(len(seqs[0])), len(seqs)) + .reshape(len(seqs), len(seqs[0])) + .transpose(), + score=0, ) profile = seq.SequenceProfile.from_alignment(alignment) - probability_matrix = np.array([[1., 0., 0., 0., ], - [0.66666667, 0., 0., 0.33333333], - [0., 0.33333333, 0.5, 0.16666667], - [0.83333333, 0., 0., 0.16666667], - [0.83333333, 0., 0., 0.16666667], - [0.66666667, 0., 0., 0.33333333]]) + probability_matrix = np.array( + [ + [ + 1.0, + 0.0, + 0.0, + 0.0, + ], + [0.66666667, 0.0, 0.0, 0.33333333], + [0.0, 0.33333333, 0.5, 0.16666667], + [0.83333333, 0.0, 0.0, 0.16666667], + [0.83333333, 0.0, 0.0, 0.16666667], + [0.66666667, 0.0, 0.0, 0.33333333], + ] + ) ppm = profile.probability_matrix() @@ -98,25 +147,35 @@ def test_new_position_matrices(): ppm = profile.probability_matrix(pseudocount=1) - probability_matrix = np.array([[0.89285714, 0.03571429, 0.03571429, 0.03571429], - [0.60714286, 0.03571429, 0.03571429, 0.32142857], - [0.03571429, 0.32142857, 0.46428571, 0.17857143], - [0.75, 0.03571429, 0.03571429, 0.17857143], - [0.75, 0.03571429, 0.03571429, 0.17857143], - [0.60714286, 0.03571429, 0.03571429, 0.32142857]]) + probability_matrix = np.array( + [ + [0.89285714, 0.03571429, 0.03571429, 0.03571429], + [0.60714286, 0.03571429, 0.03571429, 0.32142857], + [0.03571429, 0.32142857, 0.46428571, 0.17857143], + [0.75, 0.03571429, 0.03571429, 0.17857143], + [0.75, 0.03571429, 0.03571429, 0.17857143], + [0.60714286, 0.03571429, 0.03571429, 0.32142857], + ] + ) assert np.allclose(probability_matrix, ppm, atol=1e-3) - probability = profile.sequence_probability(seq.NucleotideSequence("AAAAAA"), pseudocount=1) + probability = profile.sequence_probability( + seq.NucleotideSequence("AAAAAA"), pseudocount=1 + ) assert probability == pytest.approx(0.0066, abs=1e-3) - log_odds_matrix = np.array([[1.83650127, -2.80735492, -2.80735492, -2.80735492], - [1.28010792, -2.80735492, -2.80735492, 0.36257008], - [-2.80735492, 0.36257008, 0.8930848, -0.48542683], - [1.5849625, -2.80735492, -2.80735492, -0.48542683], - [1.5849625, -2.80735492, -2.80735492, -0.48542683], - [1.28010792, -2.80735492, -2.80735492, 0.36257008]]) + log_odds_matrix = np.array( + [ + [1.83650127, -2.80735492, -2.80735492, -2.80735492], + [1.28010792, -2.80735492, -2.80735492, 0.36257008], + [-2.80735492, 0.36257008, 0.8930848, -0.48542683], + [1.5849625, -2.80735492, -2.80735492, -0.48542683], + [1.5849625, -2.80735492, -2.80735492, -0.48542683], + [1.28010792, -2.80735492, -2.80735492, 0.36257008], + ] + ) pwm = profile.log_odds_matrix(pseudocount=1) diff --git a/tests/sequence/test_search.py b/tests/sequence/test_search.py index 7ef2b4618..c2150afac 100644 --- a/tests/sequence/test_search.py +++ b/tests/sequence/test_search.py @@ -3,8 +3,6 @@ # information. import biotite.sequence as seq -import numpy as np -import pytest def test_find_subsequence(): @@ -13,12 +11,13 @@ def test_find_subsequence(): main_seq = seq.NucleotideSequence(string) sub_seq = seq.NucleotideSequence(substring) matches = seq.find_subsequence(main_seq, sub_seq) - assert list(matches) == [4,8] - + assert list(matches) == [4, 8] + + def test_find_symbol(): string = "ATACGCTTGCT" symbol = "T" dna = seq.NucleotideSequence(string) - assert list(seq.find_symbol(dna, symbol)) == [1,6,7,10] + assert list(seq.find_symbol(dna, symbol)) == [1, 6, 7, 10] assert seq.find_symbol_first(dna, symbol) == 1 - assert seq.find_symbol_last(dna, symbol) == 10 \ No newline at end of file + assert seq.find_symbol_last(dna, symbol) == 10 diff --git a/tests/sequence/test_seqtypes.py b/tests/sequence/test_seqtypes.py index 157d8d9ff..086f972a9 100644 --- a/tests/sequence/test_seqtypes.py +++ b/tests/sequence/test_seqtypes.py @@ -2,9 +2,8 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.sequence as seq -import numpy as np import pytest +import biotite.sequence as seq def test_nucleotide_construction(): @@ -23,26 +22,31 @@ def test_reverse_complement(): dna = seq.NucleotideSequence(string) assert str(dna.reverse().complement()) == "RNTAACGCATT" + def test_stop_removal(): string = "LYG*GR*" protein = seq.ProteinSequence(string) assert str(protein.remove_stops()) == string.replace("*", "") -@pytest.mark.parametrize("dna_str, protein_str", - [("CACATAGCATGA", "HIA*"), - ("ATGTAGCTA", "M*L")]) +@pytest.mark.parametrize( + "dna_str, protein_str", [("CACATAGCATGA", "HIA*"), ("ATGTAGCTA", "M*L")] +) def test_full_translation(dna_str, protein_str): dna = seq.NucleotideSequence(dna_str) protein = dna.translate(complete=True) assert protein_str == str(protein) -@pytest.mark.parametrize("dna_str, protein_str_list", - [("CA", []), - ("GAATGCACTGAGATGCAATAG", ["MH*","MQ*"]), - ("ATGCACATGTAGGG", ["MHM*","M*"]), - ("GATGCATGTGAAAA", ["MHVK","M*"])]) +@pytest.mark.parametrize( + "dna_str, protein_str_list", + [ + ("CA", []), + ("GAATGCACTGAGATGCAATAG", ["MH*", "MQ*"]), + ("ATGCACATGTAGGG", ["MHM*", "M*"]), + ("GATGCATGTGAAAA", ["MHVK", "M*"]), + ], +) def test_frame_translation(dna_str, protein_str_list): dna = seq.NucleotideSequence(dna_str) proteins, pos = dna.translate(complete=False) @@ -50,8 +54,8 @@ def test_frame_translation(dna_str, protein_str_list): assert set([str(protein) for protein in proteins]) == set(protein_str_list) # Test if the positions are also right # -> Get sequence slice and translate completely - assert set([str(dna[start : stop].translate(complete=True)) - for start, stop in pos] + assert set( + [str(dna[start:stop].translate(complete=True)) for start, stop in pos] ) == set(protein_str_list) @@ -76,7 +80,7 @@ def test_letter_conversion(): @pytest.mark.parametrize( "monoisotopic, expected_mol_weight_protein", # Reference values taken from https://web.expasy.org/compute_pi/ - [(True, 2231.06), (False, 2232.56)] + [(True, 2231.06), (False, 2232.56)], ) def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein): """ @@ -84,8 +88,5 @@ def test_get_molecular_weight(monoisotopic, expected_mol_weight_protein): correctly. """ protein = seq.ProteinSequence("ACDEFGHIKLMNPQRSTVW") - mol_weight_protein = protein.get_molecular_weight( - monoisotopic=monoisotopic) - assert mol_weight_protein == \ - pytest.approx(expected_mol_weight_protein, abs=1e-2) - + mol_weight_protein = protein.get_molecular_weight(monoisotopic=monoisotopic) + assert mol_weight_protein == pytest.approx(expected_mol_weight_protein, abs=1e-2) diff --git a/tests/sequence/test_sequence.py b/tests/sequence/test_sequence.py index 78a815b5f..bfffaedb5 100644 --- a/tests/sequence/test_sequence.py +++ b/tests/sequence/test_sequence.py @@ -2,8 +2,8 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import numpy as np +import pytest import biotite.sequence as seq @@ -13,13 +13,15 @@ def test_encoding(): string2 = str(dna) assert string1 == string2 + def test_validity_check(): dna = seq.NucleotideSequence() - dna.code = np.array([0,1,0,3,3]) + dna.code = np.array([0, 1, 0, 3, 3]) assert dna.is_valid() - dna.code = np.array([0,1,4,3,3]) + dna.code = np.array([0, 1, 4, 3, 3]) assert not dna.is_valid() - + + def test_access(): string = "AATGCGTTA" dna = seq.NucleotideSequence(string) @@ -28,6 +30,7 @@ def test_access(): dna = dna[3:-2] assert "GCGT" == str(dna) + def test_manipulation(): dna_seq = seq.NucleotideSequence("ACGTA") dna_copy = dna_seq.copy() @@ -37,12 +40,13 @@ def test_manipulation(): dna_copy[0:2] = dna_copy[3:5] assert "TAGTA" == str(dna_copy) dna_copy = dna_seq.copy() - dna_copy[np.array([True,False,False,False,True])] = "T" + dna_copy[np.array([True, False, False, False, True])] = "T" assert "TCGTT" == str(dna_copy) dna_copy = dna_seq.copy() - dna_copy[1:4] = np.array([0,1,2]) + dna_copy[1:4] = np.array([0, 1, 2]) assert "AACGA" == str(dna_copy) + def test_concatenation(): str1 = "AAGTTA" str2 = "CGA" @@ -54,16 +58,19 @@ def test_concatenation(): concat_seq = seq.NucleotideSequence(str3) + seq.NucleotideSequence(str1) assert str3 + str1 == str(concat_seq) + def test_frequency(): string = "ACGCGAGAAAGCGGG" dna = seq.NucleotideSequence(string) assert dna.get_symbol_frequency() == {"A": 5, "C": 3, "G": 7, "T": 0} - + + def test_alph_error(): string = "AATGCGTUTA" with pytest.raises(seq.AlphabetError): seq.NucleotideSequence(string) + def test_alphabet_extension(): alph1 = seq.Alphabet("abc") alph2 = seq.Alphabet("abc") @@ -73,4 +80,4 @@ def test_alphabet_extension(): assert alph2.extends(alph1) assert not alph3.extends(alph1) assert alph4.extends(alph1) - assert not alph1.extends(alph4) \ No newline at end of file + assert not alph1.extends(alph4) diff --git a/tests/structure/data/1aki.mmtf b/tests/structure/data/1aki.mmtf deleted file mode 100644 index cff67a7ba..000000000 Binary files a/tests/structure/data/1aki.mmtf and /dev/null differ diff --git a/tests/structure/data/1aki.npz b/tests/structure/data/1aki.npz deleted file mode 100644 index 533f48e12..000000000 Binary files a/tests/structure/data/1aki.npz and /dev/null differ diff --git a/tests/structure/data/1crr.mmtf b/tests/structure/data/1crr.mmtf deleted file mode 100644 index 64b2b7e77..000000000 Binary files a/tests/structure/data/1crr.mmtf and /dev/null differ diff --git a/tests/structure/data/1crr.npz b/tests/structure/data/1crr.npz deleted file mode 100644 index 6707a41ef..000000000 Binary files a/tests/structure/data/1crr.npz and /dev/null differ diff --git a/tests/structure/data/1dix.mmtf b/tests/structure/data/1dix.mmtf deleted file mode 100644 index 6815ba732..000000000 Binary files a/tests/structure/data/1dix.mmtf and /dev/null differ diff --git a/tests/structure/data/1dix.npz b/tests/structure/data/1dix.npz deleted file mode 100644 index dd343f81b..000000000 Binary files a/tests/structure/data/1dix.npz and /dev/null differ diff --git a/tests/structure/data/1f2n.mmtf b/tests/structure/data/1f2n.mmtf deleted file mode 100644 index 810001ddc..000000000 Binary files a/tests/structure/data/1f2n.mmtf and /dev/null differ diff --git a/tests/structure/data/1f2n.npz b/tests/structure/data/1f2n.npz deleted file mode 100644 index 51f42e3da..000000000 Binary files a/tests/structure/data/1f2n.npz and /dev/null differ diff --git a/tests/structure/data/1gya.mmtf b/tests/structure/data/1gya.mmtf deleted file mode 100644 index ac3f74517..000000000 Binary files a/tests/structure/data/1gya.mmtf and /dev/null differ diff --git a/tests/structure/data/1gya.npz b/tests/structure/data/1gya.npz deleted file mode 100644 index 134ecbd98..000000000 Binary files a/tests/structure/data/1gya.npz and /dev/null differ diff --git a/tests/structure/data/1igy.mmtf b/tests/structure/data/1igy.mmtf deleted file mode 100644 index 17abe292a..000000000 Binary files a/tests/structure/data/1igy.mmtf and /dev/null differ diff --git a/tests/structure/data/1igy.npz b/tests/structure/data/1igy.npz deleted file mode 100644 index 4575dc284..000000000 Binary files a/tests/structure/data/1igy.npz and /dev/null differ diff --git a/tests/structure/data/1l2y.mmtf b/tests/structure/data/1l2y.mmtf deleted file mode 100644 index 8ed066093..000000000 Binary files a/tests/structure/data/1l2y.mmtf and /dev/null differ diff --git a/tests/structure/data/1l2y.npz b/tests/structure/data/1l2y.npz deleted file mode 100644 index 3738a6fc9..000000000 Binary files a/tests/structure/data/1l2y.npz and /dev/null differ diff --git a/tests/structure/data/1l2y.tng b/tests/structure/data/1l2y.tng deleted file mode 100644 index de71cdf3a..000000000 Binary files a/tests/structure/data/1l2y.tng and /dev/null differ diff --git a/tests/structure/data/1o1z.mmtf b/tests/structure/data/1o1z.mmtf deleted file mode 100644 index 8646d2cff..000000000 Binary files a/tests/structure/data/1o1z.mmtf and /dev/null differ diff --git a/tests/structure/data/1o1z.npz b/tests/structure/data/1o1z.npz deleted file mode 100644 index 147abfcbd..000000000 Binary files a/tests/structure/data/1o1z.npz and /dev/null differ diff --git a/tests/structure/data/2axd.mmtf b/tests/structure/data/2axd.mmtf deleted file mode 100644 index f7151958b..000000000 Binary files a/tests/structure/data/2axd.mmtf and /dev/null differ diff --git a/tests/structure/data/2d0f.mmtf b/tests/structure/data/2d0f.mmtf deleted file mode 100644 index 08678d613..000000000 Binary files a/tests/structure/data/2d0f.mmtf and /dev/null differ diff --git a/tests/structure/data/2d0f.npz b/tests/structure/data/2d0f.npz deleted file mode 100644 index b9cd24ee8..000000000 Binary files a/tests/structure/data/2d0f.npz and /dev/null differ diff --git a/tests/structure/data/3o5r.mmtf b/tests/structure/data/3o5r.mmtf deleted file mode 100644 index 24f328e7d..000000000 Binary files a/tests/structure/data/3o5r.mmtf and /dev/null differ diff --git a/tests/structure/data/3o5r.npz b/tests/structure/data/3o5r.npz deleted file mode 100644 index 0ec79942b..000000000 Binary files a/tests/structure/data/3o5r.npz and /dev/null differ diff --git a/tests/structure/data/4gxy.mmtf b/tests/structure/data/4gxy.mmtf deleted file mode 100644 index 02bd05703..000000000 Binary files a/tests/structure/data/4gxy.mmtf and /dev/null differ diff --git a/tests/structure/data/4gxy.npz b/tests/structure/data/4gxy.npz deleted file mode 100644 index 077caefb9..000000000 Binary files a/tests/structure/data/4gxy.npz and /dev/null differ diff --git a/tests/structure/data/4p5j.mmtf b/tests/structure/data/4p5j.mmtf deleted file mode 100644 index 4633b1d14..000000000 Binary files a/tests/structure/data/4p5j.mmtf and /dev/null differ diff --git a/tests/structure/data/4p5j.npz b/tests/structure/data/4p5j.npz deleted file mode 100644 index 1b7b22463..000000000 Binary files a/tests/structure/data/4p5j.npz and /dev/null differ diff --git a/tests/structure/data/5eil.mmtf b/tests/structure/data/5eil.mmtf deleted file mode 100644 index 9973da4d0..000000000 Binary files a/tests/structure/data/5eil.mmtf and /dev/null differ diff --git a/tests/structure/data/5eil.npz b/tests/structure/data/5eil.npz deleted file mode 100644 index 56cdbc74a..000000000 Binary files a/tests/structure/data/5eil.npz and /dev/null differ diff --git a/tests/structure/data/5h73.mmtf b/tests/structure/data/5h73.mmtf deleted file mode 100644 index a9036dc69..000000000 Binary files a/tests/structure/data/5h73.mmtf and /dev/null differ diff --git a/tests/structure/data/5h73.npz b/tests/structure/data/5h73.npz deleted file mode 100644 index 8171e6660..000000000 Binary files a/tests/structure/data/5h73.npz and /dev/null differ diff --git a/tests/structure/data/5ugo.mmtf b/tests/structure/data/5ugo.mmtf deleted file mode 100644 index 947dcaa1a..000000000 Binary files a/tests/structure/data/5ugo.mmtf and /dev/null differ diff --git a/tests/structure/data/5ugo.npz b/tests/structure/data/5ugo.npz deleted file mode 100644 index da1d6070c..000000000 Binary files a/tests/structure/data/5ugo.npz and /dev/null differ diff --git a/tests/structure/data/5zng.mmtf b/tests/structure/data/5zng.mmtf deleted file mode 100644 index affa05611..000000000 Binary files a/tests/structure/data/5zng.mmtf and /dev/null differ diff --git a/tests/structure/data/5zng.npz b/tests/structure/data/5zng.npz deleted file mode 100644 index afa1e2a22..000000000 Binary files a/tests/structure/data/5zng.npz and /dev/null differ diff --git a/tests/structure/data/7gsa.mmtf b/tests/structure/data/7gsa.mmtf deleted file mode 100644 index e0c18fe6f..000000000 Binary files a/tests/structure/data/7gsa.mmtf and /dev/null differ diff --git a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py index c81f9e050..e0c2fa1f4 100644 --- a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py +++ b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py @@ -1,35 +1,35 @@ -import pandas as pd import argparse -import numpy as np import json +import numpy as np +import pandas as pd + def process(input, output, chain): data = pd.read_csv(input) # Only retain rows with basepair annotation - data = data[data['Leontis-Westhof'].notna()] + data = data[data["Leontis-Westhof"].notna()] output_list = [] for _, row in data.iterrows(): - - nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] + nucleotides = [row["Nucleotide 1"], row["Nucleotide 2"]] # Extract the Leontis-Westhof annotation - lw_string = row['Leontis-Westhof'] + lw_string = row["Leontis-Westhof"] # Some interactions are labelled with `n` for near. These are # ignored - if lw_string[0] == 'n': + if lw_string[0] == "n": continue # Get sugar orientation from string (`c` = cis, `t` = trans) sugar_orientation = lw_string[0] # The residue ids of the nucleotides - res_ids = [None]*2 + res_ids = [None] * 2 for i, nucleotide in enumerate(nucleotides): - nucleotide_list = nucleotide.split('.') + nucleotide_list = nucleotide.split(".") # if the nucleotide is not part of the specified chain, skip # base pair @@ -41,37 +41,28 @@ def process(input, output, chain): if None in res_ids: continue - if sugar_orientation == 'c': + if sugar_orientation == "c": sugar_orientation = 1 - elif sugar_orientation == 't': + elif sugar_orientation == "t": sugar_orientation = 2 this_output = sorted((int(res_ids[0]), int(res_ids[1]))) this_output.append(int(sugar_orientation)) output_list.append(this_output) output_list = np.unique(output_list, axis=0).tolist() - with open(output, 'w') as f: + with open(output, "w") as f: json.dump(output_list, f, indent=1) + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Parse the glycosidic bond orientation annotations in the " "NAKB-database for a specific chain. The annotations can be " "downloaded in the section 'Base Pairs'." ) - parser.add_argument( - "infile", - help="The path to the input file." - ) - parser.add_argument( - "outfile", - help="The path to the output JSON file." - ) - parser.add_argument( - "chain", - help="The chain ID to be extracted." - ) + parser.add_argument("infile", help="The path to the input file.") + parser.add_argument("outfile", help="The path to the output JSON file.") + parser.add_argument("chain", help="The chain ID to be extracted.") args = parser.parse_args() process(args.infile, args.outfile, args.chain) - diff --git a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py index 1a46eb4d3..bdcd1f586 100644 --- a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py +++ b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py @@ -1,36 +1,37 @@ -import pandas as pd import argparse import json import numpy as np +import pandas as pd + def process(input, output, chain): data = pd.read_csv(input) # Only retain rows with basepair annotation - data = data[data['Leontis-Westhof'].notna()] + data = data[data["Leontis-Westhof"].notna()] output_list = [] for _, row in data.iterrows(): - nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] + nucleotides = [row["Nucleotide 1"], row["Nucleotide 2"]] # Extract the Leontis-Westhof annotation - lw_string = row['Leontis-Westhof'] + lw_string = row["Leontis-Westhof"] # Some interactions are labelled with `n` for near. These are # ignored - if lw_string[0] == 'n': + if lw_string[0] == "n": continue # Get edge annotations from string edges = [lw_string[-2], lw_string[-1]] - + # Dont allow unspecified edges in test data - if '.' in edges: + if "." in edges: continue - res_ids = [None]*2 + res_ids = [None] * 2 for i, nucleotide in enumerate(nucleotides): - nucleotide_list = nucleotide.split('.') + nucleotide_list = nucleotide.split(".") # if the nucleotide is not part of the specified chain, skip # base pair @@ -43,11 +44,11 @@ def process(input, output, chain): continue for i, edge in enumerate(edges): - if edge == 'W': + if edge == "W": edges[i] = 1 - if edge == 'H': + if edge == "H": edges[i] = 2 - if edge == 'S': + if edge == "S": edges[i] = 3 # Lower residue id on the left, higher residue id on the right @@ -62,28 +63,19 @@ def process(input, output, chain): ) output_list = np.unique(output_list, axis=0).tolist() - with open(output, 'w') as f: + with open(output, "w") as f: json.dump(output_list, f, indent=1) + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Parse the edge type annotations in the NAKB-database for " "a specific chain. The annotations can be downloaded in the section " "'Base Pairs'." ) - parser.add_argument( - "infile", - help="The path to the input file." - ) - parser.add_argument( - "outfile", - help="The path to the output JSON file." - ) - parser.add_argument( - "chain", - help="The chain ID to be extracted." - ) + parser.add_argument("infile", help="The path to the input file.") + parser.add_argument("outfile", help="The path to the output JSON file.") + parser.add_argument("chain", help="The chain ID to be extracted.") args = parser.parse_args() process(args.infile, args.outfile, args.chain) - diff --git a/tests/structure/data/create_test_structures.py b/tests/structure/data/create_test_structures.py index cbe2216ab..da0f0ff48 100644 --- a/tests/structure/data/create_test_structures.py +++ b/tests/structure/data/create_test_structures.py @@ -1,74 +1,85 @@ import argparse -import subprocess -from os.path import join import logging -import os +import subprocess import sys +from os.path import join import biotite -from biotite.database import RequestError import biotite.database.rcsb as rcsb import biotite.structure.io as strucio +from biotite.database import RequestError def create(pdb_id, directory, include_gro): - # Create *.pdb", *.cif and *.mmtf - for file_format in ["pdb", "cif", "bcif", "mmtf"]: + # Create *.pdb", *.cif and *.bcif + for file_format in ["pdb", "cif", "bcif"]: try: rcsb.fetch(pdb_id, file_format, directory, overwrite=True) except RequestError: # PDB entry is not provided in this format pass try: - array = strucio.load_structure(join(directory, pdb_id+".pdb")) + array = strucio.load_structure(join(directory, pdb_id + ".pdb")) except biotite.InvalidFileError: # Structure probably contains multiple models with different # number of atoms # -> Cannot load AtomArrayStack - # -> Skip writing GRO and NPZ file + # -> Skip writing GRO file return - # Create *.gro file - strucio.save_structure(join(directory, pdb_id+".npz"), array) # Create *.gro files using GROMACS # Clean PDB file -> remove inscodes and altlocs if include_gro: cleaned_file_name = biotite.temp_file("pdb") strucio.save_structure(cleaned_file_name, array) # Run GROMACS for file conversion - subprocess.run([ - "editconf", - "-f", cleaned_file_name, - "-o", join(directory, pdb_id+".gro") - ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run( + [ + "editconf", + "-f", + cleaned_file_name, + "-o", + join(directory, pdb_id + ".gro"), + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Create structure files for unit tests " - "in all supported formats from PDB ID " - "(excluding GROMACS trajectory files)" - ) - parser.add_argument( - "--dir", "-d", dest="directory", default=".", - help="the Biotite project directory to put the test files into" + "in all supported formats from PDB ID " + "(excluding GROMACS trajectory files)" ) parser.add_argument( - "--id", "-i", dest="id", - help="the PDB ID" + "--dir", + "-d", + dest="directory", + default=".", + help="the Biotite project directory to put the test files into", ) + parser.add_argument("--id", "-i", dest="id", help="the PDB ID") parser.add_argument( - "--file", "-f", dest="file", - help="read mutliple PDB IDs from text file (line break separated IDs)" + "--file", + "-f", + dest="file", + help="read mutliple PDB IDs from text file (line break separated IDs)", ) parser.add_argument( - "--gromacs", "-g", action="store_true", dest="include_gro", - help="Create '*.gro' files using the Gromacs software" + "--gromacs", + "-g", + action="store_true", + dest="include_gro", + help="Create '*.gro' files using the Gromacs software", ) args = parser.parse_args() if args.file is not None: with open(args.file, "r") as file: - pdb_ids = [pdb_id.strip().lower() for pdb_id - in file.read().split("\n") if len(pdb_id.strip()) != 0] + pdb_ids = [ + pdb_id.strip().lower() + for pdb_id in file.read().split("\n") + if len(pdb_id.strip()) != 0 + ] elif args.id is not None: pdb_ids = [args.id.lower()] else: @@ -81,4 +92,4 @@ def create(pdb_id, directory, include_gro): create(pdb_id, args.directory, args.include_gro) except: print() - raise \ No newline at end of file + raise diff --git a/tests/structure/data/misc/README.rst b/tests/structure/data/misc/README.rst new file mode 100644 index 000000000..8fde76f6f --- /dev/null +++ b/tests/structure/data/misc/README.rst @@ -0,0 +1,4 @@ +Miscellaneous data sets +======================= + +The datasets were created using the corresponding ``.py`` files with the same prefix. \ No newline at end of file diff --git a/tests/structure/data/misc/hbond.json b/tests/structure/data/misc/hbond.json new file mode 100644 index 000000000..fde90b766 --- /dev/null +++ b/tests/structure/data/misc/hbond.json @@ -0,0 +1,8085 @@ +{ + "1l2y": { + "single_model": [ + [ + 56, + 64, + 3 + ], + [ + 75, + 84, + 3 + ], + [ + 92, + 106, + 19 + ], + [ + 100, + 111, + 229 + ], + [ + 116, + 124, + 38 + ], + [ + 135, + 144, + 59 + ], + [ + 143, + 155, + 82 + ], + [ + 169, + 173, + 119 + ], + [ + 176, + 180, + 95 + ], + [ + 197, + 203, + 172 + ], + [ + 208, + 214, + 179 + ], + [ + 208, + 214, + 186 + ], + [ + 233, + 245, + 213 + ], + [ + 213, + 218, + 160 + ] + ], + "all_models": [ + [ + 0, + 9, + 6 + ], + [ + 0, + 9, + 297 + ], + [ + 0, + 10, + 6 + ], + [ + 16, + 24, + 6 + ], + [ + 16, + 24, + 7 + ], + [ + 35, + 47, + 3 + ], + [ + 35, + 47, + 6 + ], + [ + 56, + 64, + 3 + ], + [ + 75, + 84, + 3 + ], + [ + 75, + 84, + 19 + ], + [ + 92, + 106, + 19 + ], + [ + 100, + 111, + 229 + ], + [ + 116, + 124, + 38 + ], + [ + 135, + 144, + 59 + ], + [ + 135, + 144, + 78 + ], + [ + 143, + 154, + 82 + ], + [ + 143, + 154, + 163 + ], + [ + 143, + 155, + 82 + ], + [ + 143, + 155, + 138 + ], + [ + 157, + 165, + 78 + ], + [ + 157, + 165, + 95 + ], + [ + 169, + 173, + 119 + ], + [ + 176, + 180, + 95 + ], + [ + 197, + 203, + 172 + ], + [ + 197, + 203, + 179 + ], + [ + 208, + 214, + 179 + ], + [ + 208, + 214, + 186 + ], + [ + 219, + 223, + 186 + ], + [ + 226, + 237, + 211 + ], + [ + 233, + 245, + 164 + ], + [ + 233, + 245, + 211 + ], + [ + 233, + 245, + 213 + ], + [ + 235, + 246, + 164 + ], + [ + 235, + 247, + 164 + ], + [ + 236, + 248, + 164 + ], + [ + 236, + 249, + 164 + ], + [ + 202, + 207, + 172 + ], + [ + 202, + 207, + 200 + ], + [ + 213, + 218, + 95 + ], + [ + 213, + 218, + 160 + ], + [ + 213, + 218, + 163 + ], + [ + 213, + 218, + 164 + ], + [ + 213, + 218, + 179 + ], + [ + 297, + 303, + 281 + ], + [ + 297, + 303, + 295 + ], + [ + 297, + 303, + 298 + ] + ] + }, + "1gya": { + "single_model": [ + [ + 96, + 104, + 1609 + ], + [ + 240, + 248, + 1150 + ], + [ + 456, + 464, + 1425 + ], + [ + 468, + 476, + 1425 + ], + [ + 480, + 488, + 734 + ], + [ + 499, + 508, + 1395 + ], + [ + 521, + 535, + 707 + ], + [ + 529, + 540, + 1088 + ], + [ + 545, + 554, + 1357 + ], + [ + 560, + 569, + 644 + ], + [ + 607, + 615, + 563 + ], + [ + 641, + 650, + 563 + ], + [ + 685, + 693, + 524 + ], + [ + 704, + 709, + 524 + ], + [ + 714, + 723, + 866 + ], + [ + 731, + 742, + 483 + ], + [ + 812, + 821, + 778 + ], + [ + 863, + 874, + 717 + ], + [ + 968, + 980, + 923 + ], + [ + 989, + 998, + 1128 + ], + [ + 1030, + 1041, + 1095 + ], + [ + 1085, + 1089, + 1033 + ], + [ + 1092, + 1099, + 1078 + ], + [ + 1106, + 1114, + 276 + ], + [ + 1125, + 1134, + 992 + ], + [ + 1133, + 1144, + 234 + ], + [ + 1147, + 1155, + 243 + ], + [ + 1333, + 1345, + 1628 + ], + [ + 1354, + 1363, + 548 + ], + [ + 1376, + 1383, + 1587 + ], + [ + 1392, + 1398, + 502 + ], + [ + 1403, + 1411, + 1553 + ], + [ + 1422, + 1434, + 471 + ], + [ + 1443, + 1451, + 1501 + ], + [ + 1455, + 1462, + 428 + ], + [ + 1491, + 1495, + 1446 + ], + [ + 1498, + 1507, + 1449 + ], + [ + 1534, + 1541, + 1501 + ], + [ + 1584, + 1593, + 1379 + ], + [ + 1625, + 1636, + 1336 + ], + [ + 1657, + 1665, + 1305 + ], + [ + 1433, + 1442, + 1527 + ] + ], + "all_models": [ + [ + 96, + 104, + 1609 + ], + [ + 115, + 124, + 122 + ], + [ + 130, + 137, + 1648 + ], + [ + 168, + 172, + 1679 + ], + [ + 204, + 208, + 1209 + ], + [ + 204, + 208, + 1225 + ], + [ + 211, + 220, + 1206 + ], + [ + 211, + 220, + 1209 + ], + [ + 228, + 236, + 218 + ], + [ + 240, + 248, + 1150 + ], + [ + 266, + 271, + 299 + ], + [ + 273, + 281, + 1109 + ], + [ + 348, + 359, + 326 + ], + [ + 413, + 421, + 407 + ], + [ + 413, + 421, + 419 + ], + [ + 425, + 433, + 405 + ], + [ + 425, + 433, + 419 + ], + [ + 425, + 433, + 420 + ], + [ + 456, + 464, + 1425 + ], + [ + 468, + 476, + 1425 + ], + [ + 480, + 488, + 734 + ], + [ + 499, + 508, + 1395 + ], + [ + 507, + 519, + 760 + ], + [ + 521, + 535, + 707 + ], + [ + 529, + 540, + 1088 + ], + [ + 545, + 554, + 1357 + ], + [ + 560, + 569, + 644 + ], + [ + 568, + 580, + 614 + ], + [ + 568, + 581, + 614 + ], + [ + 582, + 589, + 548 + ], + [ + 582, + 589, + 1317 + ], + [ + 596, + 602, + 587 + ], + [ + 607, + 615, + 563 + ], + [ + 607, + 615, + 601 + ], + [ + 607, + 615, + 613 + ], + [ + 619, + 628, + 563 + ], + [ + 619, + 628, + 585 + ], + [ + 641, + 650, + 563 + ], + [ + 685, + 693, + 524 + ], + [ + 704, + 709, + 524 + ], + [ + 714, + 723, + 866 + ], + [ + 722, + 729, + 507 + ], + [ + 731, + 742, + 483 + ], + [ + 775, + 784, + 474 + ], + [ + 812, + 821, + 778 + ], + [ + 863, + 874, + 717 + ], + [ + 920, + 929, + 886 + ], + [ + 968, + 980, + 923 + ], + [ + 989, + 998, + 1128 + ], + [ + 997, + 1008, + 949 + ], + [ + 997, + 1010, + 1014 + ], + [ + 1030, + 1041, + 1095 + ], + [ + 1085, + 1089, + 1033 + ], + [ + 1092, + 1099, + 1078 + ], + [ + 1106, + 1114, + 276 + ], + [ + 1125, + 1134, + 992 + ], + [ + 1133, + 1144, + 234 + ], + [ + 1133, + 1144, + 235 + ], + [ + 1147, + 1155, + 243 + ], + [ + 1166, + 1175, + 945 + ], + [ + 1166, + 1175, + 957 + ], + [ + 1194, + 1202, + 207 + ], + [ + 1194, + 1202, + 1206 + ], + [ + 1206, + 1214, + 211 + ], + [ + 1206, + 1214, + 214 + ], + [ + 1273, + 1281, + 1228 + ], + [ + 1285, + 1294, + 1228 + ], + [ + 1285, + 1294, + 1250 + ], + [ + 1293, + 1300, + 1288 + ], + [ + 1302, + 1310, + 1276 + ], + [ + 1314, + 1322, + 587 + ], + [ + 1314, + 1322, + 1308 + ], + [ + 1333, + 1345, + 1628 + ], + [ + 1354, + 1363, + 548 + ], + [ + 1362, + 1374, + 553 + ], + [ + 1376, + 1383, + 1587 + ], + [ + 1392, + 1398, + 502 + ], + [ + 1403, + 1411, + 1553 + ], + [ + 1422, + 1434, + 471 + ], + [ + 1422, + 1434, + 474 + ], + [ + 1443, + 1451, + 1449 + ], + [ + 1443, + 1451, + 1501 + ], + [ + 1455, + 1462, + 428 + ], + [ + 1469, + 1478, + 1449 + ], + [ + 1469, + 1478, + 1450 + ], + [ + 1491, + 1495, + 1446 + ], + [ + 1491, + 1495, + 1458 + ], + [ + 1498, + 1507, + 1449 + ], + [ + 1520, + 1528, + 1527 + ], + [ + 1527, + 1532, + 1520 + ], + [ + 1527, + 1532, + 1576 + ], + [ + 1534, + 1541, + 1501 + ], + [ + 1550, + 1558, + 1406 + ], + [ + 1584, + 1593, + 1379 + ], + [ + 1606, + 1614, + 75 + ], + [ + 1625, + 1636, + 1336 + ], + [ + 1645, + 1653, + 99 + ], + [ + 1645, + 1653, + 135 + ], + [ + 1657, + 1665, + 1305 + ], + [ + 1676, + 1685, + 133 + ], + [ + 1698, + 1706, + 1292 + ], + [ + 1717, + 1726, + 171 + ], + [ + 1749, + 1761, + 1720 + ], + [ + 1749, + 1761, + 1724 + ], + [ + 63, + 68, + 61 + ], + [ + 135, + 140, + 99 + ], + [ + 407, + 412, + 413 + ], + [ + 587, + 592, + 585 + ], + [ + 587, + 592, + 596 + ], + [ + 587, + 592, + 1333 + ], + [ + 1097, + 1102, + 298 + ], + [ + 1344, + 1353, + 979 + ], + [ + 1344, + 1353, + 1276 + ], + [ + 1433, + 1442, + 1397 + ], + [ + 1433, + 1442, + 1527 + ] + ] + }, + "1igy": { + "single_model": [ + [ + 28, + 38, + 206 + ], + [ + 50, + 57, + 190 + ], + [ + 87, + 95, + 952 + ], + [ + 104, + 112, + 974 + ], + [ + 113, + 119, + 142 + ], + [ + 134, + 143, + 116 + ], + [ + 161, + 168, + 700 + ], + [ + 178, + 186, + 682 + ], + [ + 196, + 202, + 661 + ], + [ + 203, + 212, + 31 + ], + [ + 216, + 221, + 643 + ], + [ + 222, + 228, + 18 + ], + [ + 259, + 266, + 246 + ], + [ + 267, + 274, + 629 + ], + [ + 276, + 288, + 254 + ], + [ + 298, + 304, + 828 + ], + [ + 322, + 334, + 808 + ], + [ + 336, + 345, + 420 + ], + [ + 348, + 357, + 785 + ], + [ + 356, + 358, + 2404 + ], + [ + 390, + 399, + 363 + ], + [ + 417, + 426, + 339 + ], + [ + 425, + 428, + 544 + ], + [ + 439, + 447, + 309 + ], + [ + 490, + 498, + 487 + ], + [ + 497, + 500, + 460 + ], + [ + 510, + 514, + 590 + ], + [ + 518, + 530, + 442 + ], + [ + 561, + 569, + 567 + ], + [ + 579, + 584, + 764 + ], + [ + 579, + 584, + 765 + ], + [ + 580, + 586, + 764 + ], + [ + 587, + 598, + 557 + ], + [ + 599, + 606, + 691 + ], + [ + 613, + 619, + 673 + ], + [ + 626, + 632, + 652 + ], + [ + 688, + 695, + 602 + ], + [ + 697, + 705, + 164 + ], + [ + 706, + 712, + 573 + ], + [ + 767, + 775, + 745 + ], + [ + 782, + 790, + 351 + ], + [ + 791, + 803, + 943 + ], + [ + 811, + 816, + 808 + ], + [ + 825, + 829, + 301 + ], + [ + 830, + 839, + 907 + ], + [ + 842, + 846, + 279 + ], + [ + 904, + 911, + 837 + ], + [ + 925, + 929, + 821 + ], + [ + 935, + 939, + 46 + ], + [ + 940, + 947, + 794 + ], + [ + 949, + 958, + 69 + ], + [ + 957, + 961, + 789 + ], + [ + 962, + 970, + 779 + ], + [ + 971, + 980, + 90 + ], + [ + 981, + 989, + 1556 + ], + [ + 990, + 999, + 107 + ], + [ + 1010, + 1015, + 1006 + ], + [ + 1010, + 1015, + 1023 + ], + [ + 1012, + 1017, + 1594 + ], + [ + 1035, + 1040, + 1283 + ], + [ + 1088, + 1099, + 1213 + ], + [ + 1140, + 1149, + 1119 + ], + [ + 1152, + 1160, + 1117 + ], + [ + 1170, + 1176, + 1143 + ], + [ + 1188, + 1193, + 1694 + ], + [ + 1194, + 1200, + 1148 + ], + [ + 1202, + 1209, + 1676 + ], + [ + 1210, + 1217, + 1091 + ], + [ + 1225, + 1236, + 1074 + ], + [ + 1237, + 1245, + 1642 + ], + [ + 1268, + 1279, + 1620 + ], + [ + 1280, + 1292, + 1038 + ], + [ + 1332, + 1340, + 1863 + ], + [ + 1351, + 1360, + 1847 + ], + [ + 1380, + 1389, + 1831 + ], + [ + 1388, + 1391, + 1851 + ], + [ + 1393, + 1401, + 1427 + ], + [ + 1402, + 1410, + 1432 + ], + [ + 1416, + 1422, + 1431 + ], + [ + 1487, + 1495, + 1667 + ], + [ + 1578, + 1587, + 1567 + ], + [ + 1591, + 1599, + 1567 + ], + [ + 1600, + 1606, + 1564 + ], + [ + 1617, + 1629, + 1271 + ], + [ + 1648, + 1654, + 1510 + ], + [ + 1664, + 1671, + 1490 + ], + [ + 1682, + 1689, + 1477 + ], + [ + 1741, + 1753, + 1703 + ], + [ + 1755, + 1764, + 1712 + ], + [ + 1765, + 1776, + 1725 + ], + [ + 1765, + 1776, + 1734 + ], + [ + 1775, + 1780, + 1728 + ], + [ + 1782, + 1792, + 1744 + ], + [ + 1795, + 1803, + 1409 + ], + [ + 1806, + 1812, + 1408 + ], + [ + 1814, + 1826, + 1977 + ], + [ + 1828, + 1835, + 1383 + ], + [ + 1837, + 1843, + 1956 + ], + [ + 1860, + 1867, + 1335 + ], + [ + 1875, + 1880, + 1895 + ], + [ + 1875, + 1880, + 1900 + ], + [ + 1878, + 1881, + 1297 + ], + [ + 1882, + 1891, + 1875 + ], + [ + 1890, + 1894, + 1885 + ], + [ + 1895, + 1902, + 1875 + ], + [ + 1904, + 1910, + 1900 + ], + [ + 1936, + 1944, + 1857 + ], + [ + 1997, + 2008, + 1798 + ], + [ + 2053, + 2062, + 2251 + ], + [ + 2075, + 2084, + 2232 + ], + [ + 2105, + 2109, + 2102 + ], + [ + 2110, + 2115, + 2124 + ], + [ + 2135, + 2140, + 3127 + ], + [ + 2184, + 2191, + 2858 + ], + [ + 2205, + 2213, + 2836 + ], + [ + 2229, + 2238, + 2078 + ], + [ + 2314, + 2326, + 2287 + ], + [ + 2384, + 2393, + 2467 + ], + [ + 2392, + 2394, + 2957 + ], + [ + 2392, + 2396, + 2419 + ], + [ + 2397, + 2406, + 2941 + ], + [ + 2418, + 2423, + 2471 + ], + [ + 2418, + 2423, + 2472 + ], + [ + 2438, + 2447, + 2412 + ], + [ + 2446, + 2449, + 2429 + ], + [ + 2446, + 2449, + 2436 + ], + [ + 2446, + 2449, + 2438 + ], + [ + 2464, + 2473, + 2387 + ], + [ + 2490, + 2498, + 2362 + ], + [ + 2518, + 2526, + 2340 + ], + [ + 2534, + 2536, + 2550 + ], + [ + 2628, + 2637, + 2636 + ], + [ + 2636, + 2639, + 2677 + ], + [ + 2657, + 2668, + 2620 + ], + [ + 2669, + 2678, + 2631 + ], + [ + 2691, + 2700, + 2660 + ], + [ + 2699, + 2702, + 2643 + ], + [ + 2704, + 2709, + 2660 + ], + [ + 2710, + 2717, + 2845 + ], + [ + 2719, + 2727, + 2614 + ], + [ + 2746, + 2751, + 2317 + ], + [ + 2747, + 2753, + 2317 + ], + [ + 2772, + 2776, + 2558 + ], + [ + 2788, + 2794, + 2757 + ], + [ + 2816, + 2821, + 2225 + ], + [ + 2829, + 2831, + 2812 + ], + [ + 2842, + 2852, + 2713 + ], + [ + 2855, + 2863, + 2187 + ], + [ + 2864, + 2870, + 2694 + ], + [ + 2880, + 2888, + 2168 + ], + [ + 2889, + 2896, + 2922 + ], + [ + 2906, + 2914, + 2912 + ], + [ + 2915, + 2923, + 2892 + ], + [ + 2915, + 2923, + 2901 + ], + [ + 2924, + 2930, + 2901 + ], + [ + 2938, + 2945, + 2400 + ], + [ + 2946, + 2958, + 3101 + ], + [ + 2960, + 2972, + 2378 + ], + [ + 3006, + 3015, + 2331 + ], + [ + 3068, + 3075, + 3034 + ], + [ + 3116, + 3123, + 2935 + ], + [ + 3124, + 3131, + 2119 + ], + [ + 3141, + 3147, + 2138 + ], + [ + 3157, + 3162, + 3146 + ], + [ + 3171, + 3175, + 3179 + ], + [ + 3185, + 3192, + 3431 + ], + [ + 3224, + 3236, + 3382 + ], + [ + 3245, + 3253, + 3370 + ], + [ + 3272, + 3278, + 3263 + ], + [ + 3300, + 3302, + 3275 + ], + [ + 3304, + 3311, + 3289 + ], + [ + 3358, + 3366, + 3745 + ], + [ + 3372, + 3378, + 3729 + ], + [ + 3379, + 3387, + 3227 + ], + [ + 3388, + 3395, + 3712 + ], + [ + 3396, + 3405, + 3211 + ], + [ + 3409, + 3413, + 3689 + ], + [ + 3414, + 3426, + 3689 + ], + [ + 3428, + 3439, + 3188 + ], + [ + 3472, + 3479, + 3894 + ], + [ + 3489, + 3496, + 3875 + ], + [ + 3514, + 3522, + 3859 + ], + [ + 3525, + 3531, + 3878 + ], + [ + 3525, + 3531, + 3879 + ], + [ + 3538, + 3544, + 3517 + ], + [ + 3546, + 3554, + 3501 + ], + [ + 3584, + 3594, + 3737 + ], + [ + 3606, + 3617, + 3721 + ], + [ + 3631, + 3638, + 3703 + ], + [ + 3648, + 3657, + 3680 + ], + [ + 3677, + 3685, + 3651 + ], + [ + 3709, + 3717, + 3391 + ], + [ + 3726, + 3732, + 3375 + ], + [ + 3734, + 3740, + 3587 + ], + [ + 3742, + 3749, + 3361 + ], + [ + 3750, + 3757, + 3574 + ], + [ + 3774, + 3780, + 3327 + ], + [ + 3806, + 3811, + 4007 + ], + [ + 3829, + 3838, + 3826 + ], + [ + 3856, + 3863, + 3520 + ], + [ + 3865, + 3871, + 3981 + ], + [ + 3872, + 3880, + 3492 + ], + [ + 3879, + 3881, + 3514 + ], + [ + 3883, + 3890, + 3964 + ], + [ + 3891, + 3896, + 3475 + ], + [ + 3906, + 3909, + 3443 + ], + [ + 3923, + 3929, + 3913 + ], + [ + 3931, + 3937, + 3900 + ], + [ + 3931, + 3937, + 3913 + ], + [ + 3961, + 3968, + 3886 + ], + [ + 3978, + 3987, + 3868 + ], + [ + 4004, + 4012, + 3851 + ], + [ + 4081, + 4083, + 4076 + ], + [ + 4100, + 4108, + 4089 + ], + [ + 4166, + 4172, + 4163 + ], + [ + 4182, + 4193, + 4380 + ], + [ + 4203, + 4214, + 4364 + ], + [ + 4262, + 4270, + 4232 + ], + [ + 4271, + 4278, + 4232 + ], + [ + 4280, + 4288, + 4252 + ], + [ + 4289, + 4297, + 4252 + ], + [ + 4316, + 4323, + 4292 + ], + [ + 4353, + 4360, + 4815 + ], + [ + 4361, + 4368, + 4206 + ], + [ + 4377, + 4384, + 4185 + ], + [ + 4385, + 4392, + 4785 + ], + [ + 4393, + 4400, + 4163 + ], + [ + 4410, + 4418, + 4396 + ], + [ + 4440, + 4448, + 4422 + ], + [ + 4449, + 4457, + 4446 + ], + [ + 4483, + 4492, + 4468 + ], + [ + 4507, + 4513, + 4979 + ], + [ + 4523, + 4530, + 4803 + ], + [ + 4531, + 4542, + 4959 + ], + [ + 4551, + 4559, + 4534 + ], + [ + 4589, + 4596, + 4587 + ], + [ + 4619, + 4624, + 4615 + ], + [ + 4714, + 4725, + 4747 + ], + [ + 4753, + 4764, + 4413 + ], + [ + 4812, + 4820, + 4801 + ], + [ + 4828, + 4836, + 4336 + ], + [ + 4859, + 4868, + 4866 + ], + [ + 4896, + 4904, + 4849 + ], + [ + 4905, + 4913, + 4862 + ], + [ + 4916, + 4920, + 4874 + ], + [ + 4916, + 4920, + 4883 + ], + [ + 4929, + 4932, + 4546 + ], + [ + 4934, + 4943, + 4557 + ], + [ + 4956, + 4965, + 4534 + ], + [ + 4993, + 5000, + 5067 + ], + [ + 5001, + 5009, + 4478 + ], + [ + 5020, + 5025, + 5017 + ], + [ + 5091, + 5095, + 4177 + ], + [ + 5122, + 5131, + 5119 + ], + [ + 5130, + 5133, + 6024 + ], + [ + 5152, + 5156, + 5138 + ], + [ + 5182, + 5193, + 5462 + ], + [ + 5192, + 5197, + 5441 + ], + [ + 5212, + 5221, + 5432 + ], + [ + 5232, + 5244, + 5414 + ], + [ + 5255, + 5263, + 5398 + ], + [ + 5298, + 5307, + 5305 + ], + [ + 5365, + 5367, + 5354 + ], + [ + 5370, + 5377, + 5845 + ], + [ + 5386, + 5394, + 5825 + ], + [ + 5395, + 5402, + 5258 + ], + [ + 5404, + 5410, + 5804 + ], + [ + 5411, + 5419, + 5235 + ], + [ + 5420, + 5428, + 5782 + ], + [ + 5459, + 5470, + 5185 + ], + [ + 5506, + 5513, + 5998 + ], + [ + 5523, + 5532, + 5982 + ], + [ + 5541, + 5548, + 5806 + ], + [ + 5583, + 5592, + 5564 + ], + [ + 5625, + 5628, + 5618 + ], + [ + 5629, + 5641, + 5611 + ], + [ + 5643, + 5652, + 5812 + ], + [ + 5667, + 5674, + 5790 + ], + [ + 5713, + 5721, + 5748 + ], + [ + 5722, + 5729, + 5719 + ], + [ + 5745, + 5751, + 5737 + ], + [ + 5779, + 5786, + 5423 + ], + [ + 5787, + 5799, + 5672 + ], + [ + 5801, + 5807, + 5407 + ], + [ + 5809, + 5818, + 5646 + ], + [ + 5817, + 5820, + 11881 + ], + [ + 5817, + 5820, + 11882 + ], + [ + 5822, + 5830, + 5389 + ], + [ + 5842, + 5849, + 5373 + ], + [ + 5850, + 5859, + 5858 + ], + [ + 5890, + 5893, + 5858 + ], + [ + 5894, + 5908, + 5853 + ], + [ + 5910, + 5919, + 5865 + ], + [ + 5920, + 5925, + 5878 + ], + [ + 5926, + 5930, + 5897 + ], + [ + 5951, + 5962, + 6134 + ], + [ + 5963, + 5970, + 5552 + ], + [ + 5972, + 5978, + 6113 + ], + [ + 5979, + 5985, + 5526 + ], + [ + 5987, + 5994, + 6094 + ], + [ + 5995, + 6003, + 5509 + ], + [ + 6010, + 6015, + 5491 + ], + [ + 6013, + 6016, + 5462 + ], + [ + 6032, + 6040, + 6007 + ], + [ + 6047, + 6052, + 6061 + ], + [ + 6065, + 6075, + 6035 + ], + [ + 6078, + 6088, + 6044 + ], + [ + 6091, + 6098, + 5990 + ], + [ + 6110, + 6119, + 5975 + ], + [ + 6131, + 6139, + 5954 + ], + [ + 6190, + 6200, + 6368 + ], + [ + 6212, + 6219, + 6352 + ], + [ + 6249, + 6257, + 7114 + ], + [ + 6266, + 6274, + 7136 + ], + [ + 6275, + 6281, + 6304 + ], + [ + 6296, + 6305, + 6278 + ], + [ + 6323, + 6330, + 6862 + ], + [ + 6358, + 6364, + 6823 + ], + [ + 6365, + 6374, + 6193 + ], + [ + 6378, + 6383, + 6805 + ], + [ + 6384, + 6390, + 6180 + ], + [ + 6421, + 6428, + 6408 + ], + [ + 6429, + 6436, + 6791 + ], + [ + 6438, + 6450, + 6416 + ], + [ + 6460, + 6466, + 6990 + ], + [ + 6484, + 6496, + 6970 + ], + [ + 6498, + 6507, + 6582 + ], + [ + 6510, + 6519, + 6947 + ], + [ + 6552, + 6561, + 6525 + ], + [ + 6579, + 6588, + 6501 + ], + [ + 6587, + 6590, + 6706 + ], + [ + 6601, + 6609, + 6471 + ], + [ + 6610, + 6618, + 6471 + ], + [ + 6619, + 6631, + 6647 + ], + [ + 6652, + 6660, + 6649 + ], + [ + 6659, + 6662, + 6622 + ], + [ + 6672, + 6676, + 6752 + ], + [ + 6680, + 6692, + 6604 + ], + [ + 6723, + 6731, + 6729 + ], + [ + 6741, + 6746, + 6926 + ], + [ + 6741, + 6746, + 6927 + ], + [ + 6742, + 6748, + 6926 + ], + [ + 6749, + 6760, + 6719 + ], + [ + 6761, + 6768, + 6853 + ], + [ + 6775, + 6781, + 6835 + ], + [ + 6788, + 6794, + 6814 + ], + [ + 6850, + 6857, + 6764 + ], + [ + 6859, + 6867, + 6326 + ], + [ + 6868, + 6874, + 6735 + ], + [ + 6920, + 6928, + 6895 + ], + [ + 6929, + 6937, + 6907 + ], + [ + 6944, + 6952, + 6513 + ], + [ + 6953, + 6965, + 7105 + ], + [ + 6973, + 6978, + 6970 + ], + [ + 6992, + 7001, + 7069 + ], + [ + 7004, + 7008, + 6441 + ], + [ + 7066, + 7073, + 6999 + ], + [ + 7087, + 7091, + 6983 + ], + [ + 7097, + 7101, + 6208 + ], + [ + 7102, + 7109, + 6956 + ], + [ + 7111, + 7120, + 6231 + ], + [ + 7119, + 7121, + 6951 + ], + [ + 7124, + 7132, + 6941 + ], + [ + 7133, + 7142, + 6252 + ], + [ + 7143, + 7151, + 7718 + ], + [ + 7152, + 7161, + 6269 + ], + [ + 7172, + 7177, + 7168 + ], + [ + 7172, + 7177, + 7185 + ], + [ + 7174, + 7178, + 7765 + ], + [ + 7174, + 7179, + 7756 + ], + [ + 7197, + 7202, + 7445 + ], + [ + 7216, + 7223, + 7411 + ], + [ + 7250, + 7261, + 7375 + ], + [ + 7302, + 7311, + 7281 + ], + [ + 7314, + 7322, + 7279 + ], + [ + 7332, + 7338, + 7305 + ], + [ + 7350, + 7355, + 7856 + ], + [ + 7356, + 7362, + 7310 + ], + [ + 7364, + 7371, + 7838 + ], + [ + 7372, + 7379, + 7253 + ], + [ + 7399, + 7407, + 7804 + ], + [ + 7408, + 7416, + 7219 + ], + [ + 7430, + 7441, + 7782 + ], + [ + 7442, + 7454, + 7200 + ], + [ + 7485, + 7493, + 7466 + ], + [ + 7494, + 7502, + 8025 + ], + [ + 7513, + 7522, + 8009 + ], + [ + 7542, + 7551, + 7993 + ], + [ + 7550, + 7553, + 8013 + ], + [ + 7555, + 7563, + 7589 + ], + [ + 7564, + 7572, + 7594 + ], + [ + 7578, + 7584, + 7593 + ], + [ + 7649, + 7657, + 7829 + ], + [ + 7740, + 7749, + 7729 + ], + [ + 7753, + 7761, + 7729 + ], + [ + 7762, + 7768, + 7726 + ], + [ + 7779, + 7791, + 7433 + ], + [ + 7810, + 7816, + 7672 + ], + [ + 7826, + 7833, + 7652 + ], + [ + 7844, + 7851, + 7639 + ], + [ + 7903, + 7915, + 7865 + ], + [ + 7917, + 7926, + 7874 + ], + [ + 7927, + 7938, + 7887 + ], + [ + 7927, + 7938, + 7896 + ], + [ + 7937, + 7942, + 7890 + ], + [ + 7944, + 7954, + 7906 + ], + [ + 7957, + 7965, + 7571 + ], + [ + 7968, + 7974, + 7570 + ], + [ + 7976, + 7988, + 8139 + ], + [ + 7990, + 7997, + 7545 + ], + [ + 7999, + 8005, + 8118 + ], + [ + 8016, + 8021, + 8101 + ], + [ + 8022, + 8029, + 7497 + ], + [ + 8037, + 8042, + 8057 + ], + [ + 8037, + 8042, + 8062 + ], + [ + 8040, + 8043, + 7459 + ], + [ + 8044, + 8053, + 8037 + ], + [ + 8057, + 8064, + 8037 + ], + [ + 8066, + 8072, + 8062 + ], + [ + 8098, + 8106, + 8019 + ], + [ + 8183, + 8185, + 8155 + ], + [ + 8215, + 8224, + 8413 + ], + [ + 8237, + 8246, + 8394 + ], + [ + 8267, + 8271, + 8264 + ], + [ + 8272, + 8277, + 8286 + ], + [ + 8297, + 8302, + 9289 + ], + [ + 8346, + 8353, + 9020 + ], + [ + 8367, + 8375, + 8998 + ], + [ + 8391, + 8400, + 8240 + ], + [ + 8476, + 8488, + 8449 + ], + [ + 8546, + 8555, + 8629 + ], + [ + 8554, + 8556, + 8581 + ], + [ + 8554, + 8557, + 9119 + ], + [ + 8559, + 8568, + 9103 + ], + [ + 8580, + 8585, + 8633 + ], + [ + 8580, + 8585, + 8634 + ], + [ + 8600, + 8609, + 8574 + ], + [ + 8608, + 8611, + 8591 + ], + [ + 8608, + 8611, + 8598 + ], + [ + 8626, + 8635, + 8549 + ], + [ + 8652, + 8660, + 8524 + ], + [ + 8680, + 8688, + 8502 + ], + [ + 8696, + 8698, + 8712 + ], + [ + 8790, + 8799, + 8798 + ], + [ + 8798, + 8801, + 8839 + ], + [ + 8819, + 8830, + 8782 + ], + [ + 8831, + 8840, + 8793 + ], + [ + 8853, + 8862, + 8822 + ], + [ + 8866, + 8871, + 8822 + ], + [ + 8872, + 8879, + 9007 + ], + [ + 8881, + 8889, + 8776 + ], + [ + 8908, + 8913, + 8479 + ], + [ + 8909, + 8915, + 8479 + ], + [ + 8934, + 8938, + 8720 + ], + [ + 8950, + 8956, + 8919 + ], + [ + 8978, + 8983, + 8387 + ], + [ + 8995, + 9003, + 8370 + ], + [ + 9004, + 9014, + 8875 + ], + [ + 9017, + 9025, + 8349 + ], + [ + 9026, + 9032, + 8856 + ], + [ + 9042, + 9050, + 8330 + ], + [ + 9051, + 9058, + 9084 + ], + [ + 9068, + 9076, + 9074 + ], + [ + 9077, + 9085, + 9063 + ], + [ + 9086, + 9092, + 9063 + ], + [ + 9100, + 9107, + 8562 + ], + [ + 9108, + 9120, + 9263 + ], + [ + 9122, + 9134, + 8540 + ], + [ + 9168, + 9177, + 8493 + ], + [ + 9230, + 9237, + 9196 + ], + [ + 9278, + 9285, + 9097 + ], + [ + 9286, + 9293, + 8281 + ], + [ + 9303, + 9309, + 8300 + ], + [ + 9319, + 9324, + 9308 + ], + [ + 9333, + 9337, + 9341 + ], + [ + 9347, + 9354, + 9593 + ], + [ + 9386, + 9398, + 9544 + ], + [ + 9407, + 9415, + 9532 + ], + [ + 9434, + 9440, + 9425 + ], + [ + 9462, + 9464, + 9437 + ], + [ + 9466, + 9473, + 9451 + ], + [ + 9520, + 9528, + 9907 + ], + [ + 9534, + 9540, + 9891 + ], + [ + 9541, + 9549, + 9389 + ], + [ + 9550, + 9557, + 9874 + ], + [ + 9558, + 9567, + 9373 + ], + [ + 9571, + 9575, + 9851 + ], + [ + 9576, + 9588, + 9851 + ], + [ + 9590, + 9601, + 9350 + ], + [ + 9634, + 9641, + 10056 + ], + [ + 9651, + 9658, + 10037 + ], + [ + 9676, + 9684, + 10021 + ], + [ + 9687, + 9693, + 10040 + ], + [ + 9687, + 9693, + 10041 + ], + [ + 9700, + 9706, + 9679 + ], + [ + 9708, + 9716, + 9663 + ], + [ + 9746, + 9756, + 9899 + ], + [ + 9768, + 9779, + 9883 + ], + [ + 9793, + 9800, + 9865 + ], + [ + 9810, + 9819, + 9842 + ], + [ + 9839, + 9847, + 9813 + ], + [ + 9871, + 9879, + 9553 + ], + [ + 9888, + 9894, + 9537 + ], + [ + 9896, + 9902, + 9749 + ], + [ + 9904, + 9911, + 9523 + ], + [ + 9912, + 9919, + 9736 + ], + [ + 9936, + 9942, + 9489 + ], + [ + 9959, + 9970, + 9939 + ], + [ + 9969, + 9975, + 10183 + ], + [ + 9991, + 10000, + 9988 + ], + [ + 10018, + 10025, + 9682 + ], + [ + 10027, + 10033, + 10143 + ], + [ + 10034, + 10042, + 9654 + ], + [ + 10041, + 10043, + 9676 + ], + [ + 10045, + 10052, + 10126 + ], + [ + 10053, + 10058, + 9637 + ], + [ + 10068, + 10071, + 9605 + ], + [ + 10085, + 10091, + 10075 + ], + [ + 10093, + 10099, + 10062 + ], + [ + 10093, + 10099, + 10075 + ], + [ + 10123, + 10130, + 10048 + ], + [ + 10140, + 10149, + 10030 + ], + [ + 10166, + 10174, + 10013 + ], + [ + 10175, + 10182, + 10156 + ], + [ + 10199, + 10203, + 9419 + ], + [ + 10200, + 10206, + 7276 + ], + [ + 10328, + 10334, + 10325 + ], + [ + 10344, + 10355, + 10542 + ], + [ + 10365, + 10376, + 10526 + ], + [ + 10424, + 10432, + 10394 + ], + [ + 10433, + 10440, + 10394 + ], + [ + 10442, + 10450, + 10414 + ], + [ + 10451, + 10459, + 10414 + ], + [ + 10478, + 10485, + 10454 + ], + [ + 10515, + 10522, + 10977 + ], + [ + 10523, + 10530, + 10368 + ], + [ + 10539, + 10546, + 10347 + ], + [ + 10547, + 10554, + 10947 + ], + [ + 10555, + 10562, + 10325 + ], + [ + 10572, + 10580, + 10558 + ], + [ + 10602, + 10610, + 10584 + ], + [ + 10611, + 10619, + 10608 + ], + [ + 10645, + 10654, + 10630 + ], + [ + 10669, + 10675, + 11141 + ], + [ + 10685, + 10692, + 10965 + ], + [ + 10693, + 10704, + 11121 + ], + [ + 10713, + 10721, + 10696 + ], + [ + 10751, + 10758, + 10749 + ], + [ + 10781, + 10786, + 10777 + ], + [ + 10876, + 10887, + 10909 + ], + [ + 10915, + 10926, + 10575 + ], + [ + 10974, + 10982, + 10963 + ], + [ + 10990, + 10998, + 10498 + ], + [ + 11021, + 11030, + 11028 + ], + [ + 11033, + 11041, + 11002 + ], + [ + 11042, + 11056, + 11011 + ], + [ + 11058, + 11066, + 11011 + ], + [ + 11067, + 11075, + 11024 + ], + [ + 11078, + 11082, + 11036 + ], + [ + 11091, + 11093, + 10708 + ], + [ + 11096, + 11105, + 10719 + ], + [ + 11118, + 11127, + 10696 + ], + [ + 11155, + 11162, + 11229 + ], + [ + 11163, + 11171, + 10640 + ], + [ + 11182, + 11187, + 11179 + ], + [ + 11253, + 11257, + 10339 + ], + [ + 11284, + 11293, + 11281 + ], + [ + 11292, + 11295, + 12186 + ], + [ + 11314, + 11318, + 11300 + ], + [ + 11344, + 11355, + 11624 + ], + [ + 11354, + 11359, + 11603 + ], + [ + 11374, + 11383, + 11594 + ], + [ + 11394, + 11406, + 11576 + ], + [ + 11417, + 11425, + 11560 + ], + [ + 11460, + 11469, + 11467 + ], + [ + 11527, + 11531, + 11516 + ], + [ + 11532, + 11539, + 12007 + ], + [ + 11548, + 11556, + 11987 + ], + [ + 11557, + 11564, + 11420 + ], + [ + 11566, + 11572, + 11966 + ], + [ + 11573, + 11581, + 11397 + ], + [ + 11582, + 11590, + 11944 + ], + [ + 11621, + 11632, + 11347 + ], + [ + 11668, + 11675, + 12160 + ], + [ + 11685, + 11694, + 12144 + ], + [ + 11703, + 11710, + 11968 + ], + [ + 11745, + 11754, + 11726 + ], + [ + 11787, + 11790, + 11780 + ], + [ + 11791, + 11803, + 11773 + ], + [ + 11805, + 11814, + 11974 + ], + [ + 11829, + 11836, + 11952 + ], + [ + 11875, + 11883, + 11910 + ], + [ + 11884, + 11891, + 11881 + ], + [ + 11907, + 11913, + 11899 + ], + [ + 11941, + 11948, + 11585 + ], + [ + 11949, + 11961, + 11834 + ], + [ + 11963, + 11969, + 11569 + ], + [ + 11971, + 11980, + 11808 + ], + [ + 11979, + 11983, + 5719 + ], + [ + 11979, + 11983, + 5720 + ], + [ + 11984, + 11992, + 11551 + ], + [ + 12000, + 12002, + 5719 + ], + [ + 12004, + 12011, + 11535 + ], + [ + 12012, + 12021, + 12020 + ], + [ + 12052, + 12055, + 12020 + ], + [ + 12056, + 12070, + 12015 + ], + [ + 12072, + 12081, + 12027 + ], + [ + 12082, + 12087, + 12040 + ], + [ + 12088, + 12092, + 12059 + ], + [ + 12088, + 12092, + 12075 + ], + [ + 12113, + 12124, + 12296 + ], + [ + 12125, + 12132, + 11714 + ], + [ + 12134, + 12140, + 12275 + ], + [ + 12141, + 12147, + 11688 + ], + [ + 12149, + 12156, + 12256 + ], + [ + 12157, + 12165, + 11671 + ], + [ + 12172, + 12177, + 11653 + ], + [ + 12175, + 12178, + 11624 + ], + [ + 12194, + 12202, + 12169 + ], + [ + 12209, + 12214, + 12223 + ], + [ + 12227, + 12237, + 12197 + ], + [ + 12240, + 12250, + 12206 + ], + [ + 12253, + 12260, + 12152 + ], + [ + 12272, + 12281, + 12137 + ], + [ + 12293, + 12301, + 12116 + ], + [ + 174, + 177, + 178 + ], + [ + 272, + 275, + 262 + ], + [ + 675, + 678, + 679 + ], + [ + 711, + 713, + 709 + ], + [ + 866, + 868, + 864 + ], + [ + 1175, + 1177, + 1173 + ], + [ + 1512, + 1514, + 1518 + ], + [ + 1628, + 1630, + 979 + ], + [ + 1687, + 1690, + 1199 + ], + [ + 1865, + 1868, + 1863 + ], + [ + 1900, + 1903, + 1875 + ], + [ + 1909, + 1911, + 1926 + ], + [ + 1971, + 1973, + 1974 + ], + [ + 2181, + 2183, + 2867 + ], + [ + 2333, + 2336, + 2331 + ], + [ + 2558, + 2560, + 2548 + ], + [ + 2614, + 2616, + 2719 + ], + [ + 2733, + 2736, + 2737 + ], + [ + 2785, + 2787, + 2757 + ], + [ + 2785, + 2787, + 2760 + ], + [ + 2793, + 2795, + 2791 + ], + [ + 2812, + 2815, + 2819 + ], + [ + 3146, + 3148, + 3157 + ], + [ + 3329, + 3331, + 3316 + ], + [ + 3543, + 3545, + 3546 + ], + [ + 3665, + 3667, + 3663 + ], + [ + 3705, + 3708, + 3391 + ], + [ + 3787, + 3789, + 3790 + ], + [ + 3826, + 3828, + 3793 + ], + [ + 3826, + 3828, + 3829 + ], + [ + 4163, + 4165, + 4166 + ], + [ + 4366, + 4369, + 4370 + ], + [ + 4615, + 4618, + 4619 + ], + [ + 4615, + 4618, + 4625 + ], + [ + 4642, + 4645, + 4640 + ], + [ + 5017, + 5019, + 5020 + ], + [ + 5383, + 5385, + 5837 + ], + [ + 6280, + 6282, + 7155 + ], + [ + 6336, + 6339, + 6340 + ], + [ + 6389, + 6391, + 6180 + ], + [ + 6495, + 6497, + 9186 + ], + [ + 6569, + 6571, + 6567 + ], + [ + 7028, + 7030, + 7026 + ], + [ + 7071, + 7074, + 6999 + ], + [ + 7674, + 7676, + 7680 + ], + [ + 7775, + 7778, + 7760 + ], + [ + 7849, + 7852, + 7361 + ], + [ + 8062, + 8065, + 8037 + ], + [ + 8071, + 8073, + 8088 + ], + [ + 8088, + 8090, + 8071 + ], + [ + 8495, + 8498, + 8493 + ], + [ + 8712, + 8714, + 8696 + ], + [ + 8720, + 8722, + 8934 + ], + [ + 8776, + 8778, + 8881 + ], + [ + 8947, + 8949, + 8919 + ], + [ + 8947, + 8949, + 8922 + ], + [ + 9056, + 9059, + 9084 + ], + [ + 9133, + 9135, + 6518 + ], + [ + 9705, + 9707, + 9679 + ], + [ + 9827, + 9829, + 9825 + ], + [ + 10106, + 10109, + 10104 + ], + [ + 10283, + 10286, + 10281 + ], + [ + 10325, + 10327, + 10328 + ], + [ + 10438, + 10441, + 10394 + ], + [ + 10491, + 10494, + 10489 + ], + [ + 10674, + 10676, + 10672 + ], + [ + 10804, + 10807, + 10802 + ], + [ + 10903, + 10905, + 10894 + ], + [ + 10965, + 10967, + 10802 + ], + [ + 11179, + 11181, + 11182 + ], + [ + 11281, + 11283, + 11284 + ], + [ + 11545, + 11547, + 11999 + ], + [ + 12130, + 12133, + 12290 + ] + ], + "all_models": [ + [ + 28, + 38, + 206 + ], + [ + 50, + 57, + 190 + ], + [ + 87, + 95, + 952 + ], + [ + 104, + 112, + 974 + ], + [ + 113, + 119, + 142 + ], + [ + 134, + 143, + 116 + ], + [ + 161, + 168, + 700 + ], + [ + 178, + 186, + 682 + ], + [ + 196, + 202, + 661 + ], + [ + 203, + 212, + 31 + ], + [ + 216, + 221, + 643 + ], + [ + 222, + 228, + 18 + ], + [ + 259, + 266, + 246 + ], + [ + 267, + 274, + 629 + ], + [ + 276, + 288, + 254 + ], + [ + 298, + 304, + 828 + ], + [ + 322, + 334, + 808 + ], + [ + 336, + 345, + 420 + ], + [ + 348, + 357, + 785 + ], + [ + 356, + 358, + 2404 + ], + [ + 390, + 399, + 363 + ], + [ + 417, + 426, + 339 + ], + [ + 425, + 428, + 544 + ], + [ + 439, + 447, + 309 + ], + [ + 490, + 498, + 487 + ], + [ + 497, + 500, + 460 + ], + [ + 510, + 514, + 590 + ], + [ + 518, + 530, + 442 + ], + [ + 561, + 569, + 567 + ], + [ + 579, + 584, + 764 + ], + [ + 579, + 584, + 765 + ], + [ + 580, + 586, + 764 + ], + [ + 587, + 598, + 557 + ], + [ + 599, + 606, + 691 + ], + [ + 613, + 619, + 673 + ], + [ + 626, + 632, + 652 + ], + [ + 688, + 695, + 602 + ], + [ + 697, + 705, + 164 + ], + [ + 706, + 712, + 573 + ], + [ + 767, + 775, + 745 + ], + [ + 782, + 790, + 351 + ], + [ + 791, + 803, + 943 + ], + [ + 811, + 816, + 808 + ], + [ + 825, + 829, + 301 + ], + [ + 830, + 839, + 907 + ], + [ + 842, + 846, + 279 + ], + [ + 904, + 911, + 837 + ], + [ + 925, + 929, + 821 + ], + [ + 935, + 939, + 46 + ], + [ + 940, + 947, + 794 + ], + [ + 949, + 958, + 69 + ], + [ + 957, + 961, + 789 + ], + [ + 962, + 970, + 779 + ], + [ + 971, + 980, + 90 + ], + [ + 981, + 989, + 1556 + ], + [ + 990, + 999, + 107 + ], + [ + 1010, + 1015, + 1006 + ], + [ + 1010, + 1015, + 1023 + ], + [ + 1012, + 1017, + 1594 + ], + [ + 1035, + 1040, + 1283 + ], + [ + 1088, + 1099, + 1213 + ], + [ + 1140, + 1149, + 1119 + ], + [ + 1152, + 1160, + 1117 + ], + [ + 1170, + 1176, + 1143 + ], + [ + 1188, + 1193, + 1694 + ], + [ + 1194, + 1200, + 1148 + ], + [ + 1202, + 1209, + 1676 + ], + [ + 1210, + 1217, + 1091 + ], + [ + 1225, + 1236, + 1074 + ], + [ + 1237, + 1245, + 1642 + ], + [ + 1268, + 1279, + 1620 + ], + [ + 1280, + 1292, + 1038 + ], + [ + 1332, + 1340, + 1863 + ], + [ + 1351, + 1360, + 1847 + ], + [ + 1380, + 1389, + 1831 + ], + [ + 1388, + 1391, + 1851 + ], + [ + 1393, + 1401, + 1427 + ], + [ + 1402, + 1410, + 1432 + ], + [ + 1416, + 1422, + 1431 + ], + [ + 1487, + 1495, + 1667 + ], + [ + 1578, + 1587, + 1567 + ], + [ + 1591, + 1599, + 1567 + ], + [ + 1600, + 1606, + 1564 + ], + [ + 1617, + 1629, + 1271 + ], + [ + 1648, + 1654, + 1510 + ], + [ + 1664, + 1671, + 1490 + ], + [ + 1682, + 1689, + 1477 + ], + [ + 1741, + 1753, + 1703 + ], + [ + 1755, + 1764, + 1712 + ], + [ + 1765, + 1776, + 1725 + ], + [ + 1765, + 1776, + 1734 + ], + [ + 1775, + 1780, + 1728 + ], + [ + 1782, + 1792, + 1744 + ], + [ + 1795, + 1803, + 1409 + ], + [ + 1806, + 1812, + 1408 + ], + [ + 1814, + 1826, + 1977 + ], + [ + 1828, + 1835, + 1383 + ], + [ + 1837, + 1843, + 1956 + ], + [ + 1860, + 1867, + 1335 + ], + [ + 1875, + 1880, + 1895 + ], + [ + 1875, + 1880, + 1900 + ], + [ + 1878, + 1881, + 1297 + ], + [ + 1882, + 1891, + 1875 + ], + [ + 1890, + 1894, + 1885 + ], + [ + 1895, + 1902, + 1875 + ], + [ + 1904, + 1910, + 1900 + ], + [ + 1936, + 1944, + 1857 + ], + [ + 1997, + 2008, + 1798 + ], + [ + 2053, + 2062, + 2251 + ], + [ + 2075, + 2084, + 2232 + ], + [ + 2105, + 2109, + 2102 + ], + [ + 2110, + 2115, + 2124 + ], + [ + 2135, + 2140, + 3127 + ], + [ + 2184, + 2191, + 2858 + ], + [ + 2205, + 2213, + 2836 + ], + [ + 2229, + 2238, + 2078 + ], + [ + 2314, + 2326, + 2287 + ], + [ + 2384, + 2393, + 2467 + ], + [ + 2392, + 2394, + 2957 + ], + [ + 2392, + 2396, + 2419 + ], + [ + 2397, + 2406, + 2941 + ], + [ + 2418, + 2423, + 2471 + ], + [ + 2418, + 2423, + 2472 + ], + [ + 2438, + 2447, + 2412 + ], + [ + 2446, + 2449, + 2429 + ], + [ + 2446, + 2449, + 2436 + ], + [ + 2446, + 2449, + 2438 + ], + [ + 2464, + 2473, + 2387 + ], + [ + 2490, + 2498, + 2362 + ], + [ + 2518, + 2526, + 2340 + ], + [ + 2534, + 2536, + 2550 + ], + [ + 2628, + 2637, + 2636 + ], + [ + 2636, + 2639, + 2677 + ], + [ + 2657, + 2668, + 2620 + ], + [ + 2669, + 2678, + 2631 + ], + [ + 2691, + 2700, + 2660 + ], + [ + 2699, + 2702, + 2643 + ], + [ + 2704, + 2709, + 2660 + ], + [ + 2710, + 2717, + 2845 + ], + [ + 2719, + 2727, + 2614 + ], + [ + 2746, + 2751, + 2317 + ], + [ + 2747, + 2753, + 2317 + ], + [ + 2772, + 2776, + 2558 + ], + [ + 2788, + 2794, + 2757 + ], + [ + 2816, + 2821, + 2225 + ], + [ + 2829, + 2831, + 2812 + ], + [ + 2842, + 2852, + 2713 + ], + [ + 2855, + 2863, + 2187 + ], + [ + 2864, + 2870, + 2694 + ], + [ + 2880, + 2888, + 2168 + ], + [ + 2889, + 2896, + 2922 + ], + [ + 2906, + 2914, + 2912 + ], + [ + 2915, + 2923, + 2892 + ], + [ + 2915, + 2923, + 2901 + ], + [ + 2924, + 2930, + 2901 + ], + [ + 2938, + 2945, + 2400 + ], + [ + 2946, + 2958, + 3101 + ], + [ + 2960, + 2972, + 2378 + ], + [ + 3006, + 3015, + 2331 + ], + [ + 3068, + 3075, + 3034 + ], + [ + 3116, + 3123, + 2935 + ], + [ + 3124, + 3131, + 2119 + ], + [ + 3141, + 3147, + 2138 + ], + [ + 3157, + 3162, + 3146 + ], + [ + 3171, + 3175, + 3179 + ], + [ + 3185, + 3192, + 3431 + ], + [ + 3224, + 3236, + 3382 + ], + [ + 3245, + 3253, + 3370 + ], + [ + 3272, + 3278, + 3263 + ], + [ + 3300, + 3302, + 3275 + ], + [ + 3304, + 3311, + 3289 + ], + [ + 3358, + 3366, + 3745 + ], + [ + 3372, + 3378, + 3729 + ], + [ + 3379, + 3387, + 3227 + ], + [ + 3388, + 3395, + 3712 + ], + [ + 3396, + 3405, + 3211 + ], + [ + 3409, + 3413, + 3689 + ], + [ + 3414, + 3426, + 3689 + ], + [ + 3428, + 3439, + 3188 + ], + [ + 3472, + 3479, + 3894 + ], + [ + 3489, + 3496, + 3875 + ], + [ + 3514, + 3522, + 3859 + ], + [ + 3525, + 3531, + 3878 + ], + [ + 3525, + 3531, + 3879 + ], + [ + 3538, + 3544, + 3517 + ], + [ + 3546, + 3554, + 3501 + ], + [ + 3584, + 3594, + 3737 + ], + [ + 3606, + 3617, + 3721 + ], + [ + 3631, + 3638, + 3703 + ], + [ + 3648, + 3657, + 3680 + ], + [ + 3677, + 3685, + 3651 + ], + [ + 3709, + 3717, + 3391 + ], + [ + 3726, + 3732, + 3375 + ], + [ + 3734, + 3740, + 3587 + ], + [ + 3742, + 3749, + 3361 + ], + [ + 3750, + 3757, + 3574 + ], + [ + 3774, + 3780, + 3327 + ], + [ + 3806, + 3811, + 4007 + ], + [ + 3829, + 3838, + 3826 + ], + [ + 3856, + 3863, + 3520 + ], + [ + 3865, + 3871, + 3981 + ], + [ + 3872, + 3880, + 3492 + ], + [ + 3879, + 3881, + 3514 + ], + [ + 3883, + 3890, + 3964 + ], + [ + 3891, + 3896, + 3475 + ], + [ + 3906, + 3909, + 3443 + ], + [ + 3923, + 3929, + 3913 + ], + [ + 3931, + 3937, + 3900 + ], + [ + 3931, + 3937, + 3913 + ], + [ + 3961, + 3968, + 3886 + ], + [ + 3978, + 3987, + 3868 + ], + [ + 4004, + 4012, + 3851 + ], + [ + 4081, + 4083, + 4076 + ], + [ + 4100, + 4108, + 4089 + ], + [ + 4166, + 4172, + 4163 + ], + [ + 4182, + 4193, + 4380 + ], + [ + 4203, + 4214, + 4364 + ], + [ + 4262, + 4270, + 4232 + ], + [ + 4271, + 4278, + 4232 + ], + [ + 4280, + 4288, + 4252 + ], + [ + 4289, + 4297, + 4252 + ], + [ + 4316, + 4323, + 4292 + ], + [ + 4353, + 4360, + 4815 + ], + [ + 4361, + 4368, + 4206 + ], + [ + 4377, + 4384, + 4185 + ], + [ + 4385, + 4392, + 4785 + ], + [ + 4393, + 4400, + 4163 + ], + [ + 4410, + 4418, + 4396 + ], + [ + 4440, + 4448, + 4422 + ], + [ + 4449, + 4457, + 4446 + ], + [ + 4483, + 4492, + 4468 + ], + [ + 4507, + 4513, + 4979 + ], + [ + 4523, + 4530, + 4803 + ], + [ + 4531, + 4542, + 4959 + ], + [ + 4551, + 4559, + 4534 + ], + [ + 4589, + 4596, + 4587 + ], + [ + 4619, + 4624, + 4615 + ], + [ + 4714, + 4725, + 4747 + ], + [ + 4753, + 4764, + 4413 + ], + [ + 4812, + 4820, + 4801 + ], + [ + 4828, + 4836, + 4336 + ], + [ + 4859, + 4868, + 4866 + ], + [ + 4896, + 4904, + 4849 + ], + [ + 4905, + 4913, + 4862 + ], + [ + 4916, + 4920, + 4874 + ], + [ + 4916, + 4920, + 4883 + ], + [ + 4929, + 4932, + 4546 + ], + [ + 4934, + 4943, + 4557 + ], + [ + 4956, + 4965, + 4534 + ], + [ + 4993, + 5000, + 5067 + ], + [ + 5001, + 5009, + 4478 + ], + [ + 5020, + 5025, + 5017 + ], + [ + 5091, + 5095, + 4177 + ], + [ + 5122, + 5131, + 5119 + ], + [ + 5130, + 5133, + 6024 + ], + [ + 5152, + 5156, + 5138 + ], + [ + 5182, + 5193, + 5462 + ], + [ + 5192, + 5197, + 5441 + ], + [ + 5212, + 5221, + 5432 + ], + [ + 5232, + 5244, + 5414 + ], + [ + 5255, + 5263, + 5398 + ], + [ + 5298, + 5307, + 5305 + ], + [ + 5365, + 5367, + 5354 + ], + [ + 5370, + 5377, + 5845 + ], + [ + 5386, + 5394, + 5825 + ], + [ + 5395, + 5402, + 5258 + ], + [ + 5404, + 5410, + 5804 + ], + [ + 5411, + 5419, + 5235 + ], + [ + 5420, + 5428, + 5782 + ], + [ + 5459, + 5470, + 5185 + ], + [ + 5506, + 5513, + 5998 + ], + [ + 5523, + 5532, + 5982 + ], + [ + 5541, + 5548, + 5806 + ], + [ + 5583, + 5592, + 5564 + ], + [ + 5625, + 5628, + 5618 + ], + [ + 5629, + 5641, + 5611 + ], + [ + 5643, + 5652, + 5812 + ], + [ + 5667, + 5674, + 5790 + ], + [ + 5713, + 5721, + 5748 + ], + [ + 5722, + 5729, + 5719 + ], + [ + 5745, + 5751, + 5737 + ], + [ + 5779, + 5786, + 5423 + ], + [ + 5787, + 5799, + 5672 + ], + [ + 5801, + 5807, + 5407 + ], + [ + 5809, + 5818, + 5646 + ], + [ + 5817, + 5820, + 11881 + ], + [ + 5817, + 5820, + 11882 + ], + [ + 5822, + 5830, + 5389 + ], + [ + 5842, + 5849, + 5373 + ], + [ + 5850, + 5859, + 5858 + ], + [ + 5890, + 5893, + 5858 + ], + [ + 5894, + 5908, + 5853 + ], + [ + 5910, + 5919, + 5865 + ], + [ + 5920, + 5925, + 5878 + ], + [ + 5926, + 5930, + 5897 + ], + [ + 5951, + 5962, + 6134 + ], + [ + 5963, + 5970, + 5552 + ], + [ + 5972, + 5978, + 6113 + ], + [ + 5979, + 5985, + 5526 + ], + [ + 5987, + 5994, + 6094 + ], + [ + 5995, + 6003, + 5509 + ], + [ + 6010, + 6015, + 5491 + ], + [ + 6013, + 6016, + 5462 + ], + [ + 6032, + 6040, + 6007 + ], + [ + 6047, + 6052, + 6061 + ], + [ + 6065, + 6075, + 6035 + ], + [ + 6078, + 6088, + 6044 + ], + [ + 6091, + 6098, + 5990 + ], + [ + 6110, + 6119, + 5975 + ], + [ + 6131, + 6139, + 5954 + ], + [ + 6190, + 6200, + 6368 + ], + [ + 6212, + 6219, + 6352 + ], + [ + 6249, + 6257, + 7114 + ], + [ + 6266, + 6274, + 7136 + ], + [ + 6275, + 6281, + 6304 + ], + [ + 6296, + 6305, + 6278 + ], + [ + 6323, + 6330, + 6862 + ], + [ + 6358, + 6364, + 6823 + ], + [ + 6365, + 6374, + 6193 + ], + [ + 6378, + 6383, + 6805 + ], + [ + 6384, + 6390, + 6180 + ], + [ + 6421, + 6428, + 6408 + ], + [ + 6429, + 6436, + 6791 + ], + [ + 6438, + 6450, + 6416 + ], + [ + 6460, + 6466, + 6990 + ], + [ + 6484, + 6496, + 6970 + ], + [ + 6498, + 6507, + 6582 + ], + [ + 6510, + 6519, + 6947 + ], + [ + 6552, + 6561, + 6525 + ], + [ + 6579, + 6588, + 6501 + ], + [ + 6587, + 6590, + 6706 + ], + [ + 6601, + 6609, + 6471 + ], + [ + 6610, + 6618, + 6471 + ], + [ + 6619, + 6631, + 6647 + ], + [ + 6652, + 6660, + 6649 + ], + [ + 6659, + 6662, + 6622 + ], + [ + 6672, + 6676, + 6752 + ], + [ + 6680, + 6692, + 6604 + ], + [ + 6723, + 6731, + 6729 + ], + [ + 6741, + 6746, + 6926 + ], + [ + 6741, + 6746, + 6927 + ], + [ + 6742, + 6748, + 6926 + ], + [ + 6749, + 6760, + 6719 + ], + [ + 6761, + 6768, + 6853 + ], + [ + 6775, + 6781, + 6835 + ], + [ + 6788, + 6794, + 6814 + ], + [ + 6850, + 6857, + 6764 + ], + [ + 6859, + 6867, + 6326 + ], + [ + 6868, + 6874, + 6735 + ], + [ + 6920, + 6928, + 6895 + ], + [ + 6929, + 6937, + 6907 + ], + [ + 6944, + 6952, + 6513 + ], + [ + 6953, + 6965, + 7105 + ], + [ + 6973, + 6978, + 6970 + ], + [ + 6992, + 7001, + 7069 + ], + [ + 7004, + 7008, + 6441 + ], + [ + 7066, + 7073, + 6999 + ], + [ + 7087, + 7091, + 6983 + ], + [ + 7097, + 7101, + 6208 + ], + [ + 7102, + 7109, + 6956 + ], + [ + 7111, + 7120, + 6231 + ], + [ + 7119, + 7121, + 6951 + ], + [ + 7124, + 7132, + 6941 + ], + [ + 7133, + 7142, + 6252 + ], + [ + 7143, + 7151, + 7718 + ], + [ + 7152, + 7161, + 6269 + ], + [ + 7172, + 7177, + 7168 + ], + [ + 7172, + 7177, + 7185 + ], + [ + 7174, + 7178, + 7765 + ], + [ + 7174, + 7179, + 7756 + ], + [ + 7197, + 7202, + 7445 + ], + [ + 7216, + 7223, + 7411 + ], + [ + 7250, + 7261, + 7375 + ], + [ + 7302, + 7311, + 7281 + ], + [ + 7314, + 7322, + 7279 + ], + [ + 7332, + 7338, + 7305 + ], + [ + 7350, + 7355, + 7856 + ], + [ + 7356, + 7362, + 7310 + ], + [ + 7364, + 7371, + 7838 + ], + [ + 7372, + 7379, + 7253 + ], + [ + 7399, + 7407, + 7804 + ], + [ + 7408, + 7416, + 7219 + ], + [ + 7430, + 7441, + 7782 + ], + [ + 7442, + 7454, + 7200 + ], + [ + 7485, + 7493, + 7466 + ], + [ + 7494, + 7502, + 8025 + ], + [ + 7513, + 7522, + 8009 + ], + [ + 7542, + 7551, + 7993 + ], + [ + 7550, + 7553, + 8013 + ], + [ + 7555, + 7563, + 7589 + ], + [ + 7564, + 7572, + 7594 + ], + [ + 7578, + 7584, + 7593 + ], + [ + 7649, + 7657, + 7829 + ], + [ + 7740, + 7749, + 7729 + ], + [ + 7753, + 7761, + 7729 + ], + [ + 7762, + 7768, + 7726 + ], + [ + 7779, + 7791, + 7433 + ], + [ + 7810, + 7816, + 7672 + ], + [ + 7826, + 7833, + 7652 + ], + [ + 7844, + 7851, + 7639 + ], + [ + 7903, + 7915, + 7865 + ], + [ + 7917, + 7926, + 7874 + ], + [ + 7927, + 7938, + 7887 + ], + [ + 7927, + 7938, + 7896 + ], + [ + 7937, + 7942, + 7890 + ], + [ + 7944, + 7954, + 7906 + ], + [ + 7957, + 7965, + 7571 + ], + [ + 7968, + 7974, + 7570 + ], + [ + 7976, + 7988, + 8139 + ], + [ + 7990, + 7997, + 7545 + ], + [ + 7999, + 8005, + 8118 + ], + [ + 8016, + 8021, + 8101 + ], + [ + 8022, + 8029, + 7497 + ], + [ + 8037, + 8042, + 8057 + ], + [ + 8037, + 8042, + 8062 + ], + [ + 8040, + 8043, + 7459 + ], + [ + 8044, + 8053, + 8037 + ], + [ + 8057, + 8064, + 8037 + ], + [ + 8066, + 8072, + 8062 + ], + [ + 8098, + 8106, + 8019 + ], + [ + 8183, + 8185, + 8155 + ], + [ + 8215, + 8224, + 8413 + ], + [ + 8237, + 8246, + 8394 + ], + [ + 8267, + 8271, + 8264 + ], + [ + 8272, + 8277, + 8286 + ], + [ + 8297, + 8302, + 9289 + ], + [ + 8346, + 8353, + 9020 + ], + [ + 8367, + 8375, + 8998 + ], + [ + 8391, + 8400, + 8240 + ], + [ + 8476, + 8488, + 8449 + ], + [ + 8546, + 8555, + 8629 + ], + [ + 8554, + 8556, + 8581 + ], + [ + 8554, + 8557, + 9119 + ], + [ + 8559, + 8568, + 9103 + ], + [ + 8580, + 8585, + 8633 + ], + [ + 8580, + 8585, + 8634 + ], + [ + 8600, + 8609, + 8574 + ], + [ + 8608, + 8611, + 8591 + ], + [ + 8608, + 8611, + 8598 + ], + [ + 8626, + 8635, + 8549 + ], + [ + 8652, + 8660, + 8524 + ], + [ + 8680, + 8688, + 8502 + ], + [ + 8696, + 8698, + 8712 + ], + [ + 8790, + 8799, + 8798 + ], + [ + 8798, + 8801, + 8839 + ], + [ + 8819, + 8830, + 8782 + ], + [ + 8831, + 8840, + 8793 + ], + [ + 8853, + 8862, + 8822 + ], + [ + 8866, + 8871, + 8822 + ], + [ + 8872, + 8879, + 9007 + ], + [ + 8881, + 8889, + 8776 + ], + [ + 8908, + 8913, + 8479 + ], + [ + 8909, + 8915, + 8479 + ], + [ + 8934, + 8938, + 8720 + ], + [ + 8950, + 8956, + 8919 + ], + [ + 8978, + 8983, + 8387 + ], + [ + 8995, + 9003, + 8370 + ], + [ + 9004, + 9014, + 8875 + ], + [ + 9017, + 9025, + 8349 + ], + [ + 9026, + 9032, + 8856 + ], + [ + 9042, + 9050, + 8330 + ], + [ + 9051, + 9058, + 9084 + ], + [ + 9068, + 9076, + 9074 + ], + [ + 9077, + 9085, + 9063 + ], + [ + 9086, + 9092, + 9063 + ], + [ + 9100, + 9107, + 8562 + ], + [ + 9108, + 9120, + 9263 + ], + [ + 9122, + 9134, + 8540 + ], + [ + 9168, + 9177, + 8493 + ], + [ + 9230, + 9237, + 9196 + ], + [ + 9278, + 9285, + 9097 + ], + [ + 9286, + 9293, + 8281 + ], + [ + 9303, + 9309, + 8300 + ], + [ + 9319, + 9324, + 9308 + ], + [ + 9333, + 9337, + 9341 + ], + [ + 9347, + 9354, + 9593 + ], + [ + 9386, + 9398, + 9544 + ], + [ + 9407, + 9415, + 9532 + ], + [ + 9434, + 9440, + 9425 + ], + [ + 9462, + 9464, + 9437 + ], + [ + 9466, + 9473, + 9451 + ], + [ + 9520, + 9528, + 9907 + ], + [ + 9534, + 9540, + 9891 + ], + [ + 9541, + 9549, + 9389 + ], + [ + 9550, + 9557, + 9874 + ], + [ + 9558, + 9567, + 9373 + ], + [ + 9571, + 9575, + 9851 + ], + [ + 9576, + 9588, + 9851 + ], + [ + 9590, + 9601, + 9350 + ], + [ + 9634, + 9641, + 10056 + ], + [ + 9651, + 9658, + 10037 + ], + [ + 9676, + 9684, + 10021 + ], + [ + 9687, + 9693, + 10040 + ], + [ + 9687, + 9693, + 10041 + ], + [ + 9700, + 9706, + 9679 + ], + [ + 9708, + 9716, + 9663 + ], + [ + 9746, + 9756, + 9899 + ], + [ + 9768, + 9779, + 9883 + ], + [ + 9793, + 9800, + 9865 + ], + [ + 9810, + 9819, + 9842 + ], + [ + 9839, + 9847, + 9813 + ], + [ + 9871, + 9879, + 9553 + ], + [ + 9888, + 9894, + 9537 + ], + [ + 9896, + 9902, + 9749 + ], + [ + 9904, + 9911, + 9523 + ], + [ + 9912, + 9919, + 9736 + ], + [ + 9936, + 9942, + 9489 + ], + [ + 9959, + 9970, + 9939 + ], + [ + 9969, + 9975, + 10183 + ], + [ + 9991, + 10000, + 9988 + ], + [ + 10018, + 10025, + 9682 + ], + [ + 10027, + 10033, + 10143 + ], + [ + 10034, + 10042, + 9654 + ], + [ + 10041, + 10043, + 9676 + ], + [ + 10045, + 10052, + 10126 + ], + [ + 10053, + 10058, + 9637 + ], + [ + 10068, + 10071, + 9605 + ], + [ + 10085, + 10091, + 10075 + ], + [ + 10093, + 10099, + 10062 + ], + [ + 10093, + 10099, + 10075 + ], + [ + 10123, + 10130, + 10048 + ], + [ + 10140, + 10149, + 10030 + ], + [ + 10166, + 10174, + 10013 + ], + [ + 10175, + 10182, + 10156 + ], + [ + 10199, + 10203, + 9419 + ], + [ + 10200, + 10206, + 7276 + ], + [ + 10328, + 10334, + 10325 + ], + [ + 10344, + 10355, + 10542 + ], + [ + 10365, + 10376, + 10526 + ], + [ + 10424, + 10432, + 10394 + ], + [ + 10433, + 10440, + 10394 + ], + [ + 10442, + 10450, + 10414 + ], + [ + 10451, + 10459, + 10414 + ], + [ + 10478, + 10485, + 10454 + ], + [ + 10515, + 10522, + 10977 + ], + [ + 10523, + 10530, + 10368 + ], + [ + 10539, + 10546, + 10347 + ], + [ + 10547, + 10554, + 10947 + ], + [ + 10555, + 10562, + 10325 + ], + [ + 10572, + 10580, + 10558 + ], + [ + 10602, + 10610, + 10584 + ], + [ + 10611, + 10619, + 10608 + ], + [ + 10645, + 10654, + 10630 + ], + [ + 10669, + 10675, + 11141 + ], + [ + 10685, + 10692, + 10965 + ], + [ + 10693, + 10704, + 11121 + ], + [ + 10713, + 10721, + 10696 + ], + [ + 10751, + 10758, + 10749 + ], + [ + 10781, + 10786, + 10777 + ], + [ + 10876, + 10887, + 10909 + ], + [ + 10915, + 10926, + 10575 + ], + [ + 10974, + 10982, + 10963 + ], + [ + 10990, + 10998, + 10498 + ], + [ + 11021, + 11030, + 11028 + ], + [ + 11033, + 11041, + 11002 + ], + [ + 11042, + 11056, + 11011 + ], + [ + 11058, + 11066, + 11011 + ], + [ + 11067, + 11075, + 11024 + ], + [ + 11078, + 11082, + 11036 + ], + [ + 11091, + 11093, + 10708 + ], + [ + 11096, + 11105, + 10719 + ], + [ + 11118, + 11127, + 10696 + ], + [ + 11155, + 11162, + 11229 + ], + [ + 11163, + 11171, + 10640 + ], + [ + 11182, + 11187, + 11179 + ], + [ + 11253, + 11257, + 10339 + ], + [ + 11284, + 11293, + 11281 + ], + [ + 11292, + 11295, + 12186 + ], + [ + 11314, + 11318, + 11300 + ], + [ + 11344, + 11355, + 11624 + ], + [ + 11354, + 11359, + 11603 + ], + [ + 11374, + 11383, + 11594 + ], + [ + 11394, + 11406, + 11576 + ], + [ + 11417, + 11425, + 11560 + ], + [ + 11460, + 11469, + 11467 + ], + [ + 11527, + 11531, + 11516 + ], + [ + 11532, + 11539, + 12007 + ], + [ + 11548, + 11556, + 11987 + ], + [ + 11557, + 11564, + 11420 + ], + [ + 11566, + 11572, + 11966 + ], + [ + 11573, + 11581, + 11397 + ], + [ + 11582, + 11590, + 11944 + ], + [ + 11621, + 11632, + 11347 + ], + [ + 11668, + 11675, + 12160 + ], + [ + 11685, + 11694, + 12144 + ], + [ + 11703, + 11710, + 11968 + ], + [ + 11745, + 11754, + 11726 + ], + [ + 11787, + 11790, + 11780 + ], + [ + 11791, + 11803, + 11773 + ], + [ + 11805, + 11814, + 11974 + ], + [ + 11829, + 11836, + 11952 + ], + [ + 11875, + 11883, + 11910 + ], + [ + 11884, + 11891, + 11881 + ], + [ + 11907, + 11913, + 11899 + ], + [ + 11941, + 11948, + 11585 + ], + [ + 11949, + 11961, + 11834 + ], + [ + 11963, + 11969, + 11569 + ], + [ + 11971, + 11980, + 11808 + ], + [ + 11979, + 11983, + 5719 + ], + [ + 11979, + 11983, + 5720 + ], + [ + 11984, + 11992, + 11551 + ], + [ + 12000, + 12002, + 5719 + ], + [ + 12004, + 12011, + 11535 + ], + [ + 12012, + 12021, + 12020 + ], + [ + 12052, + 12055, + 12020 + ], + [ + 12056, + 12070, + 12015 + ], + [ + 12072, + 12081, + 12027 + ], + [ + 12082, + 12087, + 12040 + ], + [ + 12088, + 12092, + 12059 + ], + [ + 12088, + 12092, + 12075 + ], + [ + 12113, + 12124, + 12296 + ], + [ + 12125, + 12132, + 11714 + ], + [ + 12134, + 12140, + 12275 + ], + [ + 12141, + 12147, + 11688 + ], + [ + 12149, + 12156, + 12256 + ], + [ + 12157, + 12165, + 11671 + ], + [ + 12172, + 12177, + 11653 + ], + [ + 12175, + 12178, + 11624 + ], + [ + 12194, + 12202, + 12169 + ], + [ + 12209, + 12214, + 12223 + ], + [ + 12227, + 12237, + 12197 + ], + [ + 12240, + 12250, + 12206 + ], + [ + 12253, + 12260, + 12152 + ], + [ + 12272, + 12281, + 12137 + ], + [ + 12293, + 12301, + 12116 + ], + [ + 174, + 177, + 178 + ], + [ + 272, + 275, + 262 + ], + [ + 675, + 678, + 679 + ], + [ + 711, + 713, + 709 + ], + [ + 866, + 868, + 864 + ], + [ + 1175, + 1177, + 1173 + ], + [ + 1512, + 1514, + 1518 + ], + [ + 1628, + 1630, + 979 + ], + [ + 1687, + 1690, + 1199 + ], + [ + 1865, + 1868, + 1863 + ], + [ + 1900, + 1903, + 1875 + ], + [ + 1909, + 1911, + 1926 + ], + [ + 1971, + 1973, + 1974 + ], + [ + 2181, + 2183, + 2867 + ], + [ + 2333, + 2336, + 2331 + ], + [ + 2558, + 2560, + 2548 + ], + [ + 2614, + 2616, + 2719 + ], + [ + 2733, + 2736, + 2737 + ], + [ + 2785, + 2787, + 2757 + ], + [ + 2785, + 2787, + 2760 + ], + [ + 2793, + 2795, + 2791 + ], + [ + 2812, + 2815, + 2819 + ], + [ + 3146, + 3148, + 3157 + ], + [ + 3329, + 3331, + 3316 + ], + [ + 3543, + 3545, + 3546 + ], + [ + 3665, + 3667, + 3663 + ], + [ + 3705, + 3708, + 3391 + ], + [ + 3787, + 3789, + 3790 + ], + [ + 3826, + 3828, + 3793 + ], + [ + 3826, + 3828, + 3829 + ], + [ + 4163, + 4165, + 4166 + ], + [ + 4366, + 4369, + 4370 + ], + [ + 4615, + 4618, + 4619 + ], + [ + 4615, + 4618, + 4625 + ], + [ + 4642, + 4645, + 4640 + ], + [ + 5017, + 5019, + 5020 + ], + [ + 5383, + 5385, + 5837 + ], + [ + 6280, + 6282, + 7155 + ], + [ + 6336, + 6339, + 6340 + ], + [ + 6389, + 6391, + 6180 + ], + [ + 6495, + 6497, + 9186 + ], + [ + 6569, + 6571, + 6567 + ], + [ + 7028, + 7030, + 7026 + ], + [ + 7071, + 7074, + 6999 + ], + [ + 7674, + 7676, + 7680 + ], + [ + 7775, + 7778, + 7760 + ], + [ + 7849, + 7852, + 7361 + ], + [ + 8062, + 8065, + 8037 + ], + [ + 8071, + 8073, + 8088 + ], + [ + 8088, + 8090, + 8071 + ], + [ + 8495, + 8498, + 8493 + ], + [ + 8712, + 8714, + 8696 + ], + [ + 8720, + 8722, + 8934 + ], + [ + 8776, + 8778, + 8881 + ], + [ + 8947, + 8949, + 8919 + ], + [ + 8947, + 8949, + 8922 + ], + [ + 9056, + 9059, + 9084 + ], + [ + 9133, + 9135, + 6518 + ], + [ + 9705, + 9707, + 9679 + ], + [ + 9827, + 9829, + 9825 + ], + [ + 10106, + 10109, + 10104 + ], + [ + 10283, + 10286, + 10281 + ], + [ + 10325, + 10327, + 10328 + ], + [ + 10438, + 10441, + 10394 + ], + [ + 10491, + 10494, + 10489 + ], + [ + 10674, + 10676, + 10672 + ], + [ + 10804, + 10807, + 10802 + ], + [ + 10903, + 10905, + 10894 + ], + [ + 10965, + 10967, + 10802 + ], + [ + 11179, + 11181, + 11182 + ], + [ + 11281, + 11283, + 11284 + ], + [ + 11545, + 11547, + 11999 + ], + [ + 12130, + 12133, + 12290 + ] + ] + } +} \ No newline at end of file diff --git a/tests/structure/data/misc/hbond.py b/tests/structure/data/misc/hbond.py new file mode 100644 index 000000000..5cd7828b9 --- /dev/null +++ b/tests/structure/data/misc/hbond.py @@ -0,0 +1,48 @@ +import json +import warnings +from pathlib import Path +from tempfile import NamedTemporaryFile +import mdtraj +import biotite.structure as struc +import biotite.structure.io.pdb as pdb +import biotite.structure.io.pdbx as pdbx + +PDB_IDS = ["1l2y", "1gya", "1igy"] +STRUCTURE_DIR = Path(__file__).parents[1] +OUTPUT_FILE = Path(__file__).parent / "hbond.json" + + +def compute_mdtraj_hbonds(bcif_path, use_all_models): + pdbx_file = pdbx.BinaryCIFFile.read(bcif_path) + model = None if use_all_models else 1 + atoms = pdbx.get_structure(pdbx_file, model=model) + # Only consider amino acids for consistency + # with bonded hydrogen detection in MDTraj + atoms = atoms[..., struc.filter_amino_acids(atoms)] + + temp = NamedTemporaryFile("w+", suffix=".pdb") + pdb_file = pdb.PDBFile() + pdb_file.set_structure(atoms) + pdb_file.write(temp.name) + + # Compute hbonds with MDTraj + # Ignore warning about dummy unit cell vector + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + traj = mdtraj.load(temp.name) + temp.close() + return mdtraj.baker_hubbard(traj, freq=0, periodic=False).tolist() + + +if __name__ == "__main__": + data = {} + for pdb_id in PDB_IDS: + bcif_path = STRUCTURE_DIR / f"{pdb_id}.bcif" + data_for_pdb_id = {} + for use_all_models in [False, True]: + key = "single_model" if not use_all_models else "all_models" + data_for_pdb_id[key] = compute_mdtraj_hbonds(bcif_path, use_all_models) + data[pdb_id] = data_for_pdb_id + + with open(OUTPUT_FILE, "w") as file: + json.dump(data, file, indent=4) diff --git a/tests/structure/data/misc/rdf.json b/tests/structure/data/misc/rdf.json new file mode 100644 index 000000000..addaf7470 --- /dev/null +++ b/tests/structure/data/misc/rdf.json @@ -0,0 +1,206 @@ +{ + "bins": [ + 0.04999999701976776, + 0.14999999105930328, + 0.2499999850988388, + 0.3499999940395355, + 0.45000001788139343, + 0.550000011920929, + 0.6499999761581421, + 0.75, + 0.8500000238418579, + 0.949999988079071, + 1.0500000715255737, + 1.149999976158142, + 1.25, + 1.3499999046325684, + 1.4500000476837158, + 1.5499999523162842, + 1.6499998569488525, + 1.7500001192092896, + 1.850000023841858, + 1.9499999284744263, + 2.049999952316284, + 2.1500000953674316, + 2.25, + 2.3499999046325684, + 2.450000047683716, + 2.549999952316284, + 2.6499998569488525, + 2.75, + 2.8499999046325684, + 2.950000286102295, + 3.0500001907348633, + 3.1500000953674316, + 3.25, + 3.3500001430511475, + 3.450000047683716, + 3.5500001907348633, + 3.6500000953674316, + 3.75, + 3.8499999046325684, + 3.9499998092651367, + 4.050000190734863, + 4.150000095367432, + 4.25, + 4.349999904632568, + 4.449999809265137, + 4.549999713897705, + 4.650000095367432, + 4.75, + 4.850000381469727, + 4.949999809265137, + 5.050000190734863, + 5.149999618530273, + 5.25, + 5.34999942779541, + 5.450000286102295, + 5.550000190734863, + 5.650000095367432, + 5.75, + 5.849999904632568, + 5.950000286102295, + 6.050000190734863, + 6.150000095367432, + 6.25, + 6.349999904632568, + 6.449999809265137, + 6.549999713897705, + 6.650000095367432, + 6.75, + 6.849999904632568, + 6.949999809265137, + 7.049999713897705, + 7.15000057220459, + 7.25, + 7.350000381469727, + 7.449999809265137, + 7.550000190734863, + 7.649999618530273, + 7.75, + 7.84999942779541, + 7.950000286102295, + 8.050000190734863, + 8.149999618530273, + 8.25, + 8.34999942779541, + 8.450000762939453, + 8.550000190734863, + 8.649999618530273, + 8.75, + 8.850000381469727, + 8.949999809265137, + 9.049999237060547, + 9.15000057220459, + 9.25, + 9.350000381469727, + 9.449999809265137, + 9.550000190734863, + 9.65000057220459, + 9.75, + 9.850000381469727, + 9.949999809265137 + ], + "g_r": [ + 7115.77344579949, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.251466531019467, + 1.5680429539354408, + 0.9732970682305122, + 1.8168674865802397, + 0.8498478023584505, + 0.0, + 0.7484753641147244, + 0.7044630563284381, + 0.6642184426663381, + 0.0, + 0.2967131811396059, + 0.0, + 1.0667537505986384, + 1.0134240564818529, + 1.928002995221023, + 0.6885796652241579, + 0.656556711389155, + 1.4623453686090406, + 0.7984947351998619, + 0.19094503567053633, + 0.5484643066365387, + 0.525615222237418, + 1.0083266260672727, + 1.4520015675218338, + 0.6200309283425397, + 0.44713932593532835, + 0.717113611149751, + 0.6905540436763368, + 0.5323587584120613, + 0.256674381231009, + 0.6191725383903437, + 0.35869394922433995, + 0.8085850649261233, + 0.7816309435905584, + 0.8640107711303713, + 0.6271063129808176, + 0.7084002683416238, + 0.7843034005708507, + 0.38008543781101156, + 0.5528506608243846, + 0.44696006491342005, + 0.6073409661031577, + 0.4212408256701984, + 0.9820959602582018, + 0.874898538922559, + 0.38663231855574515, + 0.827294263526215, + 0.36587933465801015, + 0.7122478113976795, + 0.34675246941954174, + 0.6754937672259362, + 1.0530775114574136, + 0.8981111971831245, + 0.31273751863613497, + 0.6710342154649391, + 0.4761234689153867, + 0.6388960835381869, + 0.7370802264530467, + 0.3875487773417374, + 0.7030027679710801, + 0.3170034237516155, + 0.6712308665919907, + 0.5047294802746184, + 0.5428664698460439, + 0.2895983054916858, + 0.42495886935856375, + 0.5082236958057389, + 0.4521913367713196, + 0.3984049589875063, + 0.433450190365297, + 0.4669624198102246, + 0.49902019556747357, + 0.4889393171665801, + 0.31944036033673 + ] +} \ No newline at end of file diff --git a/tests/structure/data/misc/rdf.py b/tests/structure/data/misc/rdf.py new file mode 100644 index 000000000..fcba1934f --- /dev/null +++ b/tests/structure/data/misc/rdf.py @@ -0,0 +1,31 @@ +import itertools +import json +from pathlib import Path +import mdtraj +import numpy as np + +TEST_FILE = Path(__file__).parents[1] / "waterbox.gro" +OUTPUT_FILE = Path(__file__).parent / "rdf.json" +INTERVAL = [0, 10] +N_BINS = 100 + + +if __name__ == "__main__": + traj = mdtraj.load(TEST_FILE) + ow = [a.index for a in traj.topology.atoms if a.name == "O"] + pairs = itertools.product([ow[0]], ow) + mdtraj_bins, mdtraj_g_r = mdtraj.compute_rdf( + # Note the conversion from Angstrom to nm + traj, + list(pairs), + r_range=np.array(INTERVAL) / 10, + n_bins=N_BINS, + periodic=False, + ) + + with open(OUTPUT_FILE, "w") as file: + json.dump( + {"bins": (mdtraj_bins * 10).tolist(), "g_r": mdtraj_g_r.tolist()}, + file, + indent=4, + ) diff --git a/tests/structure/data/misc/sasa.json b/tests/structure/data/misc/sasa.json new file mode 100644 index 000000000..d9bdc7aa0 --- /dev/null +++ b/tests/structure/data/misc/sasa.json @@ -0,0 +1,2286 @@ +{ + "1l2y": [ + 1.421665072441101, + 0.0, + 0.19322051107883453, + 0.5785878300666809, + 0.7970346212387085, + 6.4004292488098145, + 25.800731658935547, + 13.801087379455566, + 9.14047622680664, + 12.555411338806152, + 13.302959442138672, + 0.8664763569831848, + 12.385514259338379, + 15.426677703857422, + 26.43602180480957, + 27.982088088989258, + 1.0717167854309082, + 0.0, + 0.0, + 2.250063896179199, + 1.352543592453003, + 0.0, + 0.16906794905662537, + 6.738564968109131, + 0.9514249563217163, + 10.482664108276367, + 7.050738334655762, + 14.611169815063477, + 2.973203182220459, + 0.0, + 1.563055396080017, + 11.451079368591309, + 20.132831573486328, + 16.61595916748047, + 21.950733184814453, + 0.0, + 0.0, + 0.0, + 0.0, + 1.6665269136428833, + 2.874155044555664, + 5.796615123748779, + 6.110598564147949, + 6.424582004547119, + 6.69025993347168, + 3.985172986984253, + 21.193458557128906, + 0.16989731788635254, + 0.0, + 7.305584907531738, + 8.273999214172363, + 1.7329527139663696, + 2.633408546447754, + 5.045950412750244, + 7.543440818786621, + 6.6429853439331055, + 0.0, + 0.0, + 0.0, + 0.3642960488796234, + 0.0, + 0.024152563884854317, + 6.207208633422852, + 6.134750843048096, + 0.0, + 0.016989732161164284, + 4.451309680938721, + 0.0, + 9.055527687072754, + 10.278788566589355, + 14.050509452819824, + 21.356094360351562, + 16.12325668334961, + 2.684377670288086, + 25.603527069091797, + 0.21871770918369293, + 0.0, + 0.6038141250610352, + 3.1929478645324707, + 0.5072038173675537, + 2.2461884021759033, + 3.429663896560669, + 9.750276565551758, + 11.220218658447266, + 0.0, + 0.016989732161164284, + 0.5606611967086792, + 8.647773742675781, + 19.70808982849121, + 5.147888660430908, + 20.472627639770508, + 28.06703758239746, + 0.2405894696712494, + 0.024152563884854317, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 3.414936065673828, + 5.028960704803467, + 0.0, + 5.283806800842285, + 0.0, + 0.0, + 0.08494865894317627, + 2.5144803524017334, + 7.339563846588135, + 0.0, + 0.0, + 0.0724576860666275, + 9.578843116760254, + 0.57966148853302, + 0.0, + 5.989835739135742, + 2.8500025272369385, + 0.0, + 0.0, + 1.5290758609771729, + 11.196233749389648, + 0.0, + 4.63819694519043, + 19.70808982849121, + 10.56761360168457, + 17.941158294677734, + 0.0, + 9.752105712890625, + 0.65615314245224, + 0.024152563884854317, + 0.0, + 28.2436580657959, + 2.3669509887695312, + 0.0, + 1.1834754943847656, + 0.31398332118988037, + 1.5966392755508423, + 0.0, + 15.392696380615234, + 13.965560913085938, + 9.429301261901855, + 2.59942889213562, + 0.0, + 5.1139092445373535, + 13.302959442138672, + 0.0, + 8.630784034729004, + 24.12541961669922, + 9.769096374511719, + 21.509000778198242, + 0.0, + 0.0, + 1.7148319482803345, + 15.19328784942627, + 0.0, + 3.671189785003662, + 8.46452522277832, + 23.7435302734375, + 0.0, + 8.902619361877441, + 0.0, + 0.0, + 0.17497415840625763, + 1.9563575983047485, + 2.197883367538452, + 7.478783130645752, + 0.0, + 20.761451721191406, + 6.676964282989502, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.5460655689239502, + 0.0, + 0.0, + 0.144915372133255, + 19.58626937866211, + 2.270340919494629, + 4.178393363952637, + 1.5216114521026611, + 4.077535629272461, + 12.572402000427246, + 22.69828224182129, + 23.717666625976562, + 10.431695938110352, + 11.434089660644531, + 4.043556213378906, + 0.0, + 0.0, + 0.7970346212387085, + 28.457950592041016, + 2.705087184906006, + 24.66498565673828, + 0.0, + 11.366130828857422, + 6.048344612121582, + 17.363506317138672, + 23.42884063720703, + 0.0, + 0.0, + 2.8500025272369385, + 8.528813362121582, + 0.0, + 6.985912322998047, + 0.0, + 8.970579147338867, + 0.0, + 0.0, + 1.3591785430908203, + 2.1215617656707764, + 2.7292397022247314, + 3.47796893119812, + 34.3081169128418, + 3.941617965698242, + 22.98710823059082, + 18.99452018737793, + 1.771613359451294, + 0.0, + 2.5360190868377686, + 1.1143172979354858, + 0.700424313545227, + 0.0, + 1.0144076347351074, + 0.0, + 2.6084768772125244, + 8.704964637756348, + 5.708531856536865, + 0.47571247816085815, + 2.055757522583008, + 6.846862316131592, + 2.293613910675049, + 0.0, + 0.0, + 0.152907595038414, + 11.111285209655762, + 0.0, + 9.327363014221191, + 25.671485900878906, + 9.157464981079102, + 25.603527069091797, + 0.5467942357063293, + 0.024152563884854317, + 0.0, + 6.664474964141846, + 2.3669509887695312, + 3.550426721572876, + 2.0046627521514893, + 7.8492560386657715, + 8.902619361877441, + 20.71048355102539, + 15.664532661437988, + 24.34628677368164, + 11.841843605041504, + 16.887794494628906, + 0.0, + 0.0, + 0.0, + 8.87168025970459, + 0.0, + 3.4055113792419434, + 2.1012728214263916, + 0.0, + 2.888254404067993, + 0.0, + 14.798057556152344, + 19.91196632385254, + 10.006952285766602, + 7.73032808303833, + 0.0, + 0.0, + 0.04830512776970863, + 5.185861587524414, + 2.4877140522003174, + 0.0, + 0.0, + 10.924397468566895, + 3.975597381591797, + 20.744462966918945, + 0.0, + 1.2912195920944214, + 0.0, + 0.9174454808235168, + 0.0, + 0.0, + 1.1351704597473145, + 35.67958450317383, + 3.0190703868865967, + 28.6722412109375, + 39.57969665527344, + 7.798287391662598, + 9.497260093688965, + 16.666927337646484, + 21.831806182861328, + 16.887794494628906 + ], + "1gya": [ + 1.6403827667236328, + 0.0, + 0.9419499635696411, + 17.314777374267578, + 0.0, + 0.6762717962265015, + 0.12076281756162643, + 1.4733062982559204, + 1.8372286558151245, + 22.20557975769043, + 18.63773536682129, + 24.906946182250977, + 9.905014038085938, + 8.88563060760498, + 6.184262275695801, + 5.368755340576172, + 12.300565719604492, + 11.9947509765625, + 13.77867317199707, + 16.734886169433594, + 17.720291137695312, + 20.472627639770508, + 26.63990020751953, + 27.897140502929688, + 0.4155636429786682, + 0.0, + 0.0, + 8.978826522827148, + 0.8453397154808044, + 1.2076282501220703, + 3.791952610015869, + 35.67958450317383, + 42.922645568847656, + 7.6113996505737305, + 7.543440818786621, + 21.814815521240234, + 10.839448928833008, + 11.026336669921875, + 23.598737716674805, + 0.0, + 0.0, + 0.7245768904685974, + 19.843420028686523, + 0.0, + 1.352543592453003, + 0.8211871385574341, + 4.4440717697143555, + 1.6140245199203491, + 5.3007965087890625, + 10.21082878112793, + 7.288595199584961, + 16.27616310119629, + 15.052902221679688, + 0.0, + 15.409687042236328, + 3.941617965698242, + 19.79303741455078, + 4.689166069030762, + 0.0, + 0.0, + 0.0, + 6.064457893371582, + 0.0, + 12.72893238067627, + 4.0093255043029785, + 1.2912195920944214, + 7.764307498931885, + 0.0, + 14.52622127532959, + 15.001933097839355, + 23.462820053100586, + 17.448455810546875, + 0.0, + 0.0, + 0.8453397154808044, + 2.014342784881592, + 0.86949223279953, + 5.1686482429504395, + 17.850505828857422, + 13.735471725463867, + 1.9877986907958984, + 6.558036804199219, + 15.681523323059082, + 8.664763450622559, + 7.220636367797852, + 27.455408096313477, + 0.0, + 0.0, + 0.0, + 19.029109954833984, + 2.6809346675872803, + 2.446521520614624, + 0.0, + 1.6480039358139038, + 13.404898643493652, + 0.27183571457862854, + 0.0, + 0.0, + 0.6038141250610352, + 0.3642960488796234, + 0.0, + 0.0, + 0.0, + 4.154240608215332, + 0.0, + 2.9052441120147705, + 0.0, + 9.446290969848633, + 0.0, + 0.0, + 3.9076383113861084, + 0.08494865894317627, + 19.928955078125, + 11.892812728881836, + 11.943780899047852, + 0.5249224901199341, + 0.0, + 0.0, + 15.214717864990234, + 0.8453397154808044, + 0.700424313545227, + 5.386021614074707, + 19.864849090576172, + 50.079994201660156, + 5.1648783683776855, + 2.3275933265686035, + 13.965560913085938, + 0.1019383892416954, + 9.157464981079102, + 18.671714782714844, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 9.600273132324219, + 0.0, + 0.0, + 4.757124900817871, + 0.0, + 0.0, + 0.7475482225418091, + 0.0, + 0.0, + 0.0, + 0.0, + 0.45889872312545776, + 2.721505880355835, + 1.0627127885818481, + 3.0190703868865967, + 8.429244041442871, + 1.835594892501831, + 3.4119958877563477, + 3.598731756210327, + 0.024152563884854317, + 6.95593786239624, + 1.1834754943847656, + 6.907632827758789, + 5.504673480987549, + 0.0, + 14.764076232910156, + 8.71573257446289, + 12.198627471923828, + 15.392696380615234, + 0.0, + 17.43146514892578, + 1.223260760307312, + 20.455636978149414, + 0.0, + 0.144915372133255, + 1.352543592453003, + 2.2072055339813232, + 0.0, + 4.349371433258057, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.8500025272369385, + 0.0, + 0.0, + 4.111515045166016, + 8.409917831420898, + 13.625764846801758, + 0.0, + 0.0, + 0.09661025553941727, + 25.200716018676758, + 0.024152563884854317, + 0.0, + 0.0, + 4.275003910064697, + 3.601823329925537, + 0.0, + 11.162254333496094, + 9.46328067779541, + 0.0, + 0.0, + 4.927022457122803, + 0.0, + 20.28573989868164, + 16.76886558532715, + 7.815276622772217, + 0.32807657122612, + 2.801697254180908, + 2.1737308502197266, + 34.800987243652344, + 0.0, + 8.868639945983887, + 17.040700912475586, + 0.0, + 0.0, + 1.9805101156234741, + 1.2428925037384033, + 0.0, + 0.2173730581998825, + 2.63262939453125, + 2.250063896179199, + 9.01116943359375, + 0.0, + 6.133293151855469, + 0.18688705563545227, + 0.0, + 7.73032808303833, + 13.591785430908203, + 21.610939025878906, + 15.477645874023438, + 0.0, + 0.0, + 0.0, + 6.600186824798584, + 0.024152563884854317, + 0.0, + 13.71467399597168, + 0.0, + 2.021778106689453, + 0.0, + 0.27183571457862854, + 12.810258865356445, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 3.3330538272857666, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 3.262028455734253, + 3.4319260120391846, + 17.92416763305664, + 0.0, + 0.0, + 0.33813589811325073, + 3.642960548400879, + 0.0, + 5.023733139038086, + 13.328949928283691, + 4.549328327178955, + 0.152907595038414, + 0.0, + 0.0, + 2.497490644454956, + 0.0, + 26.07923698425293, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.5313563942909241, + 7.200204372406006, + 0.38644102215766907, + 0.9902550578117371, + 0.0, + 34.77955627441406, + 5.079929828643799, + 0.0, + 13.642755508422852, + 5.0119709968566895, + 0.0, + 0.0, + 0.45889872312545776, + 5.228719711303711, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.024152563884854317, + 24.814990997314453, + 1.1110179424285889, + 0.0, + 0.0, + 0.0, + 4.077535629272461, + 0.6965789794921875, + 0.0, + 0.0, + 0.4587227404117584, + 0.0, + 0.0, + 0.04830512776970863, + 3.1639859676361084, + 25.7364444732666, + 0.6521192193031311, + 26.057880401611328, + 0.0, + 8.324968338012695, + 1.410147786140442, + 7.254615306854248, + 22.324508666992188, + 0.6124095320701599, + 0.0, + 0.0, + 2.1429178714752197, + 0.48305127024650574, + 0.024152563884854317, + 0.0, + 2.270340919494629, + 0.0, + 0.0, + 0.0, + 0.0, + 9.123486518859863, + 11.349141120910645, + 10.941388130187988, + 0.0, + 20.863391876220703, + 0.0, + 2.820295572280884, + 0.0, + 0.0, + 0.0, + 0.6762717962265015, + 22.307775497436523, + 1.280085802078247, + 1.1110179424285889, + 3.671189785003662, + 17.636213302612305, + 15.507085800170898, + 6.031354904174805, + 5.929416656494141, + 12.623371124267578, + 18.99452018737793, + 9.46328067779541, + 18.331920623779297, + 23.530778884887695, + 28.084028244018555, + 0.21871770918369293, + 0.28983074426651, + 0.0, + 14.078969955444336, + 1.1351704597473145, + 0.0, + 4.2721638679504395, + 0.7487294673919678, + 0.0, + 11.943780899047852, + 12.113678932189941, + 1.749942421913147, + 0.0, + 0.16989731788635254, + 3.8226897716522217, + 0.0, + 0.3397946357727051, + 0.10935885459184647, + 0.0, + 0.26567819714546204, + 20.40057945251465, + 0.8211871385574341, + 11.014598846435547, + 0.3907638490200043, + 5.810488224029541, + 24.07444953918457, + 8.494865417480469, + 27.42142677307129, + 0.0, + 0.0, + 0.0, + 27.043624877929688, + 1.0144076347351074, + 1.2076282501220703, + 8.55024242401123, + 47.915645599365234, + 1.0193839073181152, + 13.336939811706543, + 16.463050842285156, + 15.885398864746094, + 0.0, + 0.0, + 0.5313563942909241, + 0.0, + 0.45889872312545776, + 3.7677998542785645, + 27.90079116821289, + 39.1511116027832, + 0.0, + 1.5460655689239502, + 12.67434024810791, + 8.1720609664917, + 0.0, + 0.024152563884854317, + 0.0, + 16.307605743408203, + 0.0, + 0.0, + 3.47796893119812, + 0.0, + 0.0, + 0.0509691946208477, + 0.7135687470436096, + 0.0, + 0.0, + 2.548459768295288, + 6.676964282989502, + 4.179473876953125, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 6.535899639129639, + 0.0, + 2.1737308502197266, + 5.400153160095215, + 17.829076766967773, + 0.0, + 5.130898952484131, + 0.0, + 0.152907595038414, + 0.0, + 0.0, + 0.0, + 0.0, + 0.86949223279953, + 3.5745792388916016, + 1.1357464790344238, + 6.8787665367126465, + 0.0, + 0.0, + 2.633408546447754, + 2.6673879623413086, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0724576860666275, + 0.0, + 1.2076282501220703, + 0.0, + 0.0, + 0.0, + 5.810488224029541, + 0.016989732161164284, + 0.0, + 0.0, + 0.0, + 0.0, + 1.6649937629699707, + 5.232837200164795, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0144076347351074, + 0.7245768904685974, + 0.08748707920312881, + 0.0, + 0.0, + 0.03397946432232857, + 0.0, + 0.0, + 0.0, + 9.837055206298828, + 0.934435248374939, + 19.810028076171875, + 0.8155071139335632, + 5.3007965087890625, + 7.288595199584961, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.03397946432232857, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.352543592453003, + 24.643556594848633, + 13.328949928283691, + 0.0, + 0.0, + 0.8664763569831848, + 0.0, + 0.0, + 0.42474329471588135, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.3622884452342987, + 0.19322051107883453, + 0.17497415840625763, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.434319972991943, + 9.072516441345215, + 5.385745048522949, + 0.0, + 6.541046619415283, + 16.836824417114258, + 0.13591785728931427, + 0.0, + 0.0, + 0.09661025553941727, + 18.51481056213379, + 0.0, + 0.04285835847258568, + 0.5555089712142944, + 0.0, + 4.026566505432129, + 15.290759086608887, + 0.0, + 0.0, + 0.016989732161164284, + 5.2498273849487305, + 0.0, + 0.0, + 0.0, + 23.93639373779297, + 0.0, + 23.529239654541016, + 0.0, + 7.934205055236816, + 0.0, + 0.7305585145950317, + 9.6331787109375, + 0.0, + 0.0, + 0.0, + 2.292922019958496, + 0.700424313545227, + 4.854665279388428, + 38.67966842651367, + 42.87978744506836, + 0.0, + 4.977991580963135, + 9.53123950958252, + 1.274229884147644, + 0.0, + 0.0, + 0.0, + 5.528728485107422, + 0.12076281756162643, + 0.0, + 1.4250012636184692, + 2.560171604156494, + 1.7497416734695435, + 0.0, + 0.22086653113365173, + 3.363966941833496, + 17.448455810546875, + 8.121091842651367, + 0.0, + 11.111285209655762, + 21.64491844177246, + 23.904552459716797, + 16.021316528320312, + 17.05769157409668, + 28.13499641418457, + 17.49942398071289, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.5072038173675537, + 1.3766961097717285, + 1.1834754943847656, + 1.6622545719146729, + 0.0, + 0.9684147238731384, + 0.0, + 0.0, + 13.218010902404785, + 5.759519577026367, + 3.839679479598999, + 17.70330047607422, + 2.7693264484405518, + 10.856439590454102, + 17.2785587310791, + 22.358488082885742, + 27.77821159362793, + 0.0, + 0.0, + 0.0, + 4.885853290557861, + 0.0, + 0.9661025404930115, + 1.352543592453003, + 1.73898446559906, + 1.771613359451294, + 6.031354904174805, + 0.0, + 0.0, + 2.973203182220459, + 7.6113996505737305, + 4.383350849151611, + 18.99452018737793, + 3.5848333835601807, + 2.59942889213562, + 10.550622940063477, + 18.229982376098633, + 22.018693923950195, + 27.829181671142578, + 0.0, + 0.0, + 0.0, + 1.8214802742004395, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.4611170291900635, + 0.8155071139335632, + 0.0, + 0.0, + 0.1019383892416954, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.04285835847258568, + 0.0, + 0.0, + 0.33813589811325073, + 18.536239624023438, + 4.593071937561035, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.9052441120147705, + 20.829410552978516, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0724576860666275, + 4.419919013977051, + 0.0, + 7.125005722045898, + 0.7728820443153381, + 4.661444664001465, + 0.0, + 0.0, + 0.0, + 0.0, + 4.689166069030762, + 0.0, + 19.928955078125, + 0.0, + 13.863621711730957, + 0.0, + 0.0, + 0.0, + 0.0, + 0.38644102215766907, + 0.024152563884854317, + 0.7245768904685974, + 2.580868721008301, + 8.091108322143555, + 13.932317733764648, + 16.294469833374023, + 0.0, + 0.0, + 0.16989731788635254, + 0.0, + 2.8033056259155273, + 11.366130828857422, + 0.4587227404117584, + 1.1553016901016235, + 12.385514259338379, + 1.5460655689239502, + 24.533172607421875, + 22.613332748413086, + 24.533172607421875, + 0.0, + 0.0, + 1.3042384386062622, + 0.02142917923629284, + 0.0, + 0.0, + 1.231780767440796, + 1.7872896194458008, + 1.7059979438781738, + 0.1019383892416954, + 12.283576011657715, + 0.9174454808235168, + 0.7815276980400085, + 6.252221584320068, + 0.0, + 0.9514249563217163, + 0.2888254225254059, + 10.329756736755371, + 19.40227508544922, + 6.150282859802246, + 22.103641510009766, + 26.045259475708008, + 0.5030507445335388, + 0.16906794905662537, + 1.352543592453003, + 19.13625717163086, + 0.144915372133255, + 0.7245768904685974, + 6.617802619934082, + 44.1655387878418, + 48.237083435058594, + 0.9004558324813843, + 1.1892812252044678, + 14.458261489868164, + 8.223030090332031, + 12.096689224243164, + 18.077075958251953, + 0.0, + 0.0, + 3.0432229042053223, + 22.779216766357422, + 0.0, + 0.5072038173675537, + 1.7148319482803345, + 0.9177974462509155, + 1.6622545719146729, + 0.0, + 6.167273044586182, + 0.0, + 0.0, + 10.295777320861816, + 15.511626243591309, + 21.610939025878906, + 19.14742660522461, + 23.07205581665039, + 1.0363736152648926, + 18.212993621826172, + 28.10101890563965, + 7.152677536010742, + 0.34994831681251526, + 0.0, + 0.024152563884854317, + 0.47144195437431335, + 0.0724576860666275, + 0.700424313545227, + 6.376276969909668, + 37.929649353027344, + 48.1085090637207, + 0.0, + 8.03614330291748, + 3.1091208457946777, + 0.526681661605835, + 10.975366592407227, + 15.885398864746094, + 0.04374353960156441, + 0.0, + 0.12076281756162643, + 19.800561904907227, + 0.0, + 0.15000426769256592, + 3.8161051273345947, + 5.385745048522949, + 0.3907638490200043, + 0.0, + 0.305815190076828, + 12.946176528930664, + 6.592015743255615, + 5.368755340576172, + 0.0, + 0.0, + 0.0, + 0.06428753584623337, + 0.38644102215766907, + 2.1012728214263916, + 0.3622884452342987, + 10.554669380187988, + 0.04830512776970863, + 7.970345973968506, + 0.86949223279953, + 0.0, + 6.558036804199219, + 11.586997985839844, + 11.196233749389648, + 0.0, + 20.897371292114258, + 0.0, + 6.693954944610596, + 0.0, + 0.0, + 0.0, + 0.0, + 14.74327564239502, + 0.0, + 0.024152563884854317, + 0.0, + 0.8936448097229004, + 1.5528956651687622, + 6.948800563812256, + 0.016989732161164284, + 0.5096919536590576, + 0.0, + 0.32280489802360535, + 7.509461879730225, + 5.283806800842285, + 0.016989732161164284, + 6.337170124053955, + 20.7784423828125, + 0.526681661605835, + 19.725078582763672, + 20.438648223876953, + 0.5030507445335388, + 0.024152563884854317, + 0.0, + 25.05071258544922, + 0.5313563942909241, + 0.0, + 1.8597474098205566, + 37.32963180541992, + 10.500297546386719, + 2.6164186000823975, + 17.091670989990234, + 10.720520973205566, + 19.35130500793457, + 1.6140245199203491, + 0.0, + 0.0, + 0.0, + 2.439409017562866, + 0.3857252299785614, + 0.0, + 0.0, + 0.04830512776970863, + 1.038560152053833, + 1.7278698682785034, + 0.0, + 8.766701698303223, + 0.0, + 6.812882423400879, + 3.2280490398406982, + 0.0, + 11.722914695739746, + 1.9198397397994995, + 13.42188835144043, + 21.593948364257812, + 21.57695960998535, + 24.669092178344727, + 28.084028244018555, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.588986873626709, + 34.00810623168945, + 28.07222557067871, + 1.8518807888031006, + 0.5946406126022339, + 7.305584907531738, + 0.0, + 0.13123062252998352, + 0.0, + 0.0, + 0.0, + 0.0, + 2.0786304473876953, + 0.0, + 1.6310142278671265, + 0.0, + 13.77867317199707, + 6.948800563812256, + 0.0, + 10.686541557312012, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.2944934368133545, + 0.0, + 0.9661025404930115, + 0.0, + 0.0, + 2.592930793762207, + 0.0, + 0.0, + 0.0, + 0.0, + 0.016989732161164284, + 0.0, + 9.6331787109375, + 0.0, + 0.06795892864465714, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.1351704597473145, + 0.24152563512325287, + 0.10935885459184647, + 0.0, + 0.0, + 0.0, + 0.0, + 1.6989731788635254, + 0.0, + 6.982780456542969, + 11.57000732421875, + 0.0, + 1.4611170291900635, + 15.222799301147461, + 0.0, + 2.6164186000823975, + 0.0, + 0.0, + 0.0, + 2.1429178714752197, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.06795892864465714, + 0.06795892864465714, + 0.0, + 0.0509691946208477, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.700424313545227, + 0.407154381275177, + 0.0, + 0.0, + 1.4491537809371948, + 0.0, + 0.2173730581998825, + 0.31398332118988037, + 0.6762717962265015, + 0.0, + 6.031354904174805, + 0.0, + 0.0, + 8.477876663208008, + 0.0, + 2.1407063007354736, + 0.0, + 0.0, + 1.3560497760772705, + 0.19322051107883453, + 0.04830512776970863, + 15.943309783935547, + 0.09661025553941727, + 0.024152563884854317, + 1.038560152053833, + 1.8597474098205566, + 1.5310238599777222, + 1.8009116649627686, + 7.050738334655762, + 14.696118354797363, + 7.339563846588135, + 10.227818489074707, + 10.95837688446045, + 14.594179153442383, + 20.387678146362305, + 24.70306968688965, + 20.234771728515625, + 20.999309539794922, + 28.253925323486328, + 18.85860252380371, + 0.1968459188938141, + 0.0, + 0.024152563884854317, + 3.2358062267303467, + 0.6038141250610352, + 0.0, + 0.0, + 0.0, + 0.5946406126022339, + 16.327133178710938, + 10.95837688446045, + 15.78346061706543, + 13.251991271972656, + 0.0, + 0.024152563884854317, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.3567843735218048, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.04830512776970863, + 0.0, + 0.0, + 0.26246124505996704, + 0.0, + 0.0, + 0.0, + 0.0, + 0.16989731788635254, + 4.281412601470947, + 0.0, + 0.0, + 0.0, + 0.0, + 0.8155071139335632, + 4.791104316711426, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.08571671694517136, + 0.0, + 0.24152563512325287, + 1.594069242477417, + 1.9805101156234741, + 0.0, + 0.0, + 0.0, + 0.0, + 0.27183571457862854, + 6.541046619415283, + 0.06795892864465714, + 1.749942421913147, + 22.57935333251953, + 20.166810989379883, + 13.336939811706543, + 0.0, + 0.44173306226730347, + 2.10672664642334, + 0.0, + 0.0, + 0.0, + 0.0, + 0.9419499635696411, + 2.9224600791931152, + 3.2370221614837646, + 7.89788818359375, + 9.395346641540527, + 3.4776113033294678, + 0.47571247816085815, + 2.259634494781494, + 5.657580852508545, + 3.4659054279327393, + 23.717666625976562, + 1.1383121013641357, + 24.652101516723633, + 9.225424766540527, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.5313563942909241, + 1.8809722661972046, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.5290758609771729, + 4.859063625335693, + 6.405129432678223, + 5.674570560455322, + 17.907176971435547, + 26.7248477935791, + 27.676273345947266, + 0.0, + 0.0, + 0.0, + 6.900195598602295, + 0.0, + 17.979080200195312, + 5.072038173675537, + 0.0, + 0.0, + 11.57000732421875, + 0.0, + 13.370919227600098, + 25.807403564453125, + 15.477645874023438, + 0.15310238301753998, + 0.144915372133255, + 0.0, + 18.257661819458008, + 1.9805101156234741, + 4.588986873626709, + 28.2436580657959, + 6.257320404052734, + 0.0, + 11.332151412963867, + 22.528385162353516, + 10.975366592407227, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.41059356927871704, + 5.614445209503174, + 7.0930585861206055, + 0.0, + 5.215847969055176, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 5.335865497589111, + 0.0, + 0.0, + 1.5457640886306763, + 22.54349708557129, + 7.173940658569336, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.446521520614624, + 25.110822677612305, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.280085802078247, + 24.66498565673828, + 0.3857252299785614, + 0.0, + 0.8155071139335632, + 1.8348909616470337, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.144915372133255, + 0.19322051107883453, + 3.3813588619232178, + 0.0, + 0.0, + 0.3907638490200043, + 9.14047622680664, + 1.2062709331512451, + 0.0, + 0.0, + 15.239789962768555, + 9.293383598327637, + 25.960311889648438, + 0.8155071139335632, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.25715014338493347, + 0.0, + 0.0, + 0.12076281756162643, + 0.0, + 1.5310238599777222, + 0.0, + 0.0, + 0.0, + 0.0, + 0.27183571457862854, + 0.0, + 2.361572742462158, + 5.130898952484131, + 4.094525337219238, + 0.0, + 21.661909103393555, + 20.35369873046875, + 12.759288787841797, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.3500382900238037, + 0.0, + 0.0, + 0.0, + 0.4927022457122803, + 1.7669322490692139, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.758054733276367, + 0.04830512776970863, + 6.545344829559326, + 2.270340919494629, + 17.636213302612305, + 0.0, + 0.0, + 0.27183571457862854, + 0.0, + 0.0, + 2.055757522583008, + 0.0, + 15.851420402526855, + 2.8542749881744385, + 0.0, + 0.0, + 0.0, + 1.585759162902832, + 0.09661025553941727, + 0.0, + 0.0, + 20.486295700073242, + 0.0, + 0.0, + 0.37377411127090454, + 4.672176361083984, + 0.0, + 0.0, + 0.24152563512325287, + 28.05079460144043, + 0.0, + 6.107316017150879, + 2.0529677867889404, + 0.0, + 0.08494865894317627, + 17.040700912475586, + 13.795661926269531, + 0.0, + 18.246973037719727, + 5.725539684295654, + 0.0, + 0.0, + 0.41059356927871704, + 27.686500549316406, + 0.0, + 0.48305127024650574, + 0.6521192193031311, + 0.5072038173675537, + 1.6841262578964233, + 0.0, + 10.108890533447266, + 0.0, + 1.2572401762008667, + 14.186427116394043, + 8.817670822143555, + 17.023710250854492, + 21.186195373535156, + 17.43146514892578, + 0.11892811954021454, + 22.596343994140625, + 24.940927505493164, + 19.14742660522461, + 0.0, + 2.0046627521514893, + 0.19322051107883453, + 21.386320114135742, + 0.0, + 10.95837688446045, + 17.482433319091797, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0724576860666275, + 0.0, + 0.9661025404930115, + 0.9177974462509155, + 1.6403827667236328, + 0.0, + 1.868870496749878, + 0.8834661245346069, + 0.13591785728931427, + 7.475481986999512, + 1.342188835144043, + 14.916984558105469, + 14.849026679992676, + 12.878216743469238, + 6.829872131347656, + 20.48961639404297, + 28.21994400024414, + 26.28311538696289, + 0.0, + 0.0, + 0.0, + 11.893195152282715, + 0.0, + 3.0432229042053223, + 11.10031509399414, + 7.939452648162842, + 2.4295318126678467, + 0.0, + 9.089506149291992, + 0.0, + 6.5070672035217285, + 17.227588653564453, + 0.0, + 0.0, + 0.0, + 26.593610763549805, + 0.0, + 0.16906794905662537, + 0.0, + 0.0, + 10.499654769897461, + 17.51641273498535, + 0.0, + 1.1383121013641357, + 2.5824391841888428, + 0.0509691946208477, + 3.2280490398406982, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.5699167251586914, + 0.0, + 0.0, + 6.473088264465332, + 3.2960078716278076, + 0.0, + 0.0, + 6.880841255187988, + 12.538422584533691, + 0.0, + 0.22086653113365173, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 6.557329177856445, + 0.0, + 0.3622884452342987, + 4.878818035125732, + 30.793729782104492, + 46.26559829711914, + 2.6164186000823975, + 0.0, + 0.03397946432232857, + 0.0, + 6.5070672035217285, + 1.393157958984375, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.6665269136428833, + 0.15310238301753998, + 0.0, + 1.8858602046966553, + 0.0, + 0.0, + 0.0, + 0.016989732161164284, + 2.633408546447754, + 0.0, + 0.23785623908042908, + 3.024172306060791, + 4.111515045166016, + 21.06726837158203, + 11.841843605041504, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.024152563884854317, + 1.5699167251586914, + 4.250851154327393, + 0.0, + 0.0, + 0.32280489802360535, + 0.23785623908042908, + 3.448915481567383, + 2.973203182220459, + 1.6480039358139038, + 16.4290714263916, + 0.3567843735218048, + 16.64993667602539, + 27.2005615234375, + 0.0, + 0.0, + 0.0, + 0.10714589059352875, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.03397946432232857, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.86949223279953, + 2.994917869567871, + 12.664644241333008, + 23.057798385620117, + 0.0, + 0.0, + 6.014365196228027, + 2.9052441120147705, + 0.0, + 0.0, + 0.0, + 0.9643130898475647, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.700424313545227, + 1.5528956651687622, + 0.0, + 0.0, + 0.0, + 1.3251991271972656, + 7.373543739318848, + 0.0, + 0.0, + 0.0, + 3.975597381591797, + 3.9076383113861084, + 11.162254333496094, + 18.094064712524414, + 27.659284591674805, + 0.0, + 0.0, + 0.0, + 12.428924560546875, + 0.0, + 0.0, + 0.8453397154808044, + 0.0, + 2.9562134742736816, + 0.0, + 9.242413520812988, + 0.0, + 0.0509691946208477, + 2.735347032546997, + 0.0, + 0.611630380153656, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 9.235976219177246, + 0.16906794905662537, + 0.12076281756162643, + 0.0, + 32.52949523925781, + 0.8311272859573364, + 0.0, + 2.242644786834717, + 3.346977472305298, + 0.0, + 0.0, + 16.412080764770508, + 1.7159630060195923, + 9.88802433013916, + 0.0, + 0.04830512776970863, + 2.1737308502197266, + 28.56509780883789, + 0.31398332118988037, + 0.09661025553941727, + 6.255514144897461, + 45.001277923583984, + 47.122764587402344, + 5.368755340576172, + 8.002163887023926, + 8.46088695526123, + 20.251760482788086, + 2.0387678146362305, + 13.319950103759766, + 0.0, + 0.0, + 2.801697254180908, + 28.307947158813477, + 0.024152563884854317, + 2.0046627521514893, + 0.5072038173675537, + 2.5152535438537598, + 5.096190929412842, + 13.604241371154785, + 14.522854804992676, + 26.915050506591797, + 1.0873428583145142, + 9.70113754272461, + 12.402504920959473, + 0.0, + 8.817670822143555, + 22.05267333984375, + 21.831806182861328, + 0.7815276980400085, + 7.220636367797852, + 14.390303611755371, + 24.380264282226562, + 19.725078582763672, + 24.465213775634766, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.04830512776970863, + 0.41059356927871704, + 1.6182217597961426, + 0.0, + 9.771706581115723, + 5.485869884490967, + 1.2857507467269897, + 13.971824645996094, + 33.365230560302734, + 0.0, + 6.3201799392700195, + 0.6625995635986328, + 6.116303443908691, + 0.2888254225254059, + 3.6358025074005127, + 0.0, + 16.734886169433594, + 0.0, + 13.795661926269531, + 0.0, + 7.237626075744629, + 19.164417266845703, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.04830512776970863, + 0.6762717962265015, + 2.946612596511841, + 0.0, + 4.864423751831055, + 4.47869873046875, + 2.8715100288391113, + 19.221973419189453, + 18.257661819458008, + 1.0363736152648926, + 13.302959442138672, + 6.031354904174805, + 7.050738334655762, + 11.57000732421875, + 15.120861053466797, + 1.1553016901016235, + 4.706155776977539, + 19.589160919189453, + 0.0, + 0.6795892715454102, + 1.1722915172576904, + 19.894977569580078, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.28983074426651, + 14.078969955444336, + 0.25715014338493347, + 19.993423461914062, + 0.5785878300666809, + 0.1928626149892807, + 7.033749103546143, + 0.2888254225254059, + 5.334775924682617, + 8.121091842651367, + 11.63796615600586, + 10.057921409606934, + 13.982549667358398, + 11.417099952697754, + 17.533403396606445, + 0.04830512776970863, + 0.0, + 0.0, + 0.0, + 0.0, + 0.6521192193031311, + 16.693330764770508, + 1.671475887298584, + 8.357379913330078, + 8.185946464538574, + 1.5000425577163696, + 13.812652587890625, + 0.18688705563545227, + 0.0, + 4.8760528564453125, + 6.252221584320068, + 6.592015743255615, + 17.295547485351562, + 17.720291137695312, + 6.133293151855469, + 0.0, + 0.0, + 0.0, + 0.0, + 0.09661025553941727, + 1.6182217597961426, + 0.0, + 22.607784271240234, + 24.23640251159668, + 1.5000425577163696, + 26.07931137084961, + 0.0, + 4.281412601470947, + 10.873428344726562, + 9.871034622192383, + 7.968184471130371, + 21.763845443725586, + 17.363506317138672, + 16.63294792175293, + 16.191213607788086, + 27.829181671142578, + 0.0, + 0.024152563884854317, + 0.024152563884854317, + 0.0, + 0.0, + 0.9177974462509155, + 27.00076675415039, + 23.10065460205078, + 24.793560028076172, + 4.178689956665039, + 16.97191047668457, + 10.46567440032959, + 12.06270980834961, + 11.485058784484863, + 11.060315132141113, + 5.776508808135986, + 18.85860252380371, + 17.329526901245117, + 11.094295501708984, + 18.043094635009766, + 16.242183685302734, + 20.268749237060547, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.0046627521514893, + 0.1714334338903427, + 19.843420028686523, + 13.328949928283691, + 0.7285920977592468, + 25.47929573059082, + 0.06795892864465714, + 12.249597549438477, + 13.540816307067871, + 0.0, + 1.4611170291900635, + 6.405129432678223, + 12.41949462890625, + 1.9538191556930542, + 0.0, + 12.368524551391602, + 12.385514259338379, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 2.511866569519043, + 0.0, + 24.900705337524414, + 23.550668716430664, + 0.3642960488796234, + 26.97933578491211, + 0.934435248374939, + 6.863852024078369, + 11.57000732421875, + 5.674570560455322, + 3.363966941833496, + 19.657119750976562, + 4.8760528564453125, + 10.89041805267334, + 17.975135803222656, + 11.977761268615723, + 0.0, + 0.024152563884854317, + 0.0, + 0.0, + 0.0, + 2.028815269470215, + 28.393661499023438, + 18.600528717041016, + 24.300689697265625, + 6.514470100402832, + 30.42943572998047, + 13.065104484558105, + 11.57000732421875, + 8.155071258544922, + 10.652562141418457, + 2.208665132522583, + 20.523595809936523, + 17.720291137695312, + 11.043325424194336, + 24.56715202331543, + 16.972742080688477, + 12.980155944824219 + ] +} \ No newline at end of file diff --git a/tests/structure/data/misc/sasa.py b/tests/structure/data/misc/sasa.py new file mode 100644 index 000000000..f815b429e --- /dev/null +++ b/tests/structure/data/misc/sasa.py @@ -0,0 +1,32 @@ +import json +import warnings +from pathlib import Path +import mdtraj +from biotite.structure.info.radii import _SINGLE_RADII as SINGLE_RADII + +PDB_IDS = ["1l2y", "1gya"] +STRUCTURE_DIR = Path(__file__).parents[1] +OUTPUT_FILE = Path(__file__).parent / "sasa.json" + + +def compute_mdtraj_sasa(pdb_path): + # Use the same atom radii as Biotite + radii = { + element.capitalize(): radius / 10 for element, radius in SINGLE_RADII.items() + } + # Ignore warning about dummy unit cell vector + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + traj = mdtraj.load(pdb_path) + # Conversion from nm^2 to A^2 + return mdtraj.shrake_rupley(traj, change_radii=radii, n_sphere_points=5000)[0] * 100 + + +if __name__ == "__main__": + data = {} + for pdb_id in PDB_IDS: + pdb_path = STRUCTURE_DIR / f"{pdb_id}.pdb" + data[pdb_id] = compute_mdtraj_sasa(pdb_path).tolist() + + with open(OUTPUT_FILE, "w") as file: + json.dump(data, file, indent=4) diff --git a/tests/structure/data/molecules/create_v3000_sdf.py b/tests/structure/data/molecules/create_v3000_sdf.py index dc313722f..9630e71a1 100644 --- a/tests/structure/data/molecules/create_v3000_sdf.py +++ b/tests/structure/data/molecules/create_v3000_sdf.py @@ -11,4 +11,4 @@ writer.SetForceV3000(True) for molecule in supplier: writer.write(molecule) - writer.close() \ No newline at end of file + writer.close() diff --git a/tests/structure/test_atoms.py b/tests/structure/test_atoms.py index 93a94ac65..e4b80439f 100644 --- a/tests/structure/test_atoms.py +++ b/tests/structure/test_atoms.py @@ -10,105 +10,112 @@ @pytest.fixture def atom_list(): - chain_id = ["A","A","B","B","B"] - res_id = [1,1,1,1,2] + chain_id = ["A", "A", "B", "B", "B"] + res_id = [1, 1, 1, 1, 2] ins_code = [""] * 5 - res_name = ["ALA","ALA","PRO","PRO","MSE"] + res_name = ["ALA", "ALA", "PRO", "PRO", "MSE"] hetero = [False, False, False, False, True] atom_name = ["N", "CA", "O", "CA", "SE"] - element = ["N","C","O","C","SE"] + element = ["N", "C", "O", "C", "SE"] atom_list = [] for i in range(5): - atom_list.append(struc.Atom([i,i,i], - chain_id = chain_id[i], - res_id = res_id[i], - ins_code = ins_code[i], - res_name = res_name[i], - hetero = hetero[i], - atom_name = atom_name[i], - element = element[i])) + atom_list.append( + struc.Atom( + [i, i, i], + chain_id=chain_id[i], + res_id=res_id[i], + ins_code=ins_code[i], + res_name=res_name[i], + hetero=hetero[i], + atom_name=atom_name[i], + element=element[i], + ) + ) return atom_list + @pytest.fixture def atom(atom_list): return atom_list[2] + @pytest.fixture def array(atom_list): return struc.array(atom_list) + @pytest.fixture def stack(array): return struc.stack([array, array.copy(), array.copy()]) + @pytest.fixture def array_box(): - return np.array([ - [1,0,0], - [0,2,0], - [0,0,3] - ]) + return np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]]) + @pytest.fixture def stack_box(stack, array_box): return np.array([array_box] * stack.stack_depth()) + def test_shape(array, stack): assert array.shape == (5,) assert stack.shape == (3, 5) + def test_access(array): - chain_id = ["A","A","B","B","B"] - assert array.coord.shape == (5,3) + chain_id = ["A", "A", "B", "B", "B"] + assert array.coord.shape == (5, 3) assert array.chain_id.tolist() == chain_id assert array.get_annotation("chain_id").tolist() == chain_id array.add_annotation("test1", dtype=int) - assert array.test1.tolist() == [0,0,0,0,0] + assert array.test1.tolist() == [0, 0, 0, 0, 0] with pytest.raises(IndexError): - array.set_annotation("test2", np.array([0,1,2,3])) + array.set_annotation("test2", np.array([0, 1, 2, 3])) def test_modification(atom, array, stack): new_atom = atom new_atom.chain_id = "C" del array[2] - assert array.chain_id.tolist() == ["A","A","B","B"] + assert array.chain_id.tolist() == ["A", "A", "B", "B"] array[-1] = new_atom - assert array.chain_id.tolist() == ["A","A","B","C"] + assert array.chain_id.tolist() == ["A", "A", "B", "C"] del stack[1] assert stack.stack_depth() == 2 def test_array_indexing(atom, array): filtered_array = array[array.chain_id == "B"] - assert filtered_array.res_name.tolist() == ["PRO","PRO","MSE"] + assert filtered_array.res_name.tolist() == ["PRO", "PRO", "MSE"] assert atom == filtered_array[0] - filtered_array = array[[0,2,4]] - assert filtered_array.element.tolist() == ["N","O","SE"] + filtered_array = array[[0, 2, 4]] + assert filtered_array.element.tolist() == ["N", "O", "SE"] def test_stack_indexing(stack): with pytest.raises(IndexError): stack[5] filtered_stack = stack[0] - assert type(filtered_stack) == struc.AtomArray + assert isinstance(filtered_stack, struc.AtomArray) filtered_stack = stack[0:2, stack.res_name == "PRO"] - assert filtered_stack.atom_name.tolist() == ["O","CA"] - filtered_stack = stack[np.array([True,False,True])] + assert filtered_stack.atom_name.tolist() == ["O", "CA"] + filtered_stack = stack[np.array([True, False, True])] assert filtered_stack.stack_depth() == 2 assert filtered_stack.array_length() == 5 - filtered_stack = stack[:,0] + filtered_stack = stack[:, 0] assert filtered_stack.stack_depth() == 3 assert filtered_stack.array_length() == 1 - + def test_concatenation(array, stack): concat_array = array[2:] + array[:2] - assert concat_array.chain_id.tolist() == ["B","B","B","A","A"] - assert concat_array.coord.shape == (5,3) - concat_stack = stack[:,2:] + stack[:,:2] - assert concat_array.chain_id.tolist() == ["B","B","B","A","A"] - assert concat_stack.coord.shape == (3,5,3) + assert concat_array.chain_id.tolist() == ["B", "B", "B", "A", "A"] + assert concat_array.coord.shape == (5, 3) + concat_stack = stack[:, 2:] + stack[:, :2] + assert concat_array.chain_id.tolist() == ["B", "B", "B", "A", "A"] + assert concat_stack.coord.shape == (3, 5, 3) def test_comparison(array): @@ -129,23 +136,26 @@ def test_bonds(array): with pytest.raises(ValueError): # Expect a BondList with array length as atom count array.bonds = struc.BondList(13) - array.bonds = struc.BondList(5, np.array([(0,1),(0,2),(2,3),(2,4)])) - assert array.bonds.as_array().tolist() == [[0, 1, 0], - [0, 2, 0], - [2, 3, 0], - [2, 4, 0],] + array.bonds = struc.BondList(5, np.array([(0, 1), (0, 2), (2, 3), (2, 4)])) + assert array.bonds.as_array().tolist() == [ + [0, 1, 0], + [0, 2, 0], + [2, 3, 0], + [2, 4, 0], + ] filtered_array = array[array.chain_id == "B"] - assert filtered_array.bonds.as_array().tolist() == [[0, 1, 0], - [0, 2, 0]] + assert filtered_array.bonds.as_array().tolist() == [[0, 1, 0], [0, 2, 0]] concat_array = array + array - assert concat_array.bonds.as_array().tolist() == [[0, 1, 0], - [0, 2, 0], - [2, 3, 0], - [2, 4, 0], - [5, 6, 0], - [5, 7, 0], - [7, 8, 0], - [7, 9, 0]] + assert concat_array.bonds.as_array().tolist() == [ + [0, 1, 0], + [0, 2, 0], + [2, 3, 0], + [2, 4, 0], + [5, 6, 0], + [5, 7, 0], + [7, 8, 0], + [7, 9, 0], + ] def test_box(array, stack, array_box, stack_box): @@ -193,4 +203,4 @@ def test_pickle(atom, array, stack): assert test_array == array test_stack = pickle.loads(pickle.dumps(stack)) - assert test_stack == stack \ No newline at end of file + assert test_stack == stack diff --git a/tests/structure/test_basepairs.py b/tests/structure/test_basepairs.py index d0b554f27..b11b78ce5 100644 --- a/tests/structure/test_basepairs.py +++ b/tests/structure/test_basepairs.py @@ -2,23 +2,22 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import json -import warnings +from os.path import join import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -from biotite.structure.info import residue -from biotite.structure.residues import get_residue_masks -from biotite.structure.hbond import hbond -from os.path import join -from ..util import data_dir + # For ``base_pairs_edge()`` differences to a reference can be ambiguous # as the number hydrogen bonds between two different edges can be equal. # In order to distinguish ambiguously identified edges from wrongfully # identified edges the full edge matrix, listing the number of hydrogen # bonds for each edge has to be considered. from biotite.structure.basepairs import _get_edge_matrix +from biotite.structure.info import residue +from biotite.structure.residues import get_residue_masks +from tests.util import data_dir def reversed_iterator(iter): @@ -30,9 +29,7 @@ def reversed_iterator(iter): @pytest.fixture def nuc_sample_array(): - return strucio.load_structure( - join(data_dir("structure"), "base_pairs", "1qxb.cif") - ) + return strucio.load_structure(join(data_dir("structure"), "base_pairs", "1qxb.cif")) @pytest.fixture @@ -40,11 +37,10 @@ def basepairs(nuc_sample_array): """ Generate a test output for the base_pairs function. """ - residue_indices, residue_names = struc.residues.get_residues( - nuc_sample_array - )[0:24] + residue_indices, residue_names = struc.residues.get_residues(nuc_sample_array)[0:24] return np.vstack((residue_indices[:12], np.flip(residue_indices)[:12])).T + def check_residue_starts(computed_starts, nuc_sample_array): """ Assert that computed starts are residue starts. @@ -53,6 +49,7 @@ def check_residue_starts(computed_starts, nuc_sample_array): for start in computed_starts.flatten(): assert start in residue_starts + def check_output(computed_basepairs, basepairs): """ Check the output of base_pairs. @@ -60,16 +57,17 @@ def check_output(computed_basepairs, basepairs): # Check if base pairs are unique in computed_basepairs seen = set() - assert (not any( - (base1, base2) in seen) or (base2, base1 in seen) - or seen.add((base1, base2)) for base1, base2 in computed_basepairs - ) + assert ( + not any((base1, base2) in seen) + or (base2, base1 in seen) + or seen.add((base1, base2)) + for base1, base2 in computed_basepairs + ) # Check if the right number of base pairs is in computed_base pairs - assert(len(computed_basepairs) == len(basepairs)) + assert len(computed_basepairs) == len(basepairs) # Check if the right base pairs are in computed_basepairs for comp_basepair in computed_basepairs: - assert ((comp_basepair in basepairs) \ - or (comp_basepair in np.flip(basepairs))) + assert (comp_basepair in basepairs) or (comp_basepair in np.flip(basepairs)) @pytest.mark.parametrize("unique_bool", [False, True]) @@ -102,16 +100,12 @@ def test_base_pairs_reverse(nuc_sample_array, basepairs, unique_bool): # Reverse sequence of residues in nuc_sample_array reversed_nuc_sample_array = struc.AtomArray(0) - for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)): - reversed_nuc_sample_array = reversed_nuc_sample_array + residue + for res in reversed_iterator(struc.residue_iter(nuc_sample_array)): + reversed_nuc_sample_array = reversed_nuc_sample_array + res - computed_basepairs = struc.base_pairs( - reversed_nuc_sample_array, unique=unique_bool - ) + computed_basepairs = struc.base_pairs(reversed_nuc_sample_array, unique=unique_bool) check_residue_starts(computed_basepairs, reversed_nuc_sample_array) - check_output( - reversed_nuc_sample_array[computed_basepairs].res_id, basepairs - ) + check_output(reversed_nuc_sample_array[computed_basepairs].res_id, basepairs) def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs): @@ -123,14 +117,13 @@ def test_base_pairs_reverse_no_hydrogen(nuc_sample_array, basepairs): nuc_sample_array = nuc_sample_array[nuc_sample_array.element != "H"] # Reverse sequence of residues in nuc_sample_array reversed_nuc_sample_array = struc.AtomArray(0) - for residue in reversed_iterator(struc.residue_iter(nuc_sample_array)): - reversed_nuc_sample_array = reversed_nuc_sample_array + residue + for res in reversed_iterator(struc.residue_iter(nuc_sample_array)): + reversed_nuc_sample_array = reversed_nuc_sample_array + res computed_basepairs = struc.base_pairs(reversed_nuc_sample_array) check_residue_starts(computed_basepairs, reversed_nuc_sample_array) - check_output( - reversed_nuc_sample_array[computed_basepairs].res_id, basepairs - ) + check_output(reversed_nuc_sample_array[computed_basepairs].res_id, basepairs) + def test_base_pairs_incomplete_structure(nuc_sample_array): """ @@ -142,14 +135,15 @@ def test_base_pairs_incomplete_structure(nuc_sample_array): """ nuc_sample_array = nuc_sample_array[ - ~ np.isin( + ~np.isin( nuc_sample_array.atom_name, - ['N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9', 'O2'] + ["N1", "C2", "N3", "C4", "C5", "C6", "N7", "C8", "N9", "O2"], ) ] with pytest.warns(struc.IncompleteStructureWarning): assert len(struc.base_pairs(nuc_sample_array)) == 0 + @pytest.mark.parametrize("seed", range(10)) def test_base_pairs_reordered(nuc_sample_array, seed): """ @@ -160,52 +154,49 @@ def test_base_pairs_reordered(nuc_sample_array, seed): nuc_sample_array_reordered = struc.AtomArray(0) np.random.seed(seed) - for residue in struc.residue_iter(nuc_sample_array): - bound = residue.array_length() - indices = np.random.choice( - np.arange(bound), bound,replace=False - ) - nuc_sample_array_reordered += residue[..., indices] + for res in struc.residue_iter(nuc_sample_array): + bound = res.array_length() + indices = np.random.choice(np.arange(bound), bound, replace=False) + nuc_sample_array_reordered += res[..., indices] - assert(np.all( + assert np.all( struc.base_pairs(nuc_sample_array) == struc.base_pairs(nuc_sample_array_reordered) - )) + ) def test_map_nucleotide(): - """Test the function map_nucleotide with some examples. - """ - pyrimidines = ['C', 'T', 'U'] - purines = ['A', 'G'] + """Test the function map_nucleotide with some examples.""" + pyrimidines = ["C", "T", "U"] + purines = ["A", "G"] # Test that the standard bases are correctly identified - assert struc.map_nucleotide(residue('U')) == ('U', True) - assert struc.map_nucleotide(residue('A')) == ('A', True) - assert struc.map_nucleotide(residue('T')) == ('T', True) - assert struc.map_nucleotide(residue('G')) == ('G', True) - assert struc.map_nucleotide(residue('C')) == ('C', True) + assert struc.map_nucleotide(residue("U")) == ("U", True) + assert struc.map_nucleotide(residue("A")) == ("A", True) + assert struc.map_nucleotide(residue("T")) == ("T", True) + assert struc.map_nucleotide(residue("G")) == ("G", True) + assert struc.map_nucleotide(residue("C")) == ("C", True) # Test that some non_standard nucleotides are mapped correctly to # pyrimidine/purine references - psu_tuple = struc.map_nucleotide(residue('PSU')) + psu_tuple = struc.map_nucleotide(residue("PSU")) assert psu_tuple[0] in pyrimidines - assert psu_tuple[1] == False + assert psu_tuple[1] is False - psu_tuple = struc.map_nucleotide(residue('3MC')) + psu_tuple = struc.map_nucleotide(residue("3MC")) assert psu_tuple[0] in pyrimidines - assert psu_tuple[1] == False + assert psu_tuple[1] is False - i_tuple = struc.map_nucleotide(residue('I')) + i_tuple = struc.map_nucleotide(residue("I")) assert i_tuple[0] in purines - assert i_tuple[1] == False + assert i_tuple[1] is False - m7g_tuple = struc.map_nucleotide(residue('M7G')) + m7g_tuple = struc.map_nucleotide(residue("M7G")) assert m7g_tuple[0] in purines - assert m7g_tuple[1] == False + assert m7g_tuple[1] is False with pytest.warns(struc.IncompleteStructureWarning): - assert struc.map_nucleotide(residue('ALA')) == (None, False) + assert struc.map_nucleotide(residue("ALA")) == (None, False) def get_reference(pdb_id, suffix): @@ -218,12 +209,13 @@ def get_reference(pdb_id, suffix): ) with open( - join(data_dir("structure"), "base_pairs", f"{pdb_id}_{suffix}.json" - ), "r") as file: + join(data_dir("structure"), "base_pairs", f"{pdb_id}_{suffix}.json"), "r" + ) as file: reference = np.array(json.load(file)) return structure, reference + def get_reference_index(pair, array): """ Get the index of the row in a reference array, where the first two @@ -236,10 +228,7 @@ def get_reference_index(pair, array): return None - -def check_edge_plausibility( - reference_structure, pair, reference_edges, output_edges -): +def check_edge_plausibility(reference_structure, pair, reference_edges, output_edges): """ Checks if the difference to a reference edge is at least ambiguous. A difference is defined as ambiguous, if the number of hydrogen @@ -280,8 +269,9 @@ def test_base_pairs_edge(pdb_id): pair_res_ids = reference_structure[pair].res_id index = get_reference_index(pair_res_ids, reference_edges) if index is not None: - pair_reference_edges = [ - reference_edges[index, 2], reference_edges[index, 3] + pair_reference_edges = [ + reference_edges[index, 2], + reference_edges[index, 3], ] check_edge_plausibility( reference_structure, pair, pair_reference_edges, pair_edges @@ -309,9 +299,7 @@ def test_base_pairs_glycosidic_bond(pdb_id): pair_res_ids = reference_structure[pair].res_id index = get_reference_index(pair_res_ids, reference_gly_bonds) if index is not None: - reference_orientation = struc.GlycosidicBond( - reference_gly_bonds[index, 2] - ) + reference_orientation = struc.GlycosidicBond(reference_gly_bonds[index, 2]) assert reference_orientation == pair_orientation @@ -333,7 +321,7 @@ def test_base_stacking(): # stacked. expected_stackings = [] for i in range(1, 24): - expected_stackings.append([i, i+1]) + expected_stackings.append([i, i + 1]) # Due to distortions in the helix not all adjacent bases have a # geometry that meets the criteria of `base_stacking`. @@ -353,5 +341,3 @@ def test_base_stacking(): # Assert the stacking interactions are correct for interaction in helix[stacking].res_id: assert list(interaction) in expected_stackings - - diff --git a/tests/structure/test_bonds.py b/tests/structure/test_bonds.py index a5474ffde..d5c8b3508 100644 --- a/tests/structure/test_bonds.py +++ b/tests/structure/test_bonds.py @@ -9,7 +9,7 @@ import biotite.structure.info as info import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx -from ..util import data_dir +from tests.util import data_dir def generate_random_bond_list(atom_count, bond_count, seed=0): @@ -23,20 +23,22 @@ def generate_random_bond_list(atom_count, bond_count, seed=0): # Clip bond types to allowed BondType values bonds[:, 2] %= len(struc.BondType) # Remove bonds of atoms to itself - bonds = bonds[bonds[:,0] != bonds[:,1]] + bonds = bonds[bonds[:, 0] != bonds[:, 1]] assert len(bonds) > 0 return struc.BondList(atom_count, bonds) @pytest.fixture( - params=[False, True] # as_negative + params=[False, True] # as_negative ) def bond_list(request): """ A toy :class:`BondList`. """ as_negative = request.param - bond_array = np.array([(0,1),(2,1),(3,1),(3,4),(3,1),(1,2),(4,0),(6,4)]) + bond_array = np.array( + [(0, 1), (2, 1), (3, 1), (3, 4), (3, 1), (1, 2), (4, 0), (6, 4)] + ) if as_negative: return struc.BondList(7, -7 + bond_array) else: @@ -48,12 +50,14 @@ def test_creation(bond_list): Test creating a :class:`BondList` on a known example. """ # Test includes redundancy removal and max bonds calculation - assert bond_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0], - [3, 4, 0], - [0, 4, 0], - [4, 6, 0]] + assert bond_list.as_array().tolist() == [ + [0, 1, 0], + [1, 2, 0], + [1, 3, 0], + [3, 4, 0], + [0, 4, 0], + [4, 6, 0], + ] assert bond_list._max_bonds_per_atom == 3 assert bond_list._atom_count == 7 @@ -65,46 +69,44 @@ def test_invalid_creation(): """ # Test invalid input shapes with pytest.raises(ValueError): - struc.BondList( - 5, - np.array([ - [1,2,3,4] - ]) - ) + struc.BondList(5, np.array([[1, 2, 3, 4]])) with pytest.raises(ValueError): - struc.BondList( - 5, - np.array([1,2]) - ) + struc.BondList(5, np.array([1, 2])) # Test invalid atom indices with pytest.raises(IndexError): struc.BondList( 5, - np.array([ - [1,2], - # 5 is an invalid index for an atom count of 5 - [5,2] - ]) + np.array( + [ + [1, 2], + # 5 is an invalid index for an atom count of 5 + [5, 2], + ] + ), ) with pytest.raises(IndexError): struc.BondList( 5, - np.array([ - # Index -6 is invalid for an atom count of 5 - [-6,3], - [3,4] - ]) + np.array( + [ + # Index -6 is invalid for an atom count of 5 + [-6, 3], + [3, 4], + ] + ), ) # Test invalid BondType with pytest.raises(ValueError): struc.BondList( 5, - np.array([ - # BondType '8' does not exist - [1,2,8] - ]) + np.array( + [ + # BondType '8' does not exist + [1, 2, 8] + ] + ), ) @@ -126,25 +128,21 @@ def test_modification(bond_list): # Not in list -> Do nothing bond_list.remove_bond(0, 3) # Remove mutliple bonds, one of them is not in list - bond_list.remove_bonds(struc.BondList(10, np.array([(1,0),(1,2),(8,9)]))) - assert bond_list.as_array().tolist() == [[1, 3, 1], - [3, 4, 0], - [4, 6, 0], - [1, 4, 0]] + bond_list.remove_bonds(struc.BondList(10, np.array([(1, 0), (1, 2), (8, 9)]))) + assert bond_list.as_array().tolist() == [[1, 3, 1], [3, 4, 0], [4, 6, 0], [1, 4, 0]] def test_add_two_bond_list(): """ Test adding two `BondList` objects. """ - bond_list1 = struc.BondList(2, np.array([(0,1)])) # max_bond_per_atom=1 - bond_list2 = struc.BondList(3, np.array([(0,1),(0,2)])) # max_bond_per_atom=2 + bond_list1 = struc.BondList(2, np.array([(0, 1)])) # max_bond_per_atom=1 + bond_list2 = struc.BondList(3, np.array([(0, 1), (0, 2)])) # max_bond_per_atom=2 added_list = bond_list1 + bond_list2 assert added_list._max_bonds_per_atom == 2 assert added_list.get_bonds(2)[0].tolist() == [3, 4] - assert added_list.as_array().tolist() == [[0, 1, 0], - [2, 3, 0], - [2, 4, 0]] + assert added_list.as_array().tolist() == [[0, 1, 0], [2, 3, 0], [2, 4, 0]] + def test_contains(bond_list): """ @@ -185,29 +183,33 @@ def test_merge(bond_list): """ Test merging two `BondList` objects on a known example. """ - merged_list = struc.BondList(8, np.array([(4,6),(6,7)])).merge(bond_list) - assert merged_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0], - [3, 4, 0], - [0, 4, 0], - [4, 6, 0], - [6, 7, 0]] + merged_list = struc.BondList(8, np.array([(4, 6), (6, 7)])).merge(bond_list) + assert merged_list.as_array().tolist() == [ + [0, 1, 0], + [1, 2, 0], + [1, 3, 0], + [3, 4, 0], + [0, 4, 0], + [4, 6, 0], + [6, 7, 0], + ] def test_concatenation(bond_list): """ Test concatenation of two `BondList` objects on a known example. """ - bond_list += struc.BondList(3, np.array([(0,1,2),(1,2,2)])) - assert bond_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0], - [3, 4, 0], - [0, 4, 0], - [4, 6, 0], - [7, 8, 2], - [8, 9, 2]] + bond_list += struc.BondList(3, np.array([(0, 1, 2), (1, 2, 2)])) + assert bond_list.as_array().tolist() == [ + [0, 1, 0], + [1, 2, 0], + [1, 3, 0], + [3, 4, 0], + [0, 4, 0], + [4, 6, 0], + [7, 8, 2], + [8, 9, 2], + ] assert bond_list._max_bonds_per_atom == 3 assert bond_list._atom_count == 10 @@ -219,30 +221,27 @@ def test_indexing(bond_list): sub_list = bond_list[:] assert sub_list.as_array().tolist() == bond_list.as_array().tolist() sub_list = bond_list[::-1] - assert sub_list.as_array().tolist() == [[5, 6, 0], - [4, 5, 0], - [3, 5, 0], - [2, 3, 0], - [2, 6, 0], - [0, 2, 0]] + assert sub_list.as_array().tolist() == [ + [5, 6, 0], + [4, 5, 0], + [3, 5, 0], + [2, 3, 0], + [2, 6, 0], + [0, 2, 0], + ] sub_list = bond_list[1:6:2] assert sub_list.as_array().tolist() == [[0, 1, 0]] sub_list = bond_list[:4] - assert sub_list.as_array().tolist() == [[0, 1, 0], - [1, 2, 0], - [1, 3, 0]] + assert sub_list.as_array().tolist() == [[0, 1, 0], [1, 2, 0], [1, 3, 0]] sub_list = bond_list[2:] - assert sub_list.as_array().tolist() == [[1, 2, 0], - [2, 4, 0]] + assert sub_list.as_array().tolist() == [[1, 2, 0], [2, 4, 0]] - sub_list = bond_list[[0,3,4]] - assert sub_list.as_array().tolist() == [[1, 2, 0], - [0, 2, 0]] + sub_list = bond_list[[0, 3, 4]] + assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0]] + + sub_list = bond_list[np.array([True, False, False, True, True, False, True])] + assert sub_list.as_array().tolist() == [[1, 2, 0], [0, 2, 0], [2, 3, 0]] - sub_list = bond_list[np.array([True,False,False,True,True,False,True])] - assert sub_list.as_array().tolist() == [[1, 2, 0], - [0, 2, 0], - [2, 3, 0]] def test_get_all_bonds(): """ @@ -261,17 +260,13 @@ def test_get_all_bonds(): assert (bond_types != -1).all(axis=1).any(axis=0) test_bonds = [ - ( - bonded_i[bonded_i != -1].tolist(), - bond_type[bond_type != -1].tolist() - ) + (bonded_i[bonded_i != -1].tolist(), bond_type[bond_type != -1].tolist()) for bonded_i, bond_type in zip(bonds, bond_types) ] ref_bonds = [bond_list.get_bonds(i) for i in range(ATOM_COUNT)] ref_bonds = [ - (bonded_i.tolist(), bond_type.tolist()) - for bonded_i, bond_type in ref_bonds + (bonded_i.tolist(), bond_type.tolist()) for bonded_i, bond_type in ref_bonds ] assert test_bonds == ref_bonds @@ -330,9 +325,9 @@ def test_sorted_array_indexing(): # Create a sorted array of random indices for the BondList # Indices may not occur multiple times -> 'replace=False' - index_array = np.sort(np.random.choice( - np.arange(ATOM_COUNT), INDEX_SIZE, replace=False - )) + index_array = np.sort( + np.random.choice(np.arange(ATOM_COUNT), INDEX_SIZE, replace=False) + ) test_bonds = bonds[index_array] # Create a boolean mask that indexes the same elements as the array @@ -363,15 +358,13 @@ def test_unsorted_array_indexing(): # Create random bonds between the reference integers bonds = np.random.randint(ATOM_COUNT, size=(BOND_COUNT, 2)) # Remove bonds of elements to itself - bonds = bonds[bonds[:,0] != bonds[:,1]] + bonds = bonds[bonds[:, 0] != bonds[:, 1]] assert len(bonds) > 0 bonds = struc.BondList(ATOM_COUNT, bonds) # Create an unsorted array of random indices for the BondList # Indices should be unsorted -> 'replace=False' - unsorted_index = np.random.choice( - np.arange(ATOM_COUNT), INDEX_SIZE, replace=False - ) + unsorted_index = np.random.choice(np.arange(ATOM_COUNT), INDEX_SIZE, replace=False) test_bonds = bonds[unsorted_index] # Create a sorted variant of the index array @@ -385,14 +378,18 @@ def test_unsorted_array_indexing(): # Get the 'atoms', in this case integers, that are connected with a bond # Use a set for simpler comparison between the sorted and unsorted variant # Omit the bond type -> 'bonds.as_array()[:, :2]' - test_integer_pairs = set([ - frozenset((unsorted_indexed_integers[i], unsorted_indexed_integers[j])) - for i, j in test_bonds.as_array()[:, :2] - ]) - ref_integer_pairs = set([ - frozenset((sorted_indexed_integers[i], sorted_indexed_integers[j])) - for i, j in ref_bonds.as_array()[:, :2] - ]) + test_integer_pairs = set( + [ + frozenset((unsorted_indexed_integers[i], unsorted_indexed_integers[j])) + for i, j in test_bonds.as_array()[:, :2] + ] + ) + ref_integer_pairs = set( + [ + frozenset((sorted_indexed_integers[i], sorted_indexed_integers[j])) + for i, j in ref_bonds.as_array()[:, :2] + ] + ) # The BondList entries should be different, # since they point to different positions in the reference array @@ -415,18 +412,21 @@ def test_atom_array_consistency(): array = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))[0] ca = array[array.atom_name == "CA"] # Just for testing, does not reflect real bonds - bond_list = struc.BondList(ca.array_length(), - np.array([(0,1),(2,8),(5,15),(1,5),(0,9),(3,18),(2,9)]) + bond_list = struc.BondList( + ca.array_length(), + np.array([(0, 1), (2, 8), (5, 15), (1, 5), (0, 9), (3, 18), (2, 9)]), ) ca.bonds = bond_list - ref_ids = ca.res_id[bond_list.as_array()[:,:2].flatten()] + ref_ids = ca.res_id[bond_list.as_array()[:, :2].flatten()] # Some random boolean mask as index, # but all bonded atoms are included - mask = np.array([1,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,1,1], dtype=bool) + mask = np.array( + [1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1], dtype=bool + ) masked_ca = ca[mask] - test_ids = masked_ca.res_id[masked_ca.bonds.as_array()[:,:2].flatten()] + test_ids = masked_ca.res_id[masked_ca.bonds.as_array()[:, :2].flatten()] # The bonds, should always point to the same atoms (same res_id), # irrespective of indexing @@ -442,9 +442,7 @@ def test_method_consistency(periodic): THRESHOLD_PERCENTAGE = 0.99 # Structure with peptide, nucleotide, small molecules and water - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), "5ugo.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "5ugo.bcif")) atoms = pdbx.get_structure(pdbx_file, model=1) if periodic: # Add large dummy box to test parameter @@ -454,22 +452,22 @@ def test_method_consistency(periodic): bonds_from_names = struc.connect_via_residue_names(atoms) bonds_from_names.remove_bond_order() - bonds_from_distances = struc.connect_via_distances( - atoms, periodic=periodic - ) + bonds_from_distances = struc.connect_via_distances(atoms, periodic=periodic) # The distance based method may not detect all bonds assert bonds_from_distances.as_set().issubset(bonds_from_names.as_set()) - assert len(bonds_from_distances.as_array()) \ + assert ( + len(bonds_from_distances.as_array()) >= len(bonds_from_names.as_array()) * THRESHOLD_PERCENTAGE + ) def test_find_connected(bond_list): """ Find all connected atoms to an atom in a known example. """ - for index in (0,1,2,3,4,6): - assert struc.find_connected(bond_list, index).tolist() == [0,1,2,3,4,6] + for index in (0, 1, 2, 3, 4, 6): + assert struc.find_connected(bond_list, index).tolist() == [0, 1, 2, 3, 4, 6] assert struc.find_connected(bond_list, 5).tolist() == [5] @@ -498,7 +496,7 @@ def test_find_connected(bond_list): ("C17", "C22"), ]), ] -) +) # fmt: skip def test_find_rotatable_bonds(res_name, expected_bonds): """ Check the :func:`find_rotatable_bonds()` function based on @@ -513,11 +511,9 @@ def test_find_rotatable_bonds(res_name, expected_bonds): rotatable_bonds = struc.find_rotatable_bonds(molecule.bonds) test_bond_set = set() for i, j, _ in rotatable_bonds.as_array(): - test_bond_set.add( - tuple(sorted((molecule.atom_name[i], molecule.atom_name[j]))) - ) + test_bond_set.add(tuple(sorted((molecule.atom_name[i], molecule.atom_name[j])))) # Compare with reference bonded atom names assert test_bond_set == ref_bond_set # All rotatable bonds must be single bonds - assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE) \ No newline at end of file + assert np.all(rotatable_bonds.as_array()[:, 2] == struc.BondType.SINGLE) diff --git a/tests/structure/test_box.py b/tests/structure/test_box.py index 513f9cd35..78e8728b2 100644 --- a/tests/structure/test_box.py +++ b/tests/structure/test_box.py @@ -2,18 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join import itertools import warnings +from os.path import join import numpy as np import pytest import biotite.structure as struc import biotite.structure.io.pdbx as pdbx from biotite.structure.io import load_structure -from ..util import data_dir, cannot_import - +from tests.util import data_dir -SAMPLE_BOXES = [ +SAMPLE_CELLS = [ (1, 1, 1, 90, 90, 90), (9, 5, 2, 90, 90, 90), (5, 5, 8, 90, 90, 120), @@ -21,86 +20,69 @@ (2, 4, 6, 100, 110, 120), (9, 9, 9, 90, 90, 170), (9, 8, 7, 50, 80, 50), -] +] # fmt: skip SAMPLE_COORD = [ ( 1, 1, 1), ( 5, 10, 20), (-1, 5, 8), ( 3, 1, 54) -] - - - -# Ignore warning about dummy unit cell vector -@pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "len_a, len_b, len_c, alpha, beta, gamma", SAMPLE_BOXES -) -def test_box_vector_calculation(len_a, len_b, len_c, alpha, beta, gamma): - box = struc.vectors_from_unitcell( - len_a, len_b, len_c, - np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) - ) +] # fmt: skip - from mdtraj.utils import lengths_and_angles_to_box_vectors - ref_box = np.stack( - lengths_and_angles_to_box_vectors( - len_a, len_b, len_c, alpha, beta, gamma - ) - ) - assert np.allclose(box, ref_box) - assert struc.unitcell_from_vectors(box) == pytest.approx( - (len_a, len_b, len_c, - alpha * 2*np.pi / 360, beta * 2*np.pi / 360, gamma * 2*np.pi / 360) - ) +@pytest.mark.parametrize("ref_cell", SAMPLE_CELLS) +def test_box_vector_conversion(ref_cell): + """ + Converting a unit cell into box vectors and back should restore the same + unit cell. + """ + len_a, len_b, len_c, alpha, beta, gamma = ref_cell + alpha, beta, gamma = [np.deg2rad(angle) for angle in (alpha, beta, gamma)] + box = struc.vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma) + test_cell = struc.unitcell_from_vectors(box) + assert test_cell == pytest.approx((len_a, len_b, len_c, alpha, beta, gamma)) def test_volume(): - # Very rudimentary test - box = np.array([ - [5,0,0], - [0,8,0], - [0,0,2], - ]) + """ + Test the volume calculation of a simple orthorhombic box. + """ + box = np.array( + [ + [5, 0, 0], + [0, 8, 0], + [0, 0, 2], + ] + ) assert struc.box_volume(box) == pytest.approx(80) boxes = np.stack([box, box]) - assert struc.box_volume(boxes) == pytest.approx(80,80) + assert struc.box_volume(boxes) == pytest.approx(80, 80) @pytest.mark.parametrize( "len_a, len_b, len_c, alpha, beta, gamma, x, y,z", - [box+coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)] + [box + coord for box, coord in itertools.product(SAMPLE_CELLS, SAMPLE_COORD)], ) def test_move_into_box(len_a, len_b, len_c, alpha, beta, gamma, x, y, z): box = struc.vectors_from_unitcell( - len_a, len_b, len_c, - np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) + len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) ) - coord = np.array([x,y,z]) + coord = np.array([x, y, z]) moved_coord = struc.move_inside_box(coord, box) fractions = struc.coord_to_fraction(moved_coord, box) - assert ((fractions >= 0) & (fractions <=1)).all() + assert ((fractions >= 0) & (fractions <= 1)).all() @pytest.mark.parametrize( "len_a, len_b, len_c, alpha, beta, gamma, x, y,z", - [box+coord for box, coord in itertools.product(SAMPLE_BOXES, SAMPLE_COORD)] + [box + coord for box, coord in itertools.product(SAMPLE_CELLS, SAMPLE_COORD)], ) -def test_conversion_to_fraction(len_a, len_b, len_c, - alpha, beta, gamma, - x, y, z): +def test_conversion_to_fraction(len_a, len_b, len_c, alpha, beta, gamma, x, y, z): box = struc.vectors_from_unitcell( - len_a, len_b, len_c, - np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) + len_a, len_b, len_c, np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma) ) - coord = np.array([x,y,z]) + coord = np.array([x, y, z]) fractions = struc.coord_to_fraction(coord, box) if struc.is_orthogonal(box): @@ -119,12 +101,11 @@ def test_conversion_to_fraction(len_a, len_b, len_c, def test_repeat_box(multi_model): model = None if multi_model else 1 array = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")), - model=model + pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")), model=model ) repeat_array, _ = struc.repeat_box(array) assert repeat_array.array_length() == array.array_length() * 27 - assert repeat_array[..., :array.array_length()] == array + assert repeat_array[..., : array.array_length()] == array @pytest.mark.parametrize("multi_model", [True, False]) @@ -135,14 +116,12 @@ def test_remove_pbc_unsegmented(multi_model): """ model = None if multi_model else 1 ref_array = load_structure( - join(data_dir("structure"), "3o5r.bcif"), - model=model, - include_bonds=True + join(data_dir("structure"), "3o5r.bcif"), model=model, include_bonds=True ) # Center structure in box centroid = struc.centroid(ref_array) box_center = np.diag(ref_array.box) / 2 - ref_array = struc.translate(ref_array, box_center-centroid) + ref_array = struc.translate(ref_array, box_center - centroid) test_array = struc.remove_pbc(ref_array) assert ref_array.equal_annotation_categories(test_array) @@ -150,11 +129,7 @@ def test_remove_pbc_unsegmented(multi_model): @pytest.mark.parametrize( - "multi_model, seed", - itertools.product( - [False, True], - range(10) - ) + "multi_model, seed", itertools.product([False, True], range(10)) ) def test_remove_pbc_restore(multi_model, seed): BUFFER = 5 @@ -162,14 +137,12 @@ def test_remove_pbc_restore(multi_model, seed): def get_distance_matrices(array): if isinstance(array, struc.AtomArray): matrix = struc.distance( - array.coord[:, np.newaxis, :], - array.coord[np.newaxis, :, :], - box=None + array.coord[:, np.newaxis, :], array.coord[np.newaxis, :, :], box=None ) matrix_pbc = struc.distance( array.coord[:, np.newaxis, :], array.coord[np.newaxis, :, :], - box=array.box + box=array.box, ) elif isinstance(array, struc.AtomArrayStack): matrices = [get_distance_matrices(model) for model in array] @@ -177,9 +150,7 @@ def get_distance_matrices(array): matrix_pbc = np.stack([m[1] for m in matrices]) return matrix, matrix_pbc - stack = load_structure( - join(data_dir("structure"), "1l2y.bcif"), include_bonds=True - ) + stack = load_structure(join(data_dir("structure"), "1l2y.bcif"), include_bonds=True) # Only consider a single molecule # -> remove all other atoms (in this case some unbound hydrogen) @@ -188,10 +159,12 @@ def get_distance_matrices(array): stack = stack[..., largest_mask] # Create a relatively tight box around the protein - stack.box = np.array([ - np.diag(np.max(coord, axis=0) - np.min(coord, axis=0) + BUFFER) - for coord in stack.coord - ]) + stack.box = np.array( + [ + np.diag(np.max(coord, axis=0) - np.min(coord, axis=0) + BUFFER) + for coord in stack.coord + ] + ) stack.coord -= np.min(stack.coord, axis=-2)[:, np.newaxis, :] + BUFFER / 2 if multi_model: array = stack @@ -203,8 +176,7 @@ def get_distance_matrices(array): np.random.seed(seed) size = (array.stack_depth(), 3) if isinstance(array, struc.AtomArrayStack) else 3 translation_vector = np.sum( - np.random.uniform(-5, 5, size)[:, np.newaxis] * array.box, - axis=-2 + np.random.uniform(-5, 5, size)[:, np.newaxis] * array.box, axis=-2 )[..., np.newaxis, :] # Move atoms over periodic boundary... array = struc.translate(array, translation_vector) @@ -226,10 +198,7 @@ def get_distance_matrices(array): # The centroid of the structure should be inside the box dimensions centroid = struc.centroid(array) - assert np.all( - (centroid > np.zeros(3)) & - (centroid < np.sum(array.box, axis=-2)) - ) + assert np.all((centroid > np.zeros(3)) & (centroid < np.sum(array.box, axis=-2))) @pytest.mark.parametrize("multi_model", [True, False]) @@ -249,4 +218,4 @@ def test_remove_pbc_selection(multi_model): # A warning due to a zero-division (centroid of empty list of # atoms) is raised here warnings.simplefilter("ignore") - assert struc.remove_pbc(array, select_none) == array \ No newline at end of file + assert struc.remove_pbc(array, select_none) == array diff --git a/tests/structure/test_celllist.py b/tests/structure/test_celllist.py index 13267ce74..8ddef530b 100644 --- a/tests/structure/test_celllist.py +++ b/tests/structure/test_celllist.py @@ -2,13 +2,13 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join import itertools +from os.path import join import numpy as np import pytest import biotite.structure as struc import biotite.structure.io as strucio -from ..util import data_dir +from tests.util import data_dir # Result should be independent of cell size @@ -19,28 +19,20 @@ def test_get_atoms(cell_size): with known solutions. """ array = struc.AtomArray(length=5) - array.coord = np.array([[0,0,i] for i in range(5)]) + array.coord = np.array([[0, 0, i] for i in range(5)]) cell_list = struc.CellList(array, cell_size=cell_size) - assert cell_list.get_atoms(np.array([0,0,0.1]), 1).tolist() == [0,1] - assert cell_list.get_atoms(np.array([0,0,1.1]), 1).tolist() == [1,2] - assert cell_list.get_atoms(np.array([0,0,1.1]), 2).tolist() == [0,1,2,3] + assert cell_list.get_atoms(np.array([0, 0, 0.1]), 1).tolist() == [0, 1] + assert cell_list.get_atoms(np.array([0, 0, 1.1]), 1).tolist() == [1, 2] + assert cell_list.get_atoms(np.array([0, 0, 1.1]), 2).tolist() == [0, 1, 2, 3] # Multiple positions - pos = np.array([[0,0,0.1], - [0,0,1.1], - [0,0,4.1]]) - expected_indices = [0, 1, 2, - 0, 1, 2, 3, - 3, 4] + pos = np.array([[0, 0, 0.1], [0, 0, 1.1], [0, 0, 4.1]]) + expected_indices = [0, 1, 2, 0, 1, 2, 3, 3, 4] indices = cell_list.get_atoms(pos, 2) assert indices[indices != -1].tolist() == expected_indices # Multiple positions and multiple radii - pos = np.array([[0,0,0.1], - [0,0,1.1], - [0,0,4.1]]) + pos = np.array([[0, 0, 0.1], [0, 0, 1.1], [0, 0, 4.1]]) rad = np.array([1.0, 2.0, 3.0]) - expected_indices = [0, 1, - 0, 1, 2, 3, - 2, 3, 4] + expected_indices = [0, 1, 0, 1, 2, 3, 2, 3, 4] indices = cell_list.get_atoms(pos, rad) assert indices[indices != -1].tolist() == expected_indices @@ -52,7 +44,7 @@ def test_get_atoms(cell_size): [2, 5, 10], [False, True], [False, True], - ) + ), ) def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): """ @@ -64,9 +56,7 @@ def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): if periodic: # Create an orthorhombic box # with the outer coordinates as bounds - array.box = np.diag( - np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2) - ) + array.box = np.diag(np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2)) if use_selection: np.random.seed(0) @@ -83,17 +73,14 @@ def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): distance = struc.index_distance( array, np.stack( - [ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], - axis=-1 + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=-1, ), - periodic + periodic, ) distance = np.reshape(distance, (length, length)) # Create adjacency matrix from distance matrix - exp_matrix = (distance <= threshold) + exp_matrix = distance <= threshold if use_selection: # Set rows and columns to False for filtered out atoms exp_matrix[~selection, :] = False @@ -145,12 +132,10 @@ def test_empty_coordinates(): array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) cell_list = struc.CellList(array, cell_size=10) - for method in ( - struc.CellList.get_atoms, struc.CellList.get_atoms_in_cells - ): + for method in (struc.CellList.get_atoms, struc.CellList.get_atoms_in_cells): indices = method(cell_list, np.array([]), 1, as_mask=False) mask = method(cell_list, np.array([]), 1, as_mask=True) assert len(indices) == 0 assert len(mask) == 0 assert indices.dtype == np.int32 - assert mask.dtype == bool \ No newline at end of file + assert mask.dtype == bool diff --git a/tests/structure/test_chains.py b/tests/structure/test_chains.py index ffd5f682b..3c7daa782 100644 --- a/tests/structure/test_chains.py +++ b/tests/structure/test_chains.py @@ -2,18 +2,19 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io as strucio -import numpy as np from os.path import join -from ..util import data_dir +import numpy as np import pytest +import biotite.structure as struc +import biotite.structure.io as strucio +from tests.util import data_dir @pytest.fixture def array(): return strucio.load_structure(join(data_dir("structure"), "1igy.bcif")) + def test_get_chain_starts(array): """ Compare :func:`test_get_chain_starts()` with :func:`np.unique` in a @@ -24,6 +25,7 @@ def test_get_chain_starts(array): # All first occurences of a chain id are automatically chain starts assert set(ref_starts).issubset(set(test_starts)) + def test_get_chain_starts_same_id(array): """ Expect correct number of chains in a case where two successive @@ -34,18 +36,20 @@ def test_get_chain_starts_same_id(array): merged = array + array assert struc.get_chain_starts(merged).tolist() == [0, array.array_length()] + def test_apply_chain_wise(array): data = struc.apply_chain_wise(array, np.ones(len(array)), np.sum) assert data.tolist() == [ - len(array[array.chain_id == chain_id]) - for chain_id in np.unique(array.chain_id) + len(array[array.chain_id == chain_id]) for chain_id in np.unique(array.chain_id) ] + def test_spread_chain_wise(array): input_data = np.unique(array.chain_id) output_data = struc.spread_chain_wise(array, input_data) assert output_data.tolist() == array.chain_id.tolist() + def test_get_chain_masks(array): SAMPLE_SIZE = 100 np.random.seed(0) @@ -55,26 +59,29 @@ def test_get_chain_masks(array): ref_mask = array.chain_id == array.chain_id[index] assert test_mask.tolist() == ref_mask.tolist() + def test_get_chain_starts_for(array): SAMPLE_SIZE = 100 np.random.seed(0) indices = np.random.randint(0, array.array_length(), SAMPLE_SIZE) ref_starts = np.array( - [np.where(mask)[0][0] for mask - in struc.get_chain_masks(array, indices)] + [np.where(mask)[0][0] for mask in struc.get_chain_masks(array, indices)] ) test_starts = struc.get_chain_starts_for(array, indices) assert test_starts.tolist() == ref_starts.tolist() + def test_get_chains(array): assert struc.get_chains(array).tolist() == ["A", "B", "C", "D", "E", "F"] + def test_get_chain_count(array): assert struc.get_chain_count(array) == 6 + def test_chain_iter(array): n = 0 for chain in struc.get_chains(array): n += 1 assert isinstance(array, struc.AtomArray) - assert n == 6 \ No newline at end of file + assert n == 6 diff --git a/tests/structure/test_charges.py b/tests/structure/test_charges.py index 4d85d411b..35a99f11a 100644 --- a/tests/structure/test_charges.py +++ b/tests/structure/test_charges.py @@ -3,13 +3,9 @@ # information. import warnings -import pytest import numpy as np -from biotite.structure import Atom -from biotite.structure import array -from biotite.structure import BondList -from biotite.structure import partial_charges - +import pytest +from biotite.structure import Atom, BondList, array, partial_charges # Test the partial charge of carbon in the molecules given in table # 3 of the Gasteiger-Marsili publication @@ -19,236 +15,236 @@ # the relevant information is the BondList # Creating atoms to build molecules with -carbon = Atom([0, 0, 0], element="C") +carbon = Atom([0, 0, 0], element="C") hydrogen = Atom([0, 0, 0], element="H") -oxygen = Atom([0, 0, 0], element="O") +oxygen = Atom([0, 0, 0], element="O") nitrogen = Atom([0, 0, 0], element="N") fluorine = Atom([0, 0, 0], element="F") -sulfur = Atom([0, 0, 0], element="S") +sulfur = Atom([0, 0, 0], element="S") # Building molecules methane = array([carbon, hydrogen, hydrogen, hydrogen, hydrogen]) methane.bonds = BondList( - methane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + methane.array_length(), np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]) ) mol_length = methane.array_length() methane.charge = np.array([0] * mol_length) ethane = array( - [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, - hydrogen] + [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen] ) ethane.bonds = BondList( ethane.array_length(), - np.array([ - [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1] - ]) + np.array( + [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]] + ), ) mol_length = ethane.array_length() ethane.charge = np.array([0] * mol_length) -ethylene = array( - [carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen] -) +ethylene = array([carbon, carbon, hydrogen, hydrogen, hydrogen, hydrogen]) ethylene.bonds = BondList( ethylene.array_length(), - np.array([[0,1,2], [0,2,1], [0,3,1], [1,4,1], [1,5,1]]) + np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1], [1, 4, 1], [1, 5, 1]]), ) mol_length = ethylene.array_length() ethylene.charge = np.array([0] * mol_length) -acetylene = array( - [carbon, carbon, hydrogen, hydrogen] -) +acetylene = array([carbon, carbon, hydrogen, hydrogen]) acetylene.bonds = BondList( - acetylene.array_length(), - np.array([[0,1,3], [0,2,1], [1,3,1]]) + acetylene.array_length(), np.array([[0, 1, 3], [0, 2, 1], [1, 3, 1]]) ) mol_length = acetylene.array_length() acetylene.charge = np.array([0] * mol_length) -fluoromethane = array( - [carbon, fluorine, hydrogen, hydrogen, hydrogen] -) +fluoromethane = array([carbon, fluorine, hydrogen, hydrogen, hydrogen]) fluoromethane.bonds = BondList( - fluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + fluoromethane.array_length(), np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]) ) mol_length = fluoromethane.array_length() fluoromethane.charge = np.array([0] * mol_length) -difluoromethane = array( - [carbon, fluorine, fluorine, hydrogen, hydrogen] -) +difluoromethane = array([carbon, fluorine, fluorine, hydrogen, hydrogen]) difluoromethane.bonds = BondList( difluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]), ) mol_length = difluoromethane.array_length() difluoromethane.charge = np.array([0] * mol_length) -trifluoromethane = array( - [carbon, fluorine, fluorine, fluorine, hydrogen] -) +trifluoromethane = array([carbon, fluorine, fluorine, fluorine, hydrogen]) trifluoromethane.bonds = BondList( trifluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]), ) mol_length = trifluoromethane.array_length() trifluoromethane.charge = np.array([0] * mol_length) -tetrafluoromethane = array( - [carbon, fluorine, fluorine, fluorine, fluorine] -) +tetrafluoromethane = array([carbon, fluorine, fluorine, fluorine, fluorine]) tetrafluoromethane.bonds = BondList( tetrafluoromethane.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1]]), ) mol_length = tetrafluoromethane.array_length() tetrafluoromethane.charge = np.array([0] * mol_length) fluoroethane = array( - [carbon, carbon, fluorine, hydrogen, hydrogen, hydrogen, - hydrogen, hydrogen] + [carbon, carbon, fluorine, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen] ) fluoroethane.bonds = BondList( fluoroethane.array_length(), - np.array([ - [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1] - ]) + np.array( + [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]] + ), ) mol_length = fluoroethane.array_length() fluoroethane.charge = np.array([0] * mol_length) trifluoroethane = array( - [carbon, carbon, fluorine, fluorine, fluorine, hydrogen, - hydrogen, hydrogen] + [carbon, carbon, fluorine, fluorine, fluorine, hydrogen, hydrogen, hydrogen] ) trifluoroethane.bonds = BondList( trifluoroethane.array_length(), - np.array([ - [0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1], [1,6,1], [1,7,1] - ]) + np.array( + [[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1], [1, 6, 1], [1, 7, 1]] + ), ) mol_length = trifluoroethane.array_length() trifluoroethane.charge = np.array([0] * mol_length) -methanole = array( - [carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen] -) +methanole = array([carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen]) methanole.bonds = BondList( methanole.array_length(), - np.array([[0,1,1], [0,2,1], [0,3,1], [0,4,1], [1,5,1]]) + np.array([[0, 1, 1], [0, 2, 1], [0, 3, 1], [0, 4, 1], [1, 5, 1]]), ) mol_length = methanole.array_length() methanole.charge = np.array([0] * mol_length) dimethyl_ether = array( - [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen, - hydrogen, hydrogen] + [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen, hydrogen] ) dimethyl_ether.bonds = BondList( dimethyl_ether.array_length(), - np.array([ - [0,2,1], [1,2,1], [0,3,1], [0,4,1], [0,5,1], [1,6,1], [1,7,1], - [1,8,1] - ]) + np.array( + [ + [0, 2, 1], + [1, 2, 1], + [0, 3, 1], + [0, 4, 1], + [0, 5, 1], + [1, 6, 1], + [1, 7, 1], + [1, 8, 1], + ] + ), ) mol_length = dimethyl_ether.array_length() dimethyl_ether.charge = np.array([0] * mol_length) -formaldehyde = array( - [carbon, oxygen, hydrogen, hydrogen] -) +formaldehyde = array([carbon, oxygen, hydrogen, hydrogen]) formaldehyde.bonds = BondList( - formaldehyde.array_length(), - np.array([[0,1,2], [0,2,1], [0,3,1]]) + formaldehyde.array_length(), np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1]]) ) mol_length = formaldehyde.array_length() formaldehyde.charge = np.array([0] * mol_length) -acetaldehyde = array( - [carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen] -) +acetaldehyde = array([carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, hydrogen]) acetaldehyde.bonds = BondList( acetaldehyde.array_length(), - np.array([[0,1,1], [1,2,2], [0,3,1], [0,4,1], [0,5,1], [1,6,1]]) + np.array([[0, 1, 1], [1, 2, 2], [0, 3, 1], [0, 4, 1], [0, 5, 1], [1, 6, 1]]), ) mol_length = acetaldehyde.array_length() acetaldehyde.charge = np.array([0] * mol_length) acetone = array( - [carbon, carbon, carbon, oxygen, hydrogen, hydrogen, hydrogen, - hydrogen, hydrogen, hydrogen] + [ + carbon, + carbon, + carbon, + oxygen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + ] ) acetone.bonds = BondList( acetone.array_length(), - np.array([ - [0,1,1], [1,2,1], [1,3,2], [0,4,1], [0,5,1], [0,6,1], [2,7,1], - [2,8,1], [2,9,1] - ]) + np.array( + [ + [0, 1, 1], + [1, 2, 1], + [1, 3, 2], + [0, 4, 1], + [0, 5, 1], + [0, 6, 1], + [2, 7, 1], + [2, 8, 1], + [2, 9, 1], + ] + ), ) mol_length = acetone.array_length() acetone.charge = np.array([0] * mol_length) -hydrogen_cyanide = array( - [carbon, nitrogen, hydrogen] -) +hydrogen_cyanide = array([carbon, nitrogen, hydrogen]) hydrogen_cyanide.bonds = BondList( - hydrogen_cyanide.array_length(), - np.array([[0,1,3], [0,2,1]]) + hydrogen_cyanide.array_length(), np.array([[0, 1, 3], [0, 2, 1]]) ) mol_length = hydrogen_cyanide.array_length() hydrogen_cyanide.charge = np.array([0] * mol_length) -acetonitrile = array( - [carbon, carbon, nitrogen, hydrogen, hydrogen, hydrogen] -) +acetonitrile = array([carbon, carbon, nitrogen, hydrogen, hydrogen, hydrogen]) acetonitrile.bonds = BondList( acetonitrile.array_length(), - np.array([[0,1,1], [1,2,3], [0,3,1], [0,4,1], [0,5,1]]) + np.array([[0, 1, 1], [1, 2, 3], [0, 3, 1], [0, 4, 1], [0, 5, 1]]), ) mol_length = acetonitrile.array_length() acetonitrile.charge = np.array([0] * mol_length) + # For this purpose, parametrization via pytest is performed -@pytest.mark.parametrize("molecule, expected_results", [ - (methane, (-0.078,)), - (ethane, (-0.068, -0.068)), - (ethylene, (-0.106, -0.106)), - (acetylene, (-0.122, -0.122)), - (fluoromethane, (0.079,)), - (difluoromethane, (0.23,)), - (trifluoromethane, (0.38,)), - (tetrafluoromethane, (0.561,)), - (fluoroethane, (0.087, -0.037)), - (trifluoroethane, (0.387, 0.039)), - (methanole, (0.033,)), - (dimethyl_ether, (0.036, 0.036)), - (formaldehyde, (0.115,)), - (acetaldehyde, (-0.009, 0.123)), - (acetone, (-0.006, 0.131, -0.006)), - (hydrogen_cyanide, (0.051,)), - (acetonitrile, (0.023, 0.06)) -]) +@pytest.mark.parametrize( + "molecule, expected_results", + [ + (methane, (-0.078,)), + (ethane, (-0.068, -0.068)), + (ethylene, (-0.106, -0.106)), + (acetylene, (-0.122, -0.122)), + (fluoromethane, (0.079,)), + (difluoromethane, (0.23,)), + (trifluoromethane, (0.38,)), + (tetrafluoromethane, (0.561,)), + (fluoroethane, (0.087, -0.037)), + (trifluoroethane, (0.387, 0.039)), + (methanole, (0.033,)), + (dimethyl_ether, (0.036, 0.036)), + (formaldehyde, (0.115,)), + (acetaldehyde, (-0.009, 0.123)), + (acetone, (-0.006, 0.131, -0.006)), + (hydrogen_cyanide, (0.051,)), + (acetonitrile, (0.023, 0.06)), + ], +) def test_partial_charges(molecule, expected_results): """ Test whether the partial charges of the carbon atoms comprised in @@ -257,29 +253,33 @@ def test_partial_charges(molecule, expected_results): within a certain tolerance range. """ charges = partial_charges(molecule) - assert charges[molecule.element == "C"].tolist() == \ - pytest.approx(expected_results, abs=1e-2) - - -@pytest.mark.parametrize("molecule", [ - methane, - ethane, - ethylene, - acetylene, - fluoromethane, - difluoromethane, - trifluoromethane, - tetrafluoromethane, - fluoroethane, - trifluoroethane, - methanole, - dimethyl_ether, - formaldehyde, - acetaldehyde, - acetone, - hydrogen_cyanide, - acetonitrile -]) + assert charges[molecule.element == "C"].tolist() == pytest.approx( + expected_results, abs=1e-2 + ) + + +@pytest.mark.parametrize( + "molecule", + [ + methane, + ethane, + ethylene, + acetylene, + fluoromethane, + difluoromethane, + trifluoromethane, + tetrafluoromethane, + fluoroethane, + trifluoroethane, + methanole, + dimethyl_ether, + formaldehyde, + acetaldehyde, + acetone, + hydrogen_cyanide, + acetonitrile, + ], +) def test_total_charge_zero(molecule): """ In the case of the 17 molecules given in table 3, it is verified @@ -302,14 +302,8 @@ def test_pos_formal_charge(): pos_methane = methane.copy() pos_methane.charge = np.array([1, 0, 0, 0, 0]) - ref_carb_part_charge = partial_charges( - methane, - iteration_step_num=6 - )[0] - pos_carb_part_charge = partial_charges( - pos_methane, - iteration_step_num=6 - )[0] + ref_carb_part_charge = partial_charges(methane, iteration_step_num=6)[0] + pos_carb_part_charge = partial_charges(pos_methane, iteration_step_num=6)[0] assert pos_carb_part_charge < 1 assert pos_carb_part_charge > ref_carb_part_charge @@ -331,16 +325,12 @@ def test_valence_state_not_parametrized(): with pytest.warns( UserWarning, match=( - "Parameters for specific valence states of some atoms " - "are not available" - ) + "Parameters for specific valence states of some atoms " "are not available" + ), ): - thioformaldehyde = array( - [carbon, sulfur, hydrogen, hydrogen] - ) + thioformaldehyde = array([carbon, sulfur, hydrogen, hydrogen]) thioformaldehyde.bonds = BondList( - thioformaldehyde.array_length(), - np.array([[0,1,2], [0,2,1], [0,3,1]]) + thioformaldehyde.array_length(), np.array([[0, 1, 2], [0, 2, 1], [0, 3, 1]]) ) mol_length = thioformaldehyde.array_length() thioformaldehyde.charge = np.array([0] * mol_length) @@ -368,9 +358,7 @@ def test_correct_output_ions(): sodium_array.bonds = BondList(sodium_array.array_length()) with warnings.catch_warnings(): warnings.simplefilter("error") - sodium_charge = partial_charges( - sodium_array, iteration_step_num=1 - )[0] + sodium_charge = partial_charges(sodium_array, iteration_step_num=1)[0] assert sodium_charge == 1 @@ -414,51 +402,72 @@ def test_correct_output_charged_aa(): unspecified bond types throughout the whole AtomArray is raised. """ - glycine_charge = np.array( - [+1, 0, 0, 0, -1, 0, 0, 0, 0, 0] - ) + glycine_charge = np.array([+1, 0, 0, 0, -1, 0, 0, 0, 0, 0]) glycine_with_btype = array( - [nitrogen, carbon, carbon, oxygen, oxygen, hydrogen, hydrogen, - hydrogen, hydrogen, hydrogen] + [ + nitrogen, + carbon, + carbon, + oxygen, + oxygen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + hydrogen, + ] ) glycine_with_btype.charge = glycine_charge glycine_with_btype.bonds = BondList( glycine_with_btype.array_length(), - np.array([ - [0,1,1], [0,5,1], [0,6,1], [0,7,1], [1,2,1], [1,8,1], - [1,9,1], [2,3,2], [2,4,1] - ]) + np.array( + [ + [0, 1, 1], + [0, 5, 1], + [0, 6, 1], + [0, 7, 1], + [1, 2, 1], + [1, 8, 1], + [1, 9, 1], + [2, 3, 2], + [2, 4, 1], + ] + ), ) glycine_without_btype = glycine_with_btype.copy() glycine_without_btype.charge = glycine_charge glycine_without_btype.bonds = BondList( glycine_without_btype.array_length(), - np.array([ - [0,1,0], [0,5,0], [0,6,0], [0,7,0], [1,2,0], [1,8,0], - [1,9,0], [2,3,0], [2,4,0] - ]) + np.array( + [ + [0, 1, 0], + [0, 5, 0], + [0, 6, 0], + [0, 7, 0], + [1, 2, 0], + [1, 8, 0], + [1, 9, 0], + [2, 3, 0], + [2, 4, 0], + ] + ), ) part_charges_with_btype = partial_charges(glycine_with_btype) with pytest.warns(UserWarning, match="Each atom's bond type is 0"): - part_charges_without_btype = partial_charges( - glycine_without_btype - ) + part_charges_without_btype = partial_charges(glycine_without_btype) # Nitrogen of the amino group has the index 0 nitr_charge_with_btype = part_charges_with_btype[0] nitr_charge_without_btype = part_charges_without_btype[0] - assert nitr_charge_with_btype == pytest.approx( - nitr_charge_without_btype, abs=5e-4 - ) + assert nitr_charge_with_btype == pytest.approx(nitr_charge_without_btype, abs=5e-4) # Oxygen of the hydroxyl group in the carboxyl group has the index 2 oxyg_charge_with_btype = part_charges_with_btype[2] oxyg_charge_without_btype = part_charges_without_btype[2] assert oxyg_charge_with_btype < oxyg_charge_without_btype # Assert that difference between the two values is significant - difference_oxyg_charges = abs(oxyg_charge_with_btype - - oxyg_charge_without_btype) - assert difference_oxyg_charges > 3e-2 \ No newline at end of file + difference_oxyg_charges = abs(oxyg_charge_with_btype - oxyg_charge_without_btype) + assert difference_oxyg_charges > 3e-2 diff --git a/tests/structure/test_compare.py b/tests/structure/test_compare.py index a4cf024a1..1895c8ba4 100644 --- a/tests/structure/test_compare.py +++ b/tests/structure/test_compare.py @@ -2,17 +2,18 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io as strucio from os.path import join import numpy as np import pytest -from ..util import data_dir +import biotite.structure as struc +import biotite.structure.io as strucio +from tests.util import data_dir + @pytest.fixture def stack(): stack = struc.AtomArrayStack(depth=3, length=5) - stack.coord = np.arange(45).reshape((3,5,3)) + stack.coord = np.arange(45).reshape((3, 5, 3)) return stack @@ -20,92 +21,178 @@ def stack(): def test_rmsd(stack, as_coord): if as_coord: stack = stack.coord - assert struc.rmsd(stack[0], stack).tolist() \ - == pytest.approx([0.0, 25.98076211, 51.96152423]) - assert struc.rmsd(stack[0], stack[1]) \ - == pytest.approx(25.9807621135) + assert struc.rmsd(stack[0], stack).tolist() == pytest.approx( + [0.0, 25.98076211, 51.96152423] + ) + assert struc.rmsd(stack[0], stack[1]) == pytest.approx(25.9807621135) @pytest.mark.parametrize("as_coord", [False, True]) def test_rmsf(stack, as_coord): if as_coord: stack = stack.coord - assert struc.rmsf(struc.average(stack), stack).tolist() \ - == pytest.approx([21.21320344] * 5) + assert struc.rmsf(struc.average(stack), stack).tolist() == pytest.approx( + [21.21320344] * 5 + ) + @pytest.fixture def load_stack_superimpose(): - stack = strucio.load_structure(join( - data_dir("structure"), "1l2y.bcif" - )) + stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) # Superimpose with first frame bb_mask = struc.filter_peptide_backbone(stack[0]) supimp, _ = struc.superimpose(stack[0], stack, atom_mask=bb_mask) return stack, supimp + def test_rmsd_gmx(load_stack_superimpose): """ Comparison of RMSD values computed with Biotite with results obtained from GROMACS 2021.5. """ stack, supimp = load_stack_superimpose - rmsd = struc.rmsd(stack[0], supimp)/10 + rmsd = struc.rmsd(stack[0], supimp) / 10 # Gromacs RMSDs -> Without mass-weighting: # echo "Backbone Protein" | \ # gmx rms -s 1l2y.gro -f 1l2y.xtc -o rmsd.xvg -mw no - rmsd_gmx = np.array([ - 0.0005037, 0.1957698, 0.2119313, 0.2226127, 0.184382, - 0.2210998, 0.2712815, 0.1372861, 0.2348654, 0.1848784, - 0.1893576, 0.2500543, 0.1946374, 0.2101624, 0.2180645, - 0.1836762, 0.1681345, 0.2363865, 0.2287371, 0.2546207, - 0.1604872, 0.2167119, 0.2176063, 0.2069806, 0.2535706, - 0.2682233, 0.2252388, 0.2419151, 0.2343987, 0.1902994, - 0.2334525, 0.2010523, 0.215444, 0.1786632, 0.2652018, - 0.174061, 0.2591569, 0.2602662 - ]) + rmsd_gmx = np.array( + [ + 0.0005037, + 0.1957698, + 0.2119313, + 0.2226127, + 0.184382, + 0.2210998, + 0.2712815, + 0.1372861, + 0.2348654, + 0.1848784, + 0.1893576, + 0.2500543, + 0.1946374, + 0.2101624, + 0.2180645, + 0.1836762, + 0.1681345, + 0.2363865, + 0.2287371, + 0.2546207, + 0.1604872, + 0.2167119, + 0.2176063, + 0.2069806, + 0.2535706, + 0.2682233, + 0.2252388, + 0.2419151, + 0.2343987, + 0.1902994, + 0.2334525, + 0.2010523, + 0.215444, + 0.1786632, + 0.2652018, + 0.174061, + 0.2591569, + 0.2602662, + ] + ) assert np.allclose(rmsd, rmsd_gmx, atol=1e-03) + def test_rmspd_gmx(load_stack_superimpose): """ Comparison of the RMSPD computed with Biotite with results obtained from GROMACS 2021.5. """ stack, _ = load_stack_superimpose - rmspd = struc.rmspd(stack[0], stack)/10 + rmspd = struc.rmspd(stack[0], stack) / 10 # Gromacs RMSDist: # echo "Protein" | \ # gmx rmsdist -f 1l2y.xtc -s 1l2y.gro -o rmsdist.xvg -sumh no -pbc no - rmspd_gmx = np.array([ - 0.000401147, 0.125482, 0.138913, 0.138847, 0.113917, - 0.132915, 0.173084, 0.103089, 0.156309, 0.114694, - 0.12964, 0.15875, 0.12876, 0.128983, 0.137031, - 0.126059, 0.106726, 0.154244, 0.144405, 0.174041, - 0.10417, 0.130936, 0.141216, 0.125559, 0.171342, - 0.165306, 0.137616, 0.154447, 0.146337, 0.116433, - 0.154976, 0.128477, 0.150537, 0.111494, 0.173234, - 0.116638, 0.169524, 0.15953 - ]) + rmspd_gmx = np.array( + [ + 0.000401147, + 0.125482, + 0.138913, + 0.138847, + 0.113917, + 0.132915, + 0.173084, + 0.103089, + 0.156309, + 0.114694, + 0.12964, + 0.15875, + 0.12876, + 0.128983, + 0.137031, + 0.126059, + 0.106726, + 0.154244, + 0.144405, + 0.174041, + 0.10417, + 0.130936, + 0.141216, + 0.125559, + 0.171342, + 0.165306, + 0.137616, + 0.154447, + 0.146337, + 0.116433, + 0.154976, + 0.128477, + 0.150537, + 0.111494, + 0.173234, + 0.116638, + 0.169524, + 0.15953, + ] + ) assert np.allclose(rmspd, rmspd_gmx, atol=1e-03) + def test_rmsf_gmx(load_stack_superimpose): """ Comparison of RMSF values computed with Biotite with results obtained from GROMACS 2021.5. """ stack, supimp = load_stack_superimpose - ca_mask = ((stack[0].atom_name == "CA") & (stack[0].element == "C")) - rmsf = struc.rmsf(struc.average(supimp[:, ca_mask]), supimp[:, ca_mask])/10 + ca_mask = (stack[0].atom_name == "CA") & (stack[0].element == "C") + rmsf = struc.rmsf(struc.average(supimp[:, ca_mask]), supimp[:, ca_mask]) / 10 # Gromacs RMSF: # echo "C-alpha" | gmx rmsf -s 1l2y.gro -f 1l2y.xtc -o rmsf.xvg -res - rmsf_gmx = np.array([ - 0.1379, 0.036, 0.0261, 0.0255, 0.029, 0.0204, 0.0199, - 0.0317, 0.0365, 0.0249, 0.0269, 0.032, 0.0356, 0.0446, - 0.059, 0.037, 0.0331, 0.0392, 0.0403, 0.0954 - ]) - - assert np.allclose(rmsf, rmsf_gmx, atol=1e-02) \ No newline at end of file + rmsf_gmx = np.array( + [ + 0.1379, + 0.036, + 0.0261, + 0.0255, + 0.029, + 0.0204, + 0.0199, + 0.0317, + 0.0365, + 0.0249, + 0.0269, + 0.032, + 0.0356, + 0.0446, + 0.059, + 0.037, + 0.0331, + 0.0392, + 0.0403, + 0.0954, + ] + ) + + assert np.allclose(rmsf, rmsf_gmx, atol=1e-02) diff --git a/tests/structure/test_density.py b/tests/structure/test_density.py index bfbb3e1e4..012b5eb02 100644 --- a/tests/structure/test_density.py +++ b/tests/structure/test_density.py @@ -2,10 +2,11 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -from biotite.structure import Atom import numpy as np import pytest +import biotite.structure as struc +from biotite.structure import Atom + @pytest.fixture def array(): @@ -18,52 +19,56 @@ def array(): atom_list.append(Atom([2.5, 0.5, 1.1])) return struc.array(atom_list) + @pytest.fixture def stack(array): return struc.stack([array, array.copy()]) - def test_density(array, stack): density, (x, y, z) = struc.density(array) assert np.array_equal(x, [0.5, 1.5, 2.5]) assert np.array_equal(y, [0.5, 1.5, 2.5, 3.5]) assert np.array_equal(z, [1.0, 2.0]) assert density.sum() == 6 - assert density[0,2] == 2 - assert density[1,0] == 3 - assert density[1,1] == 1 + assert density[0, 2] == 2 + assert density[1, 0] == 3 + assert density[1, 1] == 1 density, (x, y, z) = struc.density(stack) assert np.array_equal(x, [0.5, 1.5, 2.5]) assert np.array_equal(y, [0.5, 1.5, 2.5, 3.5]) assert np.array_equal(z, [1.0, 2.0]) assert density.sum() == 12 - assert density[0,2] == 4 - assert density[1,0] == 6 - assert density[1,1] == 2 + assert density[0, 2] == 4 + assert density[1, 0] == 6 + assert density[1, 1] == 2 + def test_density_with_bins(array): - bins = np.array([[0, 1, 2, 3],[0, 1, 2, 3],[0, 1, 2, 3]]) + bins = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]) density, (x, y, z) = struc.density(array, bins=bins) - assert np.array_equal(x, [0,1,2,3]) - assert np.array_equal(y, [0,1,2,3]) - assert np.array_equal(z, [0,1,2,3]) + assert np.array_equal(x, [0, 1, 2, 3]) + assert np.array_equal(y, [0, 1, 2, 3]) + assert np.array_equal(z, [0, 1, 2, 3]) assert density.sum() == 6 - assert density[0,2,1] == 2 - assert density[1,1,1] == 1 - assert density[2,0,1] == 3 + assert density[0, 2, 1] == 2 + assert density[1, 1, 1] == 1 + assert density[2, 0, 1] == 3 + def test_density_with_delta(array): density, (x, y, z) = struc.density(array, delta=5.0) assert density.shape == (1, 1, 1) assert density.sum() == 6 - assert density[0,0,0] == 6 + assert density[0, 0, 0] == 6 + def test_density_normalized(array): density, (x, y, z) = struc.density(array, density=True) assert np.abs(density.sum() - 1.0) < 0.0001 - assert np.abs(density[0,2] - 2.0/6.0) < 0.0001 + assert np.abs(density[0, 2] - 2.0 / 6.0) < 0.0001 + def test_density_weights(array, stack): # assign weights to coordinates @@ -74,15 +79,15 @@ def test_density_weights(array, stack): assert density.sum() == atomic_weights.sum() assert density[0, 2] == atomic_weights[0] + atomic_weights[1] assert density[1, 0] == atomic_weights[3:].sum() - assert density[1,1] == atomic_weights[2] + assert density[1, 1] == atomic_weights[2] # weights should be repeated along stack dimensions and lead to the same # result independent of shape density, (x, y, z) = struc.density(stack, weights=atomic_weights) - density2, (x, y, z) = struc.density(stack, - weights=np.array([atomic_weights, atomic_weights])) + density2, (x, y, z) = struc.density( + stack, weights=np.array([atomic_weights, atomic_weights]) + ) assert density.sum() == density2.sum() - assert density[0,2] == density2[0,2] - assert density[1,0] == density2[1,0] - assert density[1,1] == density2[1,1] - + assert density[0, 2] == density2[0, 2] + assert density[1, 0] == density2[1, 0] + assert density[1, 1] == density2[1, 1] diff --git a/tests/structure/test_dotbracket.py b/tests/structure/test_dotbracket.py index 4e6827cd7..cbe5ef2e6 100644 --- a/tests/structure/test_dotbracket.py +++ b/tests/structure/test_dotbracket.py @@ -2,12 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest +from os.path import join import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io as strucio -from os.path import join -from ..util import data_dir +from tests.util import data_dir @pytest.fixture @@ -15,11 +15,10 @@ def nuc_sample_array(): """ Sample structure. """ - nuc_sample_array = strucio.load_structure( - join(data_dir("structure"), "4p5j.cif") - ) + nuc_sample_array = strucio.load_structure(join(data_dir("structure"), "4p5j.cif")) return nuc_sample_array[struc.filter_nucleotides(nuc_sample_array)] + @pytest.fixture def expected_output(): """ @@ -29,47 +28,51 @@ def expected_output(): ".[(((((.[{...)))))(((((((.......)))))))...(((((]}.)..))))[[[...((((((" "]]].].))))))(.)", ".[(((((.{[...)))))(((((((.......)))))))...(((((}].)..))))[[[...((((((" - "]]].].))))))(.)" + "]]].].))))))(.)", ] + @pytest.fixture def basepair_residue_positions(): """ The base pairs in the sample array by their residue postions. """ return np.array( - [[1, 73], - [2, 17], - [3, 16], - [4, 15], - [5, 14], - [6, 13], - [8, 47], - [9, 48], - [18, 38], - [19, 37], - [20, 36], - [21, 35], - [22, 34], - [23, 33], - [24, 32], - [42, 56], - [43, 55], - [44, 54], - [45, 53], - [46, 50], - [57, 71], - [58, 70], - [59, 69], - [63, 80], - [64, 79], - [65, 78], - [66, 77], - [67, 76], - [68, 75], - [81, 83]] + [ + [1, 73], + [2, 17], + [3, 16], + [4, 15], + [5, 14], + [6, 13], + [8, 47], + [9, 48], + [18, 38], + [19, 37], + [20, 36], + [21, 35], + [22, 34], + [23, 33], + [24, 32], + [42, 56], + [43, 55], + [44, 54], + [45, 53], + [46, 50], + [57, 71], + [58, 70], + [59, 69], + [63, 80], + [64, 79], + [65, 78], + [66, 77], + [67, 76], + [68, 75], + [81, 83], + ] ) + def verify_dot_bracket_notation(output, expected_output): """ Ensure that the dot_bracket notation matches a reference. @@ -82,6 +85,7 @@ def verify_dot_bracket_notation(output, expected_output): unique_solutions = set(output) assert len(output) == len(unique_solutions) + def test_dot_bracket_from_structure(nuc_sample_array, expected_output): """ Check the output of ``dot_bracket_from_structure()``. @@ -89,22 +93,20 @@ def test_dot_bracket_from_structure(nuc_sample_array, expected_output): output = struc.dot_bracket_from_structure(nuc_sample_array) verify_dot_bracket_notation(output, expected_output) + def test_dot_bracket(basepair_residue_positions, expected_output): """ Check the output of ``dot_bracket()``. """ - output = struc.dot_bracket( - basepair_residue_positions, len(expected_output[0]) - ) + output = struc.dot_bracket(basepair_residue_positions, len(expected_output[0])) verify_dot_bracket_notation(output, expected_output) -def test_base_pairs_from_dot_bracket( - basepair_residue_positions, expected_output -): + +def test_base_pairs_from_dot_bracket(basepair_residue_positions, expected_output): """ Ensure that the base pairs are correctly extracted from the DBL-notation """ for notation in expected_output: test_residue_positions = struc.base_pairs_from_dot_bracket(notation) - assert np.all(test_residue_positions == basepair_residue_positions) \ No newline at end of file + assert np.all(test_residue_positions == basepair_residue_positions) diff --git a/tests/structure/test_filter.py b/tests/structure/test_filter.py index 996593831..5bac8099c 100644 --- a/tests/structure/test_filter.py +++ b/tests/structure/test_filter.py @@ -2,110 +2,125 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io as strucio -import numpy as np from os.path import join -from ..util import data_dir +import numpy as np import pytest +import biotite.structure as struc +import biotite.structure.io as strucio +from tests.util import data_dir + @pytest.fixture def canonical_sample_protein(): - return strucio.load_structure( - join(data_dir("structure"), "3o5r.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) + @pytest.fixture def sample_protein(): - return strucio.load_structure( - join(data_dir("structure"), "5eil.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "5eil.bcif")) + @pytest.fixture def canonical_sample_nucleotide(): - return strucio.load_structure( - join(data_dir("structure"), "5ugo.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "5ugo.bcif")) + @pytest.fixture def sample_nucleotide(): - return strucio.load_structure( - join(data_dir("structure"), "4p5j.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "4p5j.bcif")) + @pytest.fixture def sample_carbohydrate(): - return strucio.load_structure( - join(data_dir("structure"), "2d0f.bcif") - ) + return strucio.load_structure(join(data_dir("structure"), "2d0f.bcif")) + @pytest.fixture def all_atloc_structure(): return strucio.load_structure( join(data_dir("structure"), "1o1z.bcif"), - extra_fields = ["occupancy"], - altloc="all" + extra_fields=["occupancy"], + altloc="all", ) + def test_solvent_filter(canonical_sample_protein): - assert len(canonical_sample_protein[struc.filter_solvent(canonical_sample_protein)]) == 287 + assert ( + len(canonical_sample_protein[struc.filter_solvent(canonical_sample_protein)]) + == 287 + ) + def test_canonical_amino_acid_filter(canonical_sample_protein): assert ( - len(canonical_sample_protein[ - struc.filter_canonical_amino_acids(canonical_sample_protein) - ]) == 982 + len( + canonical_sample_protein[ + struc.filter_canonical_amino_acids(canonical_sample_protein) + ] + ) + == 982 ) + def test_amino_acid_filter(sample_protein): assert ( - struc.get_residue_count((sample_protein[ - struc.filter_amino_acids(sample_protein) - ])) == - struc.get_residue_count((sample_protein[ - struc.filter_canonical_amino_acids(sample_protein) - ])) + 3 + struc.get_residue_count( + (sample_protein[struc.filter_amino_acids(sample_protein)]) + ) + == struc.get_residue_count( + (sample_protein[struc.filter_canonical_amino_acids(sample_protein)]) + ) + + 3 ) + def test_canonical_nucleotide_filter(canonical_sample_nucleotide): assert ( - len(canonical_sample_nucleotide[ - struc.filter_canonical_nucleotides(canonical_sample_nucleotide) - ]) == 651 + len( + canonical_sample_nucleotide[ + struc.filter_canonical_nucleotides(canonical_sample_nucleotide) + ] + ) + == 651 ) + def test_nucleotide_filter(sample_nucleotide): assert ( - struc.get_residue_count((sample_nucleotide[ - struc.filter_nucleotides(sample_nucleotide) - ])) == - struc.get_residue_count((sample_nucleotide[ - struc.filter_canonical_nucleotides(sample_nucleotide) - ])) + 1 + struc.get_residue_count( + (sample_nucleotide[struc.filter_nucleotides(sample_nucleotide)]) + ) + == struc.get_residue_count( + (sample_nucleotide[struc.filter_canonical_nucleotides(sample_nucleotide)]) + ) + + 1 ) + def test_carbohydrate_filter(sample_carbohydrate): assert ( - struc.get_residue_count((sample_carbohydrate[ - struc.filter_carbohydrates(sample_carbohydrate) - ])) == 8 + struc.get_residue_count( + (sample_carbohydrate[struc.filter_carbohydrates(sample_carbohydrate)]) + ) + == 8 ) def test_peptide_backbone_filter(canonical_sample_protein): assert ( - len(canonical_sample_protein[ - struc.filter_peptide_backbone(canonical_sample_protein) - ]) == 384 + len( + canonical_sample_protein[ + struc.filter_peptide_backbone(canonical_sample_protein) + ] + ) + == 384 ) def test_phosphate_backbone_filter(canonical_sample_nucleotide): # take a chain D with five canonical nucleotides # => there should be 5 x 6 = 30 backbone atoms - chain_d = canonical_sample_nucleotide[ - canonical_sample_nucleotide.chain_id == 'D' - ] + chain_d = canonical_sample_nucleotide[canonical_sample_nucleotide.chain_id == "D"] assert len(chain_d[struc.filter_phosphate_backbone(chain_d)]) == 30 @@ -139,39 +154,45 @@ def test_polymer_filter(canonical_sample_nucleotide, sample_carbohydrate): a = canonical_sample_nucleotide # Check for nucleotide filtering - a_nuc = a[struc.filter_polymer(a, pol_type='n')] + a_nuc = a[struc.filter_polymer(a, pol_type="n")] # Take three nucleic acids chains and remove solvent => the result should # encompass all nucleotide polymer atoms, which is exactly the output of the # `filter_polymer()`. In the structure file, the filtered atoms are 1-651. - a_nuc_manual = a[np.isin(a.chain_id, ['D', 'P', 'T']) & ~struc.filter_solvent(a)] + a_nuc_manual = a[np.isin(a.chain_id, ["D", "P", "T"]) & ~struc.filter_solvent(a)] assert len(a_nuc) == len(a_nuc_manual) == 651 - assert set(a_nuc.chain_id) == {'D', 'P', 'T'} + assert set(a_nuc.chain_id) == {"D", "P", "T"} # chain D should be absent - a_nuc = a_nuc[struc.filter_polymer(a_nuc, min_size=6, pol_type='n')] - assert set(a_nuc.chain_id) == {'P', 'T'} + a_nuc = a_nuc[struc.filter_polymer(a_nuc, min_size=6, pol_type="n")] + assert set(a_nuc.chain_id) == {"P", "T"} # Single protein chain A: residues 10-335 - a_pep = a[struc.filter_polymer(a, pol_type='p')] - assert len(a_pep) == len(a[(a.res_id >= 10) & (a.res_id <= 335) & (a.chain_id == 'A')]) + a_pep = a[struc.filter_polymer(a, pol_type="p")] + assert len(a_pep) == len( + a[(a.res_id >= 10) & (a.res_id <= 335) & (a.chain_id == "A")] + ) # Chain B has five carbohydrate residues # Chain C has four # => Only chain B is selected a = sample_carbohydrate - a_carb = a[struc.filter_polymer(a, min_size=4, pol_type='carb')] - assert set(a_carb.chain_id) == {'B'} + a_carb = a[struc.filter_polymer(a, min_size=4, pol_type="carb")] + assert set(a_carb.chain_id) == {"B"} assert struc.get_residue_count(a_carb) == 5 def test_intersection_filter(canonical_sample_protein): assert ( - len(canonical_sample_protein[:200][ - struc.filter_intersection( - canonical_sample_protein[:200],canonical_sample_protein[100:] - ) - ]) == 100 + len( + canonical_sample_protein[:200][ + struc.filter_intersection( + canonical_sample_protein[:200], canonical_sample_protein[100:] + ) + ] + ) + == 100 ) + @pytest.mark.parametrize("filter_func", ["first", "occupancy"]) def test_filter_altloc(all_atloc_structure, filter_func): """ @@ -183,21 +204,22 @@ def test_filter_altloc(all_atloc_structure, filter_func): all_atloc_structure.chain_id, all_atloc_structure.res_id, all_atloc_structure.ins_code, - all_atloc_structure.atom_name + all_atloc_structure.atom_name, ): ref_atom_set.add(atom_tuple) if filter_func == "first": - filtered_structure = all_atloc_structure[struc.filter_first_altloc( - all_atloc_structure, - all_atloc_structure.altloc_id - )] + filtered_structure = all_atloc_structure[ + struc.filter_first_altloc( + all_atloc_structure, all_atloc_structure.altloc_id + ) + ] elif filter_func == "occupancy": filtered_structure = all_atloc_structure[ struc.filter_highest_occupancy_altloc( all_atloc_structure, all_atloc_structure.altloc_id, - all_atloc_structure.occupancy + all_atloc_structure.occupancy, ) ] @@ -206,7 +228,7 @@ def test_filter_altloc(all_atloc_structure, filter_func): filtered_structure.chain_id, filtered_structure.res_id, filtered_structure.ins_code, - filtered_structure.atom_name + filtered_structure.atom_name, ): try: # No atom should be present twice @@ -230,10 +252,9 @@ def test_filter_highest_occupancy_altloc(all_atloc_structure): all_atloc_structure.occupancy[all_atloc_structure.altloc_id == "B"] = 1.0 # filter_first_altloc - filtered_structure = all_atloc_structure[struc.filter_first_altloc( - all_atloc_structure, - all_atloc_structure.altloc_id - )] + filtered_structure = all_atloc_structure[ + struc.filter_first_altloc(all_atloc_structure, all_atloc_structure.altloc_id) + ] ref_occupancy_sum = np.average(filtered_structure.occupancy) # filter_highest_occupancy_altloc @@ -241,9 +262,9 @@ def test_filter_highest_occupancy_altloc(all_atloc_structure): struc.filter_highest_occupancy_altloc( all_atloc_structure, all_atloc_structure.altloc_id, - all_atloc_structure.occupancy + all_atloc_structure.occupancy, ) ] test_occupancy_sum = np.average(filtered_structure.occupancy) - assert test_occupancy_sum > ref_occupancy_sum \ No newline at end of file + assert test_occupancy_sum > ref_occupancy_sum diff --git a/tests/structure/test_generalio.py b/tests/structure/test_generalio.py index 1855344b3..d98781d59 100644 --- a/tests/structure/test_generalio.py +++ b/tests/structure/test_generalio.py @@ -2,33 +2,25 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile -import biotite.structure as struc -import biotite.structure.io as strucio import glob import os from os.path import join, splitext -from ..util import data_dir, cannot_import +from tempfile import NamedTemporaryFile import pytest +import biotite.structure as struc +import biotite.structure.io as strucio +from tests.util import data_dir -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "1l2y.*")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("structure"), "1l2y.*"))) def test_loading(path): """ Just check if :func:`load_structure()` does not raise an exception and returns an object of appropriate type. """ suffix = splitext(path)[1] - if suffix in [".trr", ".xtc", ".tng", ".dcd", ".netcdf"]: - template = strucio.load_structure( - join(data_dir("structure"), "1l2y.bcif") - ) + if suffix in [".trr", ".xtc", ".dcd", ".netcdf"]: + template = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) array = strucio.load_structure(path, template) else: array = strucio.load_structure(path) @@ -40,10 +32,6 @@ def test_loading(path): assert isinstance(array, struc.AtomArrayStack) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) def test_loading_template_with_trj(): """ Check if :func:`load_structure()` using a trajectory file does not @@ -57,10 +45,6 @@ def test_loading_template_with_trj(): assert len(stack) > 1 -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) def test_loading_with_extra_args(): """ Check if :func:`load_structure()` witt optional arguments does not @@ -74,9 +58,7 @@ def test_loading_with_extra_args(): assert "b_factor" in structure.get_annotation_categories() # test if arguments are passed to read for trajectories - stack = strucio.load_structure( - trajectory, template=structure[0], start=5, stop=6 - ) + stack = strucio.load_structure(trajectory, template=structure[0], start=5, stop=6) assert len(stack) == 1 # loading should fail with wrong arguments @@ -88,16 +70,9 @@ def test_loading_with_extra_args(): assert stack.shape[1] == 2 -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( "suffix", - [ - "pdb", "pdbx", "cif", "bcif", "gro", "mmtf", "trr", "xtc", "tng", - "dcd", "netcdf" - ] + ["pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "dcd", "netcdf"], ) def test_saving(suffix): """ @@ -107,15 +82,15 @@ def test_saving(suffix): """ path = join(data_dir("structure"), "1l2y.bcif") ref_array = strucio.load_structure(path) - if suffix in ("trr", "xtc", "tng", "dcd", "netcdf"): + if suffix in ("trr", "xtc", "dcd", "netcdf"): # Reading a trajectory file requires a template template = path else: template = None temp = NamedTemporaryFile("w", suffix=f".{suffix}", delete=False) - strucio.save_structure(temp.name, ref_array) temp.close() + strucio.save_structure(temp.name, ref_array) test_array = strucio.load_structure(temp.name, template) os.remove(temp.name) @@ -124,23 +99,18 @@ def test_saving(suffix): if category == "chain_id" and suffix == "gro": # The chain ID is not written to GRO files continue - assert test_array.get_annotation(category).tolist() \ - == ref_array.get_annotation(category).tolist() + assert ( + test_array.get_annotation(category).tolist() + == ref_array.get_annotation(category).tolist() + ) assert test_array.coord.flatten().tolist() == pytest.approx( - ref_array.coord.flatten().tolist(), abs=1e-2 + ref_array.coord.flatten().tolist(), abs=1e-2 ) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( "suffix", - [ - "pdb", "pdbx", "cif", "bcif", "gro", "mmtf", "trr", "xtc", "tng", - "dcd", "netcdf" - ] + ["pdb", "pdbx", "cif", "bcif", "gro", "trr", "xtc", "dcd", "netcdf"], ) def test_saving_with_extra_args(suffix): """ @@ -150,9 +120,7 @@ def test_saving_with_extra_args(suffix): array = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) temp = NamedTemporaryFile("w+", suffix=f".{suffix}") with pytest.raises(TypeError): - strucio.save_structure( - temp.name, array, answer=42 - ) + strucio.save_structure(temp.name, array, answer=42) temp.close() diff --git a/tests/structure/test_geometry.py b/tests/structure/test_geometry.py index 3239d43b4..dc5ecf1af 100644 --- a/tests/structure/test_geometry.py +++ b/tests/structure/test_geometry.py @@ -2,9 +2,7 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile import itertools -import glob from os.path import join import numpy as np import numpy.random as random @@ -12,34 +10,33 @@ import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx -from ..util import data_dir, cannot_import +from tests.util import data_dir def test_distance(): - coord1 = struc.coord([0,1,1]) - coord2 = struc.coord([0,2,2]) + coord1 = struc.coord([0, 1, 1]) + coord2 = struc.coord([0, 2, 2]) assert struc.distance(coord1, coord2) == pytest.approx(np.sqrt(2)) def test_centroid(): - coord = struc.coord([[1,1,1],[0,-1,-1],[-1,0,0]]) - assert struc.centroid(coord).tolist() == [0,0,0] + coord = struc.coord([[1, 1, 1], [0, -1, -1], [-1, 0, 0]]) + assert struc.centroid(coord).tolist() == [0, 0, 0] def test_angle(): - coord1 = struc.coord([0,0,1]) - coord2 = struc.coord([0,0,0]) - coord3 = struc.coord([0,1,1]) - assert struc.angle(coord1, coord2, coord3) == pytest.approx(0.25*np.pi) + coord1 = struc.coord([0, 0, 1]) + coord2 = struc.coord([0, 0, 0]) + coord3 = struc.coord([0, 1, 1]) + assert struc.angle(coord1, coord2, coord3) == pytest.approx(0.25 * np.pi) def test_dihedral(): - coord1 = struc.coord([-0.5,-1,0]) - coord2 = struc.coord([0,0,0]) - coord3 = struc.coord([1,0,0]) - coord4 = struc.coord([0,0,-1]) - assert struc.dihedral(coord1, coord2, coord3, coord4) \ - == pytest.approx(0.5*np.pi) + coord1 = struc.coord([-0.5, -1, 0]) + coord2 = struc.coord([0, 0, 0]) + coord3 = struc.coord([1, 0, 0]) + coord4 = struc.coord([0, 0, -1]) + assert struc.dihedral(coord1, coord2, coord3, coord4) == pytest.approx(0.5 * np.pi) @pytest.mark.parametrize("multiple_chains", [False, True]) @@ -55,17 +52,18 @@ def test_dihedral_backbone_general(multiple_chains): array = stack[0] # Test array phi, psi, omega = struc.dihedral_backbone(array) - assert phi.shape == (n_res,) - assert psi.shape == (n_res,) + assert phi.shape == (n_res,) + assert psi.shape == (n_res,) assert omega.shape == (n_res,) _assert_plausible_omega(omega) # Test stack phi, psi, omega = struc.dihedral_backbone(stack) - assert phi.shape == (n_models, n_res) - assert psi.shape == (n_models, n_res) + assert phi.shape == (n_models, n_res) + assert psi.shape == (n_models, n_res) assert omega.shape == (n_models, n_res) _assert_plausible_omega(omega) + def _assert_plausible_omega(omega): # Remove nan values omega = omega.flatten() @@ -74,49 +72,42 @@ def _assert_plausible_omega(omega): assert omega.tolist() == pytest.approx([np.pi] * len(omega), rel=0.6) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "file_name", glob.glob(join(data_dir("structure"), "*.bcif")) -) -def test_dihedral_backbone_result(file_name): - import mdtraj - - if "5eil" in file_name: - # Structure contains non-canonical amino acid - # with missing backbone atoms - pytest.skip("Structure contains non-canonical amino acid") - - pdbx_file = pdbx.BinaryCIFFile.read(file_name) - array = pdbx.get_structure(pdbx_file, model=1) - array = array[struc.filter_amino_acids(array)] - if array.array_length() == 0: - # Structure contains no protein - # -> determination of backbone angles makes no sense - return - - for chain in struc.chain_iter(array): - print("Chain: ", chain.chain_id[0]) - if len(struc.check_res_id_continuity(chain)) != 0: - # Do not test discontinuous chains - return - test_phi, test_psi, test_ome = struc.dihedral_backbone(chain) - - temp = NamedTemporaryFile("w+", suffix=".pdb") - strucio.save_structure(temp.name, chain) - traj = mdtraj.load(temp.name) - temp.close() - _, ref_phi = mdtraj.compute_phi(traj) - _, ref_psi = mdtraj.compute_psi(traj) - _, ref_ome = mdtraj.compute_omega(traj) - ref_phi, ref_psi, ref_ome = ref_phi[0], ref_psi[0], ref_ome[0] - - assert test_phi[1: ] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3) - assert test_psi[:-1] == pytest.approx(ref_psi, abs=1e-5, rel=5e-3) - assert test_ome[:-1] == pytest.approx(ref_ome, abs=1e-5, rel=5e-3) - +def test_dihedral_backbone_consistency(): + """ + Check if the computed dihedral angles are equal to the reference computed with + MDTraj. + """ + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) + atoms = pdbx.get_structure(pdbx_file, model=1) + + test_phi, test_psi, test_ome = struc.dihedral_backbone(atoms) + + ref_dihedrals = np.array([ + [np.nan, -0.980, 3.085], + [-0.768, -0.896, 3.122], + [-1.160, -0.539, 3.053], + [-1.138, -0.802, 3.069], + [-1.130, -0.530, 2.994], + [-1.276, -0.758, 3.098], + [-1.132, -0.755, -3.122], + [-1.039, -0.449, 3.042], + [-1.361, -0.154, -3.112], + [ 1.934, 0.141, 3.128], + [ 0.964, -2.171, 3.106], + [-1.012, -0.502, -3.083], + [-1.428, 0.334, -3.078], + [-2.165, 0.234, 3.067], + [ 1.186, 0.440, -3.013], + [-2.512, 2.292, -3.141], + [-1.223, 2.794, 3.110], + [-1.213, 2.542, 3.077], + [-1.349, 2.168, 2.996], + [-1.363, np.nan, np.nan], + ]) # fmt: skip + + assert test_phi == pytest.approx(ref_dihedrals[:, 0], abs=1e-3, nan_ok=True) + assert test_psi == pytest.approx(ref_dihedrals[:, 1], abs=1e-3, nan_ok=True) + assert test_ome == pytest.approx(ref_dihedrals[:, 2], abs=1e-3, nan_ok=True) def test_index_distance_non_periodic(): @@ -126,72 +117,32 @@ def test_index_distance_non_periodic(): """ array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) ref_dist = struc.distance( - array.coord[np.newaxis, :, :], - array.coord[:, np.newaxis, :] + array.coord[np.newaxis, :, :], array.coord[:, np.newaxis, :] ).flatten() length = array.array_length() dist = struc.index_distance( array, - indices = np.stack([ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], axis=1) + indices=np.stack( + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=1, + ), ) assert np.allclose(dist, ref_dist) @pytest.mark.parametrize( - "shift", [ - np.array([10, 20, 30]), - np.array([-8, 12, 28]), - np.array([ 0, 99, 54]) - ] -) -def test_index_distance_periodic_orthogonal(shift): - """ - The PBC aware computation, should give the same results, - irrespective of which atoms are centered in the box - """ - array = strucio.load_structure(join(data_dir("structure"), "3o5r.bcif")) - # Use a box based on the boundaries of the structure - # '+1' to add a margin - array.box = np.diag( - np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1 - ) - - length = array.array_length() - dist_indices = np.stack([ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], axis=1) - ref_dist = struc.index_distance(array, dist_indices, periodic=True) - - array.coord += shift - array.coord = struc.move_inside_box(array.coord, array.box) - dist = struc.index_distance(array, dist_indices, periodic=True) - assert np.allclose(dist, ref_dist, atol=1e-5) - - -@pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize( - "shift, angles", itertools.product( - [ - np.array([10, 20, 30]), - np.array([-8, 12, 28]), - np.array([ 0, 99, 54]) - ], - [ - np.array([ 50, 90, 90]), - np.array([ 90, 90, 120]), - np.array([ 60, 60, 60]) - ] - ) + "shift, angles", + itertools.product( + [np.array([10, 20, 30]), np.array([-8, 12, 28]), np.array([0, 99, 54])], + [ + np.array([90, 90, 90]), + np.array([50, 90, 90]), + np.array([90, 90, 120]), + np.array([60, 60, 60]), + ], + ), ) -def test_index_distance_periodic_triclinic(shift, angles): +def test_index_distance_periodic(shift, angles): """ The PBC aware computation, should give the same results, irrespective of which atoms are centered in the box @@ -202,31 +153,21 @@ def test_index_distance_periodic_triclinic(shift, angles): boundaries = np.max(array.coord, axis=0) - np.min(array.coord, axis=0) + 1 angles = np.deg2rad(angles) array.box = struc.vectors_from_unitcell( - boundaries[0], boundaries[1], boundaries[2], - angles[0], angles[1], angles[2] + boundaries[0], boundaries[1], boundaries[2], angles[0], angles[1], angles[2] ) length = array.array_length() - dist_indices = np.stack([ - np.repeat(np.arange(length), length), - np.tile(np.arange(length), length) - ], axis=1) + # All-to-all indices + dist_indices = np.stack( + [np.repeat(np.arange(length), length), np.tile(np.arange(length), length)], + axis=1, + ) # index_distance() creates a large ndarray try: ref_dist = struc.index_distance(array, dist_indices, periodic=True) except MemoryError: pytest.skip("Not enough memory") - # Compare with MDTraj - import mdtraj - traj = mdtraj.load(join(data_dir("structure"), "3o5r.pdb")) - # Angstrom to Nanometers - traj.unitcell_vectors = array.box[np.newaxis, :, :] / 10 - # Nanometers to Angstrom - mdtraj_dist = mdtraj.compute_distances(traj, dist_indices)[0] * 10 - ind = np.where(~np.isclose(ref_dist, mdtraj_dist, atol=2e-5, rtol=1e-3))[0] - assert np.allclose(ref_dist, mdtraj_dist, atol=2e-5, rtol=1e-3) - # Compare with shifted variant array.coord += shift array.coord = struc.move_inside_box(array.coord, array.box) @@ -249,38 +190,35 @@ def test_index_functions(): samples = (array, stack, struc.coord(array), struc.coord(stack)) # Generate random indices random.seed(42) - indices = random.randint(array.array_length(), size=(100,4), dtype=int) + indices = random.randint(array.array_length(), size=(100, 4), dtype=int) for sample in samples: if isinstance(sample, np.ndarray): - atoms1 = sample[..., indices[:,0], :] - atoms2 = sample[..., indices[:,1], :] - atoms3 = sample[..., indices[:,2], :] - atoms4 = sample[..., indices[:,3], :] + atoms1 = sample[..., indices[:, 0], :] + atoms2 = sample[..., indices[:, 1], :] + atoms3 = sample[..., indices[:, 2], :] + atoms4 = sample[..., indices[:, 3], :] else: - atoms1 = sample[..., indices[:,0]] - atoms2 = sample[..., indices[:,1]] - atoms3 = sample[..., indices[:,2]] - atoms4 = sample[..., indices[:,3]] + atoms1 = sample[..., indices[:, 0]] + atoms2 = sample[..., indices[:, 1]] + atoms3 = sample[..., indices[:, 2]] + atoms4 = sample[..., indices[:, 3]] assert np.allclose( struc.displacement(atoms1, atoms2), - struc.index_displacement(sample, indices[:,:2]), - atol=1e-5 + struc.index_displacement(sample, indices[:, :2]), + atol=1e-5, ) assert np.allclose( struc.distance(atoms1, atoms2), - struc.index_distance(sample, indices[:,:2]), - atol=1e-5 + struc.index_distance(sample, indices[:, :2]), + atol=1e-5, ) assert np.allclose( struc.angle(atoms1, atoms2, atoms3), - struc.index_angle(sample, indices[:,:3]), - atol=1e-5 + struc.index_angle(sample, indices[:, :3]), + atol=1e-5, ) assert np.allclose( struc.dihedral(atoms1, atoms2, atoms3, atoms4), - struc.index_dihedral(sample, indices[:,:4]), - atol=1e-5 + struc.index_dihedral(sample, indices[:, :4]), + atol=1e-5, ) - - - diff --git a/tests/structure/test_gro.py b/tests/structure/test_gro.py index 02faf6f68..806c65a0d 100644 --- a/tests/structure/test_gro.py +++ b/tests/structure/test_gro.py @@ -2,18 +2,18 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile import glob import itertools from os.path import join, splitext +from tempfile import TemporaryFile +import numpy as np import pytest from pytest import approx -import numpy as np import biotite import biotite.structure.io.gro as gro import biotite.structure.io.pdb as pdb from biotite.structure import Atom, array -from ..util import data_dir +from tests.util import data_dir def test_get_model_count(): @@ -25,10 +25,7 @@ def test_get_model_count(): @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.gro")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.gro")), [None, 1, -1]), ) def test_array_conversion(path, model): gro_file = gro.GROFile.read(path) @@ -40,8 +37,10 @@ def test_array_conversion(path, model): assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): - assert array1.get_annotation(category).tolist() == \ - array2.get_annotation(category).tolist() + assert ( + array1.get_annotation(category).tolist() + == array2.get_annotation(category).tolist() + ) assert array1.coord.tolist() == array2.coord.tolist() @@ -58,20 +57,17 @@ def test_pdb_consistency(path): assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: - assert a1.get_annotation(category).tolist() == \ - a2.get_annotation(category).tolist() + assert ( + a1.get_annotation(category).tolist() == a2.get_annotation(category).tolist() + ) # Mind rounding errors when converting pdb to gro (A -> nm) - assert a1.coord.flatten().tolist() \ - == approx(a2.coord.flatten().tolist(), abs=1e-2) + assert a1.coord.flatten().tolist() == approx(a2.coord.flatten().tolist(), abs=1e-2) @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]), ) def test_pdb_to_gro(path, model): """ @@ -105,20 +101,24 @@ def test_pdb_to_gro(path, model): assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: - assert a1.get_annotation(category).tolist() == \ - a2.get_annotation(category).tolist() + assert ( + a1.get_annotation(category).tolist() == a2.get_annotation(category).tolist() + ) # Mind rounding errors when converting pdb to gro (A -> nm) - assert a1.coord.flatten().tolist() \ - == approx(a2.coord.flatten().tolist(), abs=1e-2) + assert a1.coord.flatten().tolist() == approx(a2.coord.flatten().tolist(), abs=1e-2) def test_gro_id_overflow(): # Create an oversized AtomArray where atom_id > 100000 and res_id > 10000 num_atoms = 100005 - atoms = array([Atom([1,2,3], atom_name="CA", element="C", res_name="X", - res_id=i+1) for i in range(num_atoms)]) - atoms.box = np.array([[1,0,0], [0,1,0], [0,0,1]]) + atoms = array( + [ + Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=i + 1) + for i in range(num_atoms) + ] + ) + atoms.box = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # Write .gro file temp = TemporaryFile("w+") @@ -143,7 +143,7 @@ def test_gro_no_box(): """ # Create an AtomArray - atom = Atom([1,2,3], atom_name="CA", element="C", res_name="X", res_id=1) + atom = Atom([1, 2, 3], atom_name="CA", element="C", res_name="X", res_id=1) atoms = array([atom]) # Write .gro file @@ -151,7 +151,7 @@ def test_gro_no_box(): gro_file = gro.GROFile() gro_file.set_structure(atoms) gro_file.write(temp) - + # Read in file temp.seek(0) gro_file = gro.GROFile.read(temp) @@ -159,4 +159,4 @@ def test_gro_no_box(): s = gro_file.get_structure() # Assert no box with 0 dimension - assert s.box is None \ No newline at end of file + assert s.box is None diff --git a/tests/structure/test_hbond.py b/tests/structure/test_hbond.py index 64d4068f9..834b20386 100644 --- a/tests/structure/test_hbond.py +++ b/tests/structure/test_hbond.py @@ -3,20 +3,19 @@ # information. import itertools -from tempfile import NamedTemporaryFile +import json from os.path import join import numpy as np import pytest import biotite.structure as struc -from biotite.structure.io import load_structure, save_structure -from ..util import data_dir, cannot_import +import biotite.structure.io.pdbx as pdbx +from biotite.structure.io import load_structure +from tests.util import data_dir @pytest.fixture() def stack(request): - stack = load_structure( - join(data_dir("structure"), "1l2y.bcif") - ) + stack = load_structure(join(data_dir("structure"), "1l2y.bcif")) if request.param: # Use connect_via_distances, since 1l2y has invalidly bonded # N-terminal hydrogen atoms @@ -24,62 +23,47 @@ def stack(request): return stack -# Ignore warning about dummy unit cell vector -@pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( - "pdb_id, use_bond_list", itertools.product( - ["1l2y", "1gya", "1igy"], - [False, True] - ) + "pdb_id, use_all_models, use_bond_list", + itertools.product(["1l2y", "1gya", "1igy"], [False, True], [False, True]), ) -def test_hbond_structure(pdb_id, use_bond_list): +def test_hbond_consistency(pdb_id, use_all_models, use_bond_list): """ - Compare hydrogen bond detection with MDTraj + Compare hydrogen bond detection with MDTraj. """ - file_name = join(data_dir("structure"), pdb_id+".bcif") + # Load precomputed hydrogen bond triplets from MDTraj + with open(join(data_dir("structure"), "misc", "hbond.json")) as file: + ref_data = json.load(file) + key = "single_model" if not use_all_models else "all_models" + ref_triplets = ref_data[pdb_id][key] + + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id + ".bcif")) + model = None if use_all_models else 1 + atoms = pdbx.get_structure(pdbx_file, model=model) - array = load_structure(file_name) if use_bond_list: - if isinstance(array, struc.AtomArrayStack): - ref_model = array[0] + if isinstance(atoms, struc.AtomArrayStack): + ref_model = atoms[0] else: - ref_model = array + ref_model = atoms bonds = struc.connect_via_distances(ref_model) bonds = bonds.merge(struc.connect_via_residue_names(ref_model)) - array.bonds = bonds + atoms.bonds = bonds # Only consider amino acids for consistency # with bonded hydrogen detection in MDTraj - array = array[..., struc.filter_amino_acids(array)] - if isinstance(array, struc.AtomArrayStack): + atoms = atoms[..., struc.filter_amino_acids(atoms)] + if isinstance(atoms, struc.AtomArrayStack): # For consistency with MDTraj 'S' cannot be acceptor element # https://github.com/mdtraj/mdtraj/blob/master/mdtraj/geometry/hbond.py#L365 - triplets, mask = struc.hbond(array, acceptor_elements=("O","N")) + test_triplets, mask = struc.hbond(atoms, acceptor_elements=("O", "N")) else: - triplets = struc.hbond(array, acceptor_elements=("O","N")) - - # Save to new pdb file for consistent treatment of inscode/altloc - # im MDTraj - temp = NamedTemporaryFile("w+", suffix=".pdb") - save_structure(temp.name, array) - - # Compare with MDTraj - import mdtraj - traj = mdtraj.load(temp.name) - temp.close() - triplets_ref = mdtraj.baker_hubbard( - traj, freq=0, periodic=False - ) + test_triplets = struc.hbond(atoms, acceptor_elements=("O", "N")) - # Both packages may use different order - # -> use set for comparison - triplets_set = set([tuple(tri) for tri in triplets]) - triplets_ref_set = set([tuple(tri) for tri in triplets_ref]) - assert triplets_set == triplets_ref_set + # MDTraj and Biotite may use different order -> use set for comparison + assert set([tuple(tri) for tri in test_triplets]) == set( + [tuple(tri) for tri in ref_triplets] + ) # Ignore warning about missing BondList, as this is intended @@ -105,7 +89,7 @@ def test_hbond_total_count(stack): """ With the standart Baker & Hubbard criterion, 1l2y should have 28 hydrogen bonds with a frequency > 0.1 - (comparision with MDTraj results) + (comparison with results obtained from MDTraj) """ triplets, mask = struc.hbond(stack) freq = struc.hbond_frequency(mask) @@ -122,28 +106,27 @@ def test_hbond_with_selections(stack): of this boundary should be found. Also, hbond should respect the selection type. """ - selection1 = (stack.res_id == 3) & (stack.atom_name == 'O') # 3TYR BB Ox + selection1 = (stack.res_id == 3) & (stack.atom_name == "O") # 3TYR BB Ox selection2 = stack.res_id == 7 # backbone hbond should be found if selection1/2 type is both - triplets, mask = struc.hbond(stack, selection1, selection2, - selection1_type="both") + triplets, mask = struc.hbond(stack, selection1, selection2, selection1_type="both") assert len(triplets) == 1 assert triplets[0][0] == 116 assert triplets[0][2] == 38 # backbone hbond should be found if selection1 is acceptor and # selection2 is donor - triplets, mask = struc.hbond(stack, selection1, selection2, - selection1_type="acceptor") + triplets, mask = struc.hbond( + stack, selection1, selection2, selection1_type="acceptor" + ) assert len(triplets) == 1 assert triplets[0][0] == 116 assert triplets[0][2] == 38 # no hbond should be found, # because the backbone oxygen cannot be a donor - triplets, mask = struc.hbond(stack, selection1, selection2, - selection1_type="donor") + triplets, mask = struc.hbond(stack, selection1, selection2, selection1_type="donor") assert len(triplets) == 0 @@ -164,18 +147,20 @@ def test_hbond_single_selection(stack): def test_hbond_frequency(): - mask = np.array([ - [True, True, True, True, True], # 1.0 - [False, False, False, False, False], # 0.0 - [False, False, False, True, True] # 0.4 - ]).T + mask = np.array( + [ + [True, True, True, True, True], # 1.0 + [False, False, False, False, False], # 0.0 + [False, False, False, True, True], # 0.4 + ] + ).T freq = struc.hbond_frequency(mask) assert not np.isin(False, np.isclose(freq, np.array([1.0, 0.0, 0.4]))) # Ignore warning about missing BondList @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize("translation_vector", [(10,20,30), (-5, 3, 18)]) +@pytest.mark.parametrize("translation_vector", [(10, 20, 30), (-5, 3, 18)]) def test_hbond_periodicity(translation_vector): """ Test whether hydrogen bond identification uses periodic boundary @@ -197,4 +182,4 @@ def test_hbond_periodicity(translation_vector): array.coord = struc.move_inside_box(array.coord, array.box) hbonds = struc.hbond(array, periodic=True) hbonds = set([tuple(triplet) for triplet in hbonds]) - assert ref_hbonds == hbonds \ No newline at end of file + assert ref_hbonds == hbonds diff --git a/tests/structure/test_info.py b/tests/structure/test_info.py index f23c75030..2d823aaf1 100644 --- a/tests/structure/test_info.py +++ b/tests/structure/test_info.py @@ -9,7 +9,7 @@ import biotite.structure as struc import biotite.structure.info as strucinfo from biotite.structure.io import load_structure -from ..util import data_dir +from tests.util import data_dir @pytest.mark.parametrize( @@ -18,7 +18,7 @@ (strucinfo.amino_acid_names, ["ALA", "ARG", "ASN", "ASP"], ["HOH"]), (strucinfo.nucleotide_names, ["A", "C", "G", "U"], ["HOH", "ALA"]), (strucinfo.carbohydrate_names, ["GLC", "RIB"], ["HOH", "ALA"]), - ] + ], ) def test_group_names(function, included, excluded): """ @@ -49,16 +49,16 @@ def test_mass(): ref_masses = [strucinfo.mass(res) for res in struc.residue_iter(array)] # Up to three additional/missing hydrogens are allowed # (protonation state) - mass_diff = np.abs(np.array( - [mass - ref_mass for mass, ref_mass in zip(masses, ref_masses)] - )) + mass_diff = np.abs( + np.array([mass - ref_mass for mass, ref_mass in zip(masses, ref_masses)]) + ) assert (mass_diff // strucinfo.mass("H") <= 3).all() # Check if the mass difference is a multiple of the hydrogen mass multiple_of_h_masses = mass_diff / strucinfo.mass("H") assert np.all(np.round(multiple_of_h_masses, decimals=2) % 1 == 0) -def test_protOr_radii(): +def test_protor_radii(): """ Assert that ProtOr VdW radii (except hydrogen) can be calculated for all atoms in the given structure, since the structure (1GYA) @@ -72,7 +72,7 @@ def test_protOr_radii(): for res_name, atom_name in zip(array.res_name, array.atom_name): radius = strucinfo.vdw_radius_protor(res_name, atom_name) assert isinstance(radius, float) - assert radius != None + assert radius is not None def test_protor_radii_invalid(): @@ -83,7 +83,7 @@ def test_protor_radii_invalid(): # Expect raised exception when a residue does not contain an atom strucinfo.vdw_radius_protor("ALA", "K") # For all other unknown radii expect None - assert strucinfo.vdw_radius_protor("HOH", "O") == None + assert strucinfo.vdw_radius_protor("HOH", "O") is None def test_single_radii(): @@ -105,12 +105,16 @@ def test_link_type(): [ (strucinfo.amino_acid_names(), True, 0.4), (strucinfo.nucleotide_names(), True, 0.4), - (sorted( - set(strucinfo.all_residues()) - - set(strucinfo.amino_acid_names()) - - set(strucinfo.nucleotide_names()) - ), False, 0.01), - ] + ( + sorted( + set(strucinfo.all_residues()) + - set(strucinfo.amino_acid_names()) + - set(strucinfo.nucleotide_names()) + ), + False, + 0.01, + ), + ], ) def test_one_letter_code(residues, should_have_one_letter, exception_ratio): """ @@ -145,14 +149,13 @@ def test_standardize_order(multi_model, seed): reordered = struc.AtomArray(0) for residue in struc.residue_iter(original): bound = residue.array_length() - indices = np.random.choice( - np.arange(bound), bound,replace=False - ) + indices = np.random.choice(np.arange(bound), bound, replace=False) reordered += residue[..., indices] # Restore the original PDB standard order restored = reordered[..., strucinfo.standardize_order(reordered)] assert restored.shape == original.shape - assert restored[..., restored.element != "H"] \ - == original[..., original.element != "H"] + assert ( + restored[..., restored.element != "H"] == original[..., original.element != "H"] + ) diff --git a/tests/structure/test_integrity.py b/tests/structure/test_integrity.py index b8fbbb89d..c9f92ccc5 100644 --- a/tests/structure/test_integrity.py +++ b/tests/structure/test_integrity.py @@ -2,51 +2,56 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx -import numpy as np from os.path import join -from ..util import data_dir +import numpy as np import pytest +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir @pytest.fixture def sample_array(): - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), "1l2y.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) return pdbx.get_structure(pdbx_file, model=1) + @pytest.fixture def gapped_sample_array(sample_array): - atom_ids = np.arange(1, sample_array.shape[0]+1) + atom_ids = np.arange(1, sample_array.shape[0] + 1) sample_array.add_annotation("atom_id", dtype=int) sample_array.atom_id = atom_ids sample_array = sample_array[sample_array.res_id != 5] - sample_array = sample_array[(sample_array.res_id != 9) | - (sample_array.atom_name != "N")] + sample_array = sample_array[ + (sample_array.res_id != 9) | (sample_array.atom_name != "N") + ] return sample_array + @pytest.fixture def duplicate_sample_array(sample_array): sample_array[42] = sample_array[10] sample_array[234] = sample_array[123] return sample_array + def test_atom_id_continuity_check(gapped_sample_array): discon = struc.check_atom_id_continuity(gapped_sample_array) discon_array = gapped_sample_array[discon] assert discon_array.atom_id.tolist() == [93, 159] + def test_res_id_continuity_check(gapped_sample_array): discon = struc.check_res_id_continuity(gapped_sample_array) discon_array = gapped_sample_array[discon] assert discon_array.res_id.tolist() == [6] + def test_linear_continuity_check(gapped_sample_array): # Take the first ASN residue and remove hydrogens asn = gapped_sample_array[ - (gapped_sample_array.res_id == 1) & (gapped_sample_array.element != 'H')] + (gapped_sample_array.res_id == 1) & (gapped_sample_array.element != "H") + ] # The consecutive atom groups are # (1) N, CA, C, O # - break @@ -57,11 +62,13 @@ def test_linear_continuity_check(gapped_sample_array): discon = struc.check_linear_continuity(asn) assert discon.tolist() == [4, 7] + def test_bond_continuity_check(gapped_sample_array): discon = struc.check_backbone_continuity(gapped_sample_array) discon_array = gapped_sample_array[discon] - assert discon_array.res_id.tolist() == [6,9] + assert discon_array.res_id.tolist() == [6, 9] + def test_duplicate_atoms_check(duplicate_sample_array): discon = struc.check_duplicate_atoms(duplicate_sample_array) - assert discon.tolist() == [42,234] \ No newline at end of file + assert discon.tolist() == [42, 234] diff --git a/tests/structure/test_mechanics.py b/tests/structure/test_mechanics.py index 7be195882..34c5d23c3 100644 --- a/tests/structure/test_mechanics.py +++ b/tests/structure/test_mechanics.py @@ -1,25 +1,57 @@ -import biotite.structure as struc -import biotite.structure.io as strucio -import numpy as np from os.path import join -from ..util import data_dir import pytest +import biotite.structure as struc +import biotite.structure.io as strucio +from tests.util import data_dir + def test_gyration_radius(): stack = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif")) radii = struc.gyration_radius(stack) # Compare with results from MDTraj - exp_radii = \ - [7.30527532, 7.34189463, 7.21863721, 7.29877736, 7.25389752, 7.22292189, - 7.20646252, 7.27215909, 7.30437723, 7.30455437, 7.37979331, 7.14176259, - 7.20674397, 7.27594995, 7.31665835, 7.29850786, 7.34378951, 7.2642137, - 7.20727158, 7.16336879, 7.3479218, 7.19362027, 7.24841519, 7.29229237, - 7.15243826, 7.31285673, 7.22585756, 7.25467109, 7.3493648, 7.34203588, - 7.3310182, 7.29236536, 7.20527373, 7.33138918, 7.2284936, 7.40374312, - 7.24856173, 7.25581809] + exp_radii = [ + 7.30527532, + 7.34189463, + 7.21863721, + 7.29877736, + 7.25389752, + 7.22292189, + 7.20646252, + 7.27215909, + 7.30437723, + 7.30455437, + 7.37979331, + 7.14176259, + 7.20674397, + 7.27594995, + 7.31665835, + 7.29850786, + 7.34378951, + 7.2642137, + 7.20727158, + 7.16336879, + 7.3479218, + 7.19362027, + 7.24841519, + 7.29229237, + 7.15243826, + 7.31285673, + 7.22585756, + 7.25467109, + 7.3493648, + 7.34203588, + 7.3310182, + 7.29236536, + 7.20527373, + 7.33138918, + 7.2284936, + 7.40374312, + 7.24856173, + 7.25581809, + ] assert radii.tolist() == pytest.approx(exp_radii, abs=2e-2) # Same for atom array instead of stack array = stack[0] radius = struc.gyration_radius(array) - assert radius == pytest.approx(exp_radii[0], abs=2e-2) \ No newline at end of file + assert radius == pytest.approx(exp_radii[0], abs=2e-2) diff --git a/tests/structure/test_mmtf.py b/tests/structure/test_mmtf.py deleted file mode 100644 index b04d16b64..000000000 --- a/tests/structure/test_mmtf.py +++ /dev/null @@ -1,222 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -from tempfile import TemporaryFile -import glob -import itertools -from os.path import join, splitext -import numpy as np -import pytest -from pytest import approx -import biotite -import biotite.structure.info as info -import biotite.structure.io.mmtf as mmtf -import biotite.structure.io.pdbx as pdbx -from ..util import data_dir - - -def test_get_model_count(): - mmtf_file = mmtf.MMTFFile.read(join(data_dir("structure"), "1l2y.mmtf")) - test_model_count = mmtf.get_model_count(mmtf_file) - ref_model_count = mmtf.get_structure(mmtf_file).stack_depth() - assert test_model_count == ref_model_count - - -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "*.mmtf")) -) -def test_codecs(path): - mmtf_file = mmtf.MMTFFile.read(path) - for key in mmtf_file: - if mmtf_file.get_codec(key) is not None: - codec = mmtf_file.get_codec(key) - param = mmtf_file.get_param(key) - array1 = mmtf_file[key] - mmtf_file.set_array(key, array1, codec, param) - array2 = mmtf_file[key] - if array1.dtype == np.float32: - if param != 0: - tol = 1/param - else: - tol = 0 - assert np.isclose(array1, array2, atol=tol).all() - else: - assert (array1 == array2).all() - - -@pytest.mark.parametrize( - "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.mmtf")), - [None, 1, -1] - ) -) -def test_array_conversion(path, model): - mmtf_file = mmtf.MMTFFile.read(path) - try: - a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) - except biotite.InvalidFileError: - if model is None: - # The file cannot be parsed into an AtomArrayStack, - # as the models contain different numbers of atoms - # -> skip this test case - return - else: - raise - - mmtf_file = mmtf.MMTFFile() - mmtf.set_structure(mmtf_file, a1) - temp = TemporaryFile("w+b") - mmtf_file.write(temp) - - temp.seek(0) - mmtf_file = mmtf.MMTFFile.read(temp) - temp.close() - a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) - - for category in a1.get_annotation_categories(): - assert a1.get_annotation(category).tolist() == \ - a2.get_annotation(category).tolist() - assert a1.coord.flatten().tolist() == \ - approx(a2.coord.flatten().tolist(), abs=1e-3) - assert a1.bonds == a2.bonds - if a1.box is not None: - assert np.allclose(a1.box, a2.box) - - -@pytest.mark.parametrize( - "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.mmtf")), - [None, 1, -1] - ) -) -def test_pdbx_consistency(path, model): - bcif_path = splitext(path)[0] + ".bcif" - mmtf_file = mmtf.MMTFFile.read(path) - try: - a1 = mmtf.get_structure(mmtf_file, model=model) - except biotite.InvalidFileError: - if model is None: - # The file cannot be parsed into an AtomArrayStack, - # as the models contain different numbers of atoms - # -> skip this test case - return - else: - raise - - pdbx_file = pdbx.BinaryCIFFile.read(bcif_path) - a2 = pdbx.get_structure(pdbx_file, model=model) - - # Sometimes mmCIF files can have 'cell' entry - # but corresponding MMTF file has not 'unitCell' entry - # -> Do not assert for dummy entry in mmCIF file - # (all vector elements = {0, 1}) - if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all(): - assert np.allclose(a1.box, a2.box) - assert a2.hetero is not None - # MMTF might assign some residues, that PDBx assigns as 'hetero', - # as 'non-hetero' if they are RNA/DNA or peptide linking - conflict_residues = np.unique( - a1.res_name[a1.hetero != a2.hetero] - ) - for res in conflict_residues: - assert info.link_type(res) in [ - "L-PEPTIDE LINKING", "PEPTIDE LINKING", - "DNA LINKING", "RNA LINKING" - ] - # Test the remaining categories - for category in [ - c for c in a1.get_annotation_categories() if c != "hetero" - ]: - assert a1.get_annotation(category).tolist() == \ - a2.get_annotation(category).tolist() - assert a1.coord.flatten().tolist() == \ - approx(a2.coord.flatten().tolist(), abs=1e-3) - - -@pytest.mark.parametrize( - "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.mmtf")), - [None, 1] - ) -) -def test_pdbx_consistency_assembly(path, model): - """ - Check whether :func:`get_assembly()` gives the same result for the - PDBx/mmCIF and MMTF reader. - """ - mmtf_file = mmtf.MMTFFile.read(path) - try: - test_assembly = mmtf.get_assembly(mmtf_file, model=model) - except biotite.InvalidFileError: - if model is None: - # The file cannot be parsed into an AtomArrayStack, - # as the models contain different numbers of atoms - # -> skip this test case - return - else: - raise - except NotImplementedError: - pytest.skip( - "The limitation of the function does not support this structure" - ) - - bcif_path = splitext(path)[0] + ".bcif" - pdbx_file = pdbx.BinaryCIFFile.read(bcif_path) - ref_assembly = pdbx.get_assembly(pdbx_file, model=model) - - # MMTF might assign some residues, that PDBx assigns as 'hetero', - # as 'non-hetero' if they are RNA/DNA or peptide linking - # -> skip 'hetero' category - for category in [ - c for c in ref_assembly.get_annotation_categories() if c != "hetero" - ]: - assert test_assembly.get_annotation(category).tolist() == \ - ref_assembly.get_annotation(category).tolist() - assert test_assembly.coord.flatten().tolist() == \ - approx(ref_assembly.coord.flatten().tolist(), abs=1e-3) - - -def test_extra_fields(): - path = join(data_dir("structure"), "1l2y.mmtf") - mmtf_file = mmtf.MMTFFile.read(path) - stack1 = mmtf.get_structure( - mmtf_file, - extra_fields=[ - "atom_id", "b_factor", "occupancy", "charge" - ] - ) - - mmtf_file == mmtf.MMTFFile() - mmtf.set_structure(mmtf_file, stack1) - - stack2 = mmtf.get_structure( - mmtf_file, - extra_fields=[ - "atom_id", "b_factor", "occupancy", "charge" - ] - ) - - assert stack1.atom_id.tolist() == stack2.atom_id.tolist() - assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist()) - assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist()) - assert stack1.charge.tolist() == stack2.charge.tolist() - - -def test_numpy_objects(): - """ - Test whether the Msgpack encoder is able to handle NumPy values - (e.g. np.float32) properly. - - Only check if no error occurs. - """ - mmtf_file = mmtf.MMTFFile() - mmtf_file["A float"] = np.float32(42.0) - mmtf_file["A list"] = [np.int64(1), np.int64(2), np.int64(3)] - mmtf_file["A dictionary"] = {"a": np.int64(1), "b": np.int64(2)} - temp = TemporaryFile("w+b") - mmtf_file.write(temp) - temp.close() diff --git a/tests/structure/test_mol.py b/tests/structure/test_mol.py index 55ce15f04..ce4378e86 100644 --- a/tests/structure/test_mol.py +++ b/tests/structure/test_mol.py @@ -14,16 +14,17 @@ import biotite.structure.io.pdbx as pdbx from biotite.structure.bonds import BondType from biotite.structure.io.mol.ctab import BOND_TYPE_MAPPING_REV -from ..util import data_dir +from tests.util import data_dir def list_v2000_sdf_files(): return [ - path for path - in glob.glob(join(data_dir("structure"), "molecules", "*.sdf")) - if not "v3000" in path + path + for path in glob.glob(join(data_dir("structure"), "molecules", "*.sdf")) + if "v3000" not in path ] + def list_v3000_sdf_files(): return glob.glob(join(data_dir("structure"), "molecules", "*v3000.sdf")) @@ -79,11 +80,16 @@ def test_header_conversion(): list_v2000_sdf_files(), ["V2000", "V3000"], [False, True], - [False, True] - ) + [False, True], + ), ) -def test_structure_conversion(FileClass, path, version, omit_charge, - use_charge_property): +def test_structure_conversion( + FileClass, # noqa: N803 + path, + version, + omit_charge, + use_charge_property, +): """ After reading a file, writing the structure back to a new file and reading it again should give the same structure. @@ -123,9 +129,10 @@ def test_structure_conversion(FileClass, path, version, omit_charge, @pytest.mark.parametrize( "path", [ - file for file in list_v2000_sdf_files() + list_v3000_sdf_files() + file + for file in list_v2000_sdf_files() + list_v3000_sdf_files() if file.split(".")[0] + ".cif" in list_cif_files() - ] + ], ) def test_pdbx_consistency(path): """ @@ -145,20 +152,17 @@ def test_pdbx_consistency(path): test_atoms = mol.get_structure(sdf_file) assert test_atoms.coord.shape == ref_atoms.coord.shape - assert test_atoms.coord.flatten().tolist() \ - == ref_atoms.coord.flatten().tolist() + assert test_atoms.coord.flatten().tolist() == ref_atoms.coord.flatten().tolist() assert test_atoms.element.tolist() == ref_atoms.element.tolist() assert test_atoms.charge.tolist() == ref_atoms.charge.tolist() - assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) \ - == set(tuple(bond) for bond in ref_atoms.bonds.as_array()) + assert set(tuple(bond) for bond in test_atoms.bonds.as_array()) == set( + tuple(bond) for bond in ref_atoms.bonds.as_array() + ) @pytest.mark.parametrize( "v2000_path, v3000_path", - zip( - sorted(list_v2000_sdf_files()), - sorted(list_v3000_sdf_files()) - ) + zip(sorted(list_v2000_sdf_files()), sorted(list_v3000_sdf_files())), ) def test_version_consistency(v2000_path, v3000_path): """ @@ -198,10 +202,7 @@ def test_multi_record_files(): temp.seek(0) sdf_file = mol.SDFile.read(temp) - test_atom_arrays = [ - sdf_file[res_name].get_structure() - for res_name in RES_NAMES - ] + test_atom_arrays = [sdf_file[res_name].get_structure() for res_name in RES_NAMES] assert test_atom_arrays == ref_atom_arrays @@ -210,9 +211,7 @@ def test_metadata_parsing(): """ Check if metadata is parsed correctly based on a known example. """ - sdf_file = mol.SDFile.read( - join(data_dir("structure"), "molecules", "13136.sdf") - ) + sdf_file = mol.SDFile.read(join(data_dir("structure"), "molecules", "13136.sdf")) metadata = sdf_file.record.metadata assert metadata["PUBCHEM_COMPOUND_CID"] == "13136" @@ -224,10 +223,7 @@ def test_metadata_conversion(): """ Writing metadata and reading it again should give the same data. """ - ref_metadata = { - "test_1": "value 1", - "test_2": "value 2\nvalue 3" - } + ref_metadata = {"test_1": "value 1", "test_2": "value 2\nvalue 3"} record = mol.SDRecord(metadata=ref_metadata) sdf_file = mol.SDFile({"Molecule": record}) @@ -236,9 +232,7 @@ def test_metadata_conversion(): temp.seek(0) sdf_file = mol.SDFile.read(temp) - test_metadata = { - key.name: val for key, val in sdf_file.record.metadata.items() - } + test_metadata = {key.name: val for key, val in sdf_file.record.metadata.items()} temp.close() assert test_metadata == ref_metadata @@ -248,18 +242,10 @@ def test_metadata_conversion(): "key_string, ref_key_attributes", [ # Cases from Dalby1992 - ( - "> ", - (None, "MELTING.POINT", None, None) - ), - ( - "> 55 (MD-08974) DT12", - (12, "BOILING.POINT", 55, "MD-08974") - ), - ( - "> DT12 55", (12, None, 55, None) - ), - ] + ("> ", (None, "MELTING.POINT", None, None)), + ("> 55 (MD-08974) DT12", (12, "BOILING.POINT", 55, "MD-08974")), + ("> DT12 55", (12, None, 55, None)), + ], ) def test_metadata_key_parsing(key_string, ref_key_attributes): """ @@ -270,7 +256,7 @@ def test_metadata_key_parsing(key_string, ref_key_attributes): number=number, name=name, registry_internal=registry_internal, - registry_external=registry_external + registry_external=registry_external, ) test_key = mol.Metadata.Key.deserialize(key_string) @@ -292,7 +278,7 @@ def test_structure_bond_type_fallback(path): # the default bond type ref_atoms.bonds.add_bond(0, 1, BondType.QUADRUPLE) updated_bond = ref_atoms.bonds.as_array()[ - np.all(ref_atoms.bonds.as_array()[:,[0,1]] == [0,1], axis=1) + np.all(ref_atoms.bonds.as_array()[:, [0, 1]] == [0, 1], axis=1) ] assert updated_bond.tolist()[0][2] == BondType.QUADRUPLE test_mol_file = mol.MOLFile() @@ -300,21 +286,16 @@ def test_structure_bond_type_fallback(path): # Test bond type fallback to BondType.ANY value (8) in # MolFile.set_structure during mol_file.lines formatting updated_line = [ - mol_line - for mol_line in test_mol_file.lines if mol_line.startswith(' 1 2 ') + mol_line for mol_line in test_mol_file.lines if mol_line.startswith(" 1 2 ") ].pop() - assert int(updated_line[8]) == \ - BOND_TYPE_MAPPING_REV[BondType.ANY] + assert int(updated_line[8]) == BOND_TYPE_MAPPING_REV[BondType.ANY] # Test bond type fallback to BondType.SINGLE value (1) in # MolFile.set_structure during mol_file.lines formatting - mol.set_structure(test_mol_file, ref_atoms, - default_bond_type=BondType.SINGLE) + mol.set_structure(test_mol_file, ref_atoms, default_bond_type=BondType.SINGLE) updated_line = [ - mol_line - for mol_line in test_mol_file.lines if mol_line.startswith(' 1 2 ') + mol_line for mol_line in test_mol_file.lines if mol_line.startswith(" 1 2 ") ].pop() - assert int(updated_line[8]) == \ - BOND_TYPE_MAPPING_REV[BondType.SINGLE] + assert int(updated_line[8]) == BOND_TYPE_MAPPING_REV[BondType.SINGLE] @pytest.mark.parametrize("atom_type", ["", " ", "A ", " A"]) @@ -396,4 +377,4 @@ def _delete_charge_property(file): lines = [line for line in lines if not line.startswith("M CHG")] file.seek(0) file.truncate() - file.write("\n".join(lines) + "\n") \ No newline at end of file + file.write("\n".join(lines) + "\n") diff --git a/tests/structure/test_molecules.py b/tests/structure/test_molecules.py index d983f9f83..6880cd8cd 100644 --- a/tests/structure/test_molecules.py +++ b/tests/structure/test_molecules.py @@ -18,26 +18,24 @@ def array(): :class:`AtomArray`. """ MOL_NAMES = [ - "ARG", # Molecule with multiple branches - "TRP", # Molecule with a cycle - "GLC", # Molecule with a cycle + "ARG", # Molecule with multiple branches + "TRP", # Molecule with a cycle + "GLC", # Molecule with a cycle "NA", # A single atom - "ATP" # Larger molecule + "ATP", # Larger molecule ] N_MOLECULES = 20 np.random.seed(0) - + atom_array = struc.AtomArray(0) for i, mol_name in enumerate(np.random.choice(MOL_NAMES, N_MOLECULES)): molecule = info.residue(mol_name) - molecule.res_id[:] = i+1 + molecule.res_id[:] = i + 1 atom_array += molecule - + reordered_indices = np.random.choice( - np.arange(atom_array.array_length()), - atom_array.array_length(), - replace=False + np.arange(atom_array.array_length()), atom_array.array_length(), replace=False ) atom_array = atom_array[reordered_indices] @@ -45,12 +43,7 @@ def array(): @pytest.mark.parametrize( - "as_stack, as_bonds", - [ - (False, False), - (True, False), - (False, True ) - ] + "as_stack, as_bonds", [(False, False), (True, False), (False, True)] ) def test_get_molecule_indices(array, as_stack, as_bonds): """ @@ -59,12 +52,12 @@ def test_get_molecule_indices(array, as_stack, as_bonds): """ if as_stack: array = struc.stack([array]) - + if as_bonds: test_indices = struc.get_molecule_indices(array.bonds) else: test_indices = struc.get_molecule_indices(array) - + seen_atoms = 0 for indices in test_indices: molecule = array[..., indices] @@ -72,20 +65,16 @@ def test_get_molecule_indices(array, as_stack, as_bonds): # -> all atoms from the same molecule assert (molecule.res_id == molecule.res_id[0]).all() # Assert that no atom is missing from the molecule - assert molecule.array_length() \ - == info.residue(molecule.res_name[0]).array_length() + assert ( + molecule.array_length() == info.residue(molecule.res_name[0]).array_length() + ) seen_atoms += molecule.array_length() # Assert that all molecules are fond assert seen_atoms == array.array_length() @pytest.mark.parametrize( - "as_stack, as_bonds", - [ - (False, False), - (True, False), - (False, True ) - ] + "as_stack, as_bonds", [(False, False), (True, False), (False, True)] ) def test_get_molecule_masks(array, as_stack, as_bonds): """ @@ -95,18 +84,18 @@ def test_get_molecule_masks(array, as_stack, as_bonds): """ if as_stack: array = struc.stack([array]) - + if as_bonds: ref_indices = struc.get_molecule_indices(array.bonds) test_masks = struc.get_molecule_masks(array.bonds) else: ref_indices = struc.get_molecule_indices(array) test_masks = struc.get_molecule_masks(array) - + for i in range(len(test_masks)): # Assert that the mask is 'True' for all indices # and that these 'True' values are the only ones in the mask - assert (test_masks[i, ref_indices[i]] == True).all() + assert test_masks[i, ref_indices[i]].all() assert np.count_nonzero(test_masks[i]) == len(ref_indices[i]) @@ -123,4 +112,4 @@ def test_molecule_iter(array, as_stack): test_iterator = struc.molecule_iter(array) for i, molecule in enumerate(test_iterator): - assert molecule == array[..., ref_indices[i]] \ No newline at end of file + assert molecule == array[..., ref_indices[i]] diff --git a/tests/structure/test_pdb.py b/tests/structure/test_pdb.py index d93974f72..3b1ef07a6 100644 --- a/tests/structure/test_pdb.py +++ b/tests/structure/test_pdb.py @@ -2,22 +2,21 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import TemporaryFile -import warnings -import itertools import glob -from os.path import join, splitext +import itertools import sys +import warnings +from os.path import join, splitext +from tempfile import TemporaryFile +import numpy as np import pytest from pytest import approx -import numpy as np import biotite import biotite.structure as struc import biotite.structure.io.pdb as pdb import biotite.structure.io.pdb.hybrid36 as hybrid36 import biotite.structure.io.pdbx as pdbx -import biotite.structure.io as io -from ..util import data_dir +from tests.util import data_dir def test_get_model_count(): @@ -35,17 +34,15 @@ def test_get_model_count(): glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1], [False, True], - [False, True] - ) + [False, True], + ), ) def test_array_conversion(path, model, hybrid36, include_bonds): pdb_file = pdb.PDBFile.read(path) # Test also the thin wrapper around the methods # 'get_structure()' and 'set_structure()' try: - array1 = pdb.get_structure( - pdb_file, model=model, include_bonds=include_bonds - ) + array1 = pdb.get_structure(pdb_file, model=model, include_bonds=include_bonds) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -58,8 +55,7 @@ def test_array_conversion(path, model, hybrid36, include_bonds): if hybrid36 and (array1.res_id < 0).any(): with pytest.raises( ValueError, - match="Only positive integers can be converted " - "into hybrid-36 notation" + match="Only positive integers can be converted " "into hybrid-36 notation", ): pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) @@ -68,33 +64,28 @@ def test_array_conversion(path, model, hybrid36, include_bonds): pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) - array2 = pdb.get_structure( - pdb_file, model=model, include_bonds=include_bonds - ) + array2 = pdb.get_structure(pdb_file, model=model, include_bonds=include_bonds) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): - assert array1.get_annotation(category).tolist() == \ - array2.get_annotation(category).tolist() + assert ( + array1.get_annotation(category).tolist() + == array2.get_annotation(category).tolist() + ) assert array1.coord.tolist() == array2.coord.tolist() @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]), ) def test_pdbx_consistency(path, model): bcif_path = splitext(path)[0] + ".bcif" pdbx_file = pdbx.BinaryCIFFile.read(bcif_path) try: - ref_atoms = pdbx.get_structure( - pdbx_file, model=model, include_bonds=True - ) + ref_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -134,17 +125,16 @@ def test_pdbx_consistency(path, model): print(file=sys.stderr) raise for category in ref_atoms.get_annotation_categories(): - assert test_atoms.get_annotation(category).tolist() == \ - ref_atoms.get_annotation(category).tolist() + assert ( + test_atoms.get_annotation(category).tolist() + == ref_atoms.get_annotation(category).tolist() + ) assert test_atoms.coord.tolist() == ref_atoms.coord.tolist() @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1]), ) def test_pdbx_consistency_assembly(path, model): """ @@ -168,10 +158,13 @@ def test_pdbx_consistency_assembly(path, model): ref_assembly = pdbx.get_assembly(pdbx_file, model=model) for category in ref_assembly.get_annotation_categories(): - assert test_assembly.get_annotation(category).tolist() == \ - ref_assembly.get_annotation(category).tolist() - assert test_assembly.coord.flatten().tolist() == \ - approx(ref_assembly.coord.flatten().tolist(), abs=1e-3) + assert ( + test_assembly.get_annotation(category).tolist() + == ref_assembly.get_annotation(category).tolist() + ) + assert test_assembly.coord.flatten().tolist() == approx( + ref_assembly.coord.flatten().tolist(), abs=1e-3 + ) @pytest.mark.parametrize("hybrid36", [False, True]) @@ -179,9 +172,7 @@ def test_extra_fields(hybrid36): path = join(data_dir("structure"), "1l2y.pdb") pdb_file = pdb.PDBFile.read(path) stack1 = pdb_file.get_structure( - extra_fields=[ - "atom_id", "b_factor", "occupancy", "charge" - ] + extra_fields=["atom_id", "b_factor", "occupancy", "charge"] ) with pytest.raises(ValueError): @@ -196,9 +187,7 @@ def test_extra_fields(hybrid36): pdb_file.set_structure(stack1, hybrid36=hybrid36) stack2 = pdb_file.get_structure( - extra_fields=[ - "atom_id", "b_factor", "occupancy", "charge" - ] + extra_fields=["atom_id", "b_factor", "occupancy", "charge"] ) assert stack1.ins_code.tolist() == stack2.ins_code.tolist() @@ -218,7 +207,7 @@ def test_inferred_elements(): # Remove all elements removed_stack = stack.copy() - removed_stack.element[:] = '' + removed_stack.element[:] = "" # Save stack without elements to tmp file temp = TemporaryFile("w+") @@ -237,10 +226,7 @@ def test_inferred_elements(): @pytest.mark.parametrize( "path, model", - itertools.product( - glob.glob(join(data_dir("structure"), "*.pdb")), - [None, 1, -1] - ) + itertools.product(glob.glob(join(data_dir("structure"), "*.pdb")), [None, 1, -1]), ) def test_box_shape(path, model): pdb_file = pdb.PDBFile.read(path) @@ -266,14 +252,11 @@ def test_box_parsing(): path = join(data_dir("structure"), "1igy.pdb") pdb_file = pdb.PDBFile.read(path) a = pdb_file.get_structure() - expected_box = np.array([[ - [66.65, 0.00, 0.00], - [0.00, 190.66, 0.00], - [-24.59, 0.00, 68.84] - ]]) + expected_box = np.array( + [[[66.65, 0.00, 0.00], [0.00, 190.66, 0.00], [-24.59, 0.00, 68.84]]] + ) - assert expected_box.flatten().tolist() \ - == approx(a.box.flatten().tolist(), abs=1e-2) + assert expected_box.flatten().tolist() == approx(a.box.flatten().tolist(), abs=1e-2) def test_id_overflow(): @@ -283,7 +266,7 @@ def test_id_overflow(): a.coord = np.zeros(a.coord.shape) a.chain_id = np.full(length, "A") # Create residue IDs over 10000 - a.res_id = np.arange(1, length+1) + a.res_id = np.arange(1, length + 1) a.res_name = np.full(length, "GLY") a.hetero = np.full(length, False) a.atom_name = np.full(length, "CA") @@ -299,13 +282,13 @@ def test_id_overflow(): # Assert file can be read properly temp.seek(0) a2 = pdb.get_structure(pdb.PDBFile.read(temp)) - assert(a2.array_length() == a.array_length()) + assert a2.array_length() == a.array_length() # Manually check if the written atom id is correct temp.seek(0) last_line = temp.readlines()[-1] atom_id = int(last_line.split()[1]) - assert(atom_id == 1) + assert atom_id == 1 temp.close() @@ -321,9 +304,9 @@ def test_id_overflow(): temp.seek(0) last_line = temp.readlines()[-1] atom_id = last_line.split()[1] - assert(atom_id == "A0000") + assert atom_id == "A0000" res_id = last_line.split()[4][1:] - assert(res_id == "BXG0") + assert res_id == "BXG0" temp.close() @@ -353,38 +336,41 @@ def test_get_b_factor(model): if model is None: # The B-factor is an annotation category # -> it can only be extracted in a per-model basis - ref_b_factor = np.stack([ - pdb_file.get_structure( - model=m, extra_fields=["b_factor"] - ).b_factor - for m in range(1, pdb_file.get_model_count() + 1) - ]) + ref_b_factor = np.stack( + [ + pdb_file.get_structure(model=m, extra_fields=["b_factor"]).b_factor + for m in range(1, pdb_file.get_model_count() + 1) + ] + ) else: ref_b_factor = pdb_file.get_structure( model=model, extra_fields=["b_factor"] ).b_factor - test_b_factor= pdb_file.get_b_factor(model=model) + test_b_factor = pdb_file.get_b_factor(model=model) assert test_b_factor.shape == ref_b_factor.shape assert (test_b_factor == ref_b_factor).all() - np.random.seed(0) N = 200 LENGTHS = [3, 4, 5] + + @pytest.mark.parametrize( "number, length", zip( - list(itertools.chain(*[ - np.random.randint(0, hybrid36.max_hybrid36_number(length), N) - for length in LENGTHS - ])), - list(itertools.chain(*[ - [length] * N for length in LENGTHS - ])) - ) + list( + itertools.chain( + *[ + np.random.randint(0, hybrid36.max_hybrid36_number(length), N) + for length in LENGTHS + ] + ) + ), + list(itertools.chain(*[[length] * N for length in LENGTHS])), + ), ) def test_hybrid36_codec(number, length): """ @@ -401,7 +387,6 @@ def test_max_hybrid36_number(): assert hybrid36.max_hybrid36_number(5) == 87440031 - @pytest.mark.parametrize("hybrid36", [False, True]) def test_bond_records(hybrid36): """ @@ -420,7 +405,7 @@ def test_bond_records(hybrid36): np.random.seed(0) # Create random bonds four times the number of atoms - bond_array = np.random.randint(n_atoms, size=(4*n_atoms, 2)) + bond_array = np.random.randint(n_atoms, size=(4 * n_atoms, 2)) # Remove bonds of atoms to themselves bond_array = bond_array[bond_array[:, 0] != bond_array[:, 1]] ref_bonds = struc.BondList(n_atoms, bond_array) @@ -459,8 +444,8 @@ def test_get_symmetry_mates(model): Test generated symmetry mates on a known example with a simple space group and a single chain. """ - INVERSION_AXES = [(0,0,0), (0,0,1), (0,1,0), (1,0,0)] - TRANSLATION_AXES = [(0,0,0), (1,0,1), (0,1,1), (1,1,0)] + INVERSION_AXES = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0)] + TRANSLATION_AXES = [(0, 0, 0), (1, 0, 1), (0, 1, 1), (1, 1, 0)] path = join(data_dir("structure"), "1aki.pdb") pdb_file = pdb.PDBFile.read(path) @@ -475,8 +460,7 @@ def test_get_symmetry_mates(model): symmetry_mates = pdb_file.get_symmetry_mates(model=model) # Space group has 4 copies in a unit cell - assert symmetry_mates.array_length() \ - == original_structure.array_length() * 4 + assert symmetry_mates.array_length() == original_structure.array_length() * 4 if model is None: assert symmetry_mates.stack_depth() == original_structure.stack_depth() for chain, inv_axes, trans_axes in zip( @@ -490,10 +474,13 @@ def test_get_symmetry_mates(model): chain = struc.rotate(chain, angles) # Now both mates should be equal for category in original_structure.get_annotation_categories(): - assert chain.get_annotation(category).tolist() == \ - original_structure.get_annotation(category).tolist() - assert chain.coord.flatten().tolist() == \ - approx(original_structure.coord.flatten().tolist(), abs=1e-3) + assert ( + chain.get_annotation(category).tolist() + == original_structure.get_annotation(category).tolist() + ) + assert chain.coord.flatten().tolist() == approx( + original_structure.coord.flatten().tolist(), abs=1e-3 + ) @pytest.mark.parametrize( @@ -512,7 +499,7 @@ def test_get_symmetry_mates(model): ("occupancy", 1000, False), ("charge", -10, False), ("charge", 10, False), - ] + ], ) def test_setting_incompatible_structure(annotation, value, warning_only): """ @@ -535,7 +522,7 @@ def test_setting_incompatible_structure(annotation, value, warning_only): # Set one annotation to a value that exceeds the number of columns if annotation == "coord": - atoms.coord[0,0] = value + atoms.coord[0, 0] = value else: atoms.get_annotation(annotation)[0] = value diff --git a/tests/structure/test_pdbqt.py b/tests/structure/test_pdbqt.py index 1a7c2e049..2f18b496c 100644 --- a/tests/structure/test_pdbqt.py +++ b/tests/structure/test_pdbqt.py @@ -2,31 +2,30 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import warnings -from tempfile import TemporaryFile import glob +import warnings from os.path import join -import pytest +from tempfile import TemporaryFile import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io.pdbqt as pdbqt import biotite.structure.io.pdbx as pdbx -from ..util import data_dir +from tests.util import data_dir @pytest.mark.parametrize( "path", [ - path for path in glob.glob(join(data_dir("structure"), "*.bcif")) + path + for path in glob.glob(join(data_dir("structure"), "*.bcif")) # Skip this PDB ID as it contains 5-character residue names if "7gsa" not in path - ] + ], ) def test_array_conversion(path): pdbx_file = pdbx.BinaryCIFFile.read(path) - ref_structure = pdbx.get_structure( - pdbx_file, model=1, extra_fields=["charge"] - ) + ref_structure = pdbx.get_structure(pdbx_file, model=1, extra_fields=["charge"]) ref_structure.bonds = struc.connect_via_residue_names(ref_structure) pdbqt_file = pdbqt.PDBQTFile() @@ -53,7 +52,7 @@ def test_array_conversion(path): try: assert np.array_equal( test_structure.get_annotation(category), - ref_structure.get_annotation(category) + ref_structure.get_annotation(category), ) except AssertionError: print(f"Inequality in '{category}' category") diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py index ec32bbd3a..8d5da7862 100644 --- a/tests/structure/test_pdbx.py +++ b/tests/structure/test_pdbx.py @@ -2,9 +2,9 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import warnings import glob import itertools +import warnings from os.path import join, splitext import numpy as np import pytest @@ -13,22 +13,20 @@ import biotite.sequence as seq import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -from ..util import data_dir +from tests.util import data_dir -@pytest.mark.parametrize("format", ["cif", "bcif", "legacy"]) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_get_model_count(format): """ Check of :func:`get_model_count()`gives the same number of models as :func:`get_structure()`. """ - base_path = join(data_dir("structure"), f"1l2y") + base_path = join(data_dir("structure"), "1l2y") if format == "cif": pdbx_file = pdbx.CIFFile.read(base_path + ".cif") - elif format == "bcif": - pdbx_file = pdbx.BinaryCIFFile.read(base_path + ".bcif") else: - pdbx_file = pdbx.PDBxFile.read(base_path + ".cif") + pdbx_file = pdbx.BinaryCIFFile.read(base_path + ".bcif") test_model_count = pdbx.get_model_count(pdbx_file) ref_model_count = pdbx.get_structure(pdbx_file).stack_depth() assert test_model_count == ref_model_count @@ -37,8 +35,17 @@ def test_get_model_count(format): @pytest.mark.parametrize( "string, looped", itertools.product( - ["", " ", " ", "te xt", "'", '"' ,"te\nxt", "\t",], - [False, True] + [ + "", + " ", + " ", + "te xt", + "'", + '"', + "te\nxt", + "\t", + ], + [False, True], ), ) def test_escape(string, looped): @@ -62,11 +69,11 @@ def test_escape(string, looped): @pytest.mark.parametrize( "cif_line, expected_fields", [ - ["'' 'embed'quote' ", ['', "embed'quote"]], - ['2 "embed"quote" "\t\n"', ['2', 'embed"quote', '\t\n']], - [" 3 '' \"\" 'spac e' 'embed\"quote'", ['3', '', '', 'spac e', 'embed"quote']], - ["''' \"\"\" ''quoted''", ["'", '"', "'quoted'"]] - ] + ["'' 'embed'quote' ", ["", "embed'quote"]], + ['2 "embed"quote" "\t\n"', ["2", 'embed"quote', "\t\n"]], + [" 3 '' \"\" 'spac e' 'embed\"quote'", ["3", "", "", "spac e", 'embed"quote']], + ["''' \"\"\" ''quoted''", ["'", '"', "'quoted'"]], + ], ) def test_split_one_line(cif_line, expected_fields): """ @@ -78,9 +85,7 @@ def test_split_one_line(cif_line, expected_fields): @pytest.mark.parametrize( "format, path, model", itertools.product( - ["cif", "bcif", "legacy"], - glob.glob(join(data_dir("structure"), "*.cif")), - [None, 1, -1] + ["cif", "bcif"], glob.glob(join(data_dir("structure"), "*.cif")), [None, 1, -1] ), ) def test_conversion(tmpdir, format, path, model): @@ -94,18 +99,13 @@ def test_conversion(tmpdir, format, path, model): if format == "cif": data_path = base_path + ".cif" File = pdbx.CIFFile - elif format == "bcif": + else: data_path = base_path + ".bcif" File = pdbx.BinaryCIFFile - else: - data_path = base_path + ".cif" - File = pdbx.PDBxFile pdbx_file = File.read(data_path) try: - ref_atoms = pdbx.get_structure( - pdbx_file, model=model, include_bonds=True - ) + ref_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -122,14 +122,9 @@ def test_conversion(tmpdir, format, path, model): pdbx_file = File.read(file_path) # Remove one label section to test fallback to auth fields - if format == "legacy": - del pdbx_file.cif_file.block["atom_site"][DELETED_ANNOTATION] - else: - del pdbx_file.block["atom_site"][DELETED_ANNOTATION] + del pdbx_file.block["atom_site"][DELETED_ANNOTATION] with pytest.warns(UserWarning, match=f"'{DELETED_ANNOTATION}' not found"): - test_atoms = pdbx.get_structure( - pdbx_file, model=model, include_bonds=True - ) + test_atoms = pdbx.get_structure(pdbx_file, model=model, include_bonds=True) assert ref_atoms.array_length() > 0 if ref_atoms.box is not None: @@ -168,9 +163,7 @@ def test_bond_conversion(tmpdir, format, path): File = pdbx.BinaryCIFFile pdbx_file = File.read(data_path) - atoms = pdbx.get_structure( - pdbx_file, model=1, include_bonds=True - ) + atoms = pdbx.get_structure(pdbx_file, model=1, include_bonds=True) ref_bonds = atoms.bonds pdbx_file = File() @@ -184,16 +177,12 @@ def test_bond_conversion(tmpdir, format, path): # i.e. the bonds can be properly read from ``chem_comp_bond`` with warnings.catch_warnings(): warnings.simplefilter("error") - test_bonds = pdbx.get_structure( - pdbx_file, model=1, include_bonds=True - ).bonds + test_bonds = pdbx.get_structure(pdbx_file, model=1, include_bonds=True).bonds assert test_bonds == ref_bonds -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_extra_fields(tmpdir, format): path = join(data_dir("structure"), f"1l2y.{format}") if format == "cif": @@ -232,9 +221,7 @@ def test_intra_bond_residue_parsing(): """ cif_path = join(data_dir("structure"), "1l2y.cif") cif_file = pdbx.CIFFile.read(cif_path) - ref_bonds = pdbx.get_structure( - cif_file, model=1, include_bonds=True - ).bonds + ref_bonds = pdbx.get_structure(cif_file, model=1, include_bonds=True).bonds nextgen_cif_path = join( data_dir("structure"), "nextgen", "pdb_00001l2y_xyz-enrich.cif" @@ -251,9 +238,7 @@ def test_intra_bond_residue_parsing(): assert test_bonds == ref_bonds -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_any_bonds(tmpdir, format): """ Check if ``BondType.ANY`` bonds can be written and read from a PDBx @@ -290,16 +275,12 @@ def test_any_bonds(tmpdir, format): # i.e. the bonds can be properly read from ``chem_comp_bond`` with warnings.catch_warnings(): warnings.simplefilter("error") - test_bonds = pdbx.get_structure( - pdbx_file, model=1, include_bonds=True - ).bonds + test_bonds = pdbx.get_structure(pdbx_file, model=1, include_bonds=True).bonds assert test_bonds == ref_bonds -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_unequal_lengths(format): """ Check if setting columns with unequal lengths in the same category @@ -323,9 +304,7 @@ def test_setting_empty_column(): """ Check if setting an empty column raises an exception. """ - with pytest.raises( - ValueError, match="Array must contain at least one element" - ): + with pytest.raises(ValueError, match="Array must contain at least one element"): pdbx.CIFCategory({"foo": []}) @@ -348,9 +327,7 @@ def test_setting_empty_structure(): pdbx.set_structure(pdbx.CIFFile(), atoms, include_bonds=True) -@pytest.mark.parametrize( - "format", ["cif", "bcif"] -) +@pytest.mark.parametrize("format", ["cif", "bcif"]) def test_list_assemblies(format): """ Test the :func:`list_assemblies()` function based on a known @@ -375,11 +352,10 @@ def test_list_assemblies(format): } -@pytest.mark.parametrize("format, pdb_id, model", itertools.product( - ["cif", "bcif"], - ["1f2n", "5zng"], - [None, 1, -1] -)) +@pytest.mark.parametrize( + "format, pdb_id, model", + itertools.product(["cif", "bcif"], ["1f2n", "5zng"], [None, 1, -1]), +) def test_get_assembly(format, pdb_id, model): """ Test whether the :func:`get_assembly()` function produces the same @@ -400,13 +376,11 @@ def test_get_assembly(format, pdb_id, model): # Test each available assembly for id, ref_oligomer_count in zip( assembly_category["id"].as_array(str), - assembly_category["oligomeric_count"].as_array(int) + assembly_category["oligomeric_count"].as_array(int), ): print("Assembly ID:", id) try: - assembly = pdbx.get_assembly( - pdbx_file, assembly_id=id, model=model - ) + assembly = pdbx.get_assembly(pdbx_file, assembly_id=id, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, @@ -433,8 +407,7 @@ def test_get_assembly(format, pdb_id, model): @pytest.mark.parametrize( "path, use_ideal_coord", itertools.product( - glob.glob(join(data_dir("structure"), "molecules", "*.cif")), - [False, True] + glob.glob(join(data_dir("structure"), "molecules", "*.cif")), [False, True] ), ) def test_component_conversion(tmpdir, path, use_ideal_coord): @@ -444,9 +417,7 @@ def test_component_conversion(tmpdir, path, use_ideal_coord): structure. """ cif_file = pdbx.CIFFile.read(path) - ref_atoms = pdbx.get_component( - cif_file, use_ideal_coord=use_ideal_coord - ) + ref_atoms = pdbx.get_component(cif_file, use_ideal_coord=use_ideal_coord) cif_file = pdbx.CIFFile() pdbx.set_component(cif_file, ref_atoms, data_block="test") @@ -454,9 +425,7 @@ def test_component_conversion(tmpdir, path, use_ideal_coord): cif_file.write(file_path) cif_file = pdbx.CIFFile.read(path) - test_atoms = pdbx.get_component( - cif_file, use_ideal_coord=use_ideal_coord - ) + test_atoms = pdbx.get_component(cif_file, use_ideal_coord=use_ideal_coord) assert test_atoms == ref_atoms @@ -476,14 +445,14 @@ def test_get_sequence(format): sequences_1 = pdbx.get_sequence(pdbx_file) pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}")) sequences_2 = pdbx.get_sequence(pdbx_file) - assert str(sequences_1['T']) == "CCGACGGCGCATCAGC" - assert type(sequences_1['T']) is seq.NucleotideSequence - assert str(sequences_1['P']) == "GCTGATGCGCC" - assert type(sequences_1['P']) is seq.NucleotideSequence - assert str(sequences_1['D']) == "GTCGG" - assert type(sequences_1['D']) is seq.NucleotideSequence + assert str(sequences_1["T"]) == "CCGACGGCGCATCAGC" + assert type(sequences_1["T"]) is seq.NucleotideSequence + assert str(sequences_1["P"]) == "GCTGATGCGCC" + assert type(sequences_1["P"]) is seq.NucleotideSequence + assert str(sequences_1["D"]) == "GTCGG" + assert type(sequences_1["D"]) is seq.NucleotideSequence assert ( - str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" + str(sequences_1["A"]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD" "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD" "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS" @@ -491,14 +460,14 @@ def test_get_sequence(format): "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA" "GEPLPVDSEKDIFDYIQWKYREPKDRSE" ) - assert type(sequences_1['A']) is seq.ProteinSequence + assert type(sequences_1["A"]) is seq.ProteinSequence assert ( - str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" + str(sequences_2["A"]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC" "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT" "CCGGAGTCAGGAAACCTGCCTGCCGTC" ) - assert type(sequences_2['A']) is seq.NucleotideSequence + assert type(sequences_2["A"]) is seq.NucleotideSequence def test_bcif_encoding(): @@ -509,21 +478,20 @@ def test_bcif_encoding(): PDB_ID = "1aki" encodings_used = { - encoding: False for encoding in [ + encoding: False + for encoding in [ pdbx.ByteArrayEncoding, pdbx.FixedPointEncoding, # This encoding is not used in the test file - #pdbx.IntervalQuantizationEncoding, + # pdbx.IntervalQuantizationEncoding, pdbx.RunLengthEncoding, pdbx.DeltaEncoding, pdbx.IntegerPackingEncoding, - pdbx.StringArrayEncoding + pdbx.StringArrayEncoding, ] } - bcif_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), f"{PDB_ID}.bcif") - ) + bcif_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{PDB_ID}.bcif")) for category_name, category in bcif_file[PDB_ID.upper()].items(): for column_name in category.keys(): try: @@ -543,17 +511,15 @@ def test_bcif_encoding(): test_msgpack = column.serialize() assert test_msgpack == ref_msgpack - except: - raise Exception( - f"Encoding failed for '{category_name}.{column_name}'" - ) + except Exception: + raise Exception(f"Encoding failed for '{category_name}.{column_name}'") # Check if each encoding was used at least once # to ensure that the test was thorough for key, was_used in encodings_used.items(): try: assert was_used - except: + except Exception: raise Exception(f"Encoding {key} was not used") @@ -611,14 +577,17 @@ def test_bcif_cif_consistency(): if cif_column.mask is None: assert bcif_column.mask is None else: - assert cif_column.mask.array.tolist() \ + assert ( + cif_column.mask.array.tolist() == bcif_column.mask.array.tolist() + ) # In CIF format, all vales are strings # -> ensure consistency dtype = bcif_column.data.array.dtype - assert cif_column.as_array(dtype).tolist() \ - == pytest.approx(bcif_column.as_array(dtype).tolist()) - except: + assert cif_column.as_array(dtype).tolist() == pytest.approx( + bcif_column.as_array(dtype).tolist() + ) + except Exception: raise Exception( f"Comparison failed for '{category_name}.{column_name}'" ) @@ -630,7 +599,7 @@ def test_bcif_cif_consistency(): ("cif", None), ("bcif", False), ("bcif", True), - ] + ], ) def test_serialization_consistency(format, create_new_encoding): """ @@ -650,48 +619,21 @@ def test_serialization_consistency(format, create_new_encoding): for category_name, ref_category in file.block.items(): if format == "cif": - test_category = pdbx.CIFCategory.deserialize( - ref_category.serialize() - ) + test_category = pdbx.CIFCategory.deserialize(ref_category.serialize()) elif format == "bcif": # Access each column to force otherwise lazy deserialization for _ in ref_category.values(): pass if create_new_encoding: ref_category = _clear_encoding(ref_category) - test_category = pdbx.BinaryCIFCategory.deserialize( - ref_category.serialize() - ) + test_category = pdbx.BinaryCIFCategory.deserialize(ref_category.serialize()) try: for key in test_category.keys(): assert ref_category[key] == test_category[key] - except: + except Exception: raise Exception(f"Comparison failed for '{category_name}.{key}'") -def test_legacy_pdbx(): - PDB_ID = "1aki" - - path = join(data_dir("structure"), f"{PDB_ID}.cif") - ref_file = pdbx.CIFFile.read(path) - - test_file = pdbx.PDBxFile.read(path) - assert test_file.get_block_names() == [PDB_ID.upper()] - - for category_name, category in ref_file.block.items(): - test_category_dict = test_file.get_category( - category_name, PDB_ID.upper(), expect_looped=True - ) - for column_name, test_array in test_category_dict.items(): - try: - assert test_array.tolist() \ - == category[column_name].as_array(str).tolist() - except: - raise Exception( - f"Comparison failed for {category_name}.{column_name}" - ) - - def _clear_encoding(category): columns = {} for key, col in category.items(): diff --git a/tests/structure/test_pseudoknots.py b/tests/structure/test_pseudoknots.py index d7a594bcf..b263db5af 100644 --- a/tests/structure/test_pseudoknots.py +++ b/tests/structure/test_pseudoknots.py @@ -2,14 +2,13 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pytest import json +from os.path import join import numpy as np -import pickle as pkl +import pytest import biotite.structure as struc import biotite.structure.io as strucio -from os.path import join -from ..util import data_dir +from tests.util import data_dir @pytest.fixture @@ -19,6 +18,7 @@ def nuc_sample_array(): """ return strucio.load_structure(join(data_dir("structure"), "4p5j.cif")) + def test_pseudoknots(nuc_sample_array): """ Check the output of :func:`pseudoknots()`. @@ -26,11 +26,9 @@ def test_pseudoknots(nuc_sample_array): # Known base pairs with pseudoknot-order = 1: pseudoknot_order_one = [{2, 74}, {58, 72}, {59, 71}, {60, 70}] # Known base pairs that can either be of order one or two - pseudoknot_order_one_or_two = [{9, 48}, {10, 49}] - order_one_count = ( - len(pseudoknot_order_one) + (len(pseudoknot_order_one_or_two)/2) - ) - order_two_count = len(pseudoknot_order_one_or_two)/2 + pseudoknot_order_one_or_two = [{9, 48}, {10, 49}] + order_one_count = len(pseudoknot_order_one) + (len(pseudoknot_order_one_or_two) / 2) + order_two_count = len(pseudoknot_order_one_or_two) / 2 base_pairs = struc.base_pairs(nuc_sample_array) pseudoknot_order = struc.pseudoknots(base_pairs) @@ -51,15 +49,14 @@ def test_pseudoknots(nuc_sample_array): for base_pair, order in zip( nuc_sample_array[base_pairs].res_id, optimal_solution ): - if(order == 1): + if order == 1: assert ( - set(base_pair) in pseudoknot_order_one or - set(base_pair) in pseudoknot_order_one_or_two - ) - elif (order == 2): - assert ( - set(base_pair) in pseudoknot_order_one_or_two + set(base_pair) in pseudoknot_order_one + or set(base_pair) in pseudoknot_order_one_or_two ) + elif order == 2: + assert set(base_pair) in pseudoknot_order_one_or_two + def load_test(name): """ @@ -67,20 +64,19 @@ def load_test(name): """ # Base pairs as numpy array (input for `pseudoknots()`) with open( - join(data_dir("structure"), "pseudoknots", f"{name}_knotted.json"), - "r" + join(data_dir("structure"), "pseudoknots", f"{name}_knotted.json"), "r" ) as f: basepairs = np.array(json.load(f)) # List of solutions (set of tuples) with open( - join(data_dir("structure"), "pseudoknots", f"{name}_unknotted.json"), - "rb" + join(data_dir("structure"), "pseudoknots", f"{name}_unknotted.json"), "rb" ) as f: solutions = json.load(f) for i, solution in enumerate(solutions): solutions[i] = set([tuple(pair) for pair in solution]) return basepairs, solutions + @pytest.mark.parametrize("name", [f"test{x}" for x in range(21)]) def test_pseudoknot_removal(name): """ @@ -116,6 +112,7 @@ def test_pseudoknot_removal(name): # Verify that the number of solutions matches the reference assert len(reference_solutions) == solutions_count + @pytest.mark.parametrize("seed", range(10)) def test_pseudoknot_orders(seed): """ @@ -136,7 +133,7 @@ def test_pseudoknot_orders(seed): for solution in solutions: # Number of base pairs in the previous order previous_order = -1 - for order in range(np.max(solution)+1): + for order in range(np.max(solution) + 1): # Ensure that the base pairs of the same order are unknotted assert (struc.pseudoknots(basepairs[solution == order]) == 0).all() @@ -148,9 +145,10 @@ def test_pseudoknot_orders(seed): assert this_order <= previous_order previous_order = this_order + def test_empty_base_pairs(): """ Assert than an empty array of base pairs generates an empty array of - pseudoknot orders. + pseudoknot orders. """ - assert struc.pseudoknots([]).shape == (1,0) \ No newline at end of file + assert struc.pseudoknots([]).shape == (1, 0) diff --git a/tests/structure/test_rdf.py b/tests/structure/test_rdf.py index bd072fbbe..ae419d730 100644 --- a/tests/structure/test_rdf.py +++ b/tests/structure/test_rdf.py @@ -1,148 +1,151 @@ -import itertools +import json from os.path import join import numpy as np import pytest +import biotite.structure.io.gro as gro +from biotite.structure.box import vectors_from_unitcell from biotite.structure.io import load_structure from biotite.structure.rdf import rdf -from biotite.structure.box import vectors_from_unitcell -from ..util import data_dir, cannot_import - +from tests.util import data_dir TEST_FILE = join(data_dir("structure"), "waterbox.gro") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -def test_rdf(): - """ General test to reproduce oxygen RDF for a box of water""" - test_file = TEST_FILE - stack = load_structure(test_file) +def test_rdf_consistency(): + """ + Check that oxygen RDF for a box of water reproduces results from MDTraj. + """ + INTERVAL = [0, 10] + N_BINS = 100 - # calculate oxygen RDF for water - oxygen = stack[:, stack.atom_name == 'OW'] - interval = np.array([0, 10]) - n_bins = 100 - bins, g_r = rdf(oxygen[:, 0].coord, oxygen, interval=interval, - bins=n_bins, periodic=False) + # Load precomputed RDF from MDTraj + with open(join(data_dir("structure"), "misc", "rdf.json")) as file: + ref_data = json.load(file) + ref_bins = ref_data["bins"] + ref_g_r = ref_data["g_r"] - # Compare with MDTraj - import mdtraj - traj = mdtraj.load(TEST_FILE) - ow = [a.index for a in traj.topology.atoms if a.name == 'O'] - pairs = itertools.product([ow[0]], ow) - mdt_bins, mdt_g_r = mdtraj.compute_rdf(traj, list(pairs), - r_range=interval/10, n_bins=n_bins, - periodic=False) + gro_file = gro.GROFile.read(TEST_FILE) + stack = gro_file.get_structure() + # Calculate oxygen RDF for water + oxygen = stack[:, stack.atom_name == "OW"] + test_bins, test_g_r = rdf( + oxygen[:, 0].coord, oxygen, interval=INTERVAL, bins=N_BINS, periodic=False + ) - assert np.allclose(bins, mdt_bins*10) - assert np.allclose(g_r, mdt_g_r, rtol=0.0001) + assert test_bins.tolist() == pytest.approx(ref_bins) + assert test_g_r.tolist() == pytest.approx(ref_g_r, rel=0.01) def test_rdf_bins(): - """ Test if RDF produce correct bin ranges """ + """ + Test if RDF produce correct bin ranges. + """ stack = load_structure(TEST_FILE) center = stack[:, 0] num_bins = 44 bin_range = (0, 11.7) bins, g_r = rdf(center, stack, bins=num_bins, interval=bin_range) - assert(len(bins) == num_bins) - assert(bins[0] > bin_range[0]) - assert(bins[1] < bin_range[1]) + assert len(bins) == num_bins + assert bins[0] > bin_range[0] + assert bins[1] < bin_range[1] def test_rdf_with_selection(): - """ Test if the selection argument of rdf function works as expected """ + """ + Test if the selection argument of rdf function works as expected. + """ stack = load_structure(TEST_FILE) # calculate oxygen RDF for water with and without a selection - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 - sele = (stack.atom_name == 'OW') & (stack.res_id >= 3) - bins, g_r = rdf(oxygen[:, 0].coord, stack, selection=sele, - interval=interval, bins=n_bins, periodic=False) - - nosel_bins, nosel_g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:], - interval=interval, bins=n_bins, periodic=False) + sele = (stack.atom_name == "OW") & (stack.res_id >= 3) + bins, g_r = rdf( + oxygen[:, 0].coord, + stack, + selection=sele, + interval=interval, + bins=n_bins, + periodic=False, + ) + + nosel_bins, nosel_g_r = rdf( + oxygen[:, 0].coord, + oxygen[:, 1:], + interval=interval, + bins=n_bins, + periodic=False, + ) assert np.allclose(bins, nosel_bins) assert np.allclose(g_r, nosel_g_r) def test_rdf_atom_argument(): - """ Test if the first argument allows to use AtomArrayStack """ + """ + Test if the first argument allows to use AtomArrayStack. + """ stack = load_structure(TEST_FILE) # calculate oxygen RDF for water with and without a selection - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 - bins, g_r = rdf(oxygen[:, 0], stack, interval=interval, - bins=n_bins, periodic=False) + bins, g_r = rdf(oxygen[:, 0], stack, interval=interval, bins=n_bins, periodic=False) - atom_bins, atoms_g_r = rdf(oxygen[:, 0].coord, stack, interval=interval, - bins=n_bins, periodic=False) + atom_bins, atoms_g_r = rdf( + oxygen[:, 0].coord, stack, interval=interval, bins=n_bins, periodic=False + ) assert np.allclose(g_r, atoms_g_r) def test_rdf_multiple_center(): - """ Test if the first argument allows to use multiple centers""" + """ + Test if the first argument allows to use multiple centers. + """ stack = load_structure(TEST_FILE) # calculate oxygen RDF for water with and without a selection - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 10]) n_bins = 100 # averaging individual calculations - bins1, g_r1 = rdf(oxygen[:, 1].coord, oxygen[:, 2:], interval=interval, - bins=n_bins, periodic=False) - bins2, g_r2 = rdf(oxygen[:, 0].coord, oxygen[:, 2:], interval=interval, - bins=n_bins, periodic=False) + bins1, g_r1 = rdf( + oxygen[:, 1].coord, + oxygen[:, 2:], + interval=interval, + bins=n_bins, + periodic=False, + ) + bins2, g_r2 = rdf( + oxygen[:, 0].coord, + oxygen[:, 2:], + interval=interval, + bins=n_bins, + periodic=False, + ) mean = np.mean([g_r1, g_r2], axis=0) # this should give the same result as averaging for oxygen 0 and 1 - bins, g_r = rdf(oxygen[:, 0:2].coord, oxygen[:, 2:], interval=interval, - bins=n_bins, periodic=False) + bins, g_r = rdf( + oxygen[:, 0:2].coord, + oxygen[:, 2:], + interval=interval, + bins=n_bins, + periodic=False, + ) assert np.allclose(g_r, mean, rtol=0.0001) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -def test_rdf_periodic(): - """ Test if the periodic argument gives the correct results""" - test_file = TEST_FILE - stack = load_structure(test_file) - - # calculate oxygen RDF for water - oxygen = stack[:, stack.atom_name == 'OW'] - interval = np.array([0, 10]) - n_bins = 100 - bins, g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:], interval=interval, - bins=n_bins, periodic=True) - - # Compare with MDTraj - import mdtraj - traj = mdtraj.load(TEST_FILE) - ow = [a.index for a in traj.topology.atoms if a.name == 'O'] - pairs = itertools.product([ow[0]], ow[1:]) - mdt_bins, mdt_g_r = mdtraj.compute_rdf(traj, list(pairs), - r_range=interval/10, n_bins=n_bins, - periodic=True) - - assert np.allclose(bins, mdt_bins*10) - assert np.allclose(g_r, mdt_g_r, rtol=0.0001) - - def test_rdf_box(): - """ Test correct use of simulation boxes """ + """ + Test correct use of simulation boxes. + """ stack = load_structure(TEST_FILE) box = vectors_from_unitcell(1, 1, 1, 90, 90, 90) box_stack = np.repeat(box[np.newaxis, :, :], len(stack), axis=0) @@ -169,16 +172,16 @@ def test_rdf_box(): def test_rdf_normalized(): - """ Assert that the RDF tail is normalized to 1""" + """ + Assert that the RDF tail is normalized to 1. + """ test_file = TEST_FILE stack = load_structure(test_file) # calculate oxygen RDF for water - oxygen = stack[:, stack.atom_name == 'OW'] + oxygen = stack[:, stack.atom_name == "OW"] interval = np.array([0, 5]) n_bins = 100 - bins, g_r = rdf(oxygen.coord, oxygen, interval=interval, - bins=n_bins, periodic=True) + bins, g_r = rdf(oxygen.coord, oxygen, interval=interval, bins=n_bins, periodic=True) assert np.allclose(g_r[-10:], np.ones(10), atol=0.1) - diff --git a/tests/structure/test_repair.py b/tests/structure/test_repair.py index 34ba9f622..13b6c45af 100644 --- a/tests/structure/test_repair.py +++ b/tests/structure/test_repair.py @@ -2,19 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io.pdbx as pdbx -import numpy as np from os.path import join -from ..util import data_dir +import numpy as np import pytest +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir @pytest.fixture def single_chain(): - pdbx_file = pdbx.BinaryCIFFile.read( - join(data_dir("structure"), "1l2y.bcif") - ) + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) return pdbx.get_structure(pdbx_file, model=1) @@ -40,35 +38,38 @@ def test_create_continuous_res_ids(multi_chain, restart_each_chain): test_res_ids, _ = struc.get_residues(multi_chain) if restart_each_chain: - assert test_res_ids.tolist() == np.concatenate( - [np.arange(len(test_res_ids) // 2) + 1] * 2 - ).tolist() + assert ( + test_res_ids.tolist() + == np.concatenate([np.arange(len(test_res_ids) // 2) + 1] * 2).tolist() + ) else: - assert test_res_ids.tolist() \ - == (np.arange(len(test_res_ids)) + 1).tolist() + assert test_res_ids.tolist() == (np.arange(len(test_res_ids)) + 1).tolist() +@pytest.mark.filterwarnings("ignore:Could not infer element") @pytest.mark.parametrize( "name,expected", - [("CA", "C"), - ("C", "C"), - ("CB", "C"), - ("OD1", "O"), - ("HD21", "H"), - ("1H", "H"), - #("CL", "CL"), # This is an edge case where inference is difficult - ("HE", "H"), - ("SD", "S"), - ("NA", "N"), - ("NX", "N"), - ("BE", "BE"), - ("BEA", "BE"), - ("K", "K"), - ("KA", "K"), - ("QWERT", "")] + [ + ("CA", "C"), + ("C", "C"), + ("CB", "C"), + ("OD1", "O"), + ("HD21", "H"), + ("1H", "H"), + # ("CL", "CL"), # This is an edge case where inference is difficult + ("HE", "H"), + ("SD", "S"), + ("NA", "N"), + ("NX", "N"), + ("BE", "BE"), + ("BEA", "BE"), + ("K", "K"), + ("KA", "K"), + ("QWERT", ""), + ], ) def test_infer_elements(name, expected): """ Check if elements are correctly guessed based on known examples. """ - assert struc.infer_elements([name])[0] == expected \ No newline at end of file + assert struc.infer_elements([name])[0] == expected diff --git a/tests/structure/test_residues.py b/tests/structure/test_residues.py index c3597a73f..024c3e696 100644 --- a/tests/structure/test_residues.py +++ b/tests/structure/test_residues.py @@ -2,12 +2,12 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import biotite.structure as struc -import biotite.structure.io as strucio -import numpy as np from os.path import join -from ..util import data_dir +import numpy as np import pytest +import biotite.structure as struc +import biotite.structure.io as strucio +from tests.util import data_dir @pytest.fixture @@ -17,11 +17,11 @@ def array(): def test_apply_residue_wise(array): data = struc.apply_residue_wise(array, np.ones(len(array)), np.sum) - assert data.tolist() == [len(array[array.res_id == i]) - for i in range(1, 21)] + assert data.tolist() == [len(array[array.res_id == i]) for i in range(1, 21)] + def test_spread_residue_wise(array): - input_data = np.arange(1,21) + input_data = np.arange(1, 21) output_data = struc.spread_residue_wise(array, input_data) assert output_data.tolist() == array.res_id.tolist() @@ -41,8 +41,7 @@ def test_get_residue_starts_for(array): np.random.seed(0) indices = np.random.randint(0, array.array_length(), SAMPLE_SIZE) ref_starts = np.array( - [np.where(mask)[0][0] for mask - in struc.get_residue_masks(array, indices)] + [np.where(mask)[0][0] for mask in struc.get_residue_masks(array, indices)] ) test_starts = struc.get_residue_starts_for(array, indices) assert test_starts.tolist() == ref_starts.tolist() @@ -51,16 +50,32 @@ def test_get_residue_starts_for(array): def test_get_residues(array): ids, names = struc.get_residues(array) assert ids.tolist() == list(range(1, 21)) - assert names.tolist() == ["ASN","LEU","TYR","ILE","GLN","TRP","LEU","LYS", - "ASP","GLY","GLY","PRO","SER","SER","GLY","ARG", - "PRO","PRO","PRO","SER"] + assert names.tolist() == [ + "ASN", + "LEU", + "TYR", + "ILE", + "GLN", + "TRP", + "LEU", + "LYS", + "ASP", + "GLY", + "GLY", + "PRO", + "SER", + "SER", + "GLY", + "ARG", + "PRO", + "PRO", + "PRO", + "SER", + ] assert len(ids) == struc.get_residue_count(array) def test_residue_iter(array): - centroid = [struc.centroid(res).tolist() - for res in struc.residue_iter(array)] - ref_centroid = struc.apply_residue_wise( - array, array.coord, np.average, axis=0 - ) - assert centroid == ref_centroid.tolist() \ No newline at end of file + centroid = [struc.centroid(res).tolist() for res in struc.residue_iter(array)] + ref_centroid = struc.apply_residue_wise(array, array.coord, np.average, axis=0) + assert centroid == ref_centroid.tolist() diff --git a/tests/structure/test_sasa.py b/tests/structure/test_sasa.py index 12827f533..88900b0c8 100644 --- a/tests/structure/test_sasa.py +++ b/tests/structure/test_sasa.py @@ -2,59 +2,45 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +import json from os.path import join -import pytest import numpy as np +import pytest import biotite.structure as struc import biotite.structure.io.pdb as pdb import biotite.structure.io.pdbx as pdbx -from ..util import data_dir, cannot_import +from tests.util import data_dir -# Ignore warning about dummy unit cell vector -@pytest.mark.filterwarnings("ignore") -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"]) -def test_single(pdb_id): - file_name = join(data_dir("structure"), pdb_id+".pdb") +def test_sasa_consistency(pdb_id): + """ + Check that SASA computation for a single model reproduces results from MDTraj. + """ + # Load precomputed hydrogen bond triplets from MDTraj + with open(join(data_dir("structure"), "misc", "sasa.json")) as file: + ref_data = json.load(file) + ref_sasa = np.array(ref_data[pdb_id]) - # Single atom SASA, compare with MDTraj - file = pdb.PDBFile.read(file_name) + file = pdb.PDBFile.read(join(data_dir("structure"), pdb_id + ".pdb")) array = file.get_structure(model=1) - sasa = struc.sasa(array, vdw_radii="Single", point_number=5000) - - from biotite.structure.info.radii import _SINGLE_RADII as radii - import mdtraj - # Use the same atom radii - radii = {element.capitalize() : radius / 10 - for element, radius in radii.items()} - traj = mdtraj.load(file_name) - # Conversion from nm^2 to A^2 - sasa_exp = mdtraj.shrake_rupley( - traj, change_radii=radii, n_sphere_points=5000 - )[0] * 100 + test_sasa = struc.sasa(array, vdw_radii="Single", point_number=5000) - - # Assert that more than 90% of atoms - # have less than 10% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1e-1) - ) / len(sasa) > 0.9 - # Assert that more than 98% of atoms - # have less than 1% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=1e-2, atol=1e-1) - ) / len(sasa) > 0.98 + # Assert that all atoms have less than 10% SASA difference + assert np.all(np.isclose(test_sasa, ref_sasa, rtol=1e-1, atol=1e-1)) + # Assert that more than 98% of atoms have less than 1% SASA difference + assert ( + np.count_nonzero(np.isclose(test_sasa, ref_sasa, rtol=1e-2, atol=1e-1)) + / len(test_sasa) + > 0.98 + ) @pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"]) def test_coarse_grained(pdb_id): # Multi atom SASA (ProtOr), compare with single atom SASA # on residue level - file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id+".bcif")) + file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), pdb_id + ".bcif")) array = pdbx.get_structure(file, model=1) array = array[struc.filter_amino_acids(array)] sasa = struc.apply_residue_wise( @@ -66,11 +52,13 @@ def test_coarse_grained(pdb_id): # Assert that more than 90% of atoms # have less than 10% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1) - ) / len(sasa) > 0.9 + assert ( + np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1)) / len(sasa) + > 0.9 + ) # Assert that more than 98% of atoms # have less than 40% SASA difference - assert np.count_nonzero( - np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1) - ) / len(sasa) > 0.98 \ No newline at end of file + assert ( + np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1)) / len(sasa) + > 0.98 + ) diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py index 098958824..bcac9b9eb 100644 --- a/tests/structure/test_sequence.py +++ b/tests/structure/test_sequence.py @@ -5,16 +5,14 @@ import glob from os.path import join import pytest -import biotite.structure as struc import biotite.sequence as seq import biotite.sequence.align as align +import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -from ..util import data_dir +from tests.util import data_dir -@pytest.mark.parametrize( - "path", glob.glob(join(data_dir("structure"), "*.bcif")) -) +@pytest.mark.parametrize("path", glob.glob(join(data_dir("structure"), "*.bcif"))) def test_pdbx_sequence_consistency(path): """ Check if sequences created with :func:`to_sequence()` are equal to @@ -54,15 +52,14 @@ def _find_best_match(sequence, ref_sequences): best_alignment = None best_identity = 0.0 for ref_sequence in ref_sequences.values(): - if type(sequence) != type(ref_sequence): + if not isinstance(sequence, type(ref_sequence)): continue if isinstance(sequence, seq.ProteinSequence): matrix = align.SubstitutionMatrix.std_protein_matrix() else: matrix = align.SubstitutionMatrix.std_nucleotide_matrix() alignment = align.align_optimal( - sequence, ref_sequence, matrix, - terminal_penalty=False, max_number=1 + sequence, ref_sequence, matrix, terminal_penalty=False, max_number=1 )[0] # The 'shortest' identity is 1.0, if every residue in the # test sequence is aligned to an identical residue @@ -70,4 +67,4 @@ def _find_best_match(sequence, ref_sequences): if identity > best_identity: best_alignment = alignment best_identity = identity - return best_alignment, best_identity \ No newline at end of file + return best_alignment, best_identity diff --git a/tests/structure/test_sse.py b/tests/structure/test_sse.py index 2d2964e10..543175ff3 100644 --- a/tests/structure/test_sse.py +++ b/tests/structure/test_sse.py @@ -6,27 +6,10 @@ from os.path import join import numpy as np import pytest +import biotite.sequence.io.fasta as fasta import biotite.structure as struc import biotite.structure.io.pdbx as pdbx -import biotite.sequence.io.fasta as fasta -from ..util import data_dir - - -def test_sse_legacy(): - """ - Legacy test to assert that refactoring did not change behavior. - """ - array = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "3o5r.bcif")), - model = 1 - ) - test_sse = struc.annotate_sse(array, "A") - ref_sse = ( - "caaaaaacccccccccccccbbbbbccccccbbbbccccccccccccccc" - "ccccccccccccbbbbbbcccccccaaaaaaaaaccccccbbbbbccccc" - "ccccccccccccbbbbbbbccccccccc" - ) - assert "".join(test_sse.tolist()) == ref_sse +from tests.util import data_dir def test_sse(): @@ -40,18 +23,14 @@ def test_sse(): matches = 0 total = 0 - ref_psea_file = fasta.FastaFile.read( - join(data_dir("structure"), "psea.fasta") - ) + ref_psea_file = fasta.FastaFile.read(join(data_dir("structure"), "psea.fasta")) for pdb_id in ref_psea_file: ref_sse = np.array(list(ref_psea_file[pdb_id])) atoms = pdbx.get_structure( - pdbx.BinaryCIFFile.read( - join(data_dir("structure"), f"{pdb_id}.bcif") - ), - model=1 + pdbx.BinaryCIFFile.read(join(data_dir("structure"), f"{pdb_id}.bcif")), + model=1, ) atoms = atoms[struc.filter_canonical_amino_acids(atoms)] if atoms.array_length() == 0: @@ -68,9 +47,9 @@ def test_sse(): np.random.seed(0) -@pytest.mark.parametrize( - "discont_pos", np.random.randint(2, 105, size=100) -) + + +@pytest.mark.parametrize("discont_pos", np.random.randint(2, 105, size=100)) def test_sse_discontinuity(discont_pos): """ Check if discontinuities are properly handled by inserting a @@ -78,8 +57,7 @@ def test_sse_discontinuity(discont_pos): proximity becomes 'coil'. """ atoms = pdbx.get_structure( - pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1gya.bcif")), - model=1 + pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1gya.bcif")), model=1 ) atoms = atoms[struc.filter_canonical_amino_acids(atoms)] @@ -89,7 +67,7 @@ def test_sse_discontinuity(discont_pos): assert len(struc.check_res_id_continuity(atoms)) == 0 # Introduce discontinuity res_starts = struc.get_residue_starts(atoms) - atoms.res_id[res_starts[discont_pos]:] += 1 + atoms.res_id[res_starts[discont_pos] :] += 1 test_sse = struc.annotate_sse(atoms) assert len(test_sse) == len(ref_sse) @@ -106,9 +84,7 @@ def test_sse_discontinuity(discont_pos): assert (test_sse[discont_proximity] == "c").all() -@pytest.mark.parametrize( - "file_name", glob.glob(join(data_dir("structure"), "*.bcif")) -) +@pytest.mark.parametrize("file_name", glob.glob(join(data_dir("structure"), "*.bcif"))) def test_sse_non_peptide(file_name): """ Test whether only amino acids get SSE annotated. @@ -118,9 +94,7 @@ def test_sse_non_peptide(file_name): # Special case for PDB 5EIL: # The residue BP5 is an amino acid, but has no CA # -> rename analogous atom - atoms.atom_name[ - (atoms.res_name == "BP5") & (atoms.atom_name == "C13") - ] = "CA" + atoms.atom_name[(atoms.res_name == "BP5") & (atoms.atom_name == "C13")] = "CA" sse = struc.annotate_sse(atoms) peptide_mask = struc.filter_amino_acids(atoms) @@ -128,4 +102,4 @@ def test_sse_non_peptide(file_name): peptide_mask = peptide_mask[struc.get_residue_starts(atoms)] assert np.all(np.isin(sse[peptide_mask], ["a", "b", "c"])) - assert np.all(sse[~peptide_mask] == "") \ No newline at end of file + assert np.all(sse[~peptide_mask] == "") diff --git a/tests/structure/test_superimpose.py b/tests/structure/test_superimpose.py index 1804e9583..694ad6446 100755 --- a/tests/structure/test_superimpose.py +++ b/tests/structure/test_superimpose.py @@ -9,9 +9,8 @@ import pytest import biotite.structure as struc import biotite.structure.io as strucio -import biotite.structure as struc from biotite.structure.superimpose import _multi_matmul as multi_matmul -from ..util import data_dir +from tests.util import data_dir def test_transform_as_matrix(): @@ -30,7 +29,7 @@ def test_transform_as_matrix(): # This is not really a rotation matrix, # but the same maths apply rotation=np.random.rand(N_MODELS, 3, 3), - target_translation=np.random.rand(N_MODELS, 3) + target_translation=np.random.rand(N_MODELS, 3), ) ref_coord = transform.apply(orig_coord) @@ -41,15 +40,13 @@ def test_transform_as_matrix(): test_coord_4 = multi_matmul(transform.as_matrix(), orig_coord_4) test_coord = test_coord_4[..., :3] - assert test_coord.flatten().tolist() \ - == pytest.approx(ref_coord.flatten().tolist(), abs=1e-6) + assert test_coord.flatten().tolist() == pytest.approx( + ref_coord.flatten().tolist(), abs=1e-6 + ) @pytest.mark.parametrize( - "seed, multi_model", itertools.product( - range(10), - [False, True] - ) + "seed, multi_model", itertools.product(range(10), [False, True]) ) def test_restoration(seed, multi_model): """ @@ -70,8 +67,9 @@ def test_restoration(seed, multi_model): test_coord = _transform_random_affine(ref_coord) test_coord, _ = struc.superimpose(ref_coord, test_coord) - assert test_coord.flatten().tolist() \ - == pytest.approx(ref_coord.flatten().tolist(), abs=1e-6) + assert test_coord.flatten().tolist() == pytest.approx( + ref_coord.flatten().tolist(), abs=1e-6 + ) def test_rotation_matrix(): @@ -83,28 +81,23 @@ def test_rotation_matrix(): N_COORD = 100 # A rotation matrix that rotates 90 degrees around the z-axis - ref_rotation = np.array([ - [0, -1, 0], - [1, 0, 0], - [0, 0, 1] - ]) + ref_rotation = np.array([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) np.random.seed(0) original_coord = np.random.rand(N_COORD, 3) # Rotate about 90 degrees around z-axis - rotated_coord = struc.rotate(original_coord, angles=(0, 0, np.pi/2)) + rotated_coord = struc.rotate(original_coord, angles=(0, 0, np.pi / 2)) _, transform = struc.superimpose(rotated_coord, original_coord) test_rotation = transform.rotation - assert test_rotation.flatten().tolist() \ - == pytest.approx(ref_rotation.flatten().tolist(), abs=1e-6) + assert test_rotation.flatten().tolist() == pytest.approx( + ref_rotation.flatten().tolist(), abs=1e-6 + ) @pytest.mark.parametrize( - "path, coord_only", itertools.product( - glob.glob(join(data_dir("structure"), "*.bcif")), - [False, True] - ) + "path, coord_only", + itertools.product(glob.glob(join(data_dir("structure"), "*.bcif")), [False, True]), ) def test_superimposition_array(path, coord_only): """ @@ -116,22 +109,20 @@ def test_superimposition_array(path, coord_only): fixed = strucio.load_structure(path, model=1) mobile = fixed.copy() - mobile = struc.rotate(mobile, (1,2,3)) - mobile = struc.translate(mobile, (1,2,3)) + mobile = struc.rotate(mobile, (1, 2, 3)) + mobile = struc.translate(mobile, (1, 2, 3)) if coord_only: fixed = fixed.coord mobile = mobile.coord - fitted, transformation = struc.superimpose( - fixed, mobile - ) + fitted, transformation = struc.superimpose(fixed, mobile) if coord_only: assert isinstance(fitted, np.ndarray) assert struc.rmsd(fixed, fitted) == pytest.approx(0, abs=6e-4) - fitted = struc.superimpose_apply(mobile, transformation) + fitted = transformation.apply(mobile) if coord_only: assert isinstance(fitted, np.ndarray) @@ -150,7 +141,7 @@ def test_superimposition_stack(ca_only): fixed = stack[0] mobile = stack[1:] if ca_only: - mask = (mobile.atom_name == "CA") + mask = mobile.atom_name == "CA" else: mask = None @@ -160,15 +151,13 @@ def test_superimposition_stack(ca_only): # The superimpositions are better for most cases than the # superimpositions in the structure file # -> Use average - assert np.mean(struc.rmsd(fixed, fitted)) \ - < np.mean(struc.rmsd(fixed, mobile)) + assert np.mean(struc.rmsd(fixed, fitted)) < np.mean(struc.rmsd(fixed, mobile)) else: # The superimpositions are better than the superimpositions # in the structure file assert (struc.rmsd(fixed, fitted) < struc.rmsd(fixed, mobile)).all() - @pytest.mark.parametrize("seed", range(5)) def test_masked_superimposition(seed): """ @@ -188,25 +177,19 @@ def test_masked_superimposition(seed): # The distance between the atom in both models should not be # already 0 prior to superimposition - assert struc.distance(fixed[mask], mobile[mask])[0] \ - != pytest.approx(0, abs=5e-4) + assert struc.distance(fixed[mask], mobile[mask])[0] != pytest.approx(0, abs=5e-4) - fitted, transformation = struc.superimpose( - fixed, mobile, mask - ) + fitted, transformation = struc.superimpose(fixed, mobile, mask) - assert struc.distance(fixed[mask], fitted[mask])[0] \ - == pytest.approx(0, abs=5e-4) + assert struc.distance(fixed[mask], fitted[mask])[0] == pytest.approx(0, abs=5e-4) - fitted = struc.superimpose_apply(mobile, transformation) + fitted = transformation.apply(mobile) - struc.distance(fixed[mask], fitted[mask])[0] \ - == pytest.approx(0, abs=5e-4) + struc.distance(fixed[mask], fitted[mask])[0] == pytest.approx(0, abs=5e-4) @pytest.mark.parametrize( - "single_model, single_atom", - itertools.product([False, True], [False, True]) + "single_model, single_atom", itertools.product([False, True], [False, True]) ) def test_input_shapes(single_model, single_atom): """ @@ -227,7 +210,7 @@ def test_input_shapes(single_model, single_atom): fitted, _ = struc.superimpose(fixed, mobile) - assert type(fitted) == type(mobile) + assert isinstance(fitted, type(mobile)) assert fitted.coord.shape == mobile.coord.shape @@ -258,24 +241,25 @@ def test_outlier_detection(seed): superimposed_coord, _, anchors = struc.superimpose_without_outliers( # Increase the threshold a bit, # to ensure that no inlier is classified as outlier - fixed_coord, mobile_coord, outlier_threshold=3.0 + fixed_coord, + mobile_coord, + outlier_threshold=3.0, ) test_outlier_mask = np.full(N_COORD, True) test_outlier_mask[anchors] = False assert test_outlier_mask.tolist() == ref_outlier_mask.tolist() # Without the outliers, the RMSD should be in the noise range - assert struc.rmsd( - fixed_coord[~ref_outlier_mask], superimposed_coord[~ref_outlier_mask] - ) < NOISE + assert ( + struc.rmsd( + fixed_coord[~ref_outlier_mask], superimposed_coord[~ref_outlier_mask] + ) + < NOISE + ) @pytest.mark.parametrize( - "multi_model, coord_only", - itertools.product( - [False, True], - [False, True] - ) + "multi_model, coord_only", itertools.product([False, True], [False, True]) ) def test_superimpose_without_outliers_inputs(multi_model, coord_only): """ @@ -289,11 +273,9 @@ def test_superimpose_without_outliers_inputs(multi_model, coord_only): if coord_only: atoms = atoms.coord - superimposed, transform, _ = struc.superimpose_without_outliers( - atoms, atoms - ) + superimposed, transform, _ = struc.superimpose_without_outliers(atoms, atoms) - assert type(superimposed) == type(atoms) + assert isinstance(superimposed, type(atoms)) assert superimposed.shape == atoms.shape transform_matrix = transform.as_matrix() if multi_model: @@ -313,7 +295,7 @@ def test_superimpose_without_outliers_inputs(multi_model, coord_only): ("1aki", "A", True), ("4gxy", "A", False), # is a nucleic acid ("4gxy", "A", True), - ] + ], ) def test_superimpose_homologs(pdb_id, chain_id, as_stack): """ @@ -342,8 +324,10 @@ def test_superimpose_homologs(pdb_id, chain_id, as_stack): ) # Check if corresponding residues were superimposed - assert fixed_atoms.res_id[fix_anchors].tolist() \ + assert ( + fixed_atoms.res_id[fix_anchors].tolist() == mobile_atoms.res_id[mob_anchors].tolist() + ) # If a stack, it only contains one model if as_stack: fixed_atoms = fixed_atoms[0] @@ -355,15 +339,14 @@ def test_superimpose_homologs(pdb_id, chain_id, as_stack): def _transform_random_affine(coord): coord = struc.translate(coord, np.random.rand(3)) - coord = struc.rotate(coord, np.random.uniform(low=0, high=2*np.pi, size=3)) + coord = struc.rotate(coord, np.random.uniform(low=0, high=2 * np.pi, size=3)) return coord def _delete_random_residues(atoms, p_conservation): residue_starts = struc.get_residue_starts(atoms) conserved_residue_starts = np.random.choice( - residue_starts, size=int(p_conservation * len(residue_starts)), - replace=False + residue_starts, size=int(p_conservation * len(residue_starts)), replace=False ) conservation_mask = np.any( struc.get_residue_masks(atoms, conserved_residue_starts), axis=0 diff --git a/tests/structure/test_trajectory.py b/tests/structure/test_trajectory.py index e4a9a1ba3..ed1db16d2 100644 --- a/tests/structure/test_trajectory.py +++ b/tests/structure/test_trajectory.py @@ -2,46 +2,34 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from tempfile import NamedTemporaryFile import itertools -import glob -from os.path import join, basename +from os.path import join +from tempfile import NamedTemporaryFile import numpy as np import pytest import biotite.structure as struc import biotite.structure.io as strucio -import biotite.structure.io.xtc as xtc -import biotite.structure.io.trr as trr -import biotite.structure.io.tng as tng import biotite.structure.io.dcd as dcd import biotite.structure.io.netcdf as netcdf -from ..util import data_dir, cannot_import +import biotite.structure.io.trr as trr +import biotite.structure.io.xtc as xtc +from tests.util import data_dir -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) -@pytest.mark.parametrize("format", ["trr", "xtc", "tng", "dcd", "netcdf"]) +@pytest.mark.parametrize("format", ["trr", "xtc", "dcd", "netcdf"]) def test_array_conversion(format): - template = strucio.load_structure( - join(data_dir("structure"), "1l2y.bcif") - )[0] + template = strucio.load_structure(join(data_dir("structure"), "1l2y.bcif"))[0] # Add fake box - template.box = np.diag([1,2,3]) + template.box = np.diag([1, 2, 3]) if format == "trr": traj_file_cls = trr.TRRFile if format == "xtc": traj_file_cls = xtc.XTCFile - if format == "tng": - traj_file_cls = tng.TNGFile if format == "dcd": traj_file_cls = dcd.DCDFile if format == "netcdf": traj_file_cls = netcdf.NetCDFFile - traj_file = traj_file_cls.read( - join(data_dir("structure"), f"1l2y.{format}") - ) + traj_file = traj_file_cls.read(join(data_dir("structure"), f"1l2y.{format}")) ref_array = traj_file.get_structure(template) traj_file = traj_file_cls() @@ -58,23 +46,19 @@ def test_array_conversion(format): assert ref_array.coord == pytest.approx(array.coord, abs=1e-2) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( "format, start, stop, step, chunk_size", itertools.product( - ["trr", "xtc", "tng", "dcd", "netcdf"], + ["trr", "xtc", "dcd", "netcdf"], [None, 2], [None, 17], [None, 2], - [None, 3] - ) + [None, 3], + ), ) def test_bcif_consistency(format, start, stop, step, chunk_size): if format == "netcdf" and stop is not None and step is not None: - # Currently, there is an inconsistency in in MDTraj's + # Currently, there is an inconsistency in in biotraj's # NetCDFTrajectoryFile class: # In this class the number of frames in the output arrays # is dependent on the 'stride' parameter @@ -89,15 +73,16 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): traj_file_cls = trr.TRRFile if format == "xtc": traj_file_cls = xtc.XTCFile - if format == "tng": - traj_file_cls = tng.TNGFile if format == "dcd": traj_file_cls = dcd.DCDFile if format == "netcdf": traj_file_cls = netcdf.NetCDFFile traj_file = traj_file_cls.read( join(data_dir("structure"), f"1l2y.{format}"), - start, stop, step, chunk_size=chunk_size + start, + stop, + step, + chunk_size=chunk_size, ) test_traj = traj_file.get_structure(template) test_traj_time = traj_file.get_time() @@ -108,10 +93,9 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): # Shift to ensure time starts at 0 test_traj_time -= 1 start = start if start is not None else 0 - stop = stop if stop is not None else 38 # 38 models in 1l2y + stop = stop if stop is not None else 38 # 38 models in 1l2y step = step if step is not None else 1 - assert test_traj_time.astype(int).tolist() \ - == list(range(start, stop, step)) + assert test_traj_time.astype(int).tolist() == list(range(start, stop, step)) assert test_traj.stack_depth() == ref_traj.stack_depth() # 1l2y has no box @@ -121,19 +105,15 @@ def test_bcif_consistency(format, start, stop, step, chunk_size): assert test_traj.coord == pytest.approx(ref_traj.coord, abs=1e-2) -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( "format, start, stop, step, stack_size", itertools.product( - ["trr", "xtc", "tng", "dcd", "netcdf"], + ["trr", "xtc", "dcd", "netcdf"], [None, 2], [None, 17], [None, 2], - [None, 2, 3] - ) + [None, 2, 3], + ), ) def test_read_iter(format, start, stop, step, stack_size): """ @@ -141,7 +121,7 @@ def test_read_iter(format, start, stop, step, stack_size): from a corresponding :class:`TrajectoryFile` object. """ if format == "netcdf" and step is not None: - # Currently, there is an inconsistency in in MDTraj's + # Currently, there is an inconsistency in in biotraj's # NetCDFTrajectoryFile class: # In this class the number of frames in the output arrays # is dependent on the 'stride' parameter @@ -151,8 +131,6 @@ def test_read_iter(format, start, stop, step, stack_size): traj_file_cls = trr.TRRFile if format == "xtc": traj_file_cls = xtc.XTCFile - if format == "tng": - traj_file_cls = tng.TNGFile if format == "dcd": traj_file_cls = dcd.DCDFile if format == "netcdf": @@ -176,7 +154,7 @@ def test_read_iter(format, start, stop, step, stack_size): # Convert list to NumPy array combination_func = np.stack if stack_size is None else np.concatenate - test_coord =combination_func(test_coord) + test_coord = combination_func(test_coord) if test_box[0] is not None: test_box = combination_func(test_box) else: @@ -197,19 +175,15 @@ def test_read_iter(format, start, stop, step, stack_size): assert test_time.tolist() == ref_time.tolist() -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( "format, start, stop, step, stack_size", itertools.product( - ["trr", "xtc", "tng", "dcd", "netcdf"], + ["trr", "xtc", "dcd", "netcdf"], [None, 2], [None, 17], [None, 2], - [None, 2, 3] - ) + [None, 2, 3], + ), ) def test_read_iter_structure(format, start, stop, step, stack_size): """ @@ -218,7 +192,7 @@ def test_read_iter_structure(format, start, stop, step, stack_size): :class:`TrajectoryFile` object. """ if format == "netcdf" and step is not None: - # Currently, there is an inconsistency in in MDTraj's + # Currently, there is an inconsistency in in biotraj's # NetCDFTrajectoryFile class: # In this class the number of frames in the output arrays # is dependent on the 'stride' parameter @@ -230,8 +204,6 @@ def test_read_iter_structure(format, start, stop, step, stack_size): traj_file_cls = trr.TRRFile if format == "xtc": traj_file_cls = xtc.XTCFile - if format == "tng": - traj_file_cls = tng.TNGFile if format == "dcd": traj_file_cls = dcd.DCDFile if format == "netcdf": @@ -241,9 +213,12 @@ def test_read_iter_structure(format, start, stop, step, stack_size): traj_file = traj_file_cls.read(file_name, start, stop, step) ref_traj = traj_file.get_structure(template) - frames = [frame for frame in traj_file_cls.read_iter_structure( - file_name, template, start, stop, step, stack_size=stack_size - )] + frames = [ + frame + for frame in traj_file_cls.read_iter_structure( + file_name, template, start, stop, step, stack_size=stack_size + ) + ] if stack_size is None: assert isinstance(frames[0], struc.AtomArray) @@ -255,19 +230,15 @@ def test_read_iter_structure(format, start, stop, step, stack_size): assert test_traj == ref_traj -@pytest.mark.skipif( - cannot_import("mdtraj"), - reason="MDTraj is not installed" -) @pytest.mark.parametrize( "format, n_models, n_atoms, include_box, include_time", itertools.product( - ["trr", "xtc", "tng", "dcd", "netcdf"], + ["trr", "xtc", "dcd", "netcdf"], [1, 100], [1, 1000], [False, True], [False, True], - ) + ), ) def test_write_iter(format, n_models, n_atoms, include_box, include_time): """ @@ -277,12 +248,6 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): traj_file_cls = trr.TRRFile if format == "xtc": traj_file_cls = xtc.XTCFile - if format == "tng": - # TNG files do only write time when more than one frame is - # written to file; 'write_iter()' writes only one frame per - # 'write()' call, hence time is not written - traj_file_cls = tng.TNGFile - include_time = False if format == "dcd": traj_file_cls = dcd.DCDFile # DCD format does not support simulation time @@ -294,9 +259,7 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): np.random.seed(0) coord = np.random.rand(n_models, n_atoms, 3) * 100 box = np.random.rand(n_models, 3, 3) * 100 if include_box else None - # time is evenly spaced for TNG compatibility - time = np.linspace(0, 10, n_models) if include_time else None - + time = np.random.rand(n_models) * 10 if include_time else None ref_file = NamedTemporaryFile("w+b") traj_file = traj_file_cls() @@ -311,7 +274,6 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): ref_time = traj_file.get_time() ref_file.close() - test_file = NamedTemporaryFile("w+b") traj_file_cls.write_iter(test_file.name, coord, box, time) @@ -321,9 +283,8 @@ def test_write_iter(format, n_models, n_atoms, include_box, include_time): test_time = traj_file.get_time() test_file.close() - assert np.allclose(test_coord, ref_coord, atol=1e-2) if include_box: assert np.allclose(test_box, ref_box, atol=1e-2) if include_time: - assert np.allclose(test_time, ref_time, atol=1e-2) \ No newline at end of file + assert np.allclose(test_time, ref_time, atol=1e-2) diff --git a/tests/structure/test_transform.py b/tests/structure/test_transform.py index dffa552c4..de79f9429 100644 --- a/tests/structure/test_transform.py +++ b/tests/structure/test_transform.py @@ -7,29 +7,29 @@ import numpy as np import pytest import biotite.structure as struc -import biotite.structure.io.npz as npz -from ..util import data_dir +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir @pytest.fixture( params=itertools.product( - [1, 2, 3], # ndim - [False, True] # as_coord + [1, 2, 3], # ndim + [False, True], # as_coord ) ) def input_atoms(request): ndim, as_coord = request.param - file = npz.NpzFile.read(join(data_dir("structure"), "1l2y.npz")) - atoms = file.get_structure() - + pdbx_file = pdbx.BinaryCIFFile.read(join(data_dir("structure"), "1l2y.bcif")) + atoms = pdbx.get_structure(pdbx_file) + if ndim == 2: # Only one model atoms = atoms[0] elif ndim == 1: # Only one atom - atoms = atoms[0,0] - + atoms = atoms[0, 0] + if as_coord: return atoms.coord else: @@ -48,7 +48,7 @@ def test_translate(input_atoms, ndim, as_list, random_seed): # Cannot run tests if translation vector has more dimensions # as input coordinates/atoms return - + np.random.seed(random_seed) vectors = np.random.rand(*struc.coord(input_atoms).shape[-ndim:]) vectors *= 10 @@ -56,19 +56,17 @@ def test_translate(input_atoms, ndim, as_list, random_seed): if as_list: vectors = vectors.tolist() neg_vectors = neg_vectors.tolist() - + translated = struc.translate(input_atoms, vectors) restored = struc.translate(translated, neg_vectors) - assert type(restored) == type(input_atoms) + assert isinstance(restored, type(input_atoms)) assert struc.coord(restored).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) @pytest.mark.parametrize("as_list", [False, True]) -@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z +@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z @pytest.mark.parametrize("random_seed", np.arange(5)) @pytest.mark.parametrize("centered", [False, True]) def test_rotate(input_atoms, as_list, axis, random_seed, centered): @@ -78,31 +76,29 @@ def test_rotate(input_atoms, as_list, axis, random_seed, centered): """ np.random.seed(random_seed) angles = np.zeros(3) - angles[axis] = np.random.rand() * 2*np.pi + angles[axis] = np.random.rand() * 2 * np.pi neg_angles = -angles if as_list: angles = angles.tolist() neg_angles = neg_angles.tolist() - + func = struc.rotate_centered if centered else struc.rotate rotated = func(input_atoms, angles) restored = func(rotated, neg_angles) - assert type(restored) == type(input_atoms) + assert isinstance(restored, type(input_atoms)) assert struc.coord(restored).shape == struc.coord(input_atoms).shape print(np.max(np.abs(struc.coord(restored) - struc.coord(input_atoms)))) - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) if centered and struc.coord(input_atoms).ndim > 1: assert np.allclose( struc.centroid(restored), struc.centroid(input_atoms), atol=1e-5 ) -@pytest.mark.parametrize("x", [0, 2*np.pi]) -@pytest.mark.parametrize("y", [0, 2*np.pi]) -@pytest.mark.parametrize("z", [0, 2*np.pi]) +@pytest.mark.parametrize("x", [0, 2 * np.pi]) +@pytest.mark.parametrize("y", [0, 2 * np.pi]) +@pytest.mark.parametrize("z", [0, 2 * np.pi]) @pytest.mark.parametrize("centered", [False, True]) def test_rotate_360(input_atoms, x, y, z, centered): """ @@ -111,12 +107,10 @@ def test_rotate_360(input_atoms, x, y, z, centered): """ func = struc.rotate_centered if centered else struc.rotate rotated = func(input_atoms, [x, y, z]) - - assert type(rotated) == type(input_atoms) + + assert isinstance(rotated, type(input_atoms)) assert struc.coord(rotated).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(rotated), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(rotated), struc.coord(input_atoms), atol=1e-5) if centered and struc.coord(input_atoms).ndim > 1: assert np.allclose( struc.centroid(rotated), struc.centroid(input_atoms), atol=1e-5 @@ -129,7 +123,7 @@ def test_rotate_known(ndim): Rotate a vector at the Y-axis about the X-axis by 90 degrees and expect a rotated vector at the Z-axis. """ - shape = (1,) * (ndim-1) + (3,) + shape = (1,) * (ndim - 1) + (3,) vector = np.zeros(shape) vector[...] = [0, 1, 0] @@ -138,12 +132,12 @@ def test_rotate_known(ndim): # Rotation by 90 degrees test_rotated = struc.rotate(vector, [0.5 * np.pi, 0, 0]) - + assert test_rotated.shape == exp_rotated.shape assert np.allclose(test_rotated, exp_rotated, atol=1e-5) -@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z +@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z @pytest.mark.parametrize("random_seed", np.arange(5)) def test_rotate_measure(axis, random_seed): """ @@ -158,16 +152,15 @@ def test_rotate_measure(axis, random_seed): angles[axis] = ref_angle # The measured angle is only equal to the input angle, - # if the input coordinates have no component on the rotation axis + # if the input coordinates have no component on the rotation axis input_coord = np.ones(3) input_coord[axis] = 0 rotated = struc.rotate(input_coord, angles) test_angle = struc.angle(rotated, 0, input_coord) - + # Vector length should be unchanged - assert np.linalg.norm(rotated) \ - == pytest.approx(np.linalg.norm(input_coord)) + assert np.linalg.norm(rotated) == pytest.approx(np.linalg.norm(input_coord)) assert test_angle == pytest.approx(ref_angle) @@ -191,14 +184,12 @@ def test_rotate_about_axis(input_atoms, as_list, use_support, random_seed): rotated = struc.rotate_about_axis(input_atoms, axis, angle, support) restored = struc.rotate_about_axis(rotated, axis, neg_angle, support) - assert type(restored) == type(input_atoms) + assert isinstance(restored, type(input_atoms)) assert struc.coord(restored).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) -@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z +@pytest.mark.parametrize("axis", [0, 1, 2]) # x, y, z @pytest.mark.parametrize("random_seed", np.arange(5)) def test_rotate_about_axis_consistency(input_atoms, axis, random_seed): """ @@ -207,7 +198,7 @@ def test_rotate_about_axis_consistency(input_atoms, axis, random_seed): """ np.random.seed(random_seed) angle = np.random.rand() * 2 * np.pi - + angles = np.zeros(3) angles[axis] = angle ref_rotated = struc.rotate(input_atoms, angles) @@ -215,13 +206,15 @@ def test_rotate_about_axis_consistency(input_atoms, axis, random_seed): rot_axis = np.zeros(3) # Length of axis should be irrelevant rot_axis[axis] = np.random.rand() - test_rotated = struc.rotate_about_axis(input_atoms, rot_axis, angle,) + test_rotated = struc.rotate_about_axis( + input_atoms, + rot_axis, + angle, + ) - assert type(test_rotated) == type(ref_rotated) + assert isinstance(test_rotated, type(ref_rotated)) assert struc.coord(test_rotated).shape == struc.coord(ref_rotated).shape - assert np.allclose( - struc.coord(test_rotated), struc.coord(ref_rotated), atol=1e-5 - ) + assert np.allclose(struc.coord(test_rotated), struc.coord(ref_rotated), atol=1e-5) @pytest.mark.parametrize("random_seed", np.arange(5)) @@ -233,26 +226,27 @@ def test_rotate_about_axis_360(input_atoms, random_seed, use_support): """ np.random.seed(random_seed) axis = np.random.rand(3) - support = np.random.rand(3) if use_support else None - - rotated = struc.rotate_about_axis(input_atoms, axis, 2*np.pi, support) + support = np.random.rand(3) if use_support else None + + rotated = struc.rotate_about_axis(input_atoms, axis, 2 * np.pi, support) - assert type(rotated) == type(input_atoms) + assert isinstance(rotated, type(input_atoms)) assert struc.coord(rotated).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(rotated), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(rotated), struc.coord(input_atoms), atol=1e-5) @pytest.mark.parametrize("as_list", [False, True]) -@pytest.mark.parametrize("order", ( - np.array([0, 1, 2]), - np.array([0, 2, 1]), - np.array([1, 0, 2]), - np.array([2, 0, 1]), - np.array([2, 1, 0]), - np.array([1, 2, 0]), -)) +@pytest.mark.parametrize( + "order", + ( + np.array([0, 1, 2]), + np.array([0, 2, 1]), + np.array([1, 0, 2]), + np.array([2, 0, 1]), + np.array([2, 1, 0]), + np.array([1, 2, 0]), + ), +) def test_orient_principal_components(input_atoms, as_list, order): """ Orient atoms such that the variance in each axis is greatest @@ -270,7 +264,7 @@ def test_orient_principal_components(input_atoms, as_list, order): result = struc.orient_principal_components(input_atoms, order=order) neg_variance = -struc.coord(result).var(axis=0) - assert type(result) == type(input_atoms) + assert isinstance(result, type(input_atoms)) assert (neg_variance.argsort() == np.argsort(order)).all() @@ -295,35 +289,37 @@ def test_align_vectors(input_atoms, as_list, use_support, random_seed): source_direction = np.random.rand(3) target_direction = np.random.rand(3) if use_support: - source_position = np.random.rand(3) - target_position = np.random.rand(3) + source_position = np.random.rand(3) + target_position = np.random.rand(3) else: source_position = None target_position = None - + if as_list: source_direction = source_direction.tolist() target_direction = target_direction.tolist() if use_support: source_position = source_position.tolist() target_position = target_position.tolist() - + transformed = struc.align_vectors( input_atoms, - source_direction, target_direction, - source_position, target_position + source_direction, + target_direction, + source_position, + target_position, ) restored = struc.align_vectors( transformed, - target_direction, source_direction, - target_position, source_position + target_direction, + source_direction, + target_position, + source_position, ) - assert type(restored) == type(input_atoms) + assert isinstance(restored, type(input_atoms)) assert struc.coord(restored).shape == struc.coord(input_atoms).shape - assert np.allclose( - struc.coord(restored), struc.coord(input_atoms), atol=1e-5 - ) + assert np.allclose(struc.coord(restored), struc.coord(input_atoms), atol=1e-5) def test_align_vectors_non_vector_inputs(input_atoms): diff --git a/tests/test_doctest.py b/tests/test_doctest.py index 28f349b78..8293210b6 100644 --- a/tests/test_doctest.py +++ b/tests/test_doctest.py @@ -5,15 +5,14 @@ __author__ = "Patrick Kunzmann" import doctest -from os.path import join import tempfile from importlib import import_module +from os.path import join import numpy as np import pytest -import biotite.structure.io as strucio import biotite.structure as struc -from .util import is_not_installed, cannot_import, cannot_connect_to - +import biotite.structure.io as strucio +from tests.util import cannot_connect_to, cannot_import, is_not_installed NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/" RCSB_URL = "https://www.rcsb.org/" @@ -23,137 +22,93 @@ # Keep test parameters in separate variable to generate IDs from them TEST_PARAMETERS = [ - pytest.param( - "biotite", - [] - ), - pytest.param( - "biotite.sequence", - [] - ), - pytest.param( - "biotite.sequence.align", - ["biotite.sequence"] - ), - pytest.param( - "biotite.sequence.phylo", - ["biotite.sequence"] - ), + pytest.param("biotite", []), + pytest.param("biotite.sequence", []), + pytest.param("biotite.sequence.align", ["biotite.sequence"]), + pytest.param("biotite.sequence.phylo", ["biotite.sequence"]), pytest.param( "biotite.sequence.graphics", ["biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_import("matplotlib"), reason="Matplotlib is not installed" - ) - ), - pytest.param( - "biotite.sequence.io", - ["biotite.sequence"] - ), - pytest.param( - "biotite.sequence.io.fasta", - ["biotite.sequence"] - ), - pytest.param( - "biotite.sequence.io.fastq", - ["biotite.sequence"] + ), ), + pytest.param("biotite.sequence.io", ["biotite.sequence"]), + pytest.param("biotite.sequence.io.fasta", ["biotite.sequence"]), + pytest.param("biotite.sequence.io.fastq", ["biotite.sequence"]), pytest.param( "biotite.sequence.io.genbank", ["biotite.sequence", "biotite.database.entrez"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available" - ) + ), ), pytest.param( "biotite.sequence.io.gff", ["biotite.sequence", "biotite.sequence.io.fasta"], - marks = pytest.mark.filterwarnings("ignore:") + marks=pytest.mark.filterwarnings("ignore:"), ), pytest.param( - "biotite.structure", - ["biotite.structure.io", "biotite.structure.info"] + "biotite.structure", ["biotite.structure.io", "biotite.structure.info"] ), pytest.param( "biotite.structure.graphics", ["biotite.structure"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_import("matplotlib"), reason="Matplotlib is not installed" ), ), + pytest.param("biotite.structure.io", ["biotite.structure"]), + pytest.param("biotite.structure.io.pdb", ["biotite.structure", "biotite"]), + pytest.param("biotite.structure.io.pdbx", ["biotite.structure"]), pytest.param( - "biotite.structure.io", - ["biotite.structure"] - ), - pytest.param( - "biotite.structure.io.pdb", - ["biotite.structure", "biotite"] - ), - pytest.param( - "biotite.structure.io.pdbx", - ["biotite.structure"] - ), - pytest.param( - "biotite.structure.io.pdbqt", - ["biotite.structure", "biotite.structure.info"] + "biotite.structure.io.pdbqt", ["biotite.structure", "biotite.structure.info"] ), pytest.param( - "biotite.structure.io.npz", - ["biotite.structure"] - ), - pytest.param( - "biotite.structure.io.mmtf", - ["biotite.structure"] - ), - pytest.param( - "biotite.structure.io.mol", - ["biotite.structure", "biotite.structure.info"] - ), - pytest.param( - "biotite.structure.info", - ["biotite.structure"] + "biotite.structure.io.mol", ["biotite.structure", "biotite.structure.info"] ), + pytest.param("biotite.structure.info", ["biotite.structure"]), pytest.param( "biotite.database.entrez", [], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(NCBI_URL), reason="NCBI Entrez is not available" - ) + ), ), pytest.param( "biotite.database.rcsb", [], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" - ) + ), ), pytest.param( "biotite.database.uniprot", [], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(UNIPROT_URL), reason="UniProt is not available" - ) + ), ), pytest.param( "biotite.database.pubchem", ["biotite.structure.info"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( cannot_connect_to(PUBCHEM_URL), reason="PubChem is not available" - ) + ), ), pytest.param( "biotite.application", ["biotite.application.clustalo", "biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("clustalo"), reason="Software is not installed" - ) + ), ), pytest.param( "biotite.application.blast", [], ), # Do not test Muscle due to version clash - #pytest.param( + # pytest.param( # "biotite.application.muscle", # ["biotite.sequence"], # marks = pytest.mark.skipif( @@ -162,50 +117,52 @@ pytest.param( "biotite.application.clustalo", ["biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("clustalo"), reason="Software is not installed" - ) + ), ), pytest.param( "biotite.application.mafft", ["biotite.sequence"], - marks = pytest.mark.skipif( - is_not_installed("mafft"), reason="Software is not installed") + marks=pytest.mark.skipif( + is_not_installed("mafft"), reason="Software is not installed" ), + ), pytest.param( - "biotite.application.sra", ["biotite.sequence"], - marks = pytest.mark.skipif( - is_not_installed("fasterq-dump"), - reason="Software is not installed" - ) + "biotite.application.sra", + ["biotite.sequence"], + marks=pytest.mark.skipif( + is_not_installed("fasterq-dump"), reason="Software is not installed" + ), ), pytest.param( "biotite.application.tantan", ["biotite.sequence"], - marks = pytest.mark.skipif( - is_not_installed("tantan"), reason="Software is not installed") + marks=pytest.mark.skipif( + is_not_installed("tantan"), reason="Software is not installed" ), + ), pytest.param( "biotite.application.viennarna", ["biotite.sequence"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("RNAfold") | is_not_installed("RNAplot"), - reason="Software is not installed" - ) + reason="Software is not installed", + ), ), pytest.param( "biotite.application.dssp", ["biotite.structure"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("mkdssp"), reason="Software is not installed" - ) + ), ), pytest.param( "biotite.application.autodock", ["biotite.structure", "biotite.structure.info"], - marks = pytest.mark.skipif( + marks=pytest.mark.skipif( is_not_installed("vina"), reason="Software is not installed" - ) + ), ), ] @@ -213,7 +170,7 @@ @pytest.mark.parametrize( "package_name, context_package_names", TEST_PARAMETERS, - ids=[param.values[0] for param in TEST_PARAMETERS] + ids=[param.values[0] for param in TEST_PARAMETERS], ) def test_doctest(package_name, context_package_names): """ @@ -222,18 +179,17 @@ def test_doctest(package_name, context_package_names): # Collect all attributes of this package and its subpackages # as globals for the doctests globs = {} - #The package itself is also used as context + # The package itself is also used as context for name in context_package_names + [package_name]: context_package = import_module(name) globs.update( - {attr : getattr(context_package, attr) - for attr in dir(context_package)} + {attr: getattr(context_package, attr) for attr in dir(context_package)} ) # Add fixed names for certain paths - globs["path_to_directory"] = tempfile.gettempdir() + globs["path_to_directory"] = tempfile.gettempdir() globs["path_to_structures"] = join(".", "tests", "structure", "data") - globs["path_to_sequences"] = join(".", "tests", "sequence", "data") + globs["path_to_sequences"] = join(".", "tests", "sequence", "data") # Add frequently used modules globs["np"] = np # Add frequently used objects @@ -253,14 +209,14 @@ def test_doctest(package_name, context_package_names): # More information below package = import_module(package_name) runner = doctest.DocTestRunner( - verbose = False, - optionflags = - doctest.ELLIPSIS | - doctest.REPORT_ONLY_FIRST_FAILURE | - doctest.NORMALIZE_WHITESPACE + verbose=False, + optionflags=doctest.ELLIPSIS + | doctest.REPORT_ONLY_FIRST_FAILURE + | doctest.NORMALIZE_WHITESPACE, ) for test in doctest.DocTestFinder(exclude_empty=False).find( - package, package.__name__, + package, + package.__name__, # It is necessary to set 'module' to 'False', as otherwise # Cython functions and classes would be falsely identified # as members of an external module by 'DocTestFinder._find()' @@ -271,7 +227,7 @@ def test_doctest(package_name, context_package_names): # ('__init__.py' modules) should only contain attributes, that # are part of the package itself. module=False, - extraglobs=globs + extraglobs=globs, ): runner.run(test) results = doctest.TestResults(runner.failures, runner.tries) @@ -279,4 +235,4 @@ def test_doctest(package_name, context_package_names): assert results.failed == 0 except AssertionError: print(f"Failing doctest in module {package}") - raise \ No newline at end of file + raise diff --git a/tests/test_init.py b/tests/test_init.py deleted file mode 100644 index 644659ce9..000000000 --- a/tests/test_init.py +++ /dev/null @@ -1,13 +0,0 @@ -# This source code is part of the Biotite package and is distributed -# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further -# information. - -__author__ = "Daniel Bauer" - -import biotite -import pytest - - -def test_version_number(): - version = biotite.__version__ - assert hasattr(biotite, "__version__") \ No newline at end of file diff --git a/tests/test_modname.py b/tests/test_modname.py index 808625f4b..39a394d8b 100644 --- a/tests/test_modname.py +++ b/tests/test_modname.py @@ -2,11 +2,11 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -import pkgutil -from os.path import dirname, join, isdir, splitext import importlib +import pkgutil +from os.path import dirname, join import pytest -from .util import cannot_import +from tests.util import cannot_import def find_all_modules(package_name, src_dir): @@ -18,25 +18,21 @@ def find_all_modules(package_name, src_dir): for _, module_name, is_package in pkgutil.iter_modules([src_dir]): full_module_name = f"{package_name}.{module_name}" if is_package: - module_names.extend(find_all_modules( - full_module_name, - join(src_dir, module_name) - )) + module_names.extend( + find_all_modules(full_module_name, join(src_dir, module_name)) + ) else: module_names.append(full_module_name) return module_names @pytest.mark.skipif( - cannot_import("matplotlib") | cannot_import("mdtraj"), - reason="Optional dependencies are not met" + cannot_import("matplotlib"), + reason="Optional dependencies are not met", ) @pytest.mark.parametrize( "module_name", - find_all_modules( - "biotite", - join(dirname(dirname(__file__)), "src", "biotite") - ) + find_all_modules("biotite", join(dirname(dirname(__file__)), "src", "biotite")), ) def test_module_name(module_name): """ @@ -55,4 +51,4 @@ def test_module_name(module_name): # Autogenerated module from hatch-vcs # # It contains no '__name__' attribute on purpose return - assert module.__name__ == package_name \ No newline at end of file + assert module.__name__ == package_name diff --git a/tests/test_repr.py b/tests/test_repr.py index 5f9714af8..f8bf319c4 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -2,51 +2,85 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from biotite.sequence import NucleotideSequence -from biotite.sequence import ProteinSequence -from biotite.sequence import Alphabet -from biotite.sequence import GeneralSequence -from biotite.sequence import LetterAlphabet -from biotite.sequence import Location -from biotite.sequence import Feature -from biotite.sequence import Annotation -from biotite.sequence import AnnotatedSequence -from biotite.sequence.align import Alignment -from biotite.structure import Atom import numpy as np -from numpy import float32, int32 -from biotite.sequence import CodonTable -from biotite.sequence.align import SubstitutionMatrix -from biotite.sequence import SequenceProfile import pytest +from numpy import float32, int32 # noqa: F401 +from biotite.sequence import ( + Alphabet, + AnnotatedSequence, + Annotation, + CodonTable, + Feature, + GeneralSequence, + LetterAlphabet, + Location, + NucleotideSequence, + ProteinSequence, + SequenceProfile, +) +from biotite.sequence.align import Alignment, SubstitutionMatrix +from biotite.structure import Atom __author__ = "Maximilian Greil" -@pytest.mark.parametrize("repr_object", - [NucleotideSequence("AACTGCTA"), - NucleotideSequence("AACTGCTA", ambiguous=True), - ProteinSequence("BIQTITE"), - Alphabet(["X", "Y", "Z"]), - GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), - LetterAlphabet(["X", "Y", "Z"]), - Location(98, 178), - Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}), - Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), - AnnotatedSequence(Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), - NucleotideSequence("AACTGCTA")), - Alignment([NucleotideSequence("CGTCAT", ambiguous=False), - NucleotideSequence("TCATGC", ambiguous=False)], - np.array([[0, -1], [1, -1], [2, 0], [3, 1], [4, 2], [5, 3], [-1, 4], [-1, 5]]), - score=-20), - Atom([1, 2, 3], chain_id="A"), - CodonTable.default_table(), - SubstitutionMatrix(Alphabet(["foo", "bar"]), Alphabet([1, 2, 3]), - {("foo", 1): 5, ("foo", 2): 10, ("foo", 3): 15, ("bar", 1): 42, - ("bar", 2): 42, ("bar", 3): 42}), - SequenceProfile(np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], - [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]), - np.array([1, 1, 0, 0, 0, 0, 1, 1]), - Alphabet(["A", "C", "G", "T"]))]) +@pytest.mark.parametrize( + "repr_object", + [ + NucleotideSequence("AACTGCTA"), + NucleotideSequence("AACTGCTA", ambiguous=True), + ProteinSequence("BIQTITE"), + Alphabet(["X", "Y", "Z"]), + GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), + LetterAlphabet(["X", "Y", "Z"]), + Location(98, 178), + Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}), + Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), + AnnotatedSequence( + Annotation([Feature("CDS", [Location(98, 178)], qual={"gene": "test1"})]), + NucleotideSequence("AACTGCTA"), + ), + Alignment( + [ + NucleotideSequence("CGTCAT", ambiguous=False), + NucleotideSequence("TCATGC", ambiguous=False), + ], + np.array( + [[0, -1], [1, -1], [2, 0], [3, 1], [4, 2], [5, 3], [-1, 4], [-1, 5]] + ), + score=-20, + ), + Atom([1, 2, 3], chain_id="A"), + CodonTable.default_table(), + SubstitutionMatrix( + Alphabet(["foo", "bar"]), + Alphabet([1, 2, 3]), + { + ("foo", 1): 5, + ("foo", 2): 10, + ("foo", 3): 15, + ("bar", 1): 42, + ("bar", 2): 42, + ("bar", 3): 42, + }, + ), + SequenceProfile( + np.array( + [ + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 2], + [0, 2, 0, 0], + [2, 0, 0, 0], + [0, 0, 0, 2], + [0, 0, 1, 0], + [0, 1, 0, 0], + ] + ), + np.array([1, 1, 0, 0, 0, 0, 1, 1]), + Alphabet(["A", "C", "G", "T"]), + ), + ], +) def test_repr(repr_object): assert eval(repr(repr_object)) == repr_object diff --git a/tests/test_version.py b/tests/test_version.py index 5f11daa2a..ec7bca6f9 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -6,4 +6,4 @@ def test_version(): """ Check if version imported from version.py is correct. """ - assert biotite.__version__ == version("biotite") \ No newline at end of file + assert biotite.__version__ == version("biotite") diff --git a/tests/util.py b/tests/util.py index e72cc5cb5..99cf24741 100644 --- a/tests/util.py +++ b/tests/util.py @@ -2,11 +2,11 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. -from os.path import join, dirname, realpath -import urllib.error -import urllib.request import importlib import shutil +import urllib.error +import urllib.request +from os.path import dirname, join, realpath def data_dir(subdir): @@ -16,6 +16,8 @@ def data_dir(subdir): ### Functions for conditional test skips ### tested_urls = {} + + def cannot_connect_to(url): if url not in tested_urls: try: @@ -25,8 +27,10 @@ def cannot_connect_to(url): tested_urls[url] = True return tested_urls[url] + def cannot_import(module): return importlib.util.find_spec(module) is None + def is_not_installed(program): - return shutil.which(program) is None \ No newline at end of file + return shutil.which(program) is None